diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h index 5bb2fe4ac384add95b0177ba3ae1d192742481ce..88840910a586346f4f962e594e02fa6e2e8179d0 100644 --- a/mace/kernels/activation.h +++ b/mace/kernels/activation.h @@ -156,7 +156,6 @@ class ActivationFunctor { T relux_max_limit_; cl::Kernel kernel_; uint32_t kwg_size_; - bool is_non_uniform_work_groups_supported_; std::string tuning_key_prefix_; std::vector input_shape_; }; diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h index e2d875e9a73e9d12668e6f11388060a35454e8ec..c8bb601620f1965b126bc39d2ef2259d26e91e68 100644 --- a/mace/kernels/addn.h +++ b/mace/kernels/addn.h @@ -91,7 +91,6 @@ struct AddNFunctor { cl::Kernel kernel_; uint32_t kwg_size_; - bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h index f17db80a48295d1bf7a24e5775fae4a17f9a81f0..5e8ae34f9a9aaef596090cc7113c440b425021cf 100644 --- a/mace/kernels/batch_norm.h +++ b/mace/kernels/batch_norm.h @@ -158,7 +158,6 @@ struct BatchNormFunctor : BatchNormFunctorBase { StatsFuture *future); cl::Kernel kernel_; uint32_t kwg_size_; - bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.h index f2f917f4f5073c3e585e70260678c95f9f13f59c..a3d1ff61b1c0fa0be7c7eb0506c495fc1331476c 100644 --- a/mace/kernels/bias_add.h +++ b/mace/kernels/bias_add.h @@ -65,7 +65,6 @@ struct BiasAddFunctor { StatsFuture *future); cl::Kernel kernel_; uint32_t kwg_size_; - bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; diff --git a/mace/kernels/channel_shuffle.h b/mace/kernels/channel_shuffle.h index b93e657837a50f658aa9c3444b99e3a0d65cf761..69332e0182e4623a7ffb9e1e87fd05fbd9ca5b75 100644 --- a/mace/kernels/channel_shuffle.h +++ b/mace/kernels/channel_shuffle.h @@ -57,7 +57,6 @@ struct ChannelShuffleFunctor { cl::Kernel kernel_; uint32_t kwg_size_; - bool is_non_uniform_work_groups_supported_; const int groups_; std::vector input_shape_; }; diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h index 7186bde6e452983b3bc5620e3b620086907e19ab..6c803b5f2a32544cac7d79145b691b3166faeb9b 100644 --- a/mace/kernels/concat.h +++ b/mace/kernels/concat.h @@ -86,7 +86,6 @@ struct ConcatFunctor : ConcatFunctorBase { StatsFuture *future); cl::Kernel kernel_; uint32_t kwg_size_; - bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index f2d3dfbb53c40ca5ff5e7753333c88300ac8b535..4cd05a65d527a45caf8c34486be2696511406589 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -402,7 +402,6 @@ struct Conv2dFunctor : Conv2dFunctorBase { cl::Kernel kernel_; uint32_t kwg_size_; - bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h index 6b439db67ecb2c5c2f6ee2390e7900adfc90a307..6d30673976df6399351ad50c871d2f9e578cea22 100644 --- a/mace/kernels/depth_to_space.h +++ b/mace/kernels/depth_to_space.h @@ -109,7 +109,6 @@ struct DepthToSpaceOpFunctor { cl::Kernel kernel_; uint32_t kwg_size_; - bool is_non_uniform_work_groups_supported_; const int block_size_; bool d2s_; std::vector input_shape_; diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h index ba4f74c8a12132b1780467b38e35f52a8e127063..90c17b19e71553424c5f68eee1cc3bc9ffa2b279 100644 --- a/mace/kernels/depthwise_conv2d.h +++ b/mace/kernels/depthwise_conv2d.h @@ -438,7 +438,6 @@ struct DepthwiseConv2dFunctor cl::Kernel kernel_; uint32_t kwg_size_; - bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h index 11d52bc97e8802b04058589c6eb3bdb057607f00..9c7f0a901a5968f1d0f4cf5c7af8ceeebb465f7e 100644 --- a/mace/kernels/eltwise.h +++ b/mace/kernels/eltwise.h @@ -98,7 +98,6 @@ struct EltwiseFunctor : EltwiseFunctorBase { cl::Kernel kernel_; uint32_t kwg_size_; - bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.h index 1ce9b6fd07f4a377664b03b821cf1b170dadea19..b025cbfebe29efa20d65838328458eb73befb823 100644 --- a/mace/kernels/matmul.h +++ b/mace/kernels/matmul.h @@ -242,7 +242,6 @@ struct MatMulFunctor { cl::Kernel kernel_; uint32_t kwg_size_; - bool is_non_uniform_work_groups_supported_; }; } // namespace kernels diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc index f41513c54dd66b38d28e9c0f21b21998711d0d0a..368a78bc9594da81e84831136fbfcfe93a4f295f 100644 --- a/mace/kernels/opencl/activation_opencl.cc +++ b/mace/kernels/opencl/activation_opencl.cc @@ -27,16 +27,14 @@ void ActivationFunctor::operator()(const Tensor *input, auto runtime = OpenCLRuntime::Global(); if (kernel_.get() == nullptr) { - is_non_uniform_work_groups_supported_ = - runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation"); built_options.emplace("-Dactivation=" + kernel_name); auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (is_non_uniform_work_groups_supported_) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } switch (activation_) { case RELU: @@ -63,6 +61,9 @@ void ActivationFunctor::operator()(const Tensor *input, LOG(FATAL) << "Unknown activation type: " << activation_; } kernel_ = runtime->BuildKernel("activation", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } const uint32_t gws[3] = {static_cast(channel_blocks), @@ -71,7 +72,7 @@ void ActivationFunctor::operator()(const Tensor *input, if (!IsVecEqual(input_shape_, input->shape())) { int idx = 0; - if (!is_non_uniform_work_groups_supported_) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[2]); @@ -85,9 +86,6 @@ void ActivationFunctor::operator()(const Tensor *input, kernel_.setArg(idx++, *(output->opencl_image())); input_shape_ = input->shape(); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } const std::vector lws = {8, kwg_size_ / 64, 8, 1}; diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index c2c19fa780a19e05aa43702322be799f97412417..dc2aabeb949f9c6e54b41ebc8c2eb1324f1e4694 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -35,8 +35,6 @@ void AddNFunctor::operator()( } if (kernel_.get() == nullptr) { - is_non_uniform_work_groups_supported_ = - runtime->IsNonUniformWorkgroupsSupported(); if (input_tensors.size() > 4) { MACE_NOT_IMPLEMENTED; } @@ -47,11 +45,14 @@ void AddNFunctor::operator()( built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size())); - if (is_non_uniform_work_groups_supported_) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } kernel_ = runtime->BuildKernel("addn", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } std::vector output_shape = input_tensors[0]->shape(); @@ -70,7 +71,7 @@ void AddNFunctor::operator()( output_tensor->ResizeImage(output_shape, output_image_shape); uint32_t idx = 0; - if (!is_non_uniform_work_groups_supported_) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); } @@ -80,9 +81,6 @@ void AddNFunctor::operator()( kernel_.setArg(idx++, *(output_tensor->opencl_image())); input_shape_ = input_tensors[0]->shape(); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } const std::vector lws = {kwg_size_ / 16, 16, 1}; diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index 8065acba80374af51a2f1368e45f1e46ca4fc869..09be320fc680cb0e34306762a4672cd774f1d5ea 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -38,16 +38,14 @@ void BatchNormFunctor::operator()(const Tensor *input, if (kernel_.get() == nullptr) { - is_non_uniform_work_groups_supported_ = - runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm"); built_options.emplace("-Dbatch_norm=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (is_non_uniform_work_groups_supported_) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } if (folded_constant_) { built_options.emplace("-DFOLDED_CONSTANT"); @@ -72,10 +70,13 @@ void BatchNormFunctor::operator()(const Tensor *input, } kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; - if (!is_non_uniform_work_groups_supported_) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[2]); @@ -92,9 +93,6 @@ void BatchNormFunctor::operator()(const Tensor *input, kernel_.setArg(idx++, relux_max_limit_); input_shape_ = input->shape(); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } const std::vector lws = {8, kwg_size_ / 64, 8, 1}; diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add_opencl.cc index a518f074209a05b2676356ff6bc3d50bf890abd7..1197a3590599b4aac96ac97ce0c8fc58915d0046 100644 --- a/mace/kernels/opencl/bias_add_opencl.cc +++ b/mace/kernels/opencl/bias_add_opencl.cc @@ -30,22 +30,23 @@ void BiasAddFunctor::operator()(const Tensor *input, auto runtime = OpenCLRuntime::Global(); if (kernel_.get() == nullptr) { - is_non_uniform_work_groups_supported_ = - runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add"); built_options.emplace("-Dbias_add=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (is_non_uniform_work_groups_supported_) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; - if (!is_non_uniform_work_groups_supported_) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[2]); @@ -54,16 +55,13 @@ void BiasAddFunctor::operator()(const Tensor *input, kernel_.setArg(idx++, *(bias->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image())); input_shape_ = input->shape(); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } const std::vector lws = {8, kwg_size_ / 64, 8}; cl::Event event; cl_int error; - if (is_non_uniform_work_groups_supported_) { + if (runtime->IsNonUniformWorkgroupsSupported()) { error = runtime->command_queue().enqueueNDRangeKernel( kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index 5d7ae4c1b47d7af2a0897ba17bf158148bdf5b12..8b570b3369aca067d53bb0286c3bf9c354f8f74a 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -62,16 +62,13 @@ void BufferToImageFunctor::operator()( auto runtime = OpenCLRuntime::Global(); - const bool is_non_uniform_work_groups_supported = - runtime->IsNonUniformWorkgroupsSupported(); - std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::set built_options; std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; built_options.emplace(kernel_name_ss.str()); - if (is_non_uniform_work_groups_supported) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } if (buffer->dtype() == image->dtype()) { built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); @@ -87,7 +84,7 @@ void BufferToImageFunctor::operator()( obfuscated_kernel_name, built_options); uint32_t idx = 0; - if (!is_non_uniform_work_groups_supported) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { b2f_kernel.setArg(idx++, gws[0]); b2f_kernel.setArg(idx++, gws[1]); } @@ -123,7 +120,7 @@ void BufferToImageFunctor::operator()( cl::Event event; cl_int error; - if (is_non_uniform_work_groups_supported) { + if (runtime->IsNonUniformWorkgroupsSupported()) { error = runtime->command_queue().enqueueNDRangeKernel( b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), cl::NDRange(lws[0], lws[1]), nullptr, &event); diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc index 29097345417aa0a1e30f532a25c8735b2831dd05..59f2c9518061a7e1e6f0e8071a18cec699814be4 100644 --- a/mace/kernels/opencl/channel_shuffle.cc +++ b/mace/kernels/opencl/channel_shuffle.cc @@ -37,24 +37,25 @@ void ChannelShuffleFunctor::operator()( auto runtime = OpenCLRuntime::Global(); if (kernel_.get() == nullptr) { - is_non_uniform_work_groups_supported_ = - runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle"); built_options.emplace("-Dchannel_shuffle=" + kernel_name); auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (is_non_uniform_work_groups_supported_) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; - if (!is_non_uniform_work_groups_supported_) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[2]); @@ -65,9 +66,6 @@ void ChannelShuffleFunctor::operator()( kernel_.setArg(idx++, *(output->opencl_image())); input_shape_ = input->shape(); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } const std::vector lws = {8, kwg_size_ / 64, 8, 1}; diff --git a/mace/kernels/opencl/cl/activation.cl b/mace/kernels/opencl/cl/activation.cl index 7976dd38c811a082c186507c2f7a5f446aadd6c9..42afc7012528242475b3fc61a8a9bdfdb5623772 100644 --- a/mace/kernels/opencl/cl/activation.cl +++ b/mace/kernels/opencl/cl/activation.cl @@ -1,11 +1,7 @@ #include __kernel void activation( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 __read_only image2d_t input, #ifdef USE_PRELU __read_only image2d_t alpha, @@ -16,7 +12,7 @@ __kernel void activation( const int w = get_global_id(1); const int hb = get_global_id(2); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (ch_blk >= global_size_dim0 || w >= global_size_dim1 || hb >= global_size_dim2) { return; diff --git a/mace/kernels/opencl/cl/addn.cl b/mace/kernels/opencl/cl/addn.cl index 09dd5c388de0bf5b30a873d6a20a55181e341d06..d0604f9ed074c5a5d2729fd8c66751d9ab7b751b 100644 --- a/mace/kernels/opencl/cl/addn.cl +++ b/mace/kernels/opencl/cl/addn.cl @@ -1,10 +1,7 @@ #include __kernel void addn( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input1, #if INPUT_NUM > 2 @@ -17,7 +14,7 @@ __kernel void addn( const int w = get_global_id(0); const int hb = get_global_id(1); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (w >= global_size_dim0 || hb >= global_size_dim1) return; #endif diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl index 0e592fdcbd26bfa3b274ea222d7d7cb3070b132d..0075932dbac599780803ac7041da293dfbbc1447 100644 --- a/mace/kernels/opencl/cl/batch_norm.cl +++ b/mace/kernels/opencl/cl/batch_norm.cl @@ -1,11 +1,7 @@ #include // Supported data types: half/float __kernel void batch_norm( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 __read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset, @@ -20,7 +16,7 @@ __kernel void batch_norm( const int w = get_global_id(1); const int hb = get_global_id(2); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (ch_blk >= global_size_dim0 || w >= global_size_dim1 || hb >= global_size_dim2) { return; diff --git a/mace/kernels/opencl/cl/bias_add.cl b/mace/kernels/opencl/cl/bias_add.cl index ee7b60786a57466178e3517face1ee7900ac639c..a2d99abcc8e21e19e0710db8f752df3a6032d56f 100644 --- a/mace/kernels/opencl/cl/bias_add.cl +++ b/mace/kernels/opencl/cl/bias_add.cl @@ -1,11 +1,7 @@ #include // Supported data types: half/float __kernel void bias_add( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 __read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t output) { @@ -13,7 +9,7 @@ __kernel void bias_add( const int w = get_global_id(1); const int hb = get_global_id(2); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (ch_blk >= global_size_dim0 || w >= global_size_dim1 || hb >= global_size_dim2) { return; diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl index 7e764503656fbf7d4ed41c245753663899c85fcb..86071708117efe6a7d4f0580d0324e2ad0701962 100644 --- a/mace/kernels/opencl/cl/buffer_to_image.cl +++ b/mace/kernels/opencl/cl/buffer_to_image.cl @@ -1,10 +1,7 @@ #include __kernel void filter_buffer_to_image( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 __global const DATA_TYPE *input, /* h, w, oc, ic */ __private const int input_offset, __private const int filter_h, @@ -15,7 +12,7 @@ __kernel void filter_buffer_to_image( int w = get_global_id(0); int h = get_global_id(1); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (w >= global_size_dim0 || h >= global_size_dim1) { return; } @@ -57,10 +54,7 @@ __kernel void filter_buffer_to_image( } __kernel void filter_image_to_buffer( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 __global DATA_TYPE *output, /* h, w, oc, ic */ __private const int filter_h, __private const int filter_w, @@ -70,7 +64,7 @@ __kernel void filter_image_to_buffer( int w = get_global_id(0); int h = get_global_id(1); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (w >= global_size_dim0 || h >= global_size_dim1) { return; } @@ -109,10 +103,7 @@ __kernel void filter_image_to_buffer( } __kernel void dw_filter_buffer_to_image( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 __global const DATA_TYPE *input, /* h, w, ic, m */ __private const int input_offset, __private const int filter_w, @@ -122,7 +113,7 @@ __kernel void dw_filter_buffer_to_image( const int w = get_global_id(0); const int h = get_global_id(1); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (w >= global_size_dim0 || h >= global_size_dim1) { return; } @@ -170,10 +161,7 @@ __kernel void dw_filter_buffer_to_image( } __kernel void in_out_buffer_to_image( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 __global const DATA_TYPE *input, /* nhwc */ __private const int input_offset, __private const int height, @@ -183,7 +171,7 @@ __kernel void in_out_buffer_to_image( int w = get_global_id(0); int h = get_global_id(1); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (w >= global_size_dim0 || h >= global_size_dim1) { return; } @@ -215,10 +203,7 @@ __kernel void in_out_buffer_to_image( } __kernel void in_out_image_to_buffer( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 __global DATA_TYPE *output, /* nhwc */ __private const int height, __private const int width, @@ -227,7 +212,7 @@ __kernel void in_out_image_to_buffer( int w = get_global_id(0); int h = get_global_id(1); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (w >= global_size_dim0 || h >= global_size_dim1) { return; } @@ -258,10 +243,7 @@ __kernel void in_out_image_to_buffer( } __kernel void arg_buffer_to_image( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 __global const DATA_TYPE *input, /* nhwc */ __private const int input_offset, __private const int count, @@ -269,7 +251,7 @@ __kernel void arg_buffer_to_image( int w = get_global_id(0); int h = get_global_id(1); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (w >= global_size_dim0 || h >= global_size_dim1) { return; } @@ -297,17 +279,14 @@ __kernel void arg_buffer_to_image( } __kernel void arg_image_to_buffer( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 __global DATA_TYPE *output, /* nhwc */ __private const int count, __read_only image2d_t input) { int w = get_global_id(0); int h = get_global_id(1); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (w >= global_size_dim0 || h >= global_size_dim1) { return; } @@ -334,10 +313,7 @@ __kernel void arg_image_to_buffer( __kernel void in_out_height_buffer_to_image( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 __global const DATA_TYPE *input, //nhwc __private const int input_offset, __private const int height, @@ -347,7 +323,7 @@ __kernel void in_out_height_buffer_to_image( int w = get_global_id(0); int h = get_global_id(1); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (w >= global_size_dim0 || h >= global_size_dim1) { return; } @@ -380,10 +356,7 @@ __kernel void in_out_height_buffer_to_image( } __kernel void in_out_height_image_to_buffer( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 __global DATA_TYPE *output, //nhwc __private const int height, __private const int width, @@ -392,7 +365,7 @@ __kernel void in_out_height_image_to_buffer( int w = get_global_id(0); int h = get_global_id(1); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (w >= global_size_dim0 || h >= global_size_dim1) { return; } @@ -422,10 +395,7 @@ __kernel void in_out_height_image_to_buffer( __kernel void in_out_width_buffer_to_image( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 __global const DATA_TYPE *input, /* nhwc */ __private const int input_offset, __private const int height, @@ -435,7 +405,7 @@ __kernel void in_out_width_buffer_to_image( int w = get_global_id(0); int h = get_global_id(1); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (w >= global_size_dim0 || h >= global_size_dim1) { return; } @@ -468,10 +438,7 @@ __kernel void in_out_width_buffer_to_image( // only support 3x3 now __kernel void winograd_filter_buffer_to_image( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 __global const DATA_TYPE *input, //Oc, Ic, H, W __private const int input_offset, __private const int in_channels, @@ -481,7 +448,7 @@ __kernel void winograd_filter_buffer_to_image( int w = get_global_id(0); int h = get_global_id(1); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (w >= global_size_dim0 || h >= global_size_dim1) { return; } @@ -563,10 +530,7 @@ __kernel void winograd_filter_buffer_to_image( // only support 3x3 now __kernel void winograd_filter_image_to_buffer( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 __global DATA_TYPE *output, //Oc, Ic, H, W __private const int height, __private const int width, @@ -575,7 +539,7 @@ __kernel void winograd_filter_image_to_buffer( const int w = get_global_id(0); const int h = get_global_id(1); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (w >= global_size_dim0 || h >= global_size_dim1) { return; } diff --git a/mace/kernels/opencl/cl/channel_shuffle.cl b/mace/kernels/opencl/cl/channel_shuffle.cl index 5bf0e067a30d4248eff32858ec11fe171fff53b5..3fa2894e8bf60b8e7528ccd2562fc179afd9f46e 100644 --- a/mace/kernels/opencl/cl/channel_shuffle.cl +++ b/mace/kernels/opencl/cl/channel_shuffle.cl @@ -2,11 +2,7 @@ // assume channes_per_group mod 4 = 0 && groups mod 4 == 0 __kernel void channel_shuffle( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 __read_only image2d_t input, __private const int groups, __private const int channels_per_group, @@ -15,7 +11,7 @@ __kernel void channel_shuffle( const int width_idx = get_global_id(1); const int hb_idx = get_global_id(2); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (group_chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 || hb_idx >= global_size_dim2) { return; diff --git a/mace/kernels/opencl/cl/common.h b/mace/kernels/opencl/cl/common.h index dd61012cebdb887639f132d63e48d9521898e443..30aad065ac12da29c3eb661bcf17069a023b16e6 100644 --- a/mace/kernels/opencl/cl/common.h +++ b/mace/kernels/opencl/cl/common.h @@ -18,6 +18,23 @@ #define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE) #define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE) +#ifndef NON_UNIFORM_WORK_GROUP + +#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 \ + __private const int global_size_dim0, \ + __private const int global_size_dim1, +#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 \ + __private const int global_size_dim0, \ + __private const int global_size_dim1, \ + __private const int global_size_dim2, + +#else + +#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + +#endif + __constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; diff --git a/mace/kernels/opencl/cl/concat.cl b/mace/kernels/opencl/cl/concat.cl index 2658025df79333f23c063a99310c1acecb3cd014..3b7370a8a30ba21a0c22305d1ef84e66314d7153 100644 --- a/mace/kernels/opencl/cl/concat.cl +++ b/mace/kernels/opencl/cl/concat.cl @@ -23,11 +23,7 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left, // Supported data type: half/float __kernel void concat_channel( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 __read_only image2d_t input0, __read_only image2d_t input1, __private const int input0_chan, @@ -36,7 +32,7 @@ __kernel void concat_channel( const int width_idx = get_global_id(1); const int hb_idx = get_global_id(2); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 || hb_idx >= global_size_dim2) { return; @@ -89,11 +85,7 @@ __kernel void concat_channel( // Required: All input channels are divisible by 4 __kernel void concat_channel_multi( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 __read_only image2d_t input, __private const int chan_blk_offset, __write_only image2d_t output) { @@ -101,7 +93,7 @@ __kernel void concat_channel_multi( const int width_idx = get_global_id(1); const int hb_idx = get_global_id(2); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 || hb_idx >= global_size_dim2) { return; diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl index f40f31dab55dd7da07385004185ea5d9e84b9bfc..1383557d89d96b4a282773f16dabbae59b7b798b 100644 --- a/mace/kernels/opencl/cl/conv_2d.cl +++ b/mace/kernels/opencl/cl/conv_2d.cl @@ -1,11 +1,7 @@ #include __kernel void conv_2d( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */ #ifdef BIAS @@ -29,7 +25,7 @@ __kernel void conv_2d( const int out_w_blk = get_global_id(1); const int out_hb = get_global_id(2); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1 || out_hb >= global_size_dim2) { return; diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index 96d9a2c06aae8fd81e420eb1b0b407149bc13bdd..e993a159e3e82fa5c110881647ffb290b75c4832 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -1,11 +1,7 @@ #include __kernel void conv_2d_1x1( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * cin, cout/4 */ #ifdef BIAS @@ -23,7 +19,7 @@ __kernel void conv_2d_1x1( const int out_w_blk = get_global_id(1); const int out_hb = get_global_id(2); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1 || out_hb >= global_size_dim2) { return; diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index b159fd6af0fdcdecd2f9586f9992f1412548398c..8bc27b33569109fad1e9207c910299c6ebcaac0b 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -1,11 +1,7 @@ #include __kernel void conv_2d_3x3( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */ #ifdef BIAS @@ -27,7 +23,7 @@ __kernel void conv_2d_3x3( const int out_w_blk = get_global_id(1); const int out_hb = get_global_id(2); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1 || out_hb >= global_size_dim2) { return; diff --git a/mace/kernels/opencl/cl/depth_to_space.cl b/mace/kernels/opencl/cl/depth_to_space.cl index 2a5a8893ea2c86ba6a13cc1d9dc24dfa728088ac..a52617c87367635f697fc29f7c56315b6347bf13 100644 --- a/mace/kernels/opencl/cl/depth_to_space.cl +++ b/mace/kernels/opencl/cl/depth_to_space.cl @@ -1,11 +1,7 @@ #include __kernel void depth_to_space( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 __read_only image2d_t input, __private const int block_size, __private const int output_depth, @@ -14,7 +10,7 @@ __kernel void depth_to_space( const int out_w = get_global_id(1); const int out_h = get_global_id(2); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (out_d >= global_size_dim0 || out_w >= global_size_dim1 || out_h >= global_size_dim2) { return; @@ -43,11 +39,7 @@ __kernel void depth_to_space( } __kernel void space_to_depth( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 __read_only image2d_t input, __private const int block_size, __private const int input_depth, @@ -57,7 +49,7 @@ __kernel void space_to_depth( const int w = get_global_id(1); const int h = get_global_id(2); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (d >= global_size_dim0 || w >= global_size_dim1 || h >= global_size_dim2) { return; diff --git a/mace/kernels/opencl/cl/depthwise_conv2d.cl b/mace/kernels/opencl/cl/depthwise_conv2d.cl index 1974d8db50935f2abc96e6a309e8fc41f448deb3..fff19613c9dfad3f3e4a80fed57c60e99d1ec43f 100644 --- a/mace/kernels/opencl/cl/depthwise_conv2d.cl +++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl @@ -2,11 +2,7 @@ // Only multiplier = 1 is supported __kernel void depthwise_conv2d( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ #ifdef BIAS @@ -29,7 +25,7 @@ __kernel void depthwise_conv2d( const short out_w_blk = get_global_id(1); const short out_hb = get_global_id(2); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1 || out_hb >= global_size_dim2) { return; @@ -143,11 +139,7 @@ __kernel void depthwise_conv2d( } __kernel void depthwise_conv2d_s1( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ #ifdef BIAS @@ -168,7 +160,7 @@ __kernel void depthwise_conv2d_s1( const short out_w_blk = get_global_id(1) << 2; const short out_hb = get_global_id(2); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (out_ch_blk >= global_size_dim0 || get_global_id(1) >= global_size_dim1 || out_hb >= global_size_dim2) { return; diff --git a/mace/kernels/opencl/cl/eltwise.cl b/mace/kernels/opencl/cl/eltwise.cl index 9a69af1af46cacff54c20c1c5583f25bfdec4907..def21f0a993b75d321729e5c89b080555c1dcdf7 100644 --- a/mace/kernels/opencl/cl/eltwise.cl +++ b/mace/kernels/opencl/cl/eltwise.cl @@ -1,10 +1,7 @@ #include __kernel void eltwise( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input1, #ifdef COEFF_SUM @@ -15,7 +12,7 @@ __kernel void eltwise( const int w = get_global_id(0); const int hb = get_global_id(1); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (w >= global_size_dim0 || hb >= global_size_dim1) return; #endif diff --git a/mace/kernels/opencl/cl/matmul.cl b/mace/kernels/opencl/cl/matmul.cl index c3efc9f25a8869af9e10f27d74d6ea113d94826e..fe260e7a22477ea958936b30378b439c8c94fb2f 100644 --- a/mace/kernels/opencl/cl/matmul.cl +++ b/mace/kernels/opencl/cl/matmul.cl @@ -2,10 +2,7 @@ // C = A * B __kernel void matmul( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 __read_only image2d_t A, __read_only image2d_t B, __write_only image2d_t C, @@ -17,7 +14,7 @@ __kernel void matmul( const int gx = get_global_id(0) << 2; const int hb = get_global_id(1); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (get_global_id(0) >= global_size_dim0 || hb >= global_size_dim1) return; #endif diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl index 0a28b745d5f78cbd8b73181dff2b960c664aa6ae..ead839940e5081e6d90e841f3eda569339a2ffa1 100644 --- a/mace/kernels/opencl/cl/pooling.cl +++ b/mace/kernels/opencl/cl/pooling.cl @@ -20,11 +20,7 @@ inline int calculate_avg_block_size(const int pool_size, // Supported data type: half/float __kernel void pooling( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 __read_only image2d_t input, __private const int in_height, __private const int in_width, @@ -39,7 +35,7 @@ __kernel void pooling( const int out_width_idx = get_global_id(1); const int out_hb_idx = get_global_id(2); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (out_chan_idx >= global_size_dim0 || out_width_idx >= global_size_dim1 || out_hb_idx >= global_size_dim2) { return; diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl index 8c5b7a33b561c7bbd37366630725546f64a9d72d..83e6df85c5c7c1c4b1ee9facf62d73c7cd0f5a58 100644 --- a/mace/kernels/opencl/cl/resize_bilinear.cl +++ b/mace/kernels/opencl/cl/resize_bilinear.cl @@ -1,11 +1,7 @@ #include __kernel void resize_bilinear_nocache( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __write_only image2d_t output, __private const float height_scale, @@ -18,7 +14,7 @@ __kernel void resize_bilinear_nocache( const int w = get_global_id(1); const int hb = get_global_id(2); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (ch_blk >= global_size_dim0 || w >= global_size_dim1 || hb >= global_size_dim2) { return; diff --git a/mace/kernels/opencl/cl/slice.cl b/mace/kernels/opencl/cl/slice.cl index 4517ec99a0f23c58377de0d784cb59f39bf604c1..eccdd882c75a809804d61599b5288a432d2d432e 100644 --- a/mace/kernels/opencl/cl/slice.cl +++ b/mace/kernels/opencl/cl/slice.cl @@ -1,11 +1,7 @@ #include __kernel void slice( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 __read_only image2d_t input, __private const int chan_blk_offset, __write_only image2d_t output) { @@ -13,7 +9,7 @@ __kernel void slice( const int width_idx = get_global_id(1); const int hb_idx = get_global_id(2); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 || hb_idx >= global_size_dim2) { return; diff --git a/mace/kernels/opencl/cl/softmax.cl b/mace/kernels/opencl/cl/softmax.cl index 11ff80bf9f601d874ea81df7fac2f12a8001fa2e..628d71cbb3a22b4ca240446c7e7c889dc1fc55ab 100644 --- a/mace/kernels/opencl/cl/softmax.cl +++ b/mace/kernels/opencl/cl/softmax.cl @@ -1,11 +1,7 @@ #include __kernel void softmax( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 __read_only image2d_t input, __private const int channels, __private const int remain_channels, @@ -14,7 +10,7 @@ __kernel void softmax( const int width_idx = get_global_id(1); const int hb_idx = get_global_id(2); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 || hb_idx >= global_size_dim2) { return; diff --git a/mace/kernels/opencl/cl/space_to_batch.cl b/mace/kernels/opencl/cl/space_to_batch.cl index 0a54601269284a5038d733154a836515618e7443..1e2024043f97f835c03c41ecc03fea7b86617ed2 100644 --- a/mace/kernels/opencl/cl/space_to_batch.cl +++ b/mace/kernels/opencl/cl/space_to_batch.cl @@ -1,11 +1,7 @@ #include __kernel void space_to_batch( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 __read_only image2d_t space_data, __write_only image2d_t batch_data, __private const int block_height, @@ -20,7 +16,7 @@ __kernel void space_to_batch( const int batch_w_idx = get_global_id(1); const int batch_hb_idx = get_global_id(2); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1 || batch_hb_idx >= global_size_dim2) { return; @@ -53,11 +49,7 @@ __kernel void space_to_batch( } __kernel void batch_to_space( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 __read_only image2d_t batch_data, __write_only image2d_t space_data, __private const int block_height, @@ -72,7 +64,7 @@ __kernel void batch_to_space( const int batch_w_idx = get_global_id(1); const int batch_hb_idx = get_global_id(2); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1 || batch_hb_idx >= global_size_dim2) { return; diff --git a/mace/kernels/opencl/cl/winograd_transform.cl b/mace/kernels/opencl/cl/winograd_transform.cl index f3f99cfa567514ae4ba3a99c73930dd8e13efa6f..d447001e6ccd09f44f2d60c658be778c0e1fbff9 100644 --- a/mace/kernels/opencl/cl/winograd_transform.cl +++ b/mace/kernels/opencl/cl/winograd_transform.cl @@ -1,10 +1,7 @@ #include __kernel void winograd_transform_2x2( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 __read_only image2d_t input, __write_only image2d_t output, __private const int in_height, @@ -17,7 +14,7 @@ __kernel void winograd_transform_2x2( int out_width_idx = get_global_id(0); int chan_blk_idx = get_global_id(1); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) { return; } @@ -120,10 +117,7 @@ __kernel void winograd_transform_2x2( } __kernel void winograd_inverse_transform_2x2( -#ifndef USE_QUALCOMM_OPENCL_2_0 - __private const int global_size_dim0, - __private const int global_size_dim1, -#endif + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 __read_only image2d_t input, #ifdef BIAS __read_only image2d_t bias, /* cout%4 * cout/4 */ @@ -137,7 +131,7 @@ __kernel void winograd_inverse_transform_2x2( const int width_idx = get_global_id(0); const int height_idx = get_global_id(1); -#ifndef USE_QUALCOMM_OPENCL_2_0 +#ifndef NON_UNIFORM_WORK_GROUP if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) { return; } diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index 56449d14188d778fc25cc5e906f2b083f9e9ce42..bccc8623997c7b356190ecc4818fc3394eb45d89 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -18,7 +18,6 @@ static void Concat2(cl::Kernel *kernel, std::vector *prev_input_shape, Tensor *output, StatsFuture *future, - bool *is_non_uniform_work_groups_supported, uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); @@ -34,13 +33,11 @@ static void Concat2(cl::Kernel *kernel, auto runtime = OpenCLRuntime::Global(); if (kernel->get() == nullptr) { - *is_non_uniform_work_groups_supported = - runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); built_options.emplace("-Dconcat_channel=" + kernel_name); - if (*is_non_uniform_work_groups_supported) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } if (input0->dtype() == output->dtype()) { built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); @@ -53,10 +50,13 @@ static void Concat2(cl::Kernel *kernel, built_options.emplace("-DDIVISIBLE_FOUR"); } *kernel = runtime->BuildKernel("concat", kernel_name, built_options); + + *kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } if (!IsVecEqual(*prev_input_shape, input0->shape())) { uint32_t idx = 0; - if (!(*is_non_uniform_work_groups_supported)) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[1]); kernel->setArg(idx++, gws[2]); @@ -70,9 +70,6 @@ static void Concat2(cl::Kernel *kernel, *(static_cast(output->opencl_image()))); *prev_input_shape = input0->shape(); - - *kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } const std::vector lws = {8, *kwg_size / 64, 8, 1}; @@ -87,7 +84,6 @@ static void ConcatN(cl::Kernel *kernel, const DataType dt, Tensor *output, StatsFuture *future, - bool *is_non_uniform_work_groups_supported, uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); @@ -97,17 +93,17 @@ static void ConcatN(cl::Kernel *kernel, auto runtime = OpenCLRuntime::Global(); if (kernel->get() == nullptr) { - *is_non_uniform_work_groups_supported = - runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi"); built_options.emplace("-Dconcat_channel_multi=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - if (*is_non_uniform_work_groups_supported) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } *kernel = runtime->BuildKernel("concat", kernel_name, built_options); + *kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } const int inputs_count = input_list.size(); @@ -121,7 +117,7 @@ static void ConcatN(cl::Kernel *kernel, }; uint32_t idx = 0; - if (!(*is_non_uniform_work_groups_supported)) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[1]); kernel->setArg(idx++, gws[2]); @@ -131,8 +127,6 @@ static void ConcatN(cl::Kernel *kernel, kernel->setArg(idx++, *(output->opencl_image())); chan_blk_offset += input_channel_blk; - *kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); const std::vector lws = {8, *kwg_size / 64, 8, 1}; std::stringstream ss; ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_" @@ -178,13 +172,12 @@ void ConcatFunctor::operator()( switch (inputs_count) { case 2: Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum::value, - &input_shape_, output, future, - &is_non_uniform_work_groups_supported_, &kwg_size_); + &input_shape_, output, future, &kwg_size_); break; default: if (divisible_four) { ConcatN(&kernel_, input_list, DataTypeToEnum::value, output, future, - &is_non_uniform_work_groups_supported_, &kwg_size_); + &kwg_size_); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc index b9fa2d4c86b259bf9f9691654a92746071cad545..5a002666320dfbfbea8263d1693bae53231da952 100644 --- a/mace/kernels/opencl/conv_2d_opencl.cc +++ b/mace/kernels/opencl/conv_2d_opencl.cc @@ -21,7 +21,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, std::vector *prev_input_shape, Tensor *output, StatsFuture *future, - bool *is_non_uniform_work_groups_supported, uint32_t *kwg_size); extern void Conv2dOpenclK3x3(cl::Kernel *kernel, @@ -37,7 +36,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, std::vector *prev_input_shape, Tensor *output, StatsFuture *future, - bool *is_non_uniform_work_groups_supported, uint32_t *kwg_size); extern void Conv2dOpencl(cl::Kernel *kernel, @@ -53,7 +51,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel, std::vector *prev_input_shape, Tensor *output, StatsFuture *future, - bool *is_non_uniform_work_groups_supported, uint32_t *kwg_size); template @@ -68,7 +65,7 @@ void Conv2dFunctor::operator()(const Tensor *input, const int *dilations, const ActivationType activation, const float relux_max_limit, const DataType dt, std::vector *input_shape, Tensor *output, StatsFuture *future, - bool *is_non_uniform_work_groups_supported, uint32_t *kwg_size); + uint32_t *kwg_size); // Selection matrix: kernel_size x stride_size static const Conv2dOpenclFunction selector[5] = { Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr}; @@ -109,12 +106,12 @@ void Conv2dFunctor::operator()(const Tensor *input, conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, DataTypeToEnum::value, &input_shape_, output, future, - &is_non_uniform_work_groups_supported_, &kwg_size_); + &kwg_size_); } else { Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, DataTypeToEnum::value, &input_shape_, output, future, - &is_non_uniform_work_groups_supported_, &kwg_size_); + &kwg_size_); } } diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index 34055189421eeb7fb583b4e610f51cd01731b1ed..5cdf8e5608386ea7547f20d239e5126d69d50efc 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -23,7 +23,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, std::vector *prev_input_shape, Tensor *output, StatsFuture *future, - bool *is_non_uniform_work_groups_supported, uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); @@ -41,8 +40,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, auto runtime = OpenCLRuntime::Global(); if (kernel->get() == nullptr) { - *is_non_uniform_work_groups_supported = - runtime->IsNonUniformWorkgroupsSupported(); MACE_CHECK(input_batch == batch); std::set built_options; @@ -50,8 +47,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, built_options.emplace("-Dconv_2d_1x1=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (*is_non_uniform_work_groups_supported) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } if (bias != nullptr) { built_options.emplace("-DBIAS"); @@ -76,6 +73,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, } *kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options); + + *kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } const uint32_t gws[3] = {static_cast(channel_blocks), @@ -84,7 +84,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, if (!IsVecEqual(*prev_input_shape, input->shape())) { uint32_t idx = 0; - if (!(*is_non_uniform_work_groups_supported)) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[1]); kernel->setArg(idx++, gws[2]); @@ -105,9 +105,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, kernel->setArg(idx++, stride); *prev_input_shape = input->shape(); - - *kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } const std::vector lws = {8, *kwg_size / 64, 8, 1}; diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index 88793dacb06dc5e75ca6bee7a8cb57c8dd0c6775..d0f587452579a7dfe6dbd64d29fa02ab9bf73297 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -25,7 +25,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, std::vector *prev_input_shape, Tensor *output, StatsFuture *future, - bool *is_non_uniform_work_groups_supported, uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); @@ -40,15 +39,13 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, auto runtime = OpenCLRuntime::Global(); if (kernel->get() == nullptr) { - *is_non_uniform_work_groups_supported = - runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3"); built_options.emplace("-Dconv_2d_3x3=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (*is_non_uniform_work_groups_supported) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation) { @@ -71,6 +68,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, } *kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options); + + *kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } const uint32_t gws[3] = {static_cast(channel_blocks), @@ -79,7 +79,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, if (!IsVecEqual(*prev_input_shape, input->shape())) { uint32_t idx = 0; - if (!(*is_non_uniform_work_groups_supported)) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[1]); kernel->setArg(idx++, gws[2]); @@ -103,9 +103,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, kernel->setArg(idx++, dilations[1]); *prev_input_shape = input->shape(); - - *kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } const std::vector lws = {4, *kwg_size / 32, 8, 1}; diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc index 191322095748de1b7a8ee2c4f6ca7049fca5a8b2..b9b2fec56a13dc55ee97649098f0674425ddcd4e 100644 --- a/mace/kernels/opencl/conv_2d_opencl_general.cc +++ b/mace/kernels/opencl/conv_2d_opencl_general.cc @@ -25,7 +25,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel, std::vector *prev_input_shape, Tensor *output, StatsFuture *future, - bool *is_non_uniform_work_groups_supported, uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); @@ -40,15 +39,13 @@ extern void Conv2dOpencl(cl::Kernel *kernel, auto runtime = OpenCLRuntime::Global(); if (kernel->get() == nullptr) { - *is_non_uniform_work_groups_supported = - runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d"); built_options.emplace("-Dconv_2d=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (*is_non_uniform_work_groups_supported) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation) { @@ -71,6 +68,9 @@ extern void Conv2dOpencl(cl::Kernel *kernel, } *kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options); + + *kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } const uint32_t gws[3] = {static_cast(channel_blocks), @@ -79,7 +79,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel, if (!IsVecEqual(*prev_input_shape, input->shape())) { uint32_t idx = 0; - if (!(*is_non_uniform_work_groups_supported)) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[1]); kernel->setArg(idx++, gws[2]); @@ -105,9 +105,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel, kernel->setArg(idx++, dilations[1]); *prev_input_shape = input->shape(); - - *kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } const std::vector lws = {8, *kwg_size / 64, 8, 1}; diff --git a/mace/kernels/opencl/depth_to_space_opencl.cc b/mace/kernels/opencl/depth_to_space_opencl.cc index 83cff27354a563d63094d1541a4735e51b136396..8fc0924704badcf1f37d9a55b8c0188e65b295de 100644 --- a/mace/kernels/opencl/depth_to_space_opencl.cc +++ b/mace/kernels/opencl/depth_to_space_opencl.cc @@ -48,8 +48,6 @@ void DepthToSpaceOpFunctor::operator()( auto runtime = OpenCLRuntime::Global(); if (kernel_.get() == nullptr) { - is_non_uniform_work_groups_supported_ = - runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::stringstream kernel_name_ss; @@ -58,11 +56,14 @@ void DepthToSpaceOpFunctor::operator()( auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (is_non_uniform_work_groups_supported_) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } kernel_ = runtime->BuildKernel("depth_to_space", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } uint32_t gws[3]; @@ -83,7 +84,7 @@ void DepthToSpaceOpFunctor::operator()( } uint32_t idx = 0; - if (!is_non_uniform_work_groups_supported_) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[2]); @@ -94,9 +95,6 @@ void DepthToSpaceOpFunctor::operator()( kernel_.setArg(idx++, *(output->opencl_image())); input_shape_ = input->shape(); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } const std::vector lws = {8, kwg_size_ / 64, 8, 1}; diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc index 11bb38b3a603055b738e4ed2a6ea609a12e1d24b..5e0b99ba1e5a00e25eec931f209c0a28c8ed85b2 100644 --- a/mace/kernels/opencl/depthwise_conv_opencl.cc +++ b/mace/kernels/opencl/depthwise_conv_opencl.cc @@ -24,7 +24,6 @@ void DepthwiseConv2d(cl::Kernel *kernel, std::vector *prev_input_shape, Tensor *output, StatsFuture *future, - bool *is_non_uniform_work_groups_supported, uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); @@ -45,8 +44,6 @@ void DepthwiseConv2d(cl::Kernel *kernel, auto runtime = OpenCLRuntime::Global(); if (kernel->get() == nullptr) { - *is_non_uniform_work_groups_supported = - runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) { @@ -55,8 +52,8 @@ void DepthwiseConv2d(cl::Kernel *kernel, } else { built_options.emplace("-Ddepthwise_conv2d=" + kernel_name); } - if (*is_non_uniform_work_groups_supported) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); @@ -83,6 +80,9 @@ void DepthwiseConv2d(cl::Kernel *kernel, *kernel = runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options); + + *kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } if (!IsVecEqual(*prev_input_shape, input->shape())) { const index_t input_batch = input->dim(0); @@ -97,7 +97,7 @@ void DepthwiseConv2d(cl::Kernel *kernel, input_channels); uint32_t idx = 0; - if (!(*is_non_uniform_work_groups_supported)) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[1]); kernel->setArg(idx++, gws[2]); @@ -124,9 +124,6 @@ void DepthwiseConv2d(cl::Kernel *kernel, } *prev_input_shape = input->shape(); - - *kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } const std::vector lws = {8, *kwg_size / 64, 8, 1}; @@ -185,7 +182,7 @@ void DepthwiseConv2dFunctor::operator()( DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, DataTypeToEnum::value, &input_shape_, output, future, - &is_non_uniform_work_groups_supported_, &kwg_size_); + &kwg_size_); } template struct DepthwiseConv2dFunctor; diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise_opencl.cc index 38a231da40224da36eb0a9e972764b0a1915ead9..c23534bbce5ca423314d23bd470a5cbc2289ae1e 100644 --- a/mace/kernels/opencl/eltwise_opencl.cc +++ b/mace/kernels/opencl/eltwise_opencl.cc @@ -30,8 +30,6 @@ void EltwiseFunctor::operator()(const Tensor *input0, auto runtime = OpenCLRuntime::Global(); if (kernel_.get() == nullptr) { - is_non_uniform_work_groups_supported_ = - runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise"); @@ -39,15 +37,18 @@ void EltwiseFunctor::operator()(const Tensor *input0, built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace(MakeString("-DELTWISE_TYPE=", type_)); - if (is_non_uniform_work_groups_supported_) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM"); kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } if (!IsVecEqual(input_shape_, input0->shape())) { uint32_t idx = 0; - if (!is_non_uniform_work_groups_supported_) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); } @@ -60,9 +61,6 @@ void EltwiseFunctor::operator()(const Tensor *input0, kernel_.setArg(idx++, *(output->opencl_image())); input_shape_ = input0->shape(); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } const std::vector lws = {kwg_size_ / 16, 16, 1}; diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index b8b8d6a3cc51de74c9cab7a8f85f86f61d218abd..2ab8cde257de73b8dee80300afa89122dd3cf126 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -200,8 +200,6 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, const std::vector &lws, StatsFuture *future) { auto runtime = OpenCLRuntime::Global(); - const bool is_non_uniform_work_groups_supported = - runtime->IsNonUniformWorkgroupsSupported(); auto params_generator = [&]() -> std::vector> { const uint32_t kwg_size = @@ -239,7 +237,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, << "Tuning parameters of 3D kernel must be 4D"; cl_int error = CL_SUCCESS; std::vector roundup_gws(3); - if (!is_non_uniform_work_groups_supported) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { for (size_t i = 0; i < 3; ++i) { roundup_gws[i] = RoundUp(gws[i], params[i]); } @@ -252,7 +250,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, for (uint32_t i = 0; i < num_blocks; ++i) { uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - if (is_non_uniform_work_groups_supported) { + if (runtime->IsNonUniformWorkgroupsSupported()) { error = runtime->command_queue().enqueueNDRangeKernel( kernel, cl::NDRange(0, 0, i * block_size), cl::NDRange(gws[0], gws[1], gws2), @@ -268,7 +266,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, } } else { timer->ClearTiming(); - if (is_non_uniform_work_groups_supported) { + if (runtime->IsNonUniformWorkgroupsSupported()) { error = runtime->command_queue().enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), cl::NDRange(params[0], params[1], params[2]), nullptr, &event); @@ -293,7 +291,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, for (uint32_t i = 0; i < num_blocks; ++i) { uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - if (is_non_uniform_work_groups_supported) { + if (runtime->IsNonUniformWorkgroupsSupported()) { error = runtime->command_queue().enqueueNDRangeKernel( kernel, cl::NDRange(0, 0, i * block_size), cl::NDRange(gws[0], gws[1], gws2), @@ -332,8 +330,6 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, const std::vector &lws, StatsFuture *future) { auto runtime = OpenCLRuntime::Global(); - const bool is_non_uniform_work_groups_supported = - runtime->IsNonUniformWorkgroupsSupported(); auto params_generator = [&]() -> std::vector> { const uint32_t kwg_size = @@ -359,7 +355,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, << "Tuning parameters of 2D kernel must be 3d"; cl_int error = CL_SUCCESS; std::vector roundup_gws(2); - if (!is_non_uniform_work_groups_supported) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { for (size_t i = 0; i < 2; ++i) { roundup_gws[i] = RoundUp(gws[i], params[i]); } @@ -372,7 +368,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, for (uint32_t i = 0; i < num_blocks; ++i) { uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; - if (is_non_uniform_work_groups_supported) { + if (runtime->IsNonUniformWorkgroupsSupported()) { error = runtime->command_queue().enqueueNDRangeKernel( kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1), cl::NDRange(params[0], params[1]), nullptr, &event); @@ -387,7 +383,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, } } else { timer->ClearTiming(); - if (is_non_uniform_work_groups_supported) { + if (runtime->IsNonUniformWorkgroupsSupported()) { error = runtime->command_queue().enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), cl::NDRange(params[0], params[1]), nullptr, &event); @@ -411,7 +407,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, for (uint32_t i = 0; i < num_blocks; ++i) { uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; - if (is_non_uniform_work_groups_supported) { + if (runtime->IsNonUniformWorkgroupsSupported()) { error = runtime->command_queue().enqueueNDRangeKernel( kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1), cl::NDRange(params[0], params[1]), diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc index 79dcc40d67b0c2a855fb68ab632409e0d868aa6f..7a4822096309297d8c11e80c1f7eb6ea5069b48f 100644 --- a/mace/kernels/opencl/matmul.cc +++ b/mace/kernels/opencl/matmul.cc @@ -34,21 +34,22 @@ void MatMulFunctor::operator()(const Tensor *A, auto runtime = OpenCLRuntime::Global(); if (kernel_.get() == nullptr) { - is_non_uniform_work_groups_supported_ = - runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); built_options.emplace("-Dmatmul=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (is_non_uniform_work_groups_supported_) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } uint32_t idx = 0; - if (!is_non_uniform_work_groups_supported_) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); } @@ -61,8 +62,6 @@ void MatMulFunctor::operator()(const Tensor *A, kernel_.setArg(idx++, static_cast(height_blocks)); kernel_.setArg(idx++, static_cast(RoundUpDiv4(A->dim(2)))); - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); const std::vector lws = {kwg_size_ / 64, 64, 1}; std::stringstream ss; ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_" diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc index 9b2f96c8f1e92d7c64897bd3d5c3c7f83bcd63ea..774fd5ee2ac60875491deb00f1ce4fcbef8ba97e 100644 --- a/mace/kernels/opencl/pooling_opencl.cc +++ b/mace/kernels/opencl/pooling_opencl.cc @@ -21,8 +21,6 @@ void PoolingFunctor::operator()(const Tensor *input, auto runtime = OpenCLRuntime::Global(); if (kernel_.get() == nullptr) { - is_non_uniform_work_groups_supported_ = - runtime->IsNonUniformWorkgroupsSupported(); const DataType dt = DataTypeToEnum::value; std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); @@ -39,10 +37,13 @@ void PoolingFunctor::operator()(const Tensor *input, if (pooling_type_ == AVG) { built_options.emplace("-DPOOL_AVG"); } - if (is_non_uniform_work_groups_supported_) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } std::vector gws; @@ -81,7 +82,7 @@ void PoolingFunctor::operator()(const Tensor *input, }; uint32_t idx = 0; - if (!is_non_uniform_work_groups_supported_) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[2]); @@ -97,9 +98,6 @@ void PoolingFunctor::operator()(const Tensor *input, kernel_.setArg(idx++, *(output->opencl_image())); input_shape_ = input->shape(); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } else { index_t batch = output->dim(0); index_t out_height = output->dim(1); diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc index ce2fe7bfbc28c9d4563e6e900d87ec753179af84..ac2733e9e511c8a32dee3371c9ed404be964cb90 100644 --- a/mace/kernels/opencl/resize_bilinear_opencl.cc +++ b/mace/kernels/opencl/resize_bilinear_opencl.cc @@ -31,19 +31,20 @@ void ResizeBilinearFunctor::operator()( auto runtime = OpenCLRuntime::Global(); if (kernel_.get() == nullptr) { - is_non_uniform_work_groups_supported_ = - runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache"); built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name); auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (is_non_uniform_work_groups_supported_) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } kernel_ = runtime->BuildKernel("resize_bilinear", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } if (!IsVecEqual(input_shape_, input->shape())) { MACE_CHECK(out_height > 0 && out_width > 0); @@ -60,7 +61,7 @@ void ResizeBilinearFunctor::operator()( CalculateResizeScale(in_width, out_width, align_corners_); uint32_t idx = 0; - if (!is_non_uniform_work_groups_supported_) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[2]); @@ -74,9 +75,6 @@ void ResizeBilinearFunctor::operator()( kernel_.setArg(idx++, static_cast(out_height)); input_shape_ = input->shape(); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } const std::vector lws = {8, kwg_size_ / 64, 8, 1}; diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc index d610e1e14b6bfd49d61ed93fc95c150afd1a665e..850f08f6df5b091750d0b6ac203bf3e72e00099c 100644 --- a/mace/kernels/opencl/slice.cc +++ b/mace/kernels/opencl/slice.cc @@ -32,18 +32,19 @@ void SliceFunctor::operator()( auto runtime = OpenCLRuntime::Global(); if (kernel_.get() == nullptr) { - is_non_uniform_work_groups_supported_ = - runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice"); built_options.emplace("-Dslice=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum::value)); - if (is_non_uniform_work_groups_supported_) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } kernel_ = runtime->BuildKernel("slice", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } const index_t channel_blk = RoundUpDiv4(output_channels); @@ -53,8 +54,6 @@ void SliceFunctor::operator()( static_cast(input->dim(0) * input->dim(1)), }; - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); const std::vector lws = {8, kwg_size_ / 64, 8, 1}; std::stringstream ss; ss << "slice_opencl_kernel_" @@ -65,7 +64,7 @@ void SliceFunctor::operator()( << outputs_count; for (int i = 0; i < outputs_count; ++i) { uint32_t idx = 0; - if (!is_non_uniform_work_groups_supported_) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[2]); diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc index 61ea022890a30875a845b6034e37ef8b80081e9b..ea4f0b3e13d2d023a9dc98691ce47cb269f97714 100644 --- a/mace/kernels/opencl/softmax_opencl.cc +++ b/mace/kernels/opencl/softmax_opencl.cc @@ -30,22 +30,23 @@ void SoftmaxFunctor::operator()(const Tensor *logits, auto runtime = OpenCLRuntime::Global(); if (kernel_.get() == nullptr) { - is_non_uniform_work_groups_supported_ = - runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); built_options.emplace("-Dsoftmax=" + kernel_name); auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (is_non_uniform_work_groups_supported_) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } if (!IsVecEqual(input_shape_, logits->shape())) { uint32_t idx = 0; - if (!is_non_uniform_work_groups_supported_) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[2]); @@ -56,9 +57,6 @@ void SoftmaxFunctor::operator()(const Tensor *logits, kernel_.setArg(idx++, *(output->opencl_image())); input_shape_ = logits->shape(); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } const std::vector lws = {8, kwg_size_ / 64, 8, 1}; diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc index 38da0548b974269a114022d71b6e792493e2c71e..31b5013b737335c40255d9d4163e1d2fb8572d68 100644 --- a/mace/kernels/opencl/space_to_batch_opencl.cc +++ b/mace/kernels/opencl/space_to_batch_opencl.cc @@ -39,8 +39,6 @@ void SpaceToBatchFunctor::operator()( auto runtime = OpenCLRuntime::Global(); if (kernel_.get() == nullptr) { - is_non_uniform_work_groups_supported_ = - runtime->IsNonUniformWorkgroupsSupported(); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::set built_options; std::stringstream kernel_name_ss; @@ -49,15 +47,18 @@ void SpaceToBatchFunctor::operator()( built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum::value)); - if (is_non_uniform_work_groups_supported_) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } kernel_ = runtime->BuildKernel("space_to_batch", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } if (!IsVecEqual(space_shape_, space_tensor->shape())) { uint32_t idx = 0; - if (!is_non_uniform_work_groups_supported_) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[2]); @@ -79,9 +80,6 @@ void SpaceToBatchFunctor::operator()( kernel_.setArg(idx++, static_cast(batch_tensor->dim(2))); space_shape_ = space_tensor->shape(); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } const std::vector lws = {8, kwg_size_ / 64, 8, 1}; diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc index 092a60cd805acd2b28cb909395846ed8ec3bd1e2..f4fd5525960706e31ae3a303d00c50e534bfeaec 100644 --- a/mace/kernels/opencl/winograd_transform.cc +++ b/mace/kernels/opencl/winograd_transform.cc @@ -18,8 +18,6 @@ void WinogradTransformFunctor::operator()( auto runtime = OpenCLRuntime::Global(); if (kernel_.get() == nullptr) { - is_non_uniform_work_groups_supported_ = - runtime->IsNonUniformWorkgroupsSupported(); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2"); std::set built_options; @@ -28,11 +26,14 @@ void WinogradTransformFunctor::operator()( DtToUpstreamCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum::value)); - if (is_non_uniform_work_groups_supported_) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } std::vector output_shape(4); std::vector filter_shape = {3, 3, input_tensor->dim(3), 1}; @@ -61,7 +62,7 @@ void WinogradTransformFunctor::operator()( output_tensor->ResizeImage(output_shape, image_shape); uint32_t idx = 0; - if (!is_non_uniform_work_groups_supported_) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); } @@ -76,9 +77,6 @@ void WinogradTransformFunctor::operator()( kernel_.setArg(idx++, static_cast(paddings[1] / 2)); input_shape_ = input_tensor->shape(); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } const std::vector lws = {kwg_size_ / 8, 8, 1}; @@ -99,8 +97,6 @@ void WinogradInverseTransformFunctor::operator()( auto runtime = OpenCLRuntime::Global(); if (kernel_.get() == nullptr) { - is_non_uniform_work_groups_supported_ = - runtime->IsNonUniformWorkgroupsSupported(); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2"); std::set built_options; @@ -110,8 +106,8 @@ void WinogradInverseTransformFunctor::operator()( DtToUpstreamCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum::value)); - if (is_non_uniform_work_groups_supported_) { - built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation_) { @@ -138,6 +134,9 @@ void WinogradInverseTransformFunctor::operator()( kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } const uint32_t gws[2] = { @@ -153,7 +152,7 @@ void WinogradInverseTransformFunctor::operator()( const uint32_t round_h = (height_ + 1) / 2; const uint32_t round_w = (width_ + 1) / 2; uint32_t idx = 0; - if (!is_non_uniform_work_groups_supported_) { + if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); } @@ -173,9 +172,6 @@ void WinogradInverseTransformFunctor::operator()( kernel_.setArg(idx++, relux_max_limit_); input_shape_ = input_tensor->shape(); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } const std::vector lws = {kwg_size_ / 8, 8, 1}; diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h index 52dd12342ec360c07de992d413eac509b8f5778b..ac5c7987fbeda1e76c7e5a13f4a5166c3a2f6c07 100644 --- a/mace/kernels/pooling.h +++ b/mace/kernels/pooling.h @@ -186,7 +186,6 @@ struct PoolingFunctor : PoolingFunctorBase { cl::Kernel kernel_; uint32_t kwg_size_; - bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h index 09ae3ba5075bc959e7b571db40d06dc548b0bdd4..4e0c5ae3cb04def05794faab46cfc5dc90727e3c 100644 --- a/mace/kernels/resize_bilinear.h +++ b/mace/kernels/resize_bilinear.h @@ -174,7 +174,6 @@ struct ResizeBilinearFunctor cl::Kernel kernel_; uint32_t kwg_size_; - bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; diff --git a/mace/kernels/slice.h b/mace/kernels/slice.h index ce7431da3da8d0f2b39d6c5c38b694867c866365..1bde41e539b0aa9bd8b458261913a9957da2ec0a 100644 --- a/mace/kernels/slice.h +++ b/mace/kernels/slice.h @@ -62,7 +62,6 @@ struct SliceFunctor { StatsFuture *future); cl::Kernel kernel_; uint32_t kwg_size_; - bool is_non_uniform_work_groups_supported_; }; } // namespace kernels diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h index b491e2ad39249f1e66233375aaa3c904951f2b84..e2b8efc8d8673f26b01a2124dcc3fb64730ad0d5 100644 --- a/mace/kernels/softmax.h +++ b/mace/kernels/softmax.h @@ -62,7 +62,6 @@ struct SoftmaxFunctor { cl::Kernel kernel_; uint32_t kwg_size_; - bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h index 6bd66cbb3e721beb254b06486b12ebb52ab184cd..4e0d4c591e9b6cfe8544c1bfc4d98936c921dee1 100644 --- a/mace/kernels/space_to_batch.h +++ b/mace/kernels/space_to_batch.h @@ -57,7 +57,6 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { cl::Kernel kernel_; uint32_t kwg_size_; - bool is_non_uniform_work_groups_supported_; std::vector space_shape_; }; diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h index df12ab36227eab19372c53e02f0f4110c937bd00..e3169541dd197764d284d35c7eabbb61a2ba38b6 100644 --- a/mace/kernels/winograd_transform.h +++ b/mace/kernels/winograd_transform.h @@ -52,7 +52,6 @@ struct WinogradTransformFunctor cl::Kernel kernel_; uint32_t kwg_size_; - bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; @@ -111,7 +110,6 @@ struct WinogradInverseTransformFunctor cl::Kernel kernel_; uint32_t kwg_size_; - bool is_non_uniform_work_groups_supported_; std::vector input_shape_; };