diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 798409abf683c489104a96a80bf9938b906e3d4f..dcd7fab4a019c5a49772b7794c1aa17e0bbd4e26 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -142,17 +142,17 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint, } bool gpu_detected = false; - bool is_adreno_gpu = false; device_ = std::make_shared(); for (auto device : all_devices) { if (device.getInfo() == CL_DEVICE_TYPE_GPU) { *device_ = device; gpu_detected = true; + const std::string device_name = device.getInfo(); - constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)"; - if (device_name == kQualcommAdrenoGPUStr) { - is_adreno_gpu = true; - } + gpu_type_ = ParseGPUTypeFromDeviceName(device_name); + + const std::string device_version = device.getInfo(); + opencl_version_ = device_version.substr(7, 3); VLOG(1) << "Using device: " << device_name; break; @@ -171,7 +171,7 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint, } cl_int err; - if (is_adreno_gpu) { + if (gpu_type_ == GPUType::QUALCOMM_ADRENO) { std::vector context_properties; context_properties.reserve(5); GetAdrenoContextProperties(&context_properties, gpu_perf_hint, @@ -350,4 +350,30 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) { return size; } +const bool OpenCLRuntime::IsNonUniformWorkgroupsSupported() { + if (gpu_type_ == GPUType::QUALCOMM_ADRENO && + opencl_version_ == "2.0") { + return true; + } else { + return false; + } +} + +const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName( + const std::string &device_name) { + constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)"; + constexpr const char *kMaliGPUStr = "Mali"; + constexpr const char *kPowerVRGPUStr = "PowerVR"; + + if (device_name == kQualcommAdrenoGPUStr) { + return GPUType::QUALCOMM_ADRENO; + } else if (device_name.find(kMaliGPUStr) != std::string::npos) { + return GPUType::MALI; + } else if (device_name.find(kPowerVRGPUStr) != std::string::npos) { + return GPUType::PowerVR; + } else { + return GPUType::UNKNOWN; + } +} + } // namespace mace diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index 59d3a4cf81caf25825066d564ed0cf408fa37d0c..414fa7ed91fa205cbbb5b3d3b06d6d7c91d59fcf 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -19,6 +19,13 @@ namespace mace { +enum GPUType { + QUALCOMM_ADRENO, + MALI, + PowerVR, + UNKNOWN, +}; + class OpenCLProfilingTimer : public Timer { public: explicit OpenCLProfilingTimer(const cl::Event *event) @@ -50,6 +57,8 @@ class OpenCLRuntime { uint64_t GetDeviceMaxWorkGroupSize(); uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel); uint64_t GetKernelWaveSize(const cl::Kernel &kernel); + const bool IsNonUniformWorkgroupsSupported(); + const GPUType ParseGPUTypeFromDeviceName(const std::string &device_name); cl::Kernel BuildKernel(const std::string &program_name, const std::string &kernel_name, const std::set &build_options); @@ -75,6 +84,8 @@ class OpenCLRuntime { std::map built_program_map_; std::mutex program_build_mutex_; std::string kernel_path_; + GPUType gpu_type_; + std::string opencl_version_; static GPUPerfHint gpu_perf_hint_; static GPUPriorityHint gpu_priority_hint_; diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h index 55368c3ca83c8aa7dd9e8d76efb47bde568ec4ce..88840910a586346f4f962e594e02fa6e2e8179d0 100644 --- a/mace/kernels/activation.h +++ b/mace/kernels/activation.h @@ -155,6 +155,7 @@ class ActivationFunctor { ActivationType activation_; T relux_max_limit_; cl::Kernel kernel_; + uint32_t kwg_size_; std::string tuning_key_prefix_; std::vector input_shape_; }; diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h index 70d9583ba798babd3a27737c9ed7487913441bf6..c8bb601620f1965b126bc39d2ef2259d26e91e68 100644 --- a/mace/kernels/addn.h +++ b/mace/kernels/addn.h @@ -90,6 +90,7 @@ struct AddNFunctor { StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; std::vector input_shape_; }; diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h index 28b8d776c967e48a4af835ee55913c437aa3d3ea..5e8ae34f9a9aaef596090cc7113c440b425021cf 100644 --- a/mace/kernels/batch_norm.h +++ b/mace/kernels/batch_norm.h @@ -157,6 +157,7 @@ struct BatchNormFunctor : BatchNormFunctorBase { Tensor *output, StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; std::vector input_shape_; }; diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.h index d5372850bcf604b0f1e01e630c0c30b59e95abc0..a3d1ff61b1c0fa0be7c7eb0506c495fc1331476c 100644 --- a/mace/kernels/bias_add.h +++ b/mace/kernels/bias_add.h @@ -64,6 +64,7 @@ struct BiasAddFunctor { Tensor *output, StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; std::vector input_shape_; }; diff --git a/mace/kernels/channel_shuffle.h b/mace/kernels/channel_shuffle.h index f1e258337a2d9a871bbb3ac4aec70faf1a18edf9..69332e0182e4623a7ffb9e1e87fd05fbd9ca5b75 100644 --- a/mace/kernels/channel_shuffle.h +++ b/mace/kernels/channel_shuffle.h @@ -56,6 +56,7 @@ struct ChannelShuffleFunctor { void operator()(const Tensor *input, Tensor *output, StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; const int groups_; std::vector input_shape_; }; diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h index de34ed69fa5803f61e9f6785b9d4b7185be2cccc..6c803b5f2a32544cac7d79145b691b3166faeb9b 100644 --- a/mace/kernels/concat.h +++ b/mace/kernels/concat.h @@ -85,6 +85,7 @@ struct ConcatFunctor : ConcatFunctorBase { Tensor *output, StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; std::vector input_shape_; }; diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index 47516291d14ec21ba2202e2089bee03d6387c433..4cd05a65d527a45caf8c34486be2696511406589 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -401,6 +401,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; std::vector input_shape_; }; diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h index 3f6577f32159309bba931eaef58011902ecc2045..6d30673976df6399351ad50c871d2f9e578cea22 100644 --- a/mace/kernels/depth_to_space.h +++ b/mace/kernels/depth_to_space.h @@ -108,6 +108,7 @@ struct DepthToSpaceOpFunctor { void operator()(const Tensor *input, Tensor *output, StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; const int block_size_; bool d2s_; std::vector input_shape_; diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h index 166ea18a644ead1d53af2a7c3b83c73c617554d6..90c17b19e71553424c5f68eee1cc3bc9ffa2b279 100644 --- a/mace/kernels/depthwise_conv2d.h +++ b/mace/kernels/depthwise_conv2d.h @@ -437,6 +437,7 @@ struct DepthwiseConv2dFunctor StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; std::vector input_shape_; }; diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h index 0f9e9b40061890a62e36104746bcaf0120bfab0f..9c7f0a901a5968f1d0f4cf5c7af8ceeebb465f7e 100644 --- a/mace/kernels/eltwise.h +++ b/mace/kernels/eltwise.h @@ -97,6 +97,7 @@ struct EltwiseFunctor : EltwiseFunctorBase { StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; std::vector input_shape_; }; diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.h index 62590400bf038773c9f16fae68f4c42de4ee9130..b025cbfebe29efa20d65838328458eb73befb823 100644 --- a/mace/kernels/matmul.h +++ b/mace/kernels/matmul.h @@ -241,6 +241,7 @@ struct MatMulFunctor { StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; }; } // namespace kernels diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc index 9792cae56889275053362ed6e7d230ff744fd4ac..368a78bc9594da81e84831136fbfcfe93a4f295f 100644 --- a/mace/kernels/opencl/activation_opencl.cc +++ b/mace/kernels/opencl/activation_opencl.cc @@ -24,15 +24,18 @@ void ActivationFunctor::operator()(const Tensor *input, const index_t channel_blocks = RoundUpDiv4(channels); - if (kernel_.get() == nullptr) { - auto runtime = OpenCLRuntime::Global(); + auto runtime = OpenCLRuntime::Global(); + if (kernel_.get() == nullptr) { std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation"); built_options.emplace("-Dactivation=" + kernel_name); auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } switch (activation_) { case RELU: tuning_key_prefix_ = "relu_opencl_kernel_"; @@ -58,10 +61,22 @@ void ActivationFunctor::operator()(const Tensor *input, LOG(FATAL) << "Unknown activation type: " << activation_; } kernel_ = runtime->BuildKernel("activation", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; + if (!IsVecEqual(input_shape_, input->shape())) { int idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel_.setArg(idx++, gws[0]); + kernel_.setArg(idx++, gws[1]); + kernel_.setArg(idx++, gws[2]); + } kernel_.setArg(idx++, *(input->opencl_image())); if (activation_ == PRELU) { MACE_CHECK_NOTNULL(alpha); @@ -73,10 +88,7 @@ void ActivationFunctor::operator()(const Tensor *input, input_shape_ = input->shape(); } - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width), - static_cast(height * batch)}; - const std::vector lws = {8, 16, 8, 1}; + const std::vector lws = {8, kwg_size_ / 64, 8, 1}; std::string tuning_key = Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index e7869bb2fba3959c0fc810cbeb81f44f8f6ab00b..dc2aabeb949f9c6e54b41ebc8c2eb1324f1e4694 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -24,6 +24,8 @@ void AddNFunctor::operator()( const index_t width = input_tensors[0]->dim(2); const index_t channels = input_tensors[0]->dim(3); + auto runtime = OpenCLRuntime::Global(); + for (int i = 1; i < size; ++i) { MACE_CHECK_NOTNULL(input_tensors[i]); MACE_CHECK(batch == input_tensors[i]->dim(0)); @@ -36,7 +38,6 @@ void AddNFunctor::operator()( if (input_tensors.size() > 4) { MACE_NOT_IMPLEMENTED; } - auto runtime = OpenCLRuntime::Global(); std::set built_options; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn"); @@ -44,7 +45,14 @@ void AddNFunctor::operator()( built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size())); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } + kernel_ = runtime->BuildKernel("addn", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } std::vector output_shape = input_tensors[0]->shape(); @@ -53,6 +61,9 @@ void AddNFunctor::operator()( const index_t width_pixels = channel_blocks * width; const index_t batch_height_pixels = batch * height; + const uint32_t gws[2] = {static_cast(width_pixels), + static_cast(batch_height_pixels)}; + if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) { std::vector output_image_shape; CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, @@ -60,6 +71,10 @@ void AddNFunctor::operator()( output_tensor->ResizeImage(output_shape, output_image_shape); uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel_.setArg(idx++, gws[0]); + kernel_.setArg(idx++, gws[1]); + } for (auto input : input_tensors) { kernel_.setArg(idx++, *(input->opencl_image())); } @@ -68,9 +83,7 @@ void AddNFunctor::operator()( input_shape_ = input_tensors[0]->shape(); } - const uint32_t gws[2] = {static_cast(width_pixels), - static_cast(batch_height_pixels)}; - const std::vector lws = {64, 16, 1}; + const std::vector lws = {kwg_size_ / 16, 16, 1}; std::stringstream ss; ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1] << "_" << output_shape[2] << "_" << output_shape[3]; diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index d9dfb8254d0bea67c0eb78c673579e5f57301fd5..09be320fc680cb0e34306762a4672cd774f1d5ea 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -30,14 +30,23 @@ void BatchNormFunctor::operator()(const Tensor *input, const index_t channel_blocks = RoundUpDiv4(channels); + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; + + auto runtime = OpenCLRuntime::Global(); + + if (kernel_.get() == nullptr) { - auto runtime = OpenCLRuntime::Global(); std::set built_options; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm"); built_options.emplace("-Dbatch_norm=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } if (folded_constant_) { built_options.emplace("-DFOLDED_CONSTANT"); } @@ -61,9 +70,17 @@ void BatchNormFunctor::operator()(const Tensor *input, } kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel_.setArg(idx++, gws[0]); + kernel_.setArg(idx++, gws[1]); + kernel_.setArg(idx++, gws[2]); + } kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(scale->opencl_image())); kernel_.setArg(idx++, *(offset->opencl_image())); @@ -78,10 +95,7 @@ void BatchNormFunctor::operator()(const Tensor *input, input_shape_ = input->shape(); } - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width), - static_cast(height * batch)}; - const std::vector lws = {8, 16, 8, 1}; + const std::vector lws = {8, kwg_size_ / 64, 8, 1}; std::string tuning_key = Concat("batch_norm_opencl_kernel_", activation_, output->dim(0), output->dim(1), output->dim(2), output->dim(3), folded_constant_); diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add_opencl.cc index 3d4c4ec5c7a64406ead61439a52d155689236240..1197a3590599b4aac96ac97ce0c8fc58915d0046 100644 --- a/mace/kernels/opencl/bias_add_opencl.cc +++ b/mace/kernels/opencl/bias_add_opencl.cc @@ -23,7 +23,12 @@ void BiasAddFunctor::operator()(const Tensor *input, const index_t channel_blocks = RoundUpDiv4(channels); + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; + auto runtime = OpenCLRuntime::Global(); + if (kernel_.get() == nullptr) { std::set built_options; auto dt = DataTypeToEnum::value; @@ -31,25 +36,46 @@ void BiasAddFunctor::operator()(const Tensor *input, built_options.emplace("-Dbias_add=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel_.setArg(idx++, gws[0]); + kernel_.setArg(idx++, gws[1]); + kernel_.setArg(idx++, gws[2]); + } kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(bias->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image())); input_shape_ = input->shape(); } - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width), - static_cast(height * batch)}; - const std::vector lws = {8, 16, 8}; + const std::vector lws = {8, kwg_size_ / 64, 8}; cl::Event event; - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } else { + std::vector roundup_gws(lws.size()); + for (size_t i = 0; i < lws.size(); ++i) { + roundup_gws[i] = RoundUp(gws[i], lws[i]); + } + + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } MACE_CHECK(error == CL_SUCCESS); if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index 999c631712daeb5d3de6a84c5ce6b6fb6ab07a5f..8b570b3369aca067d53bb0286c3bf9c354f8f74a 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -26,7 +26,8 @@ void BufferToImageFunctor::operator()( buffer->Resize(image->shape()); } - size_t gws[2] = {image_shape[0], image_shape[1]}; + uint32_t gws[2] = {static_cast(image_shape[0]), + static_cast(image_shape[1])}; std::string kernel_name; switch (type) { case CONV2D_FILTER: @@ -58,11 +59,17 @@ void BufferToImageFunctor::operator()( : "winograd_filter_buffer_to_image"; break; } + + auto runtime = OpenCLRuntime::Global(); + std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::set built_options; std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; built_options.emplace(kernel_name_ss.str()); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } if (buffer->dtype() == image->dtype()) { built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + @@ -73,11 +80,14 @@ void BufferToImageFunctor::operator()( built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum::value)); } - auto runtime = OpenCLRuntime::Global(); auto b2f_kernel = runtime->BuildKernel("buffer_to_image", obfuscated_kernel_name, built_options); uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + b2f_kernel.setArg(idx++, gws[0]); + b2f_kernel.setArg(idx++, gws[1]); + } b2f_kernel.setArg(idx++, *(buffer->opencl_buffer())); if (!i2b_) { MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, @@ -103,13 +113,28 @@ void BufferToImageFunctor::operator()( b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); } b2f_kernel.setArg(idx++, *(image->opencl_image())); - const std::vector lws = {16, 64}; + + const uint32_t kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(b2f_kernel)); + const std::vector lws = {16, kwg_size / 16}; + cl::Event event; - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), - cl::NDRange(lws[0], lws[1]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), + cl::NDRange(lws[0], lws[1]), nullptr, &event); + } else { + std::vector roundup_gws(lws.size()); + for (size_t i = 0; i < lws.size(); ++i) { + roundup_gws[i] = RoundUp(gws[i], lws[i]); + } + error = runtime->command_queue().enqueueNDRangeKernel( + b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), + cl::NDRange(lws[0], lws[1]), nullptr, &event); + } + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { event.wait(); diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc index 78d855e2088c292cc15468c00a6730870a69f740..59f2c9518061a7e1e6f0e8071a18cec699814be4 100644 --- a/mace/kernels/opencl/channel_shuffle.cc +++ b/mace/kernels/opencl/channel_shuffle.cc @@ -30,20 +30,36 @@ void ChannelShuffleFunctor::operator()( "groups must be multiple of 4"); const index_t group_channel_blocks = RoundUpDiv4(channels_per_group); - if (kernel_.get() == nullptr) { - auto runtime = OpenCLRuntime::Global(); + const uint32_t gws[3] = {static_cast(group_channel_blocks), + static_cast(width), + static_cast(height * batch)}; + + auto runtime = OpenCLRuntime::Global(); + if (kernel_.get() == nullptr) { std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle"); built_options.emplace("-Dchannel_shuffle=" + kernel_name); auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } + if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel_.setArg(idx++, gws[0]); + kernel_.setArg(idx++, gws[1]); + kernel_.setArg(idx++, gws[2]); + } kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, groups_); kernel_.setArg(idx++, static_cast(channels_per_group)); @@ -51,10 +67,8 @@ void ChannelShuffleFunctor::operator()( input_shape_ = input->shape(); } - const uint32_t gws[3] = {static_cast(group_channel_blocks), - static_cast(width), - static_cast(height * batch)}; - const std::vector lws = {8, 16, 8, 1}; + + const std::vector lws = {8, kwg_size_ / 64, 8, 1}; std::stringstream ss; ss << "channel_shuffle_opencl_kernel_" << output->dim(0) << "_" diff --git a/mace/kernels/opencl/cl/activation.cl b/mace/kernels/opencl/cl/activation.cl index bee0b0e35313b4129fe6741cd9575f88b60e1431..42afc7012528242475b3fc61a8a9bdfdb5623772 100644 --- a/mace/kernels/opencl/cl/activation.cl +++ b/mace/kernels/opencl/cl/activation.cl @@ -1,6 +1,8 @@ #include -__kernel void activation(__read_only image2d_t input, +__kernel void activation( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + __read_only image2d_t input, #ifdef USE_PRELU __read_only image2d_t alpha, #endif @@ -9,7 +11,16 @@ __kernel void activation(__read_only image2d_t input, const int ch_blk = get_global_id(0); const int w = get_global_id(1); const int hb = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (ch_blk >= global_size_dim0 || w >= global_size_dim1 + || hb >= global_size_dim2) { + return; + } + const int width = global_size_dim1; +#else const int width = get_global_size(1); +#endif const int pos = mad24(ch_blk, width, w); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); diff --git a/mace/kernels/opencl/cl/addn.cl b/mace/kernels/opencl/cl/addn.cl index 9504d12a385fcc68b749f31d7394d27d15f62cf4..d0604f9ed074c5a5d2729fd8c66751d9ab7b751b 100644 --- a/mace/kernels/opencl/cl/addn.cl +++ b/mace/kernels/opencl/cl/addn.cl @@ -1,6 +1,8 @@ #include -__kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ +__kernel void addn( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 + __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input1, #if INPUT_NUM > 2 __read_only image2d_t input2, @@ -12,6 +14,10 @@ __kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ const int w = get_global_id(0); const int hb = get_global_id(1); +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || hb >= global_size_dim1) return; +#endif + DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(w, hb)); DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(w, hb)); DATA_TYPE4 out = in0 + in1; diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl index 773b59c44e0021ab68a4d621514056d5327b5427..0075932dbac599780803ac7041da293dfbbc1447 100644 --- a/mace/kernels/opencl/cl/batch_norm.cl +++ b/mace/kernels/opencl/cl/batch_norm.cl @@ -1,6 +1,8 @@ #include // Supported data types: half/float -__kernel void batch_norm(__read_only image2d_t input, +__kernel void batch_norm( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + __read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset, #ifndef FOLDED_CONSTANT @@ -13,7 +15,16 @@ __kernel void batch_norm(__read_only image2d_t input, const int ch_blk = get_global_id(0); const int w = get_global_id(1); const int hb = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (ch_blk >= global_size_dim0 || w >= global_size_dim1 + || hb >= global_size_dim2) { + return; + } + const int width = global_size_dim1; +#else const int width = get_global_size(1); +#endif #ifdef FOLDED_CONSTANT DATA_TYPE4 bn_scale = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0)); diff --git a/mace/kernels/opencl/cl/bias_add.cl b/mace/kernels/opencl/cl/bias_add.cl index f5180a3c0d58b478d81d08e65743c4af1f77c189..a2d99abcc8e21e19e0710db8f752df3a6032d56f 100644 --- a/mace/kernels/opencl/cl/bias_add.cl +++ b/mace/kernels/opencl/cl/bias_add.cl @@ -1,12 +1,23 @@ #include // Supported data types: half/float -__kernel void bias_add(__read_only image2d_t input, +__kernel void bias_add( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + __read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t output) { const int ch_blk = get_global_id(0); const int w = get_global_id(1); const int hb = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (ch_blk >= global_size_dim0 || w >= global_size_dim1 + || hb >= global_size_dim2) { + return; + } + const int width = global_size_dim1; +#else const int width = get_global_size(1); +#endif const int pos = mad24(ch_blk, width, w); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl index ece729b0ccdc3383b452090bd286a7309d90bafd..86071708117efe6a7d4f0580d0324e2ad0701962 100644 --- a/mace/kernels/opencl/cl/buffer_to_image.cl +++ b/mace/kernels/opencl/cl/buffer_to_image.cl @@ -1,6 +1,8 @@ #include -__kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, oc, ic */ +__kernel void filter_buffer_to_image( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 + __global const DATA_TYPE *input, /* h, w, oc, ic */ __private const int input_offset, __private const int filter_h, __private const int filter_w, @@ -9,6 +11,13 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o __write_only image2d_t output) { int w = get_global_id(0); int h = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || h >= global_size_dim1) { + return; + } +#endif + const int in_channel_idx = w; const int hw_size = filter_w * filter_h; const int out_channel_idx = h / hw_size * 4; @@ -44,7 +53,9 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o WRITE_IMAGET(output, coord, values); } -__kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic */ +__kernel void filter_image_to_buffer( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 + __global DATA_TYPE *output, /* h, w, oc, ic */ __private const int filter_h, __private const int filter_w, __private const int out_channel, @@ -52,6 +63,13 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic __read_only image2d_t input) { int w = get_global_id(0); int h = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || h >= global_size_dim1) { + return; + } +#endif + const int in_channel_idx = w; const int hw_size = filter_w * filter_h; const int out_channel_idx = h / hw_size * 4; @@ -84,7 +102,9 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic } } -__kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, ic, m */ +__kernel void dw_filter_buffer_to_image( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 + __global const DATA_TYPE *input, /* h, w, ic, m */ __private const int input_offset, __private const int filter_w, __private const int in_channel, @@ -93,6 +113,12 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w const int w = get_global_id(0); const int h = get_global_id(1); +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || h >= global_size_dim1) { + return; + } +#endif + DATA_TYPE4 values = 0; if (multiplier == 1) { const int in_channel_idx = h << 2; @@ -134,7 +160,9 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w WRITE_IMAGET(output, coord, values); } -__kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ +__kernel void in_out_buffer_to_image( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 + __global const DATA_TYPE *input, /* nhwc */ __private const int input_offset, __private const int height, __private const int width, @@ -142,6 +170,13 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ __write_only image2d_t output) { int w = get_global_id(0); int h = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || h >= global_size_dim1) { + return; + } +#endif + const int batch_idx = h / height; const int height_idx = h % height; const int width_idx = w % width; @@ -167,13 +202,22 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ WRITE_IMAGET(output, coord, values); } -__kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ +__kernel void in_out_image_to_buffer( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 + __global DATA_TYPE *output, /* nhwc */ __private const int height, __private const int width, __private const int channels, __read_only image2d_t input) { int w = get_global_id(0); int h = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || h >= global_size_dim1) { + return; + } +#endif + const int batch_idx = h / height; const int height_idx = h % height; const int width_idx = w % width; @@ -198,13 +242,21 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ } } -__kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ +__kernel void arg_buffer_to_image( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 + __global const DATA_TYPE *input, /* nhwc */ __private const int input_offset, __private const int count, __write_only image2d_t output) { int w = get_global_id(0); int h = get_global_id(1); +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || h >= global_size_dim1) { + return; + } +#endif + const int offset = input_offset + w * 4; const int size = count - w * 4; @@ -226,11 +278,20 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ WRITE_IMAGET(output, coord, values); } -__kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ +__kernel void arg_image_to_buffer( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 + __global DATA_TYPE *output, /* nhwc */ __private const int count, __read_only image2d_t input) { int w = get_global_id(0); int h = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || h >= global_size_dim1) { + return; + } +#endif + const int offset = w * 4; int2 coord = (int2)(w, h); @@ -251,7 +312,9 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ } -__kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //nhwc +__kernel void in_out_height_buffer_to_image( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 + __global const DATA_TYPE *input, //nhwc __private const int input_offset, __private const int height, __private const int width, @@ -259,6 +322,13 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n __write_only image2d_t output) { int w = get_global_id(0); int h = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || h >= global_size_dim1) { + return; + } +#endif + const int wc = width * channels; const int height_blks = (height + 3) / 4; const int batch_idx = h / height_blks; @@ -285,13 +355,22 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n WRITE_IMAGET(output, coord, values); } -__kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc +__kernel void in_out_height_image_to_buffer( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 + __global DATA_TYPE *output, //nhwc __private const int height, __private const int width, __private const int channels, __read_only image2d_t input) { int w = get_global_id(0); int h = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || h >= global_size_dim1) { + return; + } +#endif + const int height_blks = (height + 3) / 4; const int batch_idx = h / height_blks; const int height_idx = (h % height_blks) << 2; @@ -315,7 +394,9 @@ __kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc } -__kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ +__kernel void in_out_width_buffer_to_image( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 + __global const DATA_TYPE *input, /* nhwc */ __private const int input_offset, __private const int height, __private const int width, @@ -323,6 +404,13 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n __write_only image2d_t output) { int w = get_global_id(0); int h = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || h >= global_size_dim1) { + return; + } +#endif + const int width_blks = (width + 3) / 4; const int batch_idx = h / height; const int height_idx = h % height; @@ -349,7 +437,9 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n } // only support 3x3 now -__kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, //Oc, Ic, H, W +__kernel void winograd_filter_buffer_to_image( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 + __global const DATA_TYPE *input, //Oc, Ic, H, W __private const int input_offset, __private const int in_channels, __private const int height, @@ -357,7 +447,16 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, / __write_only image2d_t output) { int w = get_global_id(0); int h = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || h >= global_size_dim1) { + return; + } + const int out_channels = global_size_dim1; +#else const int out_channels = get_global_size(1); +#endif + const int out_channel_idx = h; const int in_channel_idx = w << 2; const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width; @@ -430,13 +529,22 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, / } // only support 3x3 now -__kernel void winograd_filter_image_to_buffer(__global DATA_TYPE *output, //Oc, Ic, H, W +__kernel void winograd_filter_image_to_buffer( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 + __global DATA_TYPE *output, //Oc, Ic, H, W __private const int height, __private const int width, __private const int channel, __read_only image2d_t input) { const int w = get_global_id(0); const int h = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || h >= global_size_dim1) { + return; + } +#endif + const int width_idx = w << 2; const int size = width - width_idx; int offset = h * width + width_idx; diff --git a/mace/kernels/opencl/cl/channel_shuffle.cl b/mace/kernels/opencl/cl/channel_shuffle.cl index 2a193a23148b2b79e210adea5a967a84413d26e9..3fa2894e8bf60b8e7528ccd2562fc179afd9f46e 100644 --- a/mace/kernels/opencl/cl/channel_shuffle.cl +++ b/mace/kernels/opencl/cl/channel_shuffle.cl @@ -1,14 +1,26 @@ #include // assume channes_per_group mod 4 = 0 && groups mod 4 == 0 -__kernel void channel_shuffle(__read_only image2d_t input, - __private const int groups, - __private const int channels_per_group, - __write_only image2d_t output) { +__kernel void channel_shuffle( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + __read_only image2d_t input, + __private const int groups, + __private const int channels_per_group, + __write_only image2d_t output) { const int group_chan_blk_idx = get_global_id(0); const int width_idx = get_global_id(1); - const int width = get_global_size(1); const int hb_idx = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (group_chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 + || hb_idx >= global_size_dim2) { + return; + } + const int width = global_size_dim1; +#else + const int width = get_global_size(1); +#endif + const int group_blks = groups / 4; const int groups_blks_width = group_blks * width; const int channels_per_group_blks = channels_per_group / 4; diff --git a/mace/kernels/opencl/cl/common.h b/mace/kernels/opencl/cl/common.h index dd61012cebdb887639f132d63e48d9521898e443..30aad065ac12da29c3eb661bcf17069a023b16e6 100644 --- a/mace/kernels/opencl/cl/common.h +++ b/mace/kernels/opencl/cl/common.h @@ -18,6 +18,23 @@ #define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE) #define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE) +#ifndef NON_UNIFORM_WORK_GROUP + +#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 \ + __private const int global_size_dim0, \ + __private const int global_size_dim1, +#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 \ + __private const int global_size_dim0, \ + __private const int global_size_dim1, \ + __private const int global_size_dim2, + +#else + +#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 +#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + +#endif + __constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; diff --git a/mace/kernels/opencl/cl/concat.cl b/mace/kernels/opencl/cl/concat.cl index af13422d7178f518163454cbdb8042848dd4611b..3b7370a8a30ba21a0c22305d1ef84e66314d7153 100644 --- a/mace/kernels/opencl/cl/concat.cl +++ b/mace/kernels/opencl/cl/concat.cl @@ -22,14 +22,26 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left, } // Supported data type: half/float -__kernel void concat_channel(__read_only image2d_t input0, +__kernel void concat_channel( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + __read_only image2d_t input0, __read_only image2d_t input1, __private const int input0_chan, __write_only image2d_t output) { const int chan_blk_idx = get_global_id(0); const int width_idx = get_global_id(1); - const int width = get_global_size(1); const int hb_idx = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 + || hb_idx >= global_size_dim2) { + return; + } + const int width = global_size_dim1; +#else + const int width = get_global_size(1); +#endif + const int input0_chan_blk = (input0_chan + 3) >> 2; DATA_TYPE4 data = 0; @@ -72,13 +84,25 @@ __kernel void concat_channel(__read_only image2d_t input0, } // Required: All input channels are divisible by 4 -__kernel void concat_channel_multi(__read_only image2d_t input, +__kernel void concat_channel_multi( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + __read_only image2d_t input, __private const int chan_blk_offset, __write_only image2d_t output) { const int chan_blk_idx = get_global_id(0); const int width_idx = get_global_id(1); - const int width = get_global_size(1); const int hb_idx = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 + || hb_idx >= global_size_dim2) { + return; + } + const int width = global_size_dim1; +#else + const int width = get_global_size(1); +#endif + DATA_TYPE4 data = 0; data = READ_IMAGET(input, SAMPLER, diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl index 42d79807ccb402b70a1d5e24a209f490cbadb77e..1383557d89d96b4a282773f16dabbae59b7b798b 100644 --- a/mace/kernels/opencl/cl/conv_2d.cl +++ b/mace/kernels/opencl/cl/conv_2d.cl @@ -1,6 +1,8 @@ #include -__kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ +__kernel void conv_2d( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */ #ifdef BIAS __read_only image2d_t bias, /* cout%4 * cout/4 */ @@ -21,9 +23,18 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __private const int dilation_w) { const int out_ch_blk = get_global_id(0); const int out_w_blk = get_global_id(1); - const int out_w_blks = get_global_size(1); const int out_hb = get_global_id(2); +#ifndef NON_UNIFORM_WORK_GROUP + if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1 + || out_hb >= global_size_dim2) { + return; + } + const int out_w_blks = global_size_dim1; +#else + const int out_w_blks = get_global_size(1); +#endif + #ifdef BIAS DATA_TYPE4 out0 = READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0)); diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index b695165e1c3398ad333f2e52f307cd91e3eb4f59..e993a159e3e82fa5c110881647ffb290b75c4832 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -1,6 +1,8 @@ #include -__kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ +__kernel void conv_2d_1x1( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * cin, cout/4 */ #ifdef BIAS __read_only image2d_t bias, /* cout%4 * cout/4 */ @@ -15,9 +17,18 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] __private const int stride) { const int out_ch_blk = get_global_id(0); const int out_w_blk = get_global_id(1); - const int out_w_blks = get_global_size(1); const int out_hb = get_global_id(2); +#ifndef NON_UNIFORM_WORK_GROUP + if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1 + || out_hb >= global_size_dim2) { + return; + } + const int out_w_blks = global_size_dim1; +#else + const int out_w_blks = get_global_size(1); +#endif + #ifdef BIAS DATA_TYPE4 out0 = READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0)); DATA_TYPE4 out1 = out0; diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index 7f7fd367d1aa4019a2a2009b4ed61ca179e23ac7..8bc27b33569109fad1e9207c910299c6ebcaac0b 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -1,6 +1,8 @@ #include -__kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ +__kernel void conv_2d_3x3( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */ #ifdef BIAS __read_only image2d_t bias, /* cout%4 * cout/4 */ @@ -19,9 +21,18 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] __private const int dilation_w) { const int out_ch_blk = get_global_id(0); const int out_w_blk = get_global_id(1); - const int out_w_blks = get_global_size(1); const int out_hb = get_global_id(2); +#ifndef NON_UNIFORM_WORK_GROUP + if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1 + || out_hb >= global_size_dim2) { + return; + } + const int out_w_blks = global_size_dim1; +#else + const int out_w_blks = get_global_size(1); +#endif + #ifdef BIAS DATA_TYPE4 out0 = READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0)); diff --git a/mace/kernels/opencl/cl/depth_to_space.cl b/mace/kernels/opencl/cl/depth_to_space.cl index 824f82665542975da3b000d2e0b1865ceabf4a3c..a52617c87367635f697fc29f7c56315b6347bf13 100644 --- a/mace/kernels/opencl/cl/depth_to_space.cl +++ b/mace/kernels/opencl/cl/depth_to_space.cl @@ -1,13 +1,24 @@ #include -__kernel void depth_to_space(__read_only image2d_t input, +__kernel void depth_to_space( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + __read_only image2d_t input, __private const int block_size, __private const int output_depth, __write_only image2d_t output) { const int out_d = get_global_id(0); const int out_w = get_global_id(1); const int out_h = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (out_d >= global_size_dim0 || out_w >= global_size_dim1 + || out_h >= global_size_dim2) { + return; + } + const int output_width = global_size_dim1; +#else const int output_width = get_global_size(1); +#endif const int out_pos = mad24(out_d, output_width, out_w); @@ -27,14 +38,27 @@ __kernel void depth_to_space(__read_only image2d_t input, WRITE_IMAGET(output, (int2)(out_pos, out_h), in_data); } -__kernel void space_to_depth(__read_only image2d_t input, +__kernel void space_to_depth( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + __read_only image2d_t input, __private const int block_size, __private const int input_depth, __write_only image2d_t output) { + const int d = get_global_id(0); const int w = get_global_id(1); const int h = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (d >= global_size_dim0 || w >= global_size_dim1 + || h >= global_size_dim2) { + return; + } + const int input_width = global_size_dim1; +#else const int input_width = get_global_size(1); +#endif + const int in_pos = mad24(d, input_width, w); const int output_width = input_width / block_size; diff --git a/mace/kernels/opencl/cl/depthwise_conv2d.cl b/mace/kernels/opencl/cl/depthwise_conv2d.cl index 792c0934a4f7af5774b3065ecd349300a1f18854..fff19613c9dfad3f3e4a80fed57c60e99d1ec43f 100644 --- a/mace/kernels/opencl/cl/depthwise_conv2d.cl +++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl @@ -1,7 +1,9 @@ #include // Only multiplier = 1 is supported -__kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ +__kernel void depthwise_conv2d( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ #ifdef BIAS __read_only image2d_t bias, /* cout%4 * cout/4 */ @@ -21,8 +23,18 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h __private const short dilation_w) { const short out_ch_blk = get_global_id(0); const short out_w_blk = get_global_id(1); - const short out_w_blks = get_global_size(1); const short out_hb = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1 + || out_hb >= global_size_dim2) { + return; + } + const short out_w_blks = global_size_dim1; +#else + const short out_w_blks = get_global_size(1); +#endif + const short rounded_in_ch = in_ch_blks << 2; const short in_ch_blk = out_ch_blk; // multiplier = 1 @@ -126,7 +138,9 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3); } -__kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ +__kernel void depthwise_conv2d_s1( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ #ifdef BIAS __read_only image2d_t bias, /* cout%4 * cout/4 */ @@ -145,6 +159,14 @@ __kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4 const short out_ch_blk = get_global_id(0); const short out_w_blk = get_global_id(1) << 2; const short out_hb = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (out_ch_blk >= global_size_dim0 || get_global_id(1) >= global_size_dim1 + || out_hb >= global_size_dim2) { + return; + } +#endif + const short rounded_in_ch = in_ch_blks << 2; const short in_ch_blk = out_ch_blk; // multiplier = 1 diff --git a/mace/kernels/opencl/cl/eltwise.cl b/mace/kernels/opencl/cl/eltwise.cl index 735bc96e0149b5716230c092f5f3716598c53116..def21f0a993b75d321729e5c89b080555c1dcdf7 100644 --- a/mace/kernels/opencl/cl/eltwise.cl +++ b/mace/kernels/opencl/cl/eltwise.cl @@ -1,6 +1,8 @@ #include -__kernel void eltwise(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ +__kernel void eltwise( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 + __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input1, #ifdef COEFF_SUM __private const float coeff0, @@ -10,6 +12,10 @@ __kernel void eltwise(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ const int w = get_global_id(0); const int hb = get_global_id(1); +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || hb >= global_size_dim1) return; +#endif + DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(w, hb)); DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(w, hb)); DATA_TYPE4 out; diff --git a/mace/kernels/opencl/cl/matmul.cl b/mace/kernels/opencl/cl/matmul.cl index cb71f21d1c78951b8c8de3c17e252cad2394dd3d..fe260e7a22477ea958936b30378b439c8c94fb2f 100644 --- a/mace/kernels/opencl/cl/matmul.cl +++ b/mace/kernels/opencl/cl/matmul.cl @@ -1,7 +1,9 @@ #include // C = A * B -__kernel void matmul(__read_only image2d_t A, +__kernel void matmul( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 + __read_only image2d_t A, __read_only image2d_t B, __write_only image2d_t C, __private const int M, @@ -11,6 +13,11 @@ __kernel void matmul(__read_only image2d_t A, __private const int k_blocks) { const int gx = get_global_id(0) << 2; const int hb = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (get_global_id(0) >= global_size_dim0 || hb >= global_size_dim1) return; +#endif + const int batch = hb / height_blocks; const int ty = (hb % height_blocks); const int gy = mad24(batch, height_blocks, ty); diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl index f2298a93264e5a3ad79ac3977d226ef7dbb3058a..ead839940e5081e6d90e841f3eda569339a2ffa1 100644 --- a/mace/kernels/opencl/cl/pooling.cl +++ b/mace/kernels/opencl/cl/pooling.cl @@ -19,7 +19,9 @@ inline int calculate_avg_block_size(const int pool_size, } // Supported data type: half/float -__kernel void pooling(__read_only image2d_t input, +__kernel void pooling( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + __read_only image2d_t input, __private const int in_height, __private const int in_width, __private const int out_height, @@ -28,11 +30,21 @@ __kernel void pooling(__read_only image2d_t input, __private const int stride, __private const int pooling_size, __write_only image2d_t output) { + const int out_chan_idx = get_global_id(0); const int out_width_idx = get_global_id(1); - const int out_width = get_global_size(1); const int out_hb_idx = get_global_id(2); +#ifndef NON_UNIFORM_WORK_GROUP + if (out_chan_idx >= global_size_dim0 || out_width_idx >= global_size_dim1 + || out_hb_idx >= global_size_dim2) { + return; + } + const int out_width = global_size_dim1; +#else + const int out_width = get_global_size(1); +#endif + const int batch_idx = mul24((out_hb_idx / out_height), in_height); const int in_height_start = mul24((out_hb_idx % out_height), stride) - pad_top; const int in_width_start = mul24(out_width_idx, stride) - pad_left; diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl index e0b4b83dcf2e6cc4610d664408db05550a58f0de..83e6df85c5c7c1c4b1ee9facf62d73c7cd0f5a58 100644 --- a/mace/kernels/opencl/cl/resize_bilinear.cl +++ b/mace/kernels/opencl/cl/resize_bilinear.cl @@ -1,17 +1,31 @@ #include -__kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ +__kernel void resize_bilinear_nocache( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __write_only image2d_t output, __private const float height_scale, __private const float width_scale, __private const int in_height, __private const int in_width, __private const int out_height) { + const int ch_blk = get_global_id(0); - const int ch_blks = get_global_size(0); const int w = get_global_id(1); - const int out_width = get_global_size(1); const int hb = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (ch_blk >= global_size_dim0 || w >= global_size_dim1 + || hb >= global_size_dim2) { + return; + } + const int ch_blks = global_size_dim0; + const int out_width = global_size_dim1; +#else + const int ch_blks = get_global_size(0); + const int out_width = get_global_size(1); +#endif + const int b = hb / out_height; const int h = hb % out_height; diff --git a/mace/kernels/opencl/cl/slice.cl b/mace/kernels/opencl/cl/slice.cl index d8d45bcbcfa4fd6416ab6ea417841e379082af50..eccdd882c75a809804d61599b5288a432d2d432e 100644 --- a/mace/kernels/opencl/cl/slice.cl +++ b/mace/kernels/opencl/cl/slice.cl @@ -1,12 +1,24 @@ #include -__kernel void slice(__read_only image2d_t input, +__kernel void slice( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + __read_only image2d_t input, __private const int chan_blk_offset, __write_only image2d_t output) { const int chan_blk_idx = get_global_id(0); const int width_idx = get_global_id(1); - const int width = get_global_size(1); const int hb_idx = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 + || hb_idx >= global_size_dim2) { + return; + } + const int width = global_size_dim1; +#else + const int width = get_global_size(1); +#endif + DATA_TYPE4 data = READ_IMAGET(input, SAMPLER, (int2)(mad24(chan_blk_idx + chan_blk_offset, width, width_idx), hb_idx)); diff --git a/mace/kernels/opencl/cl/softmax.cl b/mace/kernels/opencl/cl/softmax.cl index 6830b50801aff517f0dfeda9868c983721df65dc..628d71cbb3a22b4ca240446c7e7c889dc1fc55ab 100644 --- a/mace/kernels/opencl/cl/softmax.cl +++ b/mace/kernels/opencl/cl/softmax.cl @@ -1,14 +1,26 @@ #include -__kernel void softmax(__read_only image2d_t input, +__kernel void softmax( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + __read_only image2d_t input, __private const int channels, __private const int remain_channels, __write_only image2d_t output) { const int chan_blk_idx = get_global_id(0); const int width_idx = get_global_id(1); const int hb_idx = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 + || hb_idx >= global_size_dim2) { + return; + } + const int chan_blks = global_size_dim0 - 1; + const int width = global_size_dim1; +#else const int chan_blks = get_global_size(0) - 1; const int width = get_global_size(1); +#endif int pos = width_idx; DATA_TYPE max_value = -FLT_MAX; diff --git a/mace/kernels/opencl/cl/space_to_batch.cl b/mace/kernels/opencl/cl/space_to_batch.cl index 9ad635099ea3caa249a6b1f49b4eb206553219f3..1e2024043f97f835c03c41ecc03fea7b86617ed2 100644 --- a/mace/kernels/opencl/cl/space_to_batch.cl +++ b/mace/kernels/opencl/cl/space_to_batch.cl @@ -1,6 +1,8 @@ #include -__kernel void space_to_batch(__read_only image2d_t space_data, +__kernel void space_to_batch( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + __read_only image2d_t space_data, __write_only image2d_t batch_data, __private const int block_height, __private const int block_width, @@ -14,6 +16,13 @@ __kernel void space_to_batch(__read_only image2d_t space_data, const int batch_w_idx = get_global_id(1); const int batch_hb_idx = get_global_id(2); +#ifndef NON_UNIFORM_WORK_GROUP + if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1 + || batch_hb_idx >= global_size_dim2) { + return; + } +#endif + const int batch_b_idx = batch_hb_idx / batch_height; const int batch_h_idx = batch_hb_idx % batch_height; @@ -39,7 +48,9 @@ __kernel void space_to_batch(__read_only image2d_t space_data, WRITE_IMAGET(batch_data, batch_coord, value); } -__kernel void batch_to_space(__read_only image2d_t batch_data, +__kernel void batch_to_space( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 + __read_only image2d_t batch_data, __write_only image2d_t space_data, __private const int block_height, __private const int block_width, @@ -53,6 +64,13 @@ __kernel void batch_to_space(__read_only image2d_t batch_data, const int batch_w_idx = get_global_id(1); const int batch_hb_idx = get_global_id(2); +#ifndef NON_UNIFORM_WORK_GROUP + if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1 + || batch_hb_idx >= global_size_dim2) { + return; + } +#endif + const int batch_b_idx = batch_hb_idx / batch_height; const int batch_h_idx = batch_hb_idx % batch_height; diff --git a/mace/kernels/opencl/cl/winograd_transform.cl b/mace/kernels/opencl/cl/winograd_transform.cl index cbcd3b193a92e8e135a55014ad5e62b5545ed57e..d447001e6ccd09f44f2d60c658be778c0e1fbff9 100644 --- a/mace/kernels/opencl/cl/winograd_transform.cl +++ b/mace/kernels/opencl/cl/winograd_transform.cl @@ -1,6 +1,8 @@ #include -__kernel void winograd_transform_2x2(__read_only image2d_t input, +__kernel void winograd_transform_2x2( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 + __read_only image2d_t input, __write_only image2d_t output, __private const int in_height, __private const int in_width, @@ -11,7 +13,15 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input, __private const int padding_left) { int out_width_idx = get_global_id(0); int chan_blk_idx = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) { + return; + } + const int chan_blk_size = global_size_dim1; +#else const int chan_blk_size = get_global_size(1); +#endif const int batch_idx = out_width_idx / round_hw; const int t_idx = out_width_idx % round_hw; @@ -106,7 +116,9 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input, } } -__kernel void winograd_inverse_transform_2x2(__read_only image2d_t input, +__kernel void winograd_inverse_transform_2x2( + UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 + __read_only image2d_t input, #ifdef BIAS __read_only image2d_t bias, /* cout%4 * cout/4 */ #endif @@ -118,7 +130,16 @@ __kernel void winograd_inverse_transform_2x2(__read_only image2d_t input, __private const float relux_max_limit) { const int width_idx = get_global_id(0); const int height_idx = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) { + return; + } + const int out_channel = global_size_dim1; +#else const int out_channel = get_global_size(1); +#endif + int width = width_idx; int height = height_idx; diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index da8671db72ec89ebdc93ae43f64049ea0bcd41ee..bccc8623997c7b356190ecc4818fc3394eb45d89 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -17,19 +17,28 @@ static void Concat2(cl::Kernel *kernel, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future) { + StatsFuture *future, + uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); const index_t channel = output->dim(3); const int channel_blk = RoundUpDiv4(channel); + const uint32_t gws[3] = { + static_cast(channel_blk), static_cast(width), + static_cast(batch * height), + }; + + auto runtime = OpenCLRuntime::Global(); if (kernel->get() == nullptr) { - auto runtime = OpenCLRuntime::Global(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); built_options.emplace("-Dconcat_channel=" + kernel_name); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } if (input0->dtype() == output->dtype()) { built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); @@ -41,9 +50,17 @@ static void Concat2(cl::Kernel *kernel, built_options.emplace("-DDIVISIBLE_FOUR"); } *kernel = runtime->BuildKernel("concat", kernel_name, built_options); + + *kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } if (!IsVecEqual(*prev_input_shape, input0->shape())) { uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel->setArg(idx++, gws[0]); + kernel->setArg(idx++, gws[1]); + kernel->setArg(idx++, gws[2]); + } kernel->setArg(idx++, *(static_cast(input0->opencl_image()))); kernel->setArg(idx++, @@ -51,14 +68,11 @@ static void Concat2(cl::Kernel *kernel, kernel->setArg(idx++, static_cast(input0->dim(3))); kernel->setArg(idx++, *(static_cast(output->opencl_image()))); + *prev_input_shape = input0->shape(); } - const uint32_t gws[3] = { - static_cast(channel_blk), static_cast(width), - static_cast(batch * height), - }; - const std::vector lws = {8, 16, 8, 1}; + const std::vector lws = {8, *kwg_size / 64, 8, 1}; std::stringstream ss; ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); @@ -69,38 +83,51 @@ static void ConcatN(cl::Kernel *kernel, const std::vector &input_list, const DataType dt, Tensor *output, - StatsFuture *future) { + StatsFuture *future, + uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); const index_t channel = output->dim(3); + auto runtime = OpenCLRuntime::Global(); + if (kernel->get() == nullptr) { - auto runtime = OpenCLRuntime::Global(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi"); built_options.emplace("-Dconcat_channel_multi=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } *kernel = runtime->BuildKernel("concat", kernel_name, built_options); + *kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } const int inputs_count = input_list.size(); index_t chan_blk_offset = 0; for (int i = 0; i < inputs_count; ++i) { const Tensor *input = input_list[i]; + index_t input_channel_blk = input->dim(3) / 4; + const uint32_t gws[3] = { + static_cast(input_channel_blk), static_cast(width), + static_cast(batch * height), + }; + uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel->setArg(idx++, gws[0]); + kernel->setArg(idx++, gws[1]); + kernel->setArg(idx++, gws[2]); + } kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, static_cast(chan_blk_offset)); kernel->setArg(idx++, *(output->opencl_image())); - index_t input_channel_blk = input->dim(3) / 4; chan_blk_offset += input_channel_blk; - const uint32_t gws[3] = { - static_cast(input_channel_blk), static_cast(width), - static_cast(batch * height), - }; - const std::vector lws = {8, 16, 8, 1}; + const std::vector lws = {8, *kwg_size / 64, 8, 1}; std::stringstream ss; ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_" << batch * height; @@ -145,11 +172,12 @@ void ConcatFunctor::operator()( switch (inputs_count) { case 2: Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum::value, - &input_shape_, output, future); + &input_shape_, output, future, &kwg_size_); break; default: if (divisible_four) { - ConcatN(&kernel_, input_list, DataTypeToEnum::value, output, future); + ConcatN(&kernel_, input_list, DataTypeToEnum::value, output, future, + &kwg_size_); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc index 468d80f09c60bd9584225d2c263766cef6c790e5..5a002666320dfbfbea8263d1693bae53231da952 100644 --- a/mace/kernels/opencl/conv_2d_opencl.cc +++ b/mace/kernels/opencl/conv_2d_opencl.cc @@ -20,7 +20,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future); + StatsFuture *future, + uint32_t *kwg_size); extern void Conv2dOpenclK3x3(cl::Kernel *kernel, const Tensor *input, @@ -34,7 +35,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future); + StatsFuture *future, + uint32_t *kwg_size); extern void Conv2dOpencl(cl::Kernel *kernel, const Tensor *input, @@ -48,7 +50,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future); + StatsFuture *future, + uint32_t *kwg_size); template void Conv2dFunctor::operator()(const Tensor *input, @@ -61,7 +64,8 @@ void Conv2dFunctor::operator()(const Tensor *input, const Tensor *bias, const int stride, const int *padding, const int *dilations, const ActivationType activation, const float relux_max_limit, const DataType dt, - std::vector *input_shape, Tensor *output, StatsFuture *future); + std::vector *input_shape, Tensor *output, StatsFuture *future, + uint32_t *kwg_size); // Selection matrix: kernel_size x stride_size static const Conv2dOpenclFunction selector[5] = { Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr}; @@ -101,11 +105,13 @@ void Conv2dFunctor::operator()(const Tensor *input, auto conv2d_func = selector[kernel_h - 1]; conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, - DataTypeToEnum::value, &input_shape_, output, future); + DataTypeToEnum::value, &input_shape_, output, future, + &kwg_size_); } else { Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, - DataTypeToEnum::value, &input_shape_, output, future); + DataTypeToEnum::value, &input_shape_, output, future, + &kwg_size_); } } diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index 62f8b09acc3458784cb3506f31dbbbdad51ef7ae..5cdf8e5608386ea7547f20d239e5126d69d50efc 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -22,7 +22,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future) { + StatsFuture *future, + uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -36,6 +37,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, const index_t width_blocks = RoundUpDiv4(width); const index_t input_channel_blocks = RoundUpDiv4(input_channels); + auto runtime = OpenCLRuntime::Global(); + if (kernel->get() == nullptr) { MACE_CHECK(input_batch == batch); @@ -44,6 +47,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, built_options.emplace("-Dconv_2d_1x1=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } if (bias != nullptr) { built_options.emplace("-DBIAS"); } @@ -66,11 +72,23 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, LOG(FATAL) << "Unknown activation type: " << activation; } - auto runtime = OpenCLRuntime::Global(); *kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options); + + *kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width_blocks), + static_cast(height * batch)}; + if (!IsVecEqual(*prev_input_shape, input->shape())) { uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel->setArg(idx++, gws[0]); + kernel->setArg(idx++, gws[1]); + kernel->setArg(idx++, gws[2]); + } kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image())); if (bias != nullptr) { @@ -89,10 +107,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width_blocks), - static_cast(height * batch)}; - const std::vector lws = {8, 15, 8, 1}; + const std::vector lws = {8, *kwg_size / 64, 8, 1}; std::string tuning_key = Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index ba047cdfad9e6280020d98d92170ea3c8820aa9d..d0f587452579a7dfe6dbd64d29fa02ab9bf73297 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -24,7 +24,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future) { + StatsFuture *future, + uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -35,12 +36,17 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t width_blocks = RoundUpDiv(width); + auto runtime = OpenCLRuntime::Global(); + if (kernel->get() == nullptr) { std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3"); built_options.emplace("-Dconv_2d_3x3=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation) { case NOOP: @@ -61,11 +67,23 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, LOG(FATAL) << "Unknown activation type: " << activation; } - auto runtime = OpenCLRuntime::Global(); *kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options); + + *kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width_blocks), + static_cast(height * batch)}; + if (!IsVecEqual(*prev_input_shape, input->shape())) { uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel->setArg(idx++, gws[0]); + kernel->setArg(idx++, gws[1]); + kernel->setArg(idx++, gws[2]); + } kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image())); if (bias != nullptr) { @@ -87,10 +105,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width_blocks), - static_cast(height * batch)}; - const std::vector lws = {4, 15, 8, 1}; + const std::vector lws = {4, *kwg_size / 32, 8, 1}; std::string tuning_key = Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc index fd48605f2cfee1827a559af03a799120b9561e52..b9b2fec56a13dc55ee97649098f0674425ddcd4e 100644 --- a/mace/kernels/opencl/conv_2d_opencl_general.cc +++ b/mace/kernels/opencl/conv_2d_opencl_general.cc @@ -24,7 +24,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future) { + StatsFuture *future, + uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -35,12 +36,17 @@ extern void Conv2dOpencl(cl::Kernel *kernel, const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t width_blocks = RoundUpDiv4(width); + auto runtime = OpenCLRuntime::Global(); + if (kernel->get() == nullptr) { std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d"); built_options.emplace("-Dconv_2d=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation) { case NOOP: @@ -61,11 +67,23 @@ extern void Conv2dOpencl(cl::Kernel *kernel, LOG(FATAL) << "Unknown activation type: " << activation; } - auto runtime = OpenCLRuntime::Global(); *kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options); + + *kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width_blocks), + static_cast(height * batch)}; + if (!IsVecEqual(*prev_input_shape, input->shape())) { uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel->setArg(idx++, gws[0]); + kernel->setArg(idx++, gws[1]); + kernel->setArg(idx++, gws[2]); + } kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image())); if (bias != nullptr) { @@ -89,10 +107,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width_blocks), - static_cast(height * batch)}; - const std::vector lws = {8, 16, 8, 1}; + const std::vector lws = {8, *kwg_size / 64, 8, 1}; std::string tuning_key = Concat("conv2d_general_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); diff --git a/mace/kernels/opencl/depth_to_space_opencl.cc b/mace/kernels/opencl/depth_to_space_opencl.cc index c39c1a342c837e7aef4e9b5da03e401b012fc5e2..8fc0924704badcf1f37d9a55b8c0188e65b295de 100644 --- a/mace/kernels/opencl/depth_to_space_opencl.cc +++ b/mace/kernels/opencl/depth_to_space_opencl.cc @@ -45,8 +45,9 @@ void DepthToSpaceOpFunctor::operator()( CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); output->ResizeImage(output_shape, image_shape); + auto runtime = OpenCLRuntime::Global(); + if (kernel_.get() == nullptr) { - auto runtime = OpenCLRuntime::Global(); std::set built_options; std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::stringstream kernel_name_ss; @@ -55,38 +56,49 @@ void DepthToSpaceOpFunctor::operator()( auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } kernel_ = runtime->BuildKernel("depth_to_space", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } + + uint32_t gws[3]; + std::stringstream ss; if (!IsVecEqual(input_shape_, input->shape())) { + if (d2s_) { + gws[0] = static_cast(depth_blocks); + gws[1] = static_cast(output_width); + gws[2] = static_cast(output_height * batch); + ss << "depth_to_space_opencl_kernel_" << output->dim(0) << "_" + << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); + } else { + gws[0] = static_cast(depth_blocks); + gws[1] = static_cast(input_width); + gws[2] = static_cast(input_height * batch); + ss << "space_to_depth_opencl_kernel_" << input->dim(0) << "_" + << input->dim(1) << "_" << input->dim(2) << "_" << input->dim(3); + } + uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel_.setArg(idx++, gws[0]); + kernel_.setArg(idx++, gws[1]); + kernel_.setArg(idx++, gws[2]); + } kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, block_size_); kernel_.setArg(idx++, depth_blocks); kernel_.setArg(idx++, *(output->opencl_image())); + input_shape_ = input->shape(); } - if (d2s_) { - const uint32_t gws[3] = {static_cast(depth_blocks), - static_cast(output_width), - static_cast(output_height * batch)}; - const std::vector lws = {8, 16, 8, 1}; - std::stringstream ss; - ss << "depth_to_space_opencl_kernel_" << output->dim(0) << "_" - << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); - - TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); - } else { - const uint32_t gws[3] = {static_cast(depth_blocks), - static_cast(input_width), - static_cast(input_height * batch)}; - const std::vector lws = {8, 16, 8, 1}; - std::stringstream ss; - ss << "space_to_depth_opencl_kernel_" << input->dim(0) << "_" - << input->dim(1) << "_" << input->dim(2) << "_" << input->dim(3); - TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); - } + const std::vector lws = {8, kwg_size_ / 64, 8, 1}; + TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); } template struct DepthToSpaceOpFunctor; diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc index ecb109d1fbc456f8e9cefebcc6d29c35604770c1..5e0b99ba1e5a00e25eec931f209c0a28c8ed85b2 100644 --- a/mace/kernels/opencl/depthwise_conv_opencl.cc +++ b/mace/kernels/opencl/depthwise_conv_opencl.cc @@ -23,7 +23,8 @@ void DepthwiseConv2d(cl::Kernel *kernel, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future) { + StatsFuture *future, + uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -35,8 +36,14 @@ void DepthwiseConv2d(cl::Kernel *kernel, const index_t channel_blocks = RoundUpDiv4(channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t width_blocks = RoundUpDiv4(width); + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width_blocks), + static_cast(height * batch)}; + + auto runtime = OpenCLRuntime::Global(); + if (kernel->get() == nullptr) { - auto runtime = OpenCLRuntime::Global(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) { @@ -45,6 +52,9 @@ void DepthwiseConv2d(cl::Kernel *kernel, } else { built_options.emplace("-Ddepthwise_conv2d=" + kernel_name); } + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); @@ -70,6 +80,9 @@ void DepthwiseConv2d(cl::Kernel *kernel, *kernel = runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options); + + *kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } if (!IsVecEqual(*prev_input_shape, input->shape())) { const index_t input_batch = input->dim(0); @@ -84,6 +97,11 @@ void DepthwiseConv2d(cl::Kernel *kernel, input_channels); uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel->setArg(idx++, gws[0]); + kernel->setArg(idx++, gws[1]); + kernel->setArg(idx++, gws[2]); + } kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image())); if (bias != nullptr) { @@ -104,13 +122,11 @@ void DepthwiseConv2d(cl::Kernel *kernel, kernel->setArg(idx++, static_cast(dilations[0])); kernel->setArg(idx++, static_cast(dilations[1])); } + *prev_input_shape = input->shape(); } - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width_blocks), - static_cast(height * batch)}; - const std::vector lws = {8, 16, 8, 1}; + const std::vector lws = {8, *kwg_size / 64, 8, 1}; std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation, batch, height, width, channels, multiplier); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); @@ -165,7 +181,8 @@ void DepthwiseConv2dFunctor::operator()( DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, - DataTypeToEnum::value, &input_shape_, output, future); + DataTypeToEnum::value, &input_shape_, output, future, + &kwg_size_); } template struct DepthwiseConv2dFunctor; diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise_opencl.cc index 548d907de08ba8d25c884a5098f4da8b82db70ee..c23534bbce5ca423314d23bd470a5cbc2289ae1e 100644 --- a/mace/kernels/opencl/eltwise_opencl.cc +++ b/mace/kernels/opencl/eltwise_opencl.cc @@ -24,8 +24,12 @@ void EltwiseFunctor::operator()(const Tensor *input0, const index_t width_pixels = channel_blocks * width; const index_t batch_height_pixels = batch * height; + const uint32_t gws[2] = {static_cast(width_pixels), + static_cast(batch_height_pixels)}; + + auto runtime = OpenCLRuntime::Global(); + if (kernel_.get() == nullptr) { - auto runtime = OpenCLRuntime::Global(); std::set built_options; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise"); @@ -33,11 +37,21 @@ void EltwiseFunctor::operator()(const Tensor *input0, built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace(MakeString("-DELTWISE_TYPE=", type_)); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM"); kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } if (!IsVecEqual(input_shape_, input0->shape())) { uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel_.setArg(idx++, gws[0]); + kernel_.setArg(idx++, gws[1]); + } kernel_.setArg(idx++, *(input0->opencl_image())); kernel_.setArg(idx++, *(input1->opencl_image())); if (!coeff_.empty()) { @@ -45,12 +59,11 @@ void EltwiseFunctor::operator()(const Tensor *input0, kernel_.setArg(idx++, coeff_[1]); } kernel_.setArg(idx++, *(output->opencl_image())); + input_shape_ = input0->shape(); } - const uint32_t gws[2] = {static_cast(width_pixels), - static_cast(batch_height_pixels)}; - const std::vector lws = {64, 16, 1}; + const std::vector lws = {kwg_size_ / 16, 16, 1}; std::stringstream ss; ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index e7dfb641cb5eec81ec6f83971645be8ea3dc33bb..2ab8cde257de73b8dee80300afa89122dd3cf126 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -200,6 +200,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, const std::vector &lws, StatsFuture *future) { auto runtime = OpenCLRuntime::Global(); + auto params_generator = [&]() -> std::vector> { const uint32_t kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel)); @@ -226,12 +227,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, {4, kwg_size / 28, 7, 1}, {4, kwg_size / 32, 8, 1}, {4, kwg_size / 56, 14, 1}, - {3, 15, 9, 1}, - {7, 15, 9, 1}, - {9, 7, 15, 1}, - {15, 7, 9, 1}, {1, kwg_size, 1, 1}, - {4, 15, 8, 1}, }; }; cl::Event event; @@ -240,6 +236,13 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, MACE_CHECK(params.size() == 4) << "Tuning parameters of 3D kernel must be 4D"; cl_int error = CL_SUCCESS; + std::vector roundup_gws(3); + if (!runtime->IsNonUniformWorkgroupsSupported()) { + for (size_t i = 0; i < 3; ++i) { + roundup_gws[i] = RoundUp(gws[i], params[i]); + } + } + if (timer == nullptr) { uint32_t num_blocks = params[3]; const uint32_t block_size = gws[2] / num_blocks; @@ -247,17 +250,32 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, for (uint32_t i = 0; i < num_blocks; ++i) { uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + } else { + uint32_t roundup_gws2 = RoundUp(gws2, params[2]); + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NDRange(0, 0, i * block_size), + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + } MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; } } else { timer->ClearTiming(); - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + } else { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NullRange, + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + } MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; timer->AccumulateTiming(); tuning_result->assign(params.begin(), params.end()); @@ -273,10 +291,18 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, for (uint32_t i = 0; i < num_blocks; ++i) { uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + } else { + uint32_t roundup_gws2 = RoundUp(gws2, params[2]); + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NDRange(0, 0, i * block_size), + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + } MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; timer->AccumulateTiming(); } @@ -304,6 +330,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, const std::vector &lws, StatsFuture *future) { auto runtime = OpenCLRuntime::Global(); + auto params_generator = [&]() -> std::vector> { const uint32_t kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel)); @@ -318,7 +345,6 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, {kwg_size / 64, 64, 1}, {kwg_size / 128, 128, 1}, {kwg_size / 256, 256, 1}, - {kwg_size / 512, 512, 1}, {kwg_size, 1, 1}, {1, kwg_size, 1}}; }; @@ -328,6 +354,13 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, MACE_CHECK(params.size() == 3) << "Tuning parameters of 2D kernel must be 3d"; cl_int error = CL_SUCCESS; + std::vector roundup_gws(2); + if (!runtime->IsNonUniformWorkgroupsSupported()) { + for (size_t i = 0; i < 2; ++i) { + roundup_gws[i] = RoundUp(gws[i], params[i]); + } + } + if (timer == nullptr) { uint32_t num_blocks = params[2]; const uint32_t block_size = gws[1] / num_blocks; @@ -335,16 +368,30 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, for (uint32_t i = 0; i < num_blocks; ++i) { uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1), - cl::NDRange(params[0], params[1]), nullptr, &event); + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1), + cl::NDRange(params[0], params[1]), nullptr, &event); + } else { + uint32_t roundup_gws1 = RoundUp(gws1, params[1]); + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NDRange(0, i * block_size), + cl::NDRange(roundup_gws[0], roundup_gws1), + cl::NDRange(params[0], params[1]), nullptr, &event); + } MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; } } else { timer->ClearTiming(); - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), - cl::NDRange(params[0], params[1]), nullptr, &event); + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), + cl::NDRange(params[0], params[1]), nullptr, &event); + } else { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), + cl::NDRange(params[0], params[1]), nullptr, &event); + } MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; timer->AccumulateTiming(); tuning_result->assign(params.begin(), params.end()); @@ -360,9 +407,18 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, for (uint32_t i = 0; i < num_blocks; ++i) { uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1), - cl::NDRange(params[0], params[1]), nullptr, &event); + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NDRange(0, i * block_size), + cl::NDRange(gws[0], gws1), cl::NDRange(params[0], params[1]), + nullptr, &event); + } else { + uint32_t roundup_gws1 = RoundUp(gws1, params[1]); + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NDRange(0, i * block_size), + cl::NDRange(roundup_gws[0], roundup_gws1), + cl::NDRange(params[0], params[1]), nullptr, &event); + } MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; timer->AccumulateTiming(); } diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc index c5bd2b0ba3f789f28992a49e10ffa7b4a357a8c5..7a4822096309297d8c11e80c1f7eb6ea5069b48f 100644 --- a/mace/kernels/opencl/matmul.cc +++ b/mace/kernels/opencl/matmul.cc @@ -26,18 +26,33 @@ void MatMulFunctor::operator()(const Tensor *A, const index_t height_blocks = RoundUpDiv4(height); const index_t width_blocks = RoundUpDiv4(width); + const uint32_t gws[2] = { + static_cast(width_blocks), + static_cast(height_blocks * batch), + }; + + auto runtime = OpenCLRuntime::Global(); if (kernel_.get() == nullptr) { - auto runtime = OpenCLRuntime::Global(); std::set built_options; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); built_options.emplace("-Dmatmul=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel_.setArg(idx++, gws[0]); + kernel_.setArg(idx++, gws[1]); + } kernel_.setArg(idx++, *(A->opencl_image())); kernel_.setArg(idx++, *(B->opencl_image())); kernel_.setArg(idx++, *(C->opencl_image())); @@ -47,11 +62,7 @@ void MatMulFunctor::operator()(const Tensor *A, kernel_.setArg(idx++, static_cast(height_blocks)); kernel_.setArg(idx++, static_cast(RoundUpDiv4(A->dim(2)))); - const uint32_t gws[2] = { - static_cast(width_blocks), - static_cast(height_blocks * batch), - }; - const std::vector lws = {16, 64, 1}; + const std::vector lws = {kwg_size_ / 64, 64, 1}; std::stringstream ss; ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_" << C->dim(2) << "_" << C->dim(3); diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc index 5b52a0934facd4b4f14affb9bafb819d258fa444..774fd5ee2ac60875491deb00f1ce4fcbef8ba97e 100644 --- a/mace/kernels/opencl/pooling_opencl.cc +++ b/mace/kernels/opencl/pooling_opencl.cc @@ -18,12 +18,14 @@ void PoolingFunctor::operator()(const Tensor *input, MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) << "Pooling opencl kernel not support dilation yet"; + auto runtime = OpenCLRuntime::Global(); + if (kernel_.get() == nullptr) { const DataType dt = DataTypeToEnum::value; - auto runtime = OpenCLRuntime::Global(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); built_options.emplace("-Dpooling=" + kernel_name); + if (pooling_type_ == MAX && input->dtype() == output->dtype()) { built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); @@ -35,8 +37,16 @@ void PoolingFunctor::operator()(const Tensor *input, if (pooling_type_ == AVG) { built_options.emplace("-DPOOL_AVG"); } + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } + + std::vector gws; if (!IsVecEqual(input_shape_, input->shape())) { std::vector output_shape(4); std::vector filter_shape = {kernels_[0], kernels_[1], @@ -59,7 +69,24 @@ void PoolingFunctor::operator()(const Tensor *input, &output_image_shape); output->ResizeImage(output_shape, output_image_shape); + index_t batch = output->dim(0); + index_t out_height = output->dim(1); + index_t out_width = output->dim(2); + index_t channels = output->dim(3); + + index_t channel_blocks = (channels + 3) / 4; + + gws = { + static_cast(channel_blocks), static_cast(out_width), + static_cast(batch * out_height), + }; + uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel_.setArg(idx++, gws[0]); + kernel_.setArg(idx++, gws[1]); + kernel_.setArg(idx++, gws[2]); + } kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, static_cast(input->dim(1))); kernel_.setArg(idx++, static_cast(input->dim(2))); @@ -71,25 +98,25 @@ void PoolingFunctor::operator()(const Tensor *input, kernel_.setArg(idx++, *(output->opencl_image())); input_shape_ = input->shape(); - } - - index_t batch = output->dim(0); - index_t out_height = output->dim(1); - index_t out_width = output->dim(2); - index_t channels = output->dim(3); + } else { + index_t batch = output->dim(0); + index_t out_height = output->dim(1); + index_t out_width = output->dim(2); + index_t channels = output->dim(3); - index_t channel_blocks = (channels + 3) / 4; + index_t channel_blocks = (channels + 3) / 4; + gws = { + static_cast(channel_blocks), static_cast(out_width), + static_cast(batch * out_height), + }; + } - const uint32_t gws[3] = { - static_cast(channel_blocks), static_cast(out_width), - static_cast(batch * out_height), - }; - std::vector lws = {8, 16, 8, 1}; + std::vector lws = {8, kwg_size_ / 64, 8, 1}; std::stringstream ss; ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); - TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); + TuningOrRun3DKernel(kernel_, ss.str(), gws.data(), lws, future); } template struct PoolingFunctor; diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc index 373709168f190a6122d29bbaee457a2b356b4833..ac2733e9e511c8a32dee3371c9ed404be964cb90 100644 --- a/mace/kernels/opencl/resize_bilinear_opencl.cc +++ b/mace/kernels/opencl/resize_bilinear_opencl.cc @@ -24,16 +24,27 @@ void ResizeBilinearFunctor::operator()( const index_t out_height = out_height_; const index_t out_width = out_width_; + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(out_width), + static_cast(out_height * batch)}; + + auto runtime = OpenCLRuntime::Global(); + if (kernel_.get() == nullptr) { - auto runtime = OpenCLRuntime::Global(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache"); built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name); auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } kernel_ = runtime->BuildKernel("resize_bilinear", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } if (!IsVecEqual(input_shape_, input->shape())) { MACE_CHECK(out_height > 0 && out_width > 0); @@ -50,6 +61,11 @@ void ResizeBilinearFunctor::operator()( CalculateResizeScale(in_width, out_width, align_corners_); uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel_.setArg(idx++, gws[0]); + kernel_.setArg(idx++, gws[1]); + kernel_.setArg(idx++, gws[2]); + } kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, height_scale); @@ -61,10 +77,7 @@ void ResizeBilinearFunctor::operator()( input_shape_ = input->shape(); } - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(out_width), - static_cast(out_height * batch)}; - const std::vector lws = {8, 16, 8, 1}; + const std::vector lws = {8, kwg_size_ / 64, 8, 1}; std::stringstream ss; ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc index 6bc9ae3bf57d8c4f3df9ea41cad9bf5f283ce01a..850f08f6df5b091750d0b6ac203bf3e72e00099c 100644 --- a/mace/kernels/opencl/slice.cc +++ b/mace/kernels/opencl/slice.cc @@ -29,15 +29,22 @@ void SliceFunctor::operator()( output_list[i]->ResizeImage(output_shape, image_shape); } + auto runtime = OpenCLRuntime::Global(); + if (kernel_.get() == nullptr) { - auto runtime = OpenCLRuntime::Global(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice"); built_options.emplace("-Dslice=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum::value)); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } kernel_ = runtime->BuildKernel("slice", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } const index_t channel_blk = RoundUpDiv4(output_channels); @@ -46,7 +53,8 @@ void SliceFunctor::operator()( static_cast(input->dim(2)), static_cast(input->dim(0) * input->dim(1)), }; - const std::vector lws = {8, 16, 8, 1}; + + const std::vector lws = {8, kwg_size_ / 64, 8, 1}; std::stringstream ss; ss << "slice_opencl_kernel_" << input->dim(0) << "_" @@ -56,6 +64,11 @@ void SliceFunctor::operator()( << outputs_count; for (int i = 0; i < outputs_count; ++i) { uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel_.setArg(idx++, gws[0]); + kernel_.setArg(idx++, gws[1]); + kernel_.setArg(idx++, gws[2]); + } kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, static_cast(channel_blk * i)); kernel_.setArg(idx++, *(output_list[i]->opencl_image())); diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc index 077db9ddc1ecf2d72f71511349945ea53fe0eb73..ea4f0b3e13d2d023a9dc98691ce47cb269f97714 100644 --- a/mace/kernels/opencl/softmax_opencl.cc +++ b/mace/kernels/opencl/softmax_opencl.cc @@ -23,29 +23,43 @@ void SoftmaxFunctor::operator()(const Tensor *logits, const index_t channel_blocks = RoundUpDiv4(channels); const int remain_channels = channel_blocks * 4 - channels; - if (kernel_.get() == nullptr) { - auto runtime = OpenCLRuntime::Global(); + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; + + auto runtime = OpenCLRuntime::Global(); + if (kernel_.get() == nullptr) { std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); built_options.emplace("-Dsoftmax=" + kernel_name); auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } if (!IsVecEqual(input_shape_, logits->shape())) { uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel_.setArg(idx++, gws[0]); + kernel_.setArg(idx++, gws[1]); + kernel_.setArg(idx++, gws[2]); + } kernel_.setArg(idx++, *(logits->opencl_image())); kernel_.setArg(idx++, static_cast(channels)); kernel_.setArg(idx++, remain_channels); kernel_.setArg(idx++, *(output->opencl_image())); + input_shape_ = logits->shape(); } - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width), - static_cast(height * batch)}; - const std::vector lws = {8, 16, 8, 1}; + + const std::vector lws = {8, kwg_size_ / 64, 8, 1}; std::stringstream ss; ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc index fe911fbddb49687c74edf1e29f0276c86a249ccc..31b5013b737335c40255d9d4163e1d2fb8572d68 100644 --- a/mace/kernels/opencl/space_to_batch_opencl.cc +++ b/mace/kernels/opencl/space_to_batch_opencl.cc @@ -31,9 +31,15 @@ void SpaceToBatchFunctor::operator()( batch_tensor->ResizeImage(output_shape, output_image_shape); kernel_name = "space_to_batch"; } + const uint32_t chan_blk = RoundUpDiv4(batch_tensor->dim(3)); + const uint32_t gws[3] = { + chan_blk, static_cast(batch_tensor->dim(2)), + static_cast(batch_tensor->dim(0) * batch_tensor->dim(1))}; + + auto runtime = OpenCLRuntime::Global(); + if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); - auto runtime = OpenCLRuntime::Global(); std::set built_options; std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; @@ -41,11 +47,22 @@ void SpaceToBatchFunctor::operator()( built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum::value)); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } kernel_ = runtime->BuildKernel("space_to_batch", kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } if (!IsVecEqual(space_shape_, space_tensor->shape())) { uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel_.setArg(idx++, gws[0]); + kernel_.setArg(idx++, gws[1]); + kernel_.setArg(idx++, gws[2]); + } if (b2s_) { kernel_.setArg(idx++, *(batch_tensor->opencl_image())); kernel_.setArg(idx++, *(space_tensor->opencl_image())); @@ -65,11 +82,7 @@ void SpaceToBatchFunctor::operator()( space_shape_ = space_tensor->shape(); } - const uint32_t chan_blk = RoundUpDiv4(batch_tensor->dim(3)); - const uint32_t gws[3] = { - chan_blk, static_cast(batch_tensor->dim(2)), - static_cast(batch_tensor->dim(0) * batch_tensor->dim(1))}; - const std::vector lws = {8, 16, 8, 1}; + const std::vector lws = {8, kwg_size_ / 64, 8, 1}; std::stringstream ss; ss << kernel_name << "_" << batch_tensor->dim(0) << "_" << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_" diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc index 3b86640866a307ba97d7b0f064a1df099c021be4..f4fd5525960706e31ae3a303d00c50e534bfeaec 100644 --- a/mace/kernels/opencl/winograd_transform.cc +++ b/mace/kernels/opencl/winograd_transform.cc @@ -15,6 +15,8 @@ template void WinogradTransformFunctor::operator()( const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) { + auto runtime = OpenCLRuntime::Global(); + if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2"); @@ -24,9 +26,14 @@ void WinogradTransformFunctor::operator()( DtToUpstreamCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum::value)); - auto runtime = OpenCLRuntime::Global(); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } std::vector output_shape(4); std::vector filter_shape = {3, 3, input_tensor->dim(3), 1}; @@ -44,6 +51,9 @@ void WinogradTransformFunctor::operator()( const index_t round_h = (output_shape[1] + 1) / 2; const index_t round_w = (output_shape[2] + 1) / 2; const index_t out_width = input_tensor->dim(0) * round_h * round_w; + const uint32_t gws[2] = { + static_cast(out_width), + static_cast(RoundUpDiv4(input_tensor->dim(3)))}; if (!IsVecEqual(input_shape_, input_tensor->shape())) { output_shape = {16, input_tensor->dim(3), out_width, 1}; @@ -52,6 +62,10 @@ void WinogradTransformFunctor::operator()( output_tensor->ResizeImage(output_shape, image_shape); uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel_.setArg(idx++, gws[0]); + kernel_.setArg(idx++, gws[1]); + } kernel_.setArg(idx++, *(input_tensor->opencl_image())); kernel_.setArg(idx++, *(output_tensor->opencl_image())); kernel_.setArg(idx++, static_cast(input_tensor->dim(1))); @@ -65,10 +79,7 @@ void WinogradTransformFunctor::operator()( input_shape_ = input_tensor->shape(); } - const uint32_t gws[2] = { - static_cast(out_width), - static_cast(RoundUpDiv4(input_tensor->dim(3)))}; - const std::vector lws = {128, 8, 1}; + const std::vector lws = {kwg_size_ / 8, 8, 1}; std::stringstream ss; ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_" << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_" @@ -82,6 +93,9 @@ void WinogradInverseTransformFunctor::operator()( const Tensor *bias, Tensor *output_tensor, StatsFuture *future) { + + auto runtime = OpenCLRuntime::Global(); + if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2"); @@ -92,6 +106,9 @@ void WinogradInverseTransformFunctor::operator()( DtToUpstreamCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum::value)); + if (runtime->IsNonUniformWorkgroupsSupported()) { + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); + } built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation_) { case NOOP: @@ -115,10 +132,16 @@ void WinogradInverseTransformFunctor::operator()( LOG(FATAL) << "Unknown activation type: " << activation_; } - auto runtime = OpenCLRuntime::Global(); kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name, built_options); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } + + const uint32_t gws[2] = { + static_cast(input_tensor->dim(2)), + static_cast(RoundUpDiv4(input_tensor->dim(1)))}; if (!IsVecEqual(input_shape_, input_tensor->shape())) { std::vector output_shape = {batch_, height_, width_, input_tensor->dim(1)}; @@ -129,6 +152,10 @@ void WinogradInverseTransformFunctor::operator()( const uint32_t round_h = (height_ + 1) / 2; const uint32_t round_w = (width_ + 1) / 2; uint32_t idx = 0; + if (!runtime->IsNonUniformWorkgroupsSupported()) { + kernel_.setArg(idx++, gws[0]); + kernel_.setArg(idx++, gws[1]); + } kernel_.setArg( idx++, *(static_cast(input_tensor->opencl_image()))); @@ -147,10 +174,7 @@ void WinogradInverseTransformFunctor::operator()( input_shape_ = input_tensor->shape(); } - const uint32_t gws[2] = { - static_cast(input_tensor->dim(2)), - static_cast(RoundUpDiv4(input_tensor->dim(1)))}; - const std::vector lws = {128, 8, 1}; + const std::vector lws = {kwg_size_ / 8, 8, 1}; std::stringstream ss; ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_" diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h index 15cc691e71927300bec48224a7666f1468eb74c1..ac5c7987fbeda1e76c7e5a13f4a5166c3a2f6c07 100644 --- a/mace/kernels/pooling.h +++ b/mace/kernels/pooling.h @@ -185,6 +185,7 @@ struct PoolingFunctor : PoolingFunctorBase { StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; std::vector input_shape_; }; diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h index 65e5121211d4d836d6d17809a843e0778defaecb..4e0c5ae3cb04def05794faab46cfc5dc90727e3c 100644 --- a/mace/kernels/resize_bilinear.h +++ b/mace/kernels/resize_bilinear.h @@ -173,6 +173,7 @@ struct ResizeBilinearFunctor void operator()(const Tensor *input, Tensor *output, StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; std::vector input_shape_; }; diff --git a/mace/kernels/slice.h b/mace/kernels/slice.h index 59d9d667b0a63da1e1d3ee471aecec9efd9be1e9..1bde41e539b0aa9bd8b458261913a9957da2ec0a 100644 --- a/mace/kernels/slice.h +++ b/mace/kernels/slice.h @@ -61,6 +61,7 @@ struct SliceFunctor { const std::vector &output_list, StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; }; } // namespace kernels diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h index a1c4ea2f6e5b9200f17d54906316a83cbefaa49a..e2b8efc8d8673f26b01a2124dcc3fb64730ad0d5 100644 --- a/mace/kernels/softmax.h +++ b/mace/kernels/softmax.h @@ -61,6 +61,7 @@ struct SoftmaxFunctor { void operator()(const Tensor *logits, Tensor *output, StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; std::vector input_shape_; }; diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h index 757f784820f90fee842fc385606db4755cb52293..4e0d4c591e9b6cfe8544c1bfc4d98936c921dee1 100644 --- a/mace/kernels/space_to_batch.h +++ b/mace/kernels/space_to_batch.h @@ -56,6 +56,7 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; std::vector space_shape_; }; diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h index 6f483dacb06f920c54b14930dba3fd05ff845e44..e3169541dd197764d284d35c7eabbb61a2ba38b6 100644 --- a/mace/kernels/winograd_transform.h +++ b/mace/kernels/winograd_transform.h @@ -51,6 +51,7 @@ struct WinogradTransformFunctor void operator()(const Tensor *input, Tensor *output, StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; std::vector input_shape_; }; @@ -108,6 +109,7 @@ struct WinogradInverseTransformFunctor StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; std::vector input_shape_; }; diff --git a/tools/build_mace_run.sh b/tools/build_mace_run.sh index 669918d28247a654a28d7792e24c218c6fd1660e..4606fde6ca4a2299200266873b831a7113134a27 100644 --- a/tools/build_mace_run.sh +++ b/tools/build_mace_run.sh @@ -43,6 +43,10 @@ else HEXAGON_MODE_BUILD_FLAG="--define hexagon=true" fi + if [ x"$TARGET_ABI" = x"arm64-v8a" ]; then + NEON_ENABLE_FLAG="--define neon=true" + fi + bazel build --verbose_failures -c opt --strip always //mace/examples:mace_run \ --crosstool_top=//external:android/crosstool \ --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \ @@ -54,6 +58,7 @@ else --copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \ --define openmp=true \ --copt="-O3" \ + $NEON_ENABLE_FLAG \ $PRODUCTION_MODE_BUILD_FLAGS \ $HEXAGON_MODE_BUILD_FLAG || exit 1 fi diff --git a/tools/mace_tools.py b/tools/mace_tools.py index 4f2b209a700439fffd6f466551c0dffceb555805..2e0ea3fa386c0390202cc2cfe0449c5a58c52637 100644 --- a/tools/mace_tools.py +++ b/tools/mace_tools.py @@ -376,7 +376,8 @@ def main(unused_args): build_run_throughput_test(target_soc, FLAGS.run_seconds, merged_lib_file, FLAGS.output_dir) - packaging_lib_file(FLAGS.output_dir) + if FLAGS.mode == "build" or FLAGS.mode == "all": + packaging_lib_file(FLAGS.output_dir) if __name__ == "__main__": diff --git a/tools/packaging_lib.sh b/tools/packaging_lib.sh index 607514875a022b7a1f18117c15d4efe248b7c349..c6158cd5f9c954211d858526a4e560d9ceab8a0b 100644 --- a/tools/packaging_lib.sh +++ b/tools/packaging_lib.sh @@ -14,8 +14,13 @@ source ${CURRENT_DIR}/env.sh LIBMACE_BUILD_DIR=$1 +TAR_PACKAGE_NAME=libmace_${PROJECT_NAME}.tar.gz + pushd $LIBMACE_BUILD_DIR/$PROJECT_NAME -ls | grep -v build | xargs tar cvzf libmace_${PROJECT_NAME}.tar.gz +if [ -f $TAR_PACKAGE_NAME ]; then + rm -f $TAR_PACKAGE_NAME +fi +ls | grep -v build | xargs tar cvzf $TAR_PACKAGE_NAME popd echo "Packaging done!"