diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 798409abf683c489104a96a80bf9938b906e3d4f..02fa29a0c64ab3faa2ced1f6a9ada0d9ff0fc44d 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -142,7 +142,6 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint, } bool gpu_detected = false; - bool is_adreno_gpu = false; device_ = std::make_shared(); for (auto device : all_devices) { if (device.getInfo() == CL_DEVICE_TYPE_GPU) { @@ -150,10 +149,18 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint, gpu_detected = true; const std::string device_name = device.getInfo(); constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)"; + constexpr const char *kMaliGPUStr = "Mali"; if (device_name == kQualcommAdrenoGPUStr) { - is_adreno_gpu = true; + gpu_type_ = GPU_TYPE::QUALCOMM_ADRENO; + } else if (device_name.find(kMaliGPUStr) != std::string::npos) { + gpu_type_ = GPU_TYPE::MALI; + } else { + gpu_type_ = GPU_TYPE::UNKNOWN; } + const std::string device_version = device.getInfo(); + opencl_version_ = device_version.substr(7, 3); + VLOG(1) << "Using device: " << device_name; break; } @@ -171,7 +178,7 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint, } cl_int err; - if (is_adreno_gpu) { + if (gpu_type_ == GPU_TYPE::QUALCOMM_ADRENO) { std::vector context_properties; context_properties.reserve(5); GetAdrenoContextProperties(&context_properties, gpu_perf_hint, @@ -350,4 +357,12 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) { return size; } +const GPU_TYPE OpenCLRuntime::GetGPUType() const { + return gpu_type_; +} + +const std::string &OpenCLRuntime::GetOpenclVersion() { + return opencl_version_; +} + } // namespace mace diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index ce375b9aed7bafe47ce7cd97310a76e496008544..1b257e6bdbb06bbe7fccc1e9646ee674228f7e23 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -18,6 +18,12 @@ namespace mace { +enum GPU_TYPE { + QUALCOMM_ADRENO, + MALI, + UNKNOWN, +}; + class OpenCLProfilingTimer : public Timer { public: explicit OpenCLProfilingTimer(const cl::Event *event) @@ -49,6 +55,8 @@ class OpenCLRuntime { uint64_t GetDeviceMaxWorkGroupSize(); uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel); uint64_t GetKernelWaveSize(const cl::Kernel &kernel); + const GPU_TYPE GetGPUType() const; + const std::string &GetOpenclVersion(); cl::Kernel BuildKernel(const std::string &program_name, const std::string &kernel_name, const std::set &build_options); @@ -74,6 +82,8 @@ class OpenCLRuntime { std::map built_program_map_; std::mutex program_build_mutex_; std::string kernel_path_; + GPU_TYPE gpu_type_; + std::string opencl_version_; static GPUPerfHint gpu_perf_hint_; static GPUPriorityHint gpu_priority_hint_; diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc index dfe703dde3bf9610f39c855897b7f38742ec2cb6..d7b89336d196ac701572a76ac23b3eedba4c46a8 100644 --- a/mace/kernels/opencl/activation_opencl.cc +++ b/mace/kernels/opencl/activation_opencl.cc @@ -26,14 +26,18 @@ void ActivationFunctor::operator()(const Tensor *input, auto runtime = OpenCLRuntime::Global(); - if (kernel_.get() == nullptr) { + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel_.get() == nullptr) { std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation"); built_options.emplace("-Dactivation=" + kernel_name); auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } switch (activation_) { case RELU: tuning_key_prefix_ = "relu_opencl_kernel_"; diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index 94538fc246427f953c2d12a4fe110be36837a0df..37e6062a989f47baf8e613a9e1847c233d3061dc 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -26,6 +26,8 @@ void AddNFunctor::operator()( auto runtime = OpenCLRuntime::Global(); + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + for (int i = 1; i < size; ++i) { MACE_CHECK_NOTNULL(input_tensors[i]); MACE_CHECK(batch == input_tensors[i]->dim(0)); @@ -45,6 +47,10 @@ void AddNFunctor::operator()( built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size())); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } + kernel_ = runtime->BuildKernel("addn", kernel_name, built_options); } diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index d79b5c18f0a107eabe63563f347a349bc5544449..10b956de57e9715de1008940ead6a48d60a362f8 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -36,6 +36,8 @@ void BatchNormFunctor::operator()(const Tensor *input, auto runtime = OpenCLRuntime::Global(); + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel_.get() == nullptr) { std::set built_options; auto dt = DataTypeToEnum::value; @@ -43,6 +45,9 @@ void BatchNormFunctor::operator()(const Tensor *input, built_options.emplace("-Dbatch_norm=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } if (folded_constant_) { built_options.emplace("-DFOLDED_CONSTANT"); } diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add_opencl.cc index 6932799523da6876a0597cd27ced9e0f97d53d24..ce0e596558ed76ee36dd84e40082aec718555948 100644 --- a/mace/kernels/opencl/bias_add_opencl.cc +++ b/mace/kernels/opencl/bias_add_opencl.cc @@ -28,6 +28,9 @@ void BiasAddFunctor::operator()(const Tensor *input, static_cast(height * batch)}; auto runtime = OpenCLRuntime::Global(); + + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel_.get() == nullptr) { std::set built_options; auto dt = DataTypeToEnum::value; @@ -35,6 +38,9 @@ void BiasAddFunctor::operator()(const Tensor *input, built_options.emplace("-Dbias_add=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options); } if (!IsVecEqual(input_shape_, input->shape())) { @@ -52,15 +58,22 @@ void BiasAddFunctor::operator()(const Tensor *input, static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); const std::vector lws = {8, kwg_size / 64, 8}; - std::vector roundup_gws(lws.size()); - for (size_t i = 0; i < lws.size(); ++i) { - roundup_gws[i] = RoundUp(gws[i], lws[i]); - } - cl::Event event; - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + cl_int error; + if (is_qualcomm_opencl200) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } else { + std::vector roundup_gws(lws.size()); + for (size_t i = 0; i < lws.size(); ++i) { + roundup_gws[i] = RoundUp(gws[i], lws[i]); + } + + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } MACE_CHECK(error == CL_SUCCESS); if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index 9fee7a957662d853b748d68d4f40dd9bb51671e3..0cec970aa48989d4f263c999d1f7da3ad83c7201 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -59,11 +59,19 @@ void BufferToImageFunctor::operator()( : "winograd_filter_buffer_to_image"; break; } + + auto runtime = OpenCLRuntime::Global(); + + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::set built_options; std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; built_options.emplace(kernel_name_ss.str()); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } if (buffer->dtype() == image->dtype()) { built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + @@ -74,7 +82,6 @@ void BufferToImageFunctor::operator()( built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum::value)); } - auto runtime = OpenCLRuntime::Global(); auto b2f_kernel = runtime->BuildKernel("buffer_to_image", obfuscated_kernel_name, built_options); @@ -105,17 +112,24 @@ void BufferToImageFunctor::operator()( const uint32_t kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(b2f_kernel)); const std::vector lws = {16, kwg_size / 16}; - std::vector roundup_gws(lws.size()); - for (size_t i = 0; i < lws.size(); ++i) { - roundup_gws[i] = RoundUp(gws[i], lws[i]); - } cl::Event event; - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), - cl::NDRange(lws[0], lws[1]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + cl_int error; + if (is_qualcomm_opencl200) { + error = runtime->command_queue().enqueueNDRangeKernel( + b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), + cl::NDRange(lws[0], lws[1]), nullptr, &event); + } else { + std::vector roundup_gws(lws.size()); + for (size_t i = 0; i < lws.size(); ++i) { + roundup_gws[i] = RoundUp(gws[i], lws[i]); + } + error = runtime->command_queue().enqueueNDRangeKernel( + b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), + cl::NDRange(lws[0], lws[1]), nullptr, &event); + } + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { event.wait(); diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc index 34bc57848fd09d98b4fd6a43dddadf7cc87ab9b4..9d566477eccd4b0349b2a27d3233a1d39518f030 100644 --- a/mace/kernels/opencl/channel_shuffle.cc +++ b/mace/kernels/opencl/channel_shuffle.cc @@ -36,6 +36,8 @@ void ChannelShuffleFunctor::operator()( auto runtime = OpenCLRuntime::Global(); + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel_.get() == nullptr) { std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle"); @@ -43,6 +45,9 @@ void ChannelShuffleFunctor::operator()( auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name, built_options); } diff --git a/mace/kernels/opencl/cl/activation.cl b/mace/kernels/opencl/cl/activation.cl index 23e6d60e1658694d4ae46706452989a6c827f3e0..a02b0e3570e7e27a96d6afab00843e3a6b648060 100644 --- a/mace/kernels/opencl/cl/activation.cl +++ b/mace/kernels/opencl/cl/activation.cl @@ -5,19 +5,28 @@ __kernel void activation(__read_only image2d_t input, __read_only image2d_t alpha, #endif __private const float relux_max_limit, +#ifndef USE_QUALCOMM_OPENCL_2_0 __write_only image2d_t output, __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2) { +#else + __write_only image2d_t output) { +#endif + const int ch_blk = get_global_id(0); const int w = get_global_id(1); const int hb = get_global_id(2); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (ch_blk >= global_size_dim0 || w >= global_size_dim1 || hb >= global_size_dim2) { return; } - const int width = global_size_dim1; +#else + const int width = get_global_size(1); +#endif const int pos = mad24(ch_blk, width, w); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); diff --git a/mace/kernels/opencl/cl/addn.cl b/mace/kernels/opencl/cl/addn.cl index 4279fc23694a9e2ce36993a00c05cf7179d580ff..23e47e50157a8fe2ae1d8f12d3d16e5d7284ed6f 100644 --- a/mace/kernels/opencl/cl/addn.cl +++ b/mace/kernels/opencl/cl/addn.cl @@ -8,12 +8,20 @@ __kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ #if INPUT_NUM > 3 __read_only image2d_t input3, #endif +#ifndef USE_QUALCOMM_OPENCL_2_0 __write_only image2d_t output, __private const int global_size_dim0, __private const int global_size_dim1) { +#else + __write_only image2d_t output) { +#endif + const int w = get_global_id(0); const int hb = get_global_id(1); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (w >= global_size_dim0 || hb >= global_size_dim1) return; +#endif DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(w, hb)); DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(w, hb)); diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl index 5899fb006130c5347ce27a180d8b75022207dca1..d36c1e8bc054afeca6c0bda30740c54040bd906b 100644 --- a/mace/kernels/opencl/cl/batch_norm.cl +++ b/mace/kernels/opencl/cl/batch_norm.cl @@ -9,19 +9,28 @@ __kernel void batch_norm(__read_only image2d_t input, __private const float epsilon, #endif __write_only image2d_t output, +#ifndef USE_QUALCOMM_OPENCL_2_0 __private const float relux_max_limit, __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2) { +#else + __private const float relux_max_limit) { +#endif + const int ch_blk = get_global_id(0); const int w = get_global_id(1); const int hb = get_global_id(2); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (ch_blk >= global_size_dim0 || w >= global_size_dim1 || hb >= global_size_dim2) { return; } - const int width = global_size_dim1; +#else + const int width = get_global_size(1); +#endif #ifdef FOLDED_CONSTANT DATA_TYPE4 bn_scale = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0)); diff --git a/mace/kernels/opencl/cl/bias_add.cl b/mace/kernels/opencl/cl/bias_add.cl index d139652bb2833145b3ab17d64584d01a7b5ca45a..594528ce30ffb1164f1e02fc22ee12e708206ccd 100644 --- a/mace/kernels/opencl/cl/bias_add.cl +++ b/mace/kernels/opencl/cl/bias_add.cl @@ -2,19 +2,27 @@ // Supported data types: half/float __kernel void bias_add(__read_only image2d_t input, __read_only image2d_t bias, +#ifndef USE_QUALCOMM_OPENCL_2_0 __write_only image2d_t output, __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2) { +#else + __write_only image2d_t output) { +#endif const int ch_blk = get_global_id(0); const int w = get_global_id(1); const int hb = get_global_id(2); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (ch_blk >= global_size_dim0 || w >= global_size_dim1 || hb >= global_size_dim2) { return; } - const int width = global_size_dim1; +#else + const int width = get_global_size(1); +#endif const int pos = mad24(ch_blk, width, w); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl index faf1f091f2bddcbddd2e2c862c3113f71525f554..8e2f7184d63a3bed64d47aaaf66cc3b01b62943d 100644 --- a/mace/kernels/opencl/cl/buffer_to_image.cl +++ b/mace/kernels/opencl/cl/buffer_to_image.cl @@ -5,14 +5,22 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o __private const int filter_w, __private const int out_channel, __private const int in_channel, +#ifndef USE_QUALCOMM_OPENCL_2_0 __write_only image2d_t output, __private const int global_size_dim0, __private const int global_size_dim1) { +#else + __write_only image2d_t output) { +#endif + int w = get_global_id(0); int h = get_global_id(1); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (w >= global_size_dim0 || h >= global_size_dim1) { return; } +#endif const int out_channel_idx = h * 4; const int rounded_in_channel = ((in_channel + 3) / 4) * 4; @@ -51,14 +59,22 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic __private const int filter_w, __private const int out_channel, __private const int in_channel, +#ifndef USE_QUALCOMM_OPENCL_2_0 __read_only image2d_t input, __private const int global_size_dim0, __private const int global_size_dim1) { +#else + __read_only image2d_t input) { +#endif + int w = get_global_id(0); int h = get_global_id(1); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (w >= global_size_dim0 || h >= global_size_dim1) { return; } +#endif const int out_channel_idx = h * 4; const int rounded_in_channel = ((in_channel + 3) / 4) * 4; @@ -96,14 +112,22 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w __private const int filter_w, __private const int in_channel, __private const int multiplier, +#ifndef USE_QUALCOMM_OPENCL_2_0 __write_only image2d_t output, __private const int global_size_dim0, __private const int global_size_dim1) { /* ic%4 * kh * kw * m, ic/4 */ +#else + __write_only image2d_t output) { +#endif + const int w = get_global_id(0); const int h = get_global_id(1); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (w >= global_size_dim0 || h >= global_size_dim1) { return; } +#endif DATA_TYPE4 values = 0; if (multiplier == 1) { @@ -151,14 +175,22 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ __private const int height, __private const int width, __private const int channels, +#ifndef USE_QUALCOMM_OPENCL_2_0 __write_only image2d_t output, __private const int global_size_dim0, __private const int global_size_dim1) { +#else + __write_only image2d_t output) { +#endif + int w = get_global_id(0); int h = get_global_id(1); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (w >= global_size_dim0 || h >= global_size_dim1) { return; } +#endif const int batch_idx = h / height; const int height_idx = h % height; @@ -189,14 +221,22 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ __private const int height, __private const int width, __private const int channels, +#ifndef USE_QUALCOMM_OPENCL_2_0 __read_only image2d_t input, __private const int global_size_dim0, __private const int global_size_dim1) { +#else + __read_only image2d_t input) { +#endif + int w = get_global_id(0); int h = get_global_id(1); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (w >= global_size_dim0 || h >= global_size_dim1) { return; } +#endif const int batch_idx = h / height; const int height_idx = h % height; @@ -225,14 +265,22 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ __private const int input_offset, __private const int count, +#ifndef USE_QUALCOMM_OPENCL_2_0 __write_only image2d_t output, __private const int global_size_dim0, __private const int global_size_dim1) { +#else + __write_only image2d_t output) { +#endif + int w = get_global_id(0); int h = get_global_id(1); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (w >= global_size_dim0 || h >= global_size_dim1) { return; } +#endif const int offset = input_offset + w * 4; const int size = count - w * 4; @@ -257,14 +305,23 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ __private const int count, +#ifndef USE_QUALCOMM_OPENCL_2_0 __read_only image2d_t input, __private const int global_size_dim0, __private const int global_size_dim1) { +#else + __read_only image2d_t input) { +#endif + int w = get_global_id(0); int h = get_global_id(1); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (w >= global_size_dim0 || h >= global_size_dim1) { return; } +#endif + const int offset = w * 4; int2 coord = (int2)(w, h); @@ -290,14 +347,22 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n __private const int height, __private const int width, __private const int channels, +#ifndef USE_QUALCOMM_OPENCL_2_0 __write_only image2d_t output, __private const int global_size_dim0, __private const int global_size_dim1) { +#else + __write_only image2d_t output) { +#endif + int w = get_global_id(0); int h = get_global_id(1); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (w >= global_size_dim0 || h >= global_size_dim1) { return; } +#endif const int wc = width * channels; const int height_blks = (height + 3) / 4; @@ -329,14 +394,22 @@ __kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc __private const int height, __private const int width, __private const int channels, +#ifndef USE_QUALCOMM_OPENCL_2_0 __read_only image2d_t input, __private const int global_size_dim0, __private const int global_size_dim1) { +#else + __read_only image2d_t input) { +#endif + int w = get_global_id(0); int h = get_global_id(1); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (w >= global_size_dim0 || h >= global_size_dim1) { return; } +#endif const int height_blks = (height + 3) / 4; const int batch_idx = h / height_blks; @@ -366,14 +439,22 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n __private const int height, __private const int width, __private const int channels, +#ifndef USE_QUALCOMM_OPENCL_2_0 __write_only image2d_t output, __private const int global_size_dim0, __private const int global_size_dim1) { +#else + __write_only image2d_t output) { +#endif + int w = get_global_id(0); int h = get_global_id(1); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (w >= global_size_dim0 || h >= global_size_dim1) { return; } +#endif const int width_blks = (width + 3) / 4; const int batch_idx = h / height; @@ -406,16 +487,26 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, / __private const int in_channels, __private const int height, __private const int width, +#ifndef USE_QUALCOMM_OPENCL_2_0 __write_only image2d_t output, __private const int global_size_dim0, __private const int global_size_dim1) { +#else + __write_only image2d_t output) { +#endif + int w = get_global_id(0); int h = get_global_id(1); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (w >= global_size_dim0 || h >= global_size_dim1) { return; } - const int out_channels = global_size_dim1; +#else + const int out_channels = get_global_size(1); +#endif + const int out_channel_idx = h; const int in_channel_idx = w << 2; const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width; @@ -492,14 +583,22 @@ __kernel void winograd_filter_image_to_buffer(__global DATA_TYPE *output, //Oc, __private const int height, __private const int width, __private const int channel, +#ifndef USE_QUALCOMM_OPENCL_2_0 __read_only image2d_t input, __private const int global_size_dim0, __private const int global_size_dim1) { +#else + __read_only image2d_t input) { +#endif + const int w = get_global_id(0); const int h = get_global_id(1); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (w >= global_size_dim0 || h >= global_size_dim1) { return; } +#endif const int width_idx = w << 2; const int size = width - width_idx; diff --git a/mace/kernels/opencl/cl/channel_shuffle.cl b/mace/kernels/opencl/cl/channel_shuffle.cl index 6437ee7fb5266563c4b95e9bf4946c1fd1c3ec78..87159784467f4e118f5aced5640d6185ff50f14f 100644 --- a/mace/kernels/opencl/cl/channel_shuffle.cl +++ b/mace/kernels/opencl/cl/channel_shuffle.cl @@ -4,19 +4,29 @@ __kernel void channel_shuffle(__read_only image2d_t input, __private const int groups, __private const int channels_per_group, +#ifndef USE_QUALCOMM_OPENCL_2_0 __write_only image2d_t output, - __private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2) { + __private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2) { +#else + __write_only image2d_t output) { +#endif + const int group_chan_blk_idx = get_global_id(0); const int width_idx = get_global_id(1); const int hb_idx = get_global_id(2); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (group_chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 || hb_idx >= global_size_dim2) { return; } - const int width = global_size_dim1; +#else + const int width = get_global_size(1); +#endif + const int group_blks = groups / 4; const int groups_blks_width = group_blks * width; const int channels_per_group_blks = channels_per_group / 4; diff --git a/mace/kernels/opencl/cl/concat.cl b/mace/kernels/opencl/cl/concat.cl index ac74f0f22c7e5add65957e9d3fb54a71f33cb161..c8bfebaa1765afed605ee6561fa96fddb24f4bd8 100644 --- a/mace/kernels/opencl/cl/concat.cl +++ b/mace/kernels/opencl/cl/concat.cl @@ -25,19 +25,29 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left, __kernel void concat_channel(__read_only image2d_t input0, __read_only image2d_t input1, __private const int input0_chan, +#ifndef USE_QUALCOMM_OPENCL_2_0 __write_only image2d_t output, __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2) { +#else + __write_only image2d_t output) { +#endif + const int chan_blk_idx = get_global_id(0); const int width_idx = get_global_id(1); const int hb_idx = get_global_id(2); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 || hb_idx >= global_size_dim2) { return; } - const int width = global_size_dim1; +#else + const int width = get_global_size(1); +#endif + const int input0_chan_blk = (input0_chan + 3) >> 2; DATA_TYPE4 data = 0; @@ -82,19 +92,29 @@ __kernel void concat_channel(__read_only image2d_t input0, // Required: All input channels are divisible by 4 __kernel void concat_channel_multi(__read_only image2d_t input, __private const int chan_blk_offset, +#ifndef USE_QUALCOMM_OPENCL_2_0 __write_only image2d_t output, __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2) { +#else + __write_only image2d_t output) { +#endif + const int chan_blk_idx = get_global_id(0); const int width_idx = get_global_id(1); const int hb_idx = get_global_id(2); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 || hb_idx >= global_size_dim2) { return; } - const int width = global_size_dim1; +#else + const int width = get_global_size(1); +#endif + DATA_TYPE4 data = 0; data = READ_IMAGET(input, SAMPLER, diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl index 75be47f1eb68bd8f8a5f433d37dd554509852aa8..f85bf1080dae99432d1786fd9828dcf32c3b6d37 100644 --- a/mace/kernels/opencl/cl/conv_2d.cl +++ b/mace/kernels/opencl/cl/conv_2d.cl @@ -18,20 +18,29 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __private const int padding_top, __private const int padding_left, __private const int dilation_h, +#ifndef USE_QUALCOMM_OPENCL_2_0 __private const int dilation_w, __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2) { +#else + __private const int dilation_w) { +#endif + const int out_ch_blk = get_global_id(0); const int out_w_blk = get_global_id(1); const int out_hb = get_global_id(2); +#ifndef USE_QUALCOMM_OPENCL_2_0 if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1 || out_hb >= global_size_dim2) { return; } - const int out_w_blks = global_size_dim1; +#else + const int out_w_blks = get_global_size(1); +#endif + const int rounded_in_ch = in_ch_blks << 2; #ifdef BIAS diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index a9e4f95f8a5ad30b4b6589e3df21314182e3ac47..70d888670b26f536c451d2852f2b1652f8661a25 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -12,20 +12,28 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] __private const int in_ch_blks, __private const int height, __private const int width, +#ifndef USE_QUALCOMM_OPENCL_2_0 __private const int stride, __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2) { +#else + __private const int stride) { +#endif + const int out_ch_blk = get_global_id(0); const int out_w_blk = get_global_id(1); const int out_hb = get_global_id(2); +#ifndef USE_QUALCOMM_OPENCL_2_0 if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1 || out_hb >= global_size_dim2) { return; } - const int out_w_blks = global_size_dim1; +#else + const int out_w_blks = get_global_size(1); +#endif #ifdef BIAS DATA_TYPE4 out0 = READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0)); diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index b2d8eaa474a6d6b9d665cd855af838d02016352f..8f58255ab8100c6597bdb8bb701adbc8406e0537 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -16,20 +16,29 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] __private const int padding_top, __private const int padding_left, __private const int dilation_h, +#ifndef USE_QUALCOMM_OPENCL_2_0 __private const int dilation_w, __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2) { +#else + __private const int dilation_w) { +#endif + const int out_ch_blk = get_global_id(0); const int out_w_blk = get_global_id(1); const int out_hb = get_global_id(2); +#ifndef USE_QUALCOMM_OPENCL_2_0 if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1 || out_hb >= global_size_dim2) { return; } - const int out_w_blks = global_size_dim1; +#else + const int out_w_blks = get_global_size(1); +#endif + const int rounded_in_ch = in_ch_blks << 2; #ifdef BIAS diff --git a/mace/kernels/opencl/cl/depthwise_conv2d.cl b/mace/kernels/opencl/cl/depthwise_conv2d.cl index 28125a8decebee2a94a6a637a75f6989c6ea4b0c..7d39d3c1375c961039d933c3aa598bb1306fcdfc 100644 --- a/mace/kernels/opencl/cl/depthwise_conv2d.cl +++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl @@ -18,19 +18,29 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h __private const short padding_top, __private const short padding_left, __private const short dilation_h, +#ifndef USE_QUALCOMM_OPENCL_2_0 __private const short dilation_w, __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2) { +#else + __private const short dilation_w) { +#endif + const short out_ch_blk = get_global_id(0); const short out_w_blk = get_global_id(1); const short out_hb = get_global_id(2); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1 || out_hb >= global_size_dim2) { return; } - const short out_w_blks = global_size_dim1; +#else + const short out_w_blks = get_global_size(1); +#endif + const short rounded_in_ch = in_ch_blks << 2; const short in_ch_blk = out_ch_blk; // multiplier = 1 @@ -149,17 +159,25 @@ __kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4 __private const short filter_height, __private const short filter_width, __private const short padding_top, +#ifndef USE_QUALCOMM_OPENCL_2_0 __private const short padding_left, __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2) { +#else + __private const short padding_left) { +#endif + const short out_ch_blk = get_global_id(0); const short out_w_blk = get_global_id(1) << 2; const short out_hb = get_global_id(2); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (out_ch_blk >= global_size_dim0 || get_global_id(1) >= global_size_dim1 || out_hb >= global_size_dim2) { return; } +#endif const short rounded_in_ch = in_ch_blks << 2; const short in_ch_blk = out_ch_blk; // multiplier = 1 diff --git a/mace/kernels/opencl/cl/eltwise.cl b/mace/kernels/opencl/cl/eltwise.cl index edfb777d0e222adad8fb6a07965276989e3794e7..d7c90e03924566165fd3665f36cb113d7f52b7fe 100644 --- a/mace/kernels/opencl/cl/eltwise.cl +++ b/mace/kernels/opencl/cl/eltwise.cl @@ -6,12 +6,20 @@ __kernel void eltwise(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ __private const float coeff0, __private const float coeff1, #endif +#ifndef USE_QUALCOMM_OPENCL_2_0 __write_only image2d_t output, __private const int global_size_dim0, __private const int global_size_dim1) { +#else + __write_only image2d_t output) { +#endif + const int w = get_global_id(0); const int hb = get_global_id(1); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (w >= global_size_dim0 || hb >= global_size_dim1) return; +#endif DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(w, hb)); DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(w, hb)); diff --git a/mace/kernels/opencl/cl/fully_connected.cl b/mace/kernels/opencl/cl/fully_connected.cl index 90d84c11360bc7ef61c4921df5366b545fb83dac..a474c8cacc6fb44b68ec716a87556ebae94e019f 100644 --- a/mace/kernels/opencl/cl/fully_connected.cl +++ b/mace/kernels/opencl/cl/fully_connected.cl @@ -10,14 +10,22 @@ __kernel void fully_connected(__read_only image2d_t input, __private const int input_height, __private const int input_width, __private const int input_channel, +#ifndef USE_QUALCOMM_OPENCL_2_0 __private const float relux_max_limit, __private const int global_size_dim0, __private const int global_size_dim1) { +#else + __private const float relux_max_limit) { +#endif + const int batch_idx = get_global_id(0); const int out_blk_idx = get_global_id(1); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (batch_idx >= global_size_dim0 || out_blk_idx >= global_size_dim1) { return; } +#endif const int input_chan_blk = (input_channel + 3) >> 2; @@ -74,19 +82,28 @@ __kernel void fully_connected_width(__read_only image2d_t input, __private const int input_width, __private const int in_chan_blks, __private const int out_blks, +#ifndef USE_QUALCOMM_OPENCL_2_0 __private const float relux_max_limit, __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2) { +#else + __private const float relux_max_limit) { +#endif + const int inter_out_idx = get_global_id(0); const int width_blk_idx = get_global_id(1); const int batch_out_blk_idx = get_global_id(2); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (inter_out_idx >= global_size_dim0 || width_blk_idx >= global_size_dim1 || batch_out_blk_idx >= global_size_dim2) { return; } - const int width_blk_count = global_size_dim1; +#else + const int width_blk_count = get_global_size(1); +#endif const int batch_idx = batch_out_blk_idx / out_blks; const int out_blk_idx = batch_out_blk_idx % out_blks; diff --git a/mace/kernels/opencl/cl/matmul.cl b/mace/kernels/opencl/cl/matmul.cl index f0c2ee0ebf7446c14067d22dbc0c890b9fd72f41..7107838c894b7050e2d6b3b02bd7e85712a87d4c 100644 --- a/mace/kernels/opencl/cl/matmul.cl +++ b/mace/kernels/opencl/cl/matmul.cl @@ -8,12 +8,20 @@ __kernel void matmul(__read_only image2d_t A, __private const int N, __private const int K, __private const int height_blocks, +#ifndef USE_QUALCOMM_OPENCL_2_0 __private const int k_blocks, __private const int global_size_dim0, __private const int global_size_dim1) { +#else + __private const int k_blocks) { +#endif + const int gx = get_global_id(0) << 2; const int hb = get_global_id(1); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (get_global_id(0) >= global_size_dim0 || hb >= global_size_dim1) return; +#endif const int batch = hb / height_blocks; const int ty = (hb % height_blocks); diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl index dad48824dfb6cddfb2db2e01a56600294b3b85e5..8cdc4e4625a24e409c38697d9e40a39e57007ab7 100644 --- a/mace/kernels/opencl/cl/pooling.cl +++ b/mace/kernels/opencl/cl/pooling.cl @@ -27,19 +27,29 @@ __kernel void pooling(__read_only image2d_t input, __private const int pad_left, __private const int stride, __private const int pooling_size, +#ifndef USE_QUALCOMM_OPENCL_2_0 __write_only image2d_t output, __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2) { +#else + __write_only image2d_t output) { +#endif + const int out_chan_idx = get_global_id(0); const int out_width_idx = get_global_id(1); const int out_hb_idx = get_global_id(2); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (out_chan_idx >= global_size_dim0 || out_width_idx >= global_size_dim1 || out_hb_idx >= global_size_dim2) { return; } - const int out_width = global_size_dim1; +#else + const int out_width = get_global_size(1); +#endif + const int batch_idx = mul24((out_hb_idx / out_height), in_height); const int in_height_start = mul24((out_hb_idx % out_height), stride) - pad_top; const int in_width_start = mul24(out_width_idx, stride) - pad_left; diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl index b3778cb236aafe40a348d87bd27131407245b258..5369c7625846757556debce41a2835b9fa71651f 100644 --- a/mace/kernels/opencl/cl/resize_bilinear.cl +++ b/mace/kernels/opencl/cl/resize_bilinear.cl @@ -6,19 +6,30 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w * __private const float width_scale, __private const int in_height, __private const int in_width, +#ifndef USE_QUALCOMM_OPENCL_2_0 __private const int out_height, __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2) { +#else + __private const int out_height) { +#endif + const int ch_blk = get_global_id(0); const int w = get_global_id(1); const int hb = get_global_id(2); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (ch_blk >= global_size_dim0 || w >= global_size_dim1 || hb >= global_size_dim2) { return; } const int ch_blks = global_size_dim0; const int out_width = global_size_dim1; +#else + const int ch_blks = get_global_size(0); + const int out_width = get_global_size(1); +#endif const int b = hb / out_height; const int h = hb % out_height; diff --git a/mace/kernels/opencl/cl/slice.cl b/mace/kernels/opencl/cl/slice.cl index a626c0de63311614fa1d7c6a9d226e797af18b64..bb5f40cda7f1fdafecab81d0e48c3b3b410b2388 100644 --- a/mace/kernels/opencl/cl/slice.cl +++ b/mace/kernels/opencl/cl/slice.cl @@ -2,19 +2,28 @@ __kernel void slice(__read_only image2d_t input, __private const int chan_blk_offset, +#ifndef USE_QUALCOMM_OPENCL_2_0 __write_only image2d_t output, __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2) { +#else + __write_only image2d_t output) { +#endif + const int chan_blk_idx = get_global_id(0); const int width_idx = get_global_id(1); const int hb_idx = get_global_id(2); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 || hb_idx >= global_size_dim2) { return; } - const int width = global_size_dim1; +#else + const int width = get_global_size(1); +#endif DATA_TYPE4 data = READ_IMAGET(input, SAMPLER, (int2)(mad24(chan_blk_idx + chan_blk_offset, diff --git a/mace/kernels/opencl/cl/softmax.cl b/mace/kernels/opencl/cl/softmax.cl index e702739440e4609511cc240daf568126505792db..3fadd18e1b814e58ef0437e65aef7ea5820c9ca4 100644 --- a/mace/kernels/opencl/cl/softmax.cl +++ b/mace/kernels/opencl/cl/softmax.cl @@ -3,20 +3,30 @@ __kernel void softmax(__read_only image2d_t input, __private const int channels, __private const int remain_channels, +#ifndef USE_QUALCOMM_OPENCL_2_0 __write_only image2d_t output, __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2) { +#else + __write_only image2d_t output) { +#endif + const int chan_blk_idx = get_global_id(0); const int width_idx = get_global_id(1); const int hb_idx = get_global_id(2); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1 || hb_idx >= global_size_dim2) { return; } - const int chan_blks = global_size_dim0 - 1; const int width = global_size_dim1; +#else + const int chan_blks = get_global_size(0) - 1; + const int width = get_global_size(1); +#endif int pos = width_idx; DATA_TYPE max_value = -FLT_MAX; diff --git a/mace/kernels/opencl/cl/space_to_batch.cl b/mace/kernels/opencl/cl/space_to_batch.cl index e36313fe651d9555253bef02e2e875824a4c5bb9..822d09063e2ff3d40bdf4ad954325db9be90cf14 100644 --- a/mace/kernels/opencl/cl/space_to_batch.cl +++ b/mace/kernels/opencl/cl/space_to_batch.cl @@ -9,17 +9,25 @@ __kernel void space_to_batch(__read_only image2d_t space_data, __private const int space_height, __private const int space_width, __private const int batch_height, +#ifndef USE_QUALCOMM_OPENCL_2_0 __private const int batch_width, __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2) { +#else + __private const int batch_width) { +#endif + const int chan_idx = get_global_id(0); const int batch_w_idx = get_global_id(1); const int batch_hb_idx = get_global_id(2); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1 || batch_hb_idx >= global_size_dim2) { return; } +#endif const int batch_b_idx = batch_hb_idx / batch_height; const int batch_h_idx = batch_hb_idx % batch_height; @@ -55,17 +63,25 @@ __kernel void batch_to_space(__read_only image2d_t batch_data, __private const int space_height, __private const int space_width, __private const int batch_height, +#ifndef USE_QUALCOMM_OPENCL_2_0 __private const int batch_width, __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2) { +#else + __private const int batch_width) { +#endif + const int chan_idx = get_global_id(0); const int batch_w_idx = get_global_id(1); const int batch_hb_idx = get_global_id(2); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1 || batch_hb_idx >= global_size_dim2) { return; } +#endif const int batch_b_idx = batch_hb_idx / batch_height; const int batch_h_idx = batch_hb_idx % batch_height; diff --git a/mace/kernels/opencl/cl/winograd_transform.cl b/mace/kernels/opencl/cl/winograd_transform.cl index 3acfc9028b6e92b5af1d9eec3c469fe69dec3fea..098c8e3b2f119f2d55dfe5d0250ddf70086a8554 100644 --- a/mace/kernels/opencl/cl/winograd_transform.cl +++ b/mace/kernels/opencl/cl/winograd_transform.cl @@ -8,16 +8,25 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input, __private const int round_hw, __private const int round_w, __private const int padding_top, +#ifndef USE_QUALCOMM_OPENCL_2_0 __private const int padding_left, __private const int global_size_dim0, __private const int global_size_dim1) { +#else + __private const int padding_left) { +#endif + int out_width_idx = get_global_id(0); int chan_blk_idx = get_global_id(1); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) { return; } - const int chan_blk_size = global_size_dim1; +#else + const int chan_blk_size = get_global_size(1); +#endif const int batch_idx = out_width_idx / round_hw; const int t_idx = out_width_idx % round_hw; @@ -121,16 +130,26 @@ __kernel void winograd_inverse_transform_2x2(__read_only image2d_t input, __private const int out_width, __private const int round_hw, __private const int round_w, +#ifndef USE_QUALCOMM_OPENCL_2_0 __private const float relux_max_limit, __private const int global_size_dim0, __private const int global_size_dim1) { +#else + __private const float relux_max_limit) { +#endif + const int width_idx = get_global_id(0); const int height_idx = get_global_id(1); + +#ifndef USE_QUALCOMM_OPENCL_2_0 if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) { return; } - const int out_channel = global_size_dim1; +#else + const int out_channel = get_global_size(1); +#endif + int width = width_idx; int height = height_idx; diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index ccb5b6c20041d2b1744ab625dd144eba67560d99..1ddf37bc901e6456e8bc5ace3742fd07f5ee788c 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -31,10 +31,15 @@ static void Concat2(cl::Kernel *kernel, auto runtime = OpenCLRuntime::Global(); + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel->get() == nullptr) { std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); built_options.emplace("-Dconcat_channel=" + kernel_name); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } if (input0->dtype() == output->dtype()) { built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); @@ -83,12 +88,18 @@ static void ConcatN(cl::Kernel *kernel, const index_t channel = output->dim(3); auto runtime = OpenCLRuntime::Global(); + + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel->get() == nullptr) { std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi"); built_options.emplace("-Dconcat_channel_multi=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } *kernel = runtime->BuildKernel("concat", kernel_name, built_options); } diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index 4bfa9ac7f1260cf86920fbc876f5cfb45276e83e..ad2af5a73a1a3e682c1334bbaa92945c0d49df97 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -37,6 +37,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, const index_t input_channel_blocks = RoundUpDiv4(input_channels); auto runtime = OpenCLRuntime::Global(); + + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel->get() == nullptr) { MACE_CHECK(input_batch == batch); @@ -45,6 +48,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, built_options.emplace("-Dconv_2d_1x1=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } if (bias != nullptr) { built_options.emplace("-DBIAS"); } diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index 97db8ab998e28f2152fdc9ebb1d9bc9bcd37e8d1..6ac0fa569ebe4ab1d58ca8a9a87cd1cc56564f44 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -37,12 +37,17 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, auto runtime = OpenCLRuntime::Global(); + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel->get() == nullptr) { std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3"); built_options.emplace("-Dconv_2d_3x3=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation) { case NOOP: diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc index 4f1b67f631c07bc76670ad10f7b42f4b2a1cd9fa..0fc944422fd1a22c4b37a3cce0123158b7bee1f3 100644 --- a/mace/kernels/opencl/conv_2d_opencl_general.cc +++ b/mace/kernels/opencl/conv_2d_opencl_general.cc @@ -37,12 +37,17 @@ extern void Conv2dOpencl(cl::Kernel *kernel, auto runtime = OpenCLRuntime::Global(); + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel->get() == nullptr) { std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d"); built_options.emplace("-Dconv_2d=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation) { case NOOP: diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc index 18b53853151a5e6c03d0b1abe14df69cd5c5b2f1..c43799db2d96312a63898904f5266bc8528ea810 100644 --- a/mace/kernels/opencl/depthwise_conv_opencl.cc +++ b/mace/kernels/opencl/depthwise_conv_opencl.cc @@ -42,6 +42,8 @@ void DepthwiseConv2d(cl::Kernel *kernel, auto runtime = OpenCLRuntime::Global(); + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel->get() == nullptr) { std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); @@ -51,6 +53,9 @@ void DepthwiseConv2d(cl::Kernel *kernel, } else { built_options.emplace("-Ddepthwise_conv2d=" + kernel_name); } + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise_opencl.cc index a2e4e8f1f3a4ef0987a46c4779b8afa51498c423..e2a68396d18045e94c4697295f3f1f6c8e1ec691 100644 --- a/mace/kernels/opencl/eltwise_opencl.cc +++ b/mace/kernels/opencl/eltwise_opencl.cc @@ -29,6 +29,8 @@ void EltwiseFunctor::operator()(const Tensor *input0, auto runtime = OpenCLRuntime::Global(); + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel_.get() == nullptr) { std::set built_options; auto dt = DataTypeToEnum::value; @@ -37,6 +39,9 @@ void EltwiseFunctor::operator()(const Tensor *input0, built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace(MakeString("-DELTWISE_TYPE=", type_)); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM"); kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options); } diff --git a/mace/kernels/opencl/fully_connected_opencl.cc b/mace/kernels/opencl/fully_connected_opencl.cc index 3e17f98fe24ba2bc16e881b221abf830b308ae31..208f402595d638dde161d90159c526615b473f54 100644 --- a/mace/kernels/opencl/fully_connected_opencl.cc +++ b/mace/kernels/opencl/fully_connected_opencl.cc @@ -24,8 +24,11 @@ void FCWXKernel(cl::Kernel *kernel, << "FC width kernel only support input with 4x channel."; MACE_CHECK_NOTNULL(gws); MACE_CHECK_NOTNULL(lws); + auto runtime = OpenCLRuntime::Global(); + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel->get() == nullptr) { std::set built_options; auto dt = DataTypeToEnum::value; @@ -34,6 +37,9 @@ void FCWXKernel(cl::Kernel *kernel, built_options.emplace("-Dfully_connected_width=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } if (bias != nullptr) { built_options.emplace("-DBIAS"); } @@ -133,14 +139,21 @@ void FCWTXKernel(cl::Kernel *kernel, StatsFuture *future) { MACE_CHECK_NOTNULL(gws); MACE_CHECK_NOTNULL(lws); + + auto runtime = OpenCLRuntime::Global(); + + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel->get() == nullptr) { - auto runtime = OpenCLRuntime::Global(); std::set built_options; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected"); built_options.emplace("-Dfully_connected=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } if (bias != nullptr) { built_options.emplace("-DBIAS"); } diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index 2141c65ec1f6074b17e29533965415f20836926d..641abd664c190026314cc4419d8ee240ee031118 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -194,12 +194,25 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) { } } +const bool IsQualcommOpenCL200() { + auto runtime = OpenCLRuntime::Global(); + + if (runtime->GetGPUType() == GPU_TYPE::QUALCOMM_ADRENO && + runtime->GetOpenclVersion() == "2.0") { + return true; + } else { + return false; + } +} + void TuningOrRun3DKernel(const cl::Kernel &kernel, const std::string tuning_key, const uint32_t *gws, const std::vector &lws, StatsFuture *future) { auto runtime = OpenCLRuntime::Global(); + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + auto params_generator = [&]() -> std::vector> { const uint32_t kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel)); @@ -236,8 +249,10 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, << "Tuning parameters of 3D kernel must be 4D"; cl_int error = CL_SUCCESS; std::vector roundup_gws(3); - for (size_t i = 0; i < 3; ++i) { - roundup_gws[i] = RoundUp(gws[i], params[i]); + if(!is_qualcomm_opencl200) { + for (size_t i = 0; i < 3; ++i) { + roundup_gws[i] = RoundUp(gws[i], params[i]); + } } if (timer == nullptr) { @@ -247,18 +262,31 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, for (uint32_t i = 0; i < num_blocks; ++i) { uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - uint32_t roundup_gws2 = RoundUp(gws2, params[2]); - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NDRange(0, 0, i * block_size), - cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + if (is_qualcomm_opencl200) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + } else { + uint32_t roundup_gws2 = RoundUp(gws2, params[2]); + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NDRange(0, 0, i * block_size), + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + } MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; } } else { timer->ClearTiming(); - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + if (is_qualcomm_opencl200) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + } else { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + } MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; timer->AccumulateTiming(); tuning_result->assign(params.begin(), params.end()); @@ -274,11 +302,18 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, for (uint32_t i = 0; i < num_blocks; ++i) { uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - uint32_t roundup_gws2 = RoundUp(gws2, params[2]); - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NDRange(0, 0, i * block_size), - cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + if (is_qualcomm_opencl200) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + } else { + uint32_t roundup_gws2 = RoundUp(gws2, params[2]); + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NDRange(0, 0, i * block_size), + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + } MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; timer->AccumulateTiming(); } @@ -306,6 +341,8 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, const std::vector &lws, StatsFuture *future) { auto runtime = OpenCLRuntime::Global(); + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + auto params_generator = [&]() -> std::vector> { const uint32_t kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel)); @@ -330,8 +367,10 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, << "Tuning parameters of 2D kernel must be 3d"; cl_int error = CL_SUCCESS; std::vector roundup_gws(2); - for (size_t i = 0; i < 2; ++i) { - roundup_gws[i] = RoundUp(gws[i], params[i]); + if (!is_qualcomm_opencl200) { + for (size_t i = 0; i < 2; ++i) { + roundup_gws[i] = RoundUp(gws[i], params[i]); + } } if (timer == nullptr) { @@ -341,17 +380,29 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, for (uint32_t i = 0; i < num_blocks; ++i) { uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; - uint32_t roundup_gws1 = RoundUp(gws1, params[1]); - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NDRange(0, i * block_size), cl::NDRange(roundup_gws[0], roundup_gws1), - cl::NDRange(params[0], params[1]), nullptr, &event); + if (is_qualcomm_opencl200) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1), + cl::NDRange(params[0], params[1]), nullptr, &event); + } else { + uint32_t roundup_gws1 = RoundUp(gws1, params[1]); + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NDRange(0, i * block_size), cl::NDRange(roundup_gws[0], roundup_gws1), + cl::NDRange(params[0], params[1]), nullptr, &event); + } MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; } } else { timer->ClearTiming(); - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), - cl::NDRange(params[0], params[1]), nullptr, &event); + if (is_qualcomm_opencl200) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), + cl::NDRange(params[0], params[1]), nullptr, &event); + } else { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), + cl::NDRange(params[0], params[1]), nullptr, &event); + } MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; timer->AccumulateTiming(); tuning_result->assign(params.begin(), params.end()); @@ -367,10 +418,16 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, for (uint32_t i = 0; i < num_blocks; ++i) { uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; - uint32_t roundup_gws1 = RoundUp(gws1, params[1]); - error = runtime->command_queue().enqueueNDRangeKernel( - kernel, cl::NDRange(0, i * block_size), cl::NDRange(roundup_gws[0], roundup_gws1), - cl::NDRange(params[0], params[1]), nullptr, &event); + if (is_qualcomm_opencl200) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1), + cl::NDRange(params[0], params[1]), nullptr, &event); + } else { + uint32_t roundup_gws1 = RoundUp(gws1, params[1]); + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NDRange(0, i * block_size), cl::NDRange(roundup_gws[0], roundup_gws1), + cl::NDRange(params[0], params[1]), nullptr, &event); + } MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; timer->AccumulateTiming(); } diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h index 89712c9b96aa043f5019cde6eae23aa07109f6f7..5b4e028318c1487825f553dce28079d4bc2faccf 100644 --- a/mace/kernels/opencl/helper.h +++ b/mace/kernels/opencl/helper.h @@ -102,6 +102,8 @@ std::string Concat(Args... args) { return ss.str(); } +const bool IsQualcommOpenCL200(); + } // namespace kernels } // namespace mace #endif // MACE_KERNELS_OPENCL_HELPER_H_ diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc index 3609b1a6bac5f5d0d3588033372033af40b589c4..9e29306186f0714839a7c8f0763c5967bc11e21e 100644 --- a/mace/kernels/opencl/matmul.cc +++ b/mace/kernels/opencl/matmul.cc @@ -33,6 +33,8 @@ void MatMulFunctor::operator()(const Tensor *A, auto runtime = OpenCLRuntime::Global(); + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel_.get() == nullptr) { std::set built_options; auto dt = DataTypeToEnum::value; @@ -40,6 +42,9 @@ void MatMulFunctor::operator()(const Tensor *A, built_options.emplace("-Dmatmul=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options); } uint32_t idx = 0; diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc index 4e97174ea2d1d312a261e9cb8a9ee686c586bda2..f3d4714cd325b48714f5ddf25e1b24d85aecb39b 100644 --- a/mace/kernels/opencl/pooling_opencl.cc +++ b/mace/kernels/opencl/pooling_opencl.cc @@ -20,11 +20,14 @@ void PoolingFunctor::operator()(const Tensor *input, auto runtime = OpenCLRuntime::Global(); + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel_.get() == nullptr) { const DataType dt = DataTypeToEnum::value; std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); built_options.emplace("-Dpooling=" + kernel_name); + if (pooling_type_ == MAX && input->dtype() == output->dtype()) { built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); @@ -36,6 +39,9 @@ void PoolingFunctor::operator()(const Tensor *input, if (pooling_type_ == AVG) { built_options.emplace("-DPOOL_AVG"); } + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options); } diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc index d6a185194d184a6518a1445ee5e4d79f4a7a2e5b..63c71ea7fc4eb410b68ebba3dc707b5c331809c0 100644 --- a/mace/kernels/opencl/resize_bilinear_opencl.cc +++ b/mace/kernels/opencl/resize_bilinear_opencl.cc @@ -30,6 +30,8 @@ void ResizeBilinearFunctor::operator()( auto runtime = OpenCLRuntime::Global(); + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel_.get() == nullptr) { std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache"); @@ -37,6 +39,9 @@ void ResizeBilinearFunctor::operator()( auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } kernel_ = runtime->BuildKernel("resize_bilinear", kernel_name, built_options); } diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc index f4e3908953c4fed503a11f2db3365d89e775cded..55773a521c34c47635032b2b3d2dd4b8da346189 100644 --- a/mace/kernels/opencl/slice.cc +++ b/mace/kernels/opencl/slice.cc @@ -31,6 +31,8 @@ void SliceFunctor::operator()( auto runtime = OpenCLRuntime::Global(); + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel_.get() == nullptr) { std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice"); @@ -38,6 +40,9 @@ void SliceFunctor::operator()( built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum::value)); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } kernel_ = runtime->BuildKernel("slice", kernel_name, built_options); } const index_t channel_blk = RoundUpDiv4(output_channels); diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc index 3ec6447ab3441adc95cbfc47c1bd15bb373603ae..321d7c296f9e756ca671e45ab4a6d554d72f40d8 100644 --- a/mace/kernels/opencl/softmax_opencl.cc +++ b/mace/kernels/opencl/softmax_opencl.cc @@ -28,6 +28,9 @@ void SoftmaxFunctor::operator()(const Tensor *logits, static_cast(height * batch)}; auto runtime = OpenCLRuntime::Global(); + + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel_.get() == nullptr) { std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); @@ -35,6 +38,9 @@ void SoftmaxFunctor::operator()(const Tensor *logits, auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options); } if (!IsVecEqual(input_shape_, logits->shape())) { diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc index b2de27486ff64c211d0bc1c617958f3cc66e51d6..128164f967f3ddadd547efa3862cd79529868fee 100644 --- a/mace/kernels/opencl/space_to_batch_opencl.cc +++ b/mace/kernels/opencl/space_to_batch_opencl.cc @@ -38,6 +38,8 @@ void SpaceToBatchFunctor::operator()( auto runtime = OpenCLRuntime::Global(); + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::set built_options; @@ -47,6 +49,9 @@ void SpaceToBatchFunctor::operator()( built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum::value)); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } kernel_ = runtime->BuildKernel("space_to_batch", kernel_name, built_options); } diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc index b3f4889b8b18b7bdc9340556c0e21e561e305baf..c4a20a0307e34e024556a0680051a6e36774772d 100644 --- a/mace/kernels/opencl/winograd_transform.cc +++ b/mace/kernels/opencl/winograd_transform.cc @@ -17,6 +17,8 @@ void WinogradTransformFunctor::operator()( auto runtime = OpenCLRuntime::Global(); + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2"); @@ -26,6 +28,9 @@ void WinogradTransformFunctor::operator()( DtToUpstreamCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum::value)); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name, built_options); } @@ -90,6 +95,8 @@ void WinogradInverseTransformFunctor::operator()( auto runtime = OpenCLRuntime::Global(); + const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2"); @@ -100,6 +107,9 @@ void WinogradInverseTransformFunctor::operator()( DtToUpstreamCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum::value)); + if (is_qualcomm_opencl200) { + built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); + } built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation_) { case NOOP: diff --git a/tools/bazel-adb-run.sh b/tools/bazel-adb-run.sh index cf82cbf62c7eee08e1cc7df1522b7c15e1b0c53c..2e3dede092a6ddb0852ae2e75730310fd91ddeec 100755 --- a/tools/bazel-adb-run.sh +++ b/tools/bazel-adb-run.sh @@ -18,8 +18,8 @@ BAZEL_BIN_PATH=${BAZEL_BIN_PATH#//} BAZEL_BIN_PATH=bazel-bin/$BAZEL_BIN_PATH BIN_NAME=`echo $BAZEL_TARGET | cut -d: -f2` -ANDROID_ABI=armeabi-v7a ANDROID_ABI=arm64-v8a +ANDROID_ABI=armeabi-v7a STRIP="--strip always" VLOG_LEVEL=0 PROFILING="1"