From 6a764b53c48734a5f5060f23aab830cc3ba9af88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8F=B6=E5=89=91=E6=AD=A6?= Date: Wed, 1 Aug 2018 10:57:09 +0800 Subject: [PATCH] Merge branch 'fix-opencl-tuning-bug' into 'master' Fix opencl get_gloabl_size wrong when the kernel split. See merge request !693 --- mace/kernels/BUILD | 2 + mace/kernels/buffer_to_image.h | 11 +- mace/kernels/fully_connected.h | 5 - mace/kernels/image_to_buffer.h | 23 +-- mace/kernels/opencl/activation.cc | 38 +---- mace/kernels/opencl/addn.cc | 37 +---- mace/kernels/opencl/batch_norm.cc | 39 +---- mace/kernels/opencl/bias_add.cc | 38 +---- mace/kernels/opencl/buffer_to_image.cc | 139 ++++++++---------- mace/kernels/opencl/channel_shuffle.cc | 38 +---- mace/kernels/opencl/cl/activation.cl | 4 +- mace/kernels/opencl/cl/batch_norm.cl | 4 +- mace/kernels/opencl/cl/bias_add.cl | 4 +- mace/kernels/opencl/cl/buffer_to_image.cl | 28 +--- mace/kernels/opencl/cl/channel_shuffle.cl | 4 +- mace/kernels/opencl/cl/common.h | 11 +- mace/kernels/opencl/cl/concat.cl | 8 +- mace/kernels/opencl/cl/conv_2d.cl | 4 +- mace/kernels/opencl/cl/conv_2d_1x1.cl | 4 +- mace/kernels/opencl/cl/conv_2d_3x3.cl | 4 +- mace/kernels/opencl/cl/depthwise_conv2d.cl | 4 +- mace/kernels/opencl/cl/fully_connected.cl | 2 +- mace/kernels/opencl/cl/pad.cl | 5 +- mace/kernels/opencl/cl/pooling.cl | 4 +- mace/kernels/opencl/cl/resize_bilinear.cl | 5 +- mace/kernels/opencl/cl/slice.cl | 4 +- mace/kernels/opencl/cl/softmax.cl | 5 +- mace/kernels/opencl/cl/winograd_transform.cl | 16 +- mace/kernels/opencl/common.h | 35 +++++ mace/kernels/opencl/concat.cc | 70 ++------- mace/kernels/opencl/conv_2d_1x1.cc | 39 +---- mace/kernels/opencl/conv_2d_3x3.cc | 39 +---- mace/kernels/opencl/conv_2d_general.cc | 38 +---- mace/kernels/opencl/crop.cc | 34 +---- .../{deconv_2d_opencl.cc => deconv_2d.cc} | 37 +---- mace/kernels/opencl/depth_to_space.cc | 33 +---- mace/kernels/opencl/depthwise_conv.cc | 37 +---- mace/kernels/opencl/eltwise.cc | 38 +---- mace/kernels/opencl/fully_connected.cc | 75 ++-------- mace/kernels/opencl/helper.cc | 26 +--- mace/kernels/opencl/helper.h | 101 +++++++++---- mace/kernels/opencl/image_to_buffer.cc | 127 +++++++--------- mace/kernels/opencl/matmul.cc | 38 +---- .../kernels/opencl/out_of_range_check_test.cc | 54 +++---- mace/kernels/opencl/pad.cc | 33 +---- mace/kernels/opencl/pooling.cc | 37 +---- .../{reduce_mean_opencl.cc => reduce_mean.cc} | 37 +---- mace/kernels/opencl/resize_bilinear.cc | 37 +---- mace/kernels/opencl/slice.cc | 32 +--- mace/kernels/opencl/softmax.cc | 37 +---- mace/kernels/opencl/space_to_batch.cc | 33 +---- mace/kernels/opencl/winograd_transform.cc | 71 ++------- mace/ops/ops_test_util.h | 2 +- 53 files changed, 486 insertions(+), 1144 deletions(-) create mode 100644 mace/kernels/opencl/common.h rename mace/kernels/opencl/{deconv_2d_opencl.cc => deconv_2d.cc} (85%) rename mace/kernels/opencl/{reduce_mean_opencl.cc => reduce_mean.cc} (81%) diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD index a8991f47..1035b54b 100644 --- a/mace/kernels/BUILD +++ b/mace/kernels/BUILD @@ -44,10 +44,12 @@ cc_library( ], exclude = [ "buffer_to_image.h", + "image_to_buffer.h", ], ) + if_opencl_enabled(glob([ "opencl/*.h", "buffer_to_image.h", + "image_to_buffer.h", ])), copts = [ "-Werror", diff --git a/mace/kernels/buffer_to_image.h b/mace/kernels/buffer_to_image.h index fedf8190..1def9087 100644 --- a/mace/kernels/buffer_to_image.h +++ b/mace/kernels/buffer_to_image.h @@ -16,19 +16,18 @@ #define MACE_KERNELS_BUFFER_TO_IMAGE_H_ #include +#include #include "mace/core/future.h" #include "mace/core/tensor.h" -#include "mace/kernels/opencl/helper.h" +#include "mace/kernels/opencl/common.h" namespace mace { namespace kernels { struct BufferToImageFunctorBase { explicit BufferToImageFunctorBase(const int wino_blk_size) - : kernel_error_(nullptr), - wino_blk_size_(wino_blk_size) {} - std::unique_ptr kernel_error_; + : wino_blk_size_(wino_blk_size) {} const int wino_blk_size_; }; @@ -57,6 +56,10 @@ struct BufferToImageFunctor : BufferToImageFunctorBase { const BufferType type, Tensor *output, StatsFuture *future); + + cl::Kernel kernel_; + std::unique_ptr kernel_error_; + std::vector input_shape_; }; } // namespace kernels diff --git a/mace/kernels/fully_connected.h b/mace/kernels/fully_connected.h index 7a337a9d..82b0ba1e 100644 --- a/mace/kernels/fully_connected.h +++ b/mace/kernels/fully_connected.h @@ -23,11 +23,6 @@ #include "mace/kernels/activation.h" #include "mace/kernels/gemm.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#include "mace/kernels/opencl/helper.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { namespace kernels { diff --git a/mace/kernels/image_to_buffer.h b/mace/kernels/image_to_buffer.h index 77388da7..4e6b057f 100644 --- a/mace/kernels/image_to_buffer.h +++ b/mace/kernels/image_to_buffer.h @@ -16,19 +16,18 @@ #define MACE_KERNELS_IMAGE_TO_BUFFER_H_ #include +#include #include "mace/core/future.h" #include "mace/core/tensor.h" -#include "mace/kernels/opencl/helper.h" +#include "mace/kernels/opencl/common.h" namespace mace { namespace kernels { struct ImageToBufferFunctorBase { explicit ImageToBufferFunctorBase(const int wino_blk_size) - : kernel_error_(nullptr), - wino_blk_size_(wino_blk_size) {} - std::unique_ptr kernel_error_; + : wino_blk_size_(wino_blk_size) {} const int wino_blk_size_; }; @@ -37,9 +36,9 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase { explicit ImageToBufferFunctor(const int wino_blk_size) : ImageToBufferFunctorBase(wino_blk_size) {} MaceStatus operator()(const Tensor *input, - const BufferType type, - Tensor *output, - StatsFuture *future) { + const BufferType type, + Tensor *output, + StatsFuture *future) { MACE_UNUSED(input); MACE_UNUSED(type); MACE_UNUSED(output); @@ -54,9 +53,13 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase { explicit ImageToBufferFunctor(const int wino_blk_size) : ImageToBufferFunctorBase(wino_blk_size) {} MaceStatus operator()(const Tensor *input, - const BufferType type, - Tensor *output, - StatsFuture *future); + const BufferType type, + Tensor *output, + StatsFuture *future); + + cl::Kernel kernel_; + std::unique_ptr kernel_error_; + std::vector input_shape_; }; } // namespace kernels diff --git a/mace/kernels/opencl/activation.cc b/mace/kernels/opencl/activation.cc index 2e343aa9..2cd0c2a3 100644 --- a/mace/kernels/opencl/activation.cc +++ b/mace/kernels/opencl/activation.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "mace/kernels/activation.h" -#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/helper.h" #include "mace/utils/tuner.h" @@ -38,23 +37,13 @@ MaceStatus ActivationFunctor::operator()( if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation"); built_options.emplace("-Dactivation=" + kernel_name); auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); switch (activation_) { case RELU: tuning_key_prefix_ = "relu_opencl_kernel"; @@ -92,15 +81,8 @@ MaceStatus ActivationFunctor::operator()( if (!IsVecEqual(input_shape_, input->shape())) { int idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input->opencl_image())); if (activation_ == PRELU) { MACE_CHECK_NOTNULL(alpha); @@ -119,13 +101,7 @@ MaceStatus ActivationFunctor::operator()( MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index e47f5103..f01baa71 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -49,24 +49,14 @@ MaceStatus AddNFunctor::operator()( MACE_NOT_IMPLEMENTED; } std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn"); built_options.emplace("-Daddn=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size())); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name, built_options, &kernel_)); @@ -92,14 +82,8 @@ MaceStatus AddNFunctor::operator()( output_tensor->ResizeImage(output_shape, output_image_shape)); uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - } + OUT_OF_RANGE_SET_ARG; + SET_2D_GWS_ARGS(kernel_); for (auto input : input_tensors) { kernel_.setArg(idx++, *(input->opencl_image())); } @@ -114,14 +98,7 @@ MaceStatus AddNFunctor::operator()( output_tensor->dim(2), output_tensor->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future)); - - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/batch_norm.cc b/mace/kernels/opencl/batch_norm.cc index d2dce6d3..e26065d9 100644 --- a/mace/kernels/opencl/batch_norm.cc +++ b/mace/kernels/opencl/batch_norm.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "mace/kernels/batch_norm.h" -#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/helper.h" #include "mace/utils/tuner.h" @@ -49,23 +48,13 @@ MaceStatus BatchNormFunctor::operator()( if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm"); built_options.emplace("-Dbatch_norm=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); if (folded_constant_) { built_options.emplace("-DFOLDED_CONSTANT"); } @@ -96,15 +85,8 @@ MaceStatus BatchNormFunctor::operator()( } if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(scale->opencl_image())); kernel_.setArg(idx++, *(offset->opencl_image())); @@ -125,14 +107,7 @@ MaceStatus BatchNormFunctor::operator()( output->dim(1), output->dim(2), output->dim(3), folded_constant_); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future)); - - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/bias_add.cc b/mace/kernels/opencl/bias_add.cc index b7023dd5..aaa0d172 100644 --- a/mace/kernels/opencl/bias_add.cc +++ b/mace/kernels/opencl/bias_add.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "mace/kernels/bias_add.h" -#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/helper.h" #include "mace/utils/utils.h" @@ -45,39 +44,21 @@ MaceStatus BiasAddFunctor::operator()(const Tensor *input, if (kernel_.get() == nullptr) { std::set built_options; auto dt = DataTypeToEnum::value; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add"); built_options.emplace("-Dbias_add=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name, built_options, &kernel_)); - kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(bias->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image())); @@ -104,12 +85,7 @@ MaceStatus BiasAddFunctor::operator()(const Tensor *input, cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); } MACE_CL_RET_STATUS(error); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } + OUT_OF_RANGE_VALIDATION(kernel_error_); if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { event.wait(); diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index df104d66..c95ef0ad 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -13,8 +13,8 @@ // limitations under the License. #include "mace/kernels/buffer_to_image.h" -#include "mace/core/runtime/opencl/cl2_header.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" + +#include "mace/kernels/opencl/helper.h" namespace mace { namespace kernels { @@ -30,7 +30,8 @@ MaceStatus BufferToImageFunctor::operator()( CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size_); if (type == WINOGRAD_FILTER) { std::vector new_shape = - CalWinogradShape(buffer->shape(), type, wino_blk_size_); + {(wino_blk_size_ + 2) * (wino_blk_size_ + 2), + buffer->dim(0), buffer->dim(1)}; MACE_RETURN_IF_ERROR(image->ResizeImage(new_shape, image_shape)); } else { MACE_RETURN_IF_ERROR(image->ResizeImage(buffer->shape(), image_shape)); @@ -76,85 +77,74 @@ MaceStatus BufferToImageFunctor::operator()( auto runtime = OpenCLRuntime::Global(); - std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); - std::set built_options; - std::stringstream kernel_name_ss; - kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; - built_options.emplace(kernel_name_ss.str()); - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } - if (buffer->dtype() == image->dtype()) { - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToCLCMDDt(DataTypeToEnum::value)); - } else { - built_options.emplace("-DDATA_TYPE=" + - DtToUpstreamCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToUpstreamCLCMDDt(DataTypeToEnum::value)); - } - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - if (!kernel_error_) { - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); + if (kernel_.get() == nullptr) { + std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); + std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; + std::stringstream kernel_name_ss; + kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; + built_options.emplace(kernel_name_ss.str()); + if (buffer->dtype() == image->dtype()) { + built_options.emplace( + "-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + + DtToCLCMDDt(DataTypeToEnum::value)); + } else { + built_options.emplace("-DDATA_TYPE=" + + DtToUpCompatibleCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + + DtToUpCompatibleCLCMDDt(DataTypeToEnum::value)); } + MACE_RETURN_IF_ERROR(runtime->BuildKernel( + "buffer_to_image", obfuscated_kernel_name, built_options, &kernel_)); } - cl::Kernel b2f_kernel; - - MACE_RETURN_IF_ERROR(runtime->BuildKernel( - "buffer_to_image", obfuscated_kernel_name, built_options, &b2f_kernel)); - - uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - b2f_kernel.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - b2f_kernel.setArg(idx++, gws[0]); - b2f_kernel.setArg(idx++, gws[1]); - } - b2f_kernel.setArg(idx++, *(buffer->opencl_buffer())); - MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, - "buffer offset not aligned"); - b2f_kernel.setArg(idx++, - static_cast(buffer->buffer_offset() / - GetEnumTypeSize(buffer->dtype()))); - if (type == CONV2D_FILTER) { - const index_t inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(0))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); - b2f_kernel.setArg(idx++, static_cast(inner_size)); - } else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) { - b2f_kernel.setArg(idx++, static_cast(buffer->dim(0))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(1))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); - } else if (type == ARGUMENT) { - b2f_kernel.setArg(idx++, static_cast(buffer->dim(0))); - } else { - b2f_kernel.setArg(idx++, static_cast(formatted_buffer_shape[1])); - b2f_kernel.setArg(idx++, static_cast(formatted_buffer_shape[2])); - b2f_kernel.setArg(idx++, static_cast(formatted_buffer_shape[3])); + if (!IsVecEqual(input_shape_, buffer->shape())) { + uint32_t idx = 0; + OUT_OF_RANGE_SET_ARG; + SET_2D_GWS_ARGS(kernel_); + kernel_.setArg(idx++, *(buffer->opencl_buffer())); + MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, + "buffer offset not aligned"); + kernel_.setArg(idx++, + static_cast(buffer->buffer_offset() / + GetEnumTypeSize(buffer->dtype()))); + if (type == CONV2D_FILTER) { + const index_t + inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3); + kernel_.setArg(idx++, static_cast(buffer->dim(0))); + kernel_.setArg(idx++, static_cast(buffer->dim(2))); + kernel_.setArg(idx++, static_cast(buffer->dim(3))); + kernel_.setArg(idx++, static_cast(inner_size)); + } else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) { + kernel_.setArg(idx++, static_cast(buffer->dim(0))); + kernel_.setArg(idx++, static_cast(buffer->dim(1))); + kernel_.setArg(idx++, static_cast(buffer->dim(2))); + kernel_.setArg(idx++, static_cast(buffer->dim(3))); + } else if (type == ARGUMENT) { + kernel_.setArg(idx++, static_cast(buffer->dim(0))); + } else { + kernel_.setArg(idx++, + static_cast(formatted_buffer_shape[1])); + kernel_.setArg(idx++, + static_cast(formatted_buffer_shape[2])); + kernel_.setArg(idx++, + static_cast(formatted_buffer_shape[3])); + } + kernel_.setArg(idx++, *(image->opencl_image())); + input_shape_ = buffer->shape(); } - b2f_kernel.setArg(idx++, *(image->opencl_image())); const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(b2f_kernel)); + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); const std::vector lws = {16, kwg_size / 16}; cl::Event event; cl_int error; if (runtime->IsNonUniformWorkgroupsSupported()) { error = runtime->command_queue().enqueueNDRangeKernel( - b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), + kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]), cl::NDRange(lws[0], lws[1]), nullptr, &event); } else { std::vector roundup_gws(lws.size()); @@ -163,16 +153,11 @@ MaceStatus BufferToImageFunctor::operator()( } error = runtime->command_queue().enqueueNDRangeKernel( - b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), + kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), cl::NDRange(lws[0], lws[1]), nullptr, &event); } MACE_CL_RET_STATUS(error); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } + OUT_OF_RANGE_VALIDATION(kernel_error_); if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { event.wait(); diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc index 8babf338..d7434683 100644 --- a/mace/kernels/opencl/channel_shuffle.cc +++ b/mace/kernels/opencl/channel_shuffle.cc @@ -45,23 +45,13 @@ MaceStatus ChannelShuffleFunctor::operator()( if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle"); built_options.emplace("-Dchannel_shuffle=" + kernel_name); auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); MACE_RETURN_IF_ERROR( runtime->BuildKernel("channel_shuffle", kernel_name, built_options, &kernel_)); @@ -72,15 +62,8 @@ MaceStatus ChannelShuffleFunctor::operator()( if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, groups_); kernel_.setArg(idx++, static_cast(channels_per_group)); @@ -95,14 +78,7 @@ MaceStatus ChannelShuffleFunctor::operator()( output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future)); - - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/cl/activation.cl b/mace/kernels/opencl/cl/activation.cl index 25e79b3d..6436b82a 100644 --- a/mace/kernels/opencl/cl/activation.cl +++ b/mace/kernels/opencl/cl/activation.cl @@ -17,10 +17,8 @@ __kernel void activation(KERNEL_ERROR_PARAMS || hb >= global_size_dim2) { return; } - const int width = global_size_dim1; -#else - const int width = get_global_size(1); #endif + const int width = global_size_dim1; const int pos = mad24(ch_blk, width, w); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl index 064d8ecc..2da41eed 100644 --- a/mace/kernels/opencl/cl/batch_norm.cl +++ b/mace/kernels/opencl/cl/batch_norm.cl @@ -21,10 +21,8 @@ __kernel void batch_norm(KERNEL_ERROR_PARAMS || hb >= global_size_dim2) { return; } - const int width = global_size_dim1; -#else - const int width = get_global_size(1); #endif + const int width = global_size_dim1; #ifdef FOLDED_CONSTANT DATA_TYPE4 bn_scale = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0)); diff --git a/mace/kernels/opencl/cl/bias_add.cl b/mace/kernels/opencl/cl/bias_add.cl index 2cd5fb5c..31d11be7 100644 --- a/mace/kernels/opencl/cl/bias_add.cl +++ b/mace/kernels/opencl/cl/bias_add.cl @@ -14,10 +14,8 @@ __kernel void bias_add(KERNEL_ERROR_PARAMS || hb >= global_size_dim2) { return; } - const int width = global_size_dim1; -#else - const int width = get_global_size(1); #endif + const int width = global_size_dim1; const int pos = mad24(ch_blk, width, w); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl index c4ef4988..0ab39219 100644 --- a/mace/kernels/opencl/cl/buffer_to_image.cl +++ b/mace/kernels/opencl/cl/buffer_to_image.cl @@ -446,10 +446,8 @@ __kernel void weight_height_buffer_to_image(KERNEL_ERROR_PARAMS if (w >= global_size_dim0 || h >= global_size_dim1) { return; } - const int inner_size = global_size_dim0; -#else - const int inner_size = get_global_size(0); #endif + const int inner_size = global_size_dim0; const int out_chan_idx = h << 2; const int in_chan_idx = w % in_channels; @@ -492,10 +490,8 @@ __kernel void weight_height_image_to_buffer(KERNEL_ERROR_PARAMS if (w >= global_size_dim0 || h >= global_size_dim1) { return; } - const int inner_size = global_size_dim0; -#else - const int inner_size = get_global_size(0); #endif + const int inner_size = global_size_dim0; const int out_chan_idx = h << 2; const int in_chan_idx = w % in_channels; @@ -536,10 +532,8 @@ __kernel void weight_width_buffer_to_image(KERNEL_ERROR_PARAMS if (w >= global_size_dim0 || h >= global_size_dim1) { return; } - const int out_channels = global_size_dim1; -#else - const int out_channels = get_global_size(1); #endif + const int out_channels = global_size_dim1; const int in_chan_blks = (in_channels + 3) >> 2; const int hw_size = height * width; const int inner_size = in_channels * hw_size; @@ -585,10 +579,8 @@ __kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS if (w >= global_size_dim0 || h >= global_size_dim1) { return; } - const int out_channels = global_size_dim1; -#else - const int out_channels = get_global_size(1); #endif + const int out_channels = global_size_dim1; const int in_chan_blks = (in_channels + 3) >> 2; const int hw_size = height * width; const int inner_size = in_channels * hw_size; @@ -632,10 +624,8 @@ __kernel void winograd_filter_buffer_to_image_2x2(KERNEL_ERROR_PARAMS if (w >= global_size_dim0 || h >= global_size_dim1) { return; } - const int out_channels = global_size_dim1; -#else - const int out_channels = get_global_size(1); #endif + const int out_channels = global_size_dim1; const int out_channel_idx = h; const int in_channel_idx = w << 2; @@ -782,10 +772,8 @@ __kernel void winograd_filter_buffer_to_image_6x6(KERNEL_ERROR_PARAMS if (w >= global_size_dim0 || h >= global_size_dim1) { return; } - const int out_channels = global_size_dim1; -#else - const int out_channels = get_global_size(1); #endif + const int out_channels = global_size_dim1; const int out_channel_idx = h; const int in_channel_idx = w << 2; @@ -960,10 +948,8 @@ __kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS if (w >= global_size_dim0 || h >= global_size_dim1) { return; } - const int out_channels = global_size_dim1; -#else - const int out_channels = get_global_size(1); #endif + const int out_channels = global_size_dim1; const int out_channel_idx = h; const int in_channel_idx = w << 2; diff --git a/mace/kernels/opencl/cl/channel_shuffle.cl b/mace/kernels/opencl/cl/channel_shuffle.cl index c404130e..6563c7a8 100644 --- a/mace/kernels/opencl/cl/channel_shuffle.cl +++ b/mace/kernels/opencl/cl/channel_shuffle.cl @@ -16,10 +16,8 @@ __kernel void channel_shuffle(KERNEL_ERROR_PARAMS || hb_idx >= global_size_dim2) { return; } - const int width = global_size_dim1; -#else - const int width = get_global_size(1); #endif + const int width = global_size_dim1; const int group_blks = groups / 4; const int groups_blks_width = group_blks * width; diff --git a/mace/kernels/opencl/cl/common.h b/mace/kernels/opencl/cl/common.h index 09731d77..8408f1be 100644 --- a/mace/kernels/opencl/cl/common.h +++ b/mace/kernels/opencl/cl/common.h @@ -38,24 +38,15 @@ CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \ CMD_TYPE(write_image, CMD_DATA_TYPE)(image, coord, value); -#ifndef NON_UNIFORM_WORK_GROUP - #define GLOBAL_WORK_GROUP_SIZE_DIM2 \ __private const int global_size_dim0, \ __private const int global_size_dim1, + #define GLOBAL_WORK_GROUP_SIZE_DIM3 \ __private const int global_size_dim0, \ __private const int global_size_dim1, \ __private const int global_size_dim2, -#else - -#define GLOBAL_WORK_GROUP_SIZE_DIM2 -#define GLOBAL_WORK_GROUP_SIZE_DIM3 - -#endif - - #ifdef OUT_OF_RANGE_CHECK #define KERNEL_ERROR_PARAMS \ diff --git a/mace/kernels/opencl/cl/concat.cl b/mace/kernels/opencl/cl/concat.cl index e6e78e86..e656109c 100644 --- a/mace/kernels/opencl/cl/concat.cl +++ b/mace/kernels/opencl/cl/concat.cl @@ -37,10 +37,8 @@ __kernel void concat_channel(KERNEL_ERROR_PARAMS || hb_idx >= global_size_dim2) { return; } - const int width = global_size_dim1; -#else - const int width = get_global_size(1); #endif + const int width = global_size_dim1; const int input0_chan_blk = (input0_chan + 3) >> 2; @@ -100,10 +98,8 @@ __kernel void concat_channel_multi(KERNEL_ERROR_PARAMS || hb_idx >= global_size_dim2) { return; } - const int width = global_size_dim1; -#else - const int width = get_global_size(1); #endif + const int width = global_size_dim1; DATA_TYPE4 data = 0; data = READ_IMAGET(input, diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl index d23f6e43..b645502c 100644 --- a/mace/kernels/opencl/cl/conv_2d.cl +++ b/mace/kernels/opencl/cl/conv_2d.cl @@ -30,10 +30,8 @@ __kernel void conv_2d(KERNEL_ERROR_PARAMS || out_hb >= global_size_dim2) { return; } - const int out_w_blks = global_size_dim1; -#else - const int out_w_blks = get_global_size(1); #endif + const int out_w_blks = global_size_dim1; #ifdef BIAS DATA_TYPE4 out0 = diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index 65d2b9c6..b9b387e1 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -24,10 +24,8 @@ __kernel void conv_2d_1x1(KERNEL_ERROR_PARAMS || out_hb >= global_size_dim2) { return; } - const int out_w_blks = global_size_dim1; -#else - const int out_w_blks = get_global_size(1); #endif + const int out_w_blks = global_size_dim1; #ifdef BIAS DATA_TYPE4 out0 = READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0)); diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index 13e4ccb3..07603287 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -28,10 +28,8 @@ __kernel void conv_2d_3x3(KERNEL_ERROR_PARAMS || out_hb >= global_size_dim2) { return; } - const int out_w_blks = global_size_dim1; -#else - const int out_w_blks = get_global_size(1); #endif + const int out_w_blks = global_size_dim1; #ifdef BIAS DATA_TYPE4 out0 = diff --git a/mace/kernels/opencl/cl/depthwise_conv2d.cl b/mace/kernels/opencl/cl/depthwise_conv2d.cl index 9173f1b4..979c882b 100644 --- a/mace/kernels/opencl/cl/depthwise_conv2d.cl +++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl @@ -30,10 +30,8 @@ __kernel void depthwise_conv2d(KERNEL_ERROR_PARAMS || out_hb >= global_size_dim2) { return; } - const short out_w_blks = global_size_dim1; -#else - const short out_w_blks = get_global_size(1); #endif + const short out_w_blks = global_size_dim1; const short rounded_in_ch = in_ch_blks << 2; const short in_ch_blk = out_ch_blk; // multiplier = 1 diff --git a/mace/kernels/opencl/cl/fully_connected.cl b/mace/kernels/opencl/cl/fully_connected.cl index e5de2c64..0aea2ee5 100644 --- a/mace/kernels/opencl/cl/fully_connected.cl +++ b/mace/kernels/opencl/cl/fully_connected.cl @@ -80,7 +80,7 @@ __kernel void fully_connected_width(KERNEL_ERROR_PARAMS __private const float relux_max_limit) { const int inter_out_idx = get_global_id(0); const int width_blk_idx = get_global_id(1); - const int width_blk_count = get_global_size(1); + const int width_blk_count = global_size_dim1; const int batch_out_blk_idx = get_global_id(2); const int batch_idx = batch_out_blk_idx / out_blks; diff --git a/mace/kernels/opencl/cl/pad.cl b/mace/kernels/opencl/cl/pad.cl index 1ccaa29a..8e102d60 100644 --- a/mace/kernels/opencl/cl/pad.cl +++ b/mace/kernels/opencl/cl/pad.cl @@ -23,11 +23,8 @@ __kernel void pad(KERNEL_ERROR_PARAMS || hb_idx >= global_size_dim2) { return; } - const int width = global_size_dim1; -#else - const int width = get_global_size(1); #endif - + const int width = global_size_dim1; DATA_TYPE4 data = constant_value; if ((height_padding <= height_idx && height_idx < input_padded_height) && diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl index b0faddb0..11ff3b89 100644 --- a/mace/kernels/opencl/cl/pooling.cl +++ b/mace/kernels/opencl/cl/pooling.cl @@ -36,10 +36,8 @@ __kernel void pooling(KERNEL_ERROR_PARAMS || out_hb_idx >= global_size_dim2) { return; } - const int out_width = global_size_dim1; -#else - const int out_width = get_global_size(1); #endif + const int out_width = global_size_dim1; const int batch_idx = mul24((out_hb_idx / out_height), in_height); const int in_height_start = mul24((out_hb_idx % out_height), stride) - pad_top; diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl index 8736bf52..8e1fb1e6 100644 --- a/mace/kernels/opencl/cl/resize_bilinear.cl +++ b/mace/kernels/opencl/cl/resize_bilinear.cl @@ -19,12 +19,9 @@ __kernel void resize_bilinear_nocache(KERNEL_ERROR_PARAMS || hb >= global_size_dim2) { return; } +#endif const int ch_blks = global_size_dim0; const int out_width = global_size_dim1; -#else - const int ch_blks = get_global_size(0); - const int out_width = get_global_size(1); -#endif const int b = hb / out_height; const int h = hb % out_height; diff --git a/mace/kernels/opencl/cl/slice.cl b/mace/kernels/opencl/cl/slice.cl index 366cddc3..f6b0c35a 100644 --- a/mace/kernels/opencl/cl/slice.cl +++ b/mace/kernels/opencl/cl/slice.cl @@ -14,10 +14,8 @@ __kernel void slice(KERNEL_ERROR_PARAMS || hb_idx >= global_size_dim2) { return; } - const int width = global_size_dim1; -#else - const int width = get_global_size(1); #endif + const int width = global_size_dim1; DATA_TYPE4 data = READ_IMAGET(input, SAMPLER, (int2)(mad24(chan_blk_idx + chan_blk_offset, diff --git a/mace/kernels/opencl/cl/softmax.cl b/mace/kernels/opencl/cl/softmax.cl index 710433a2..361ea263 100644 --- a/mace/kernels/opencl/cl/softmax.cl +++ b/mace/kernels/opencl/cl/softmax.cl @@ -15,12 +15,9 @@ __kernel void softmax(KERNEL_ERROR_PARAMS || hb_idx >= global_size_dim2) { return; } +#endif const int chan_blks = global_size_dim0 - 1; const int width = global_size_dim1; -#else - const int chan_blks = get_global_size(0) - 1; - const int width = get_global_size(1); -#endif int pos = width_idx; DATA_TYPE max_value = -FLT_MAX; diff --git a/mace/kernels/opencl/cl/winograd_transform.cl b/mace/kernels/opencl/cl/winograd_transform.cl index bfc58981..5e0a467f 100644 --- a/mace/kernels/opencl/cl/winograd_transform.cl +++ b/mace/kernels/opencl/cl/winograd_transform.cl @@ -20,10 +20,8 @@ __kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) { return; } - const int chan_blk_size = global_size_dim1; -#else - const int chan_blk_size = get_global_size(1); #endif + const int chan_blk_size = global_size_dim1; const int batch_idx = out_width_idx * round_hw_r; const int t_idx = mad24(batch_idx, -round_hw, out_width_idx); @@ -141,10 +139,8 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) { return; } - const int out_channel = global_size_dim1; -#else - const int out_channel = get_global_size(1); #endif + const int out_channel = global_size_dim1; int width = width_idx; int height = height_idx; @@ -255,10 +251,8 @@ __kernel void winograd_transform_4x4(KERNEL_ERROR_PARAMS if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) { return; } - const int chan_blk_size = global_size_dim1; -#else - const int chan_blk_size = get_global_size(1); #endif + const int chan_blk_size = global_size_dim1; const int batch_idx = out_width_idx * round_hw_r; const int t_idx = mad24(batch_idx, -round_hw, out_width_idx); @@ -417,10 +411,8 @@ __kernel void winograd_inverse_transform_4x4(KERNEL_ERROR_PARAMS if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) { return; } - const int out_channel = global_size_dim1; -#else - const int out_channel = get_global_size(1); #endif + const int out_channel = global_size_dim1; const int batch = width_idx * round_hw_r; int h = mad24(batch, -round_hw, width_idx); diff --git a/mace/kernels/opencl/common.h b/mace/kernels/opencl/common.h new file mode 100644 index 00000000..176f58ed --- /dev/null +++ b/mace/kernels/opencl/common.h @@ -0,0 +1,35 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_COMMON_H_ +#define MACE_KERNELS_OPENCL_COMMON_H_ + +namespace mace { +namespace kernels { + +enum BufferType { + CONV2D_FILTER = 0, + IN_OUT_CHANNEL = 1, + ARGUMENT = 2, + IN_OUT_HEIGHT = 3, + IN_OUT_WIDTH = 4, + WINOGRAD_FILTER = 5, + DW_CONV2D_FILTER = 6, + WEIGHT_HEIGHT = 7, + WEIGHT_WIDTH = 8, +}; + +} // namespace kernels +} // namespace mace +#endif // MACE_KERNELS_OPENCL_COMMON_H_ diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index 98ac4342..58b27faa 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -65,26 +65,16 @@ static MaceStatus Concat2(cl::Kernel *kernel, if (kernel->get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(*kernel_error); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); built_options.emplace("-Dconcat_channel=" + kernel_name); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1)); - (*kernel_error)->Map(nullptr); - *((*kernel_error)->mutable_data()) = 0; - (*kernel_error)->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } if (input0->dtype() == output->dtype()) { built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); } else { - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); } if (input0->dim(3) % 4 == 0) { built_options.emplace("-DDIVISIBLE_FOUR"); @@ -97,15 +87,8 @@ static MaceStatus Concat2(cl::Kernel *kernel, } if (!IsVecEqual(*prev_input_shape, input0->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel->setArg(idx++, - *(static_cast((*kernel_error)->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel->setArg(idx++, gws[0]); - kernel->setArg(idx++, gws[1]); - kernel->setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG_PTR; + SET_3D_GWS_ARGS_PTR(kernel, gws); kernel->setArg(idx++, *(static_cast(input0->opencl_image()))); kernel->setArg(idx++, @@ -123,14 +106,7 @@ static MaceStatus Concat2(cl::Kernel *kernel, output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future)); - - if (runtime->IsOutOfRangeCheckEnabled()) { - (*kernel_error)->Map(nullptr); - char *kerror_code = (*kernel_error)->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - (*kernel_error)->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(*kernel_error); return MACE_SUCCESS; } @@ -149,22 +125,12 @@ static MaceStatus ConcatN(cl::Kernel *kernel, if (kernel->get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(*kernel_error); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi"); built_options.emplace("-Dconcat_channel_multi=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1)); - (*kernel_error)->Map(nullptr); - *((*kernel_error)->mutable_data()) = 0; - (*kernel_error)->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name, built_options, kernel)); *kwg_size = @@ -185,15 +151,8 @@ static MaceStatus ConcatN(cl::Kernel *kernel, const std::vector lws = LocalWS(gws, *kwg_size); uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel->setArg(idx++, - *(static_cast((*kernel_error)->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel->setArg(idx++, gws[0]); - kernel->setArg(idx++, gws[1]); - kernel->setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG_PTR; + SET_3D_GWS_ARGS_PTR(kernel, gws); kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, static_cast(chan_blk_offset)); kernel->setArg(idx++, *(output->opencl_image())); @@ -217,12 +176,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel, cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); } MACE_CL_RET_STATUS(error); - if (runtime->IsOutOfRangeCheckEnabled()) { - (*kernel_error)->Map(nullptr); - char *kerror_code = (*kernel_error)->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - (*kernel_error)->UnMap(); - } + OUT_OF_RANGE_VALIDATION(*kernel_error); if (future != nullptr && runtime->is_profiling_enabled()) { event.wait(); CallStats tmp_stats; diff --git a/mace/kernels/opencl/conv_2d_1x1.cc b/mace/kernels/opencl/conv_2d_1x1.cc index e5eb2134..770f0606 100644 --- a/mace/kernels/opencl/conv_2d_1x1.cc +++ b/mace/kernels/opencl/conv_2d_1x1.cc @@ -98,22 +98,12 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, MACE_CHECK(input_batch == batch); std::set built_options; + OUT_OF_RANGE_CONFIG(*kernel_error); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1"); built_options.emplace("-Dconv_2d_1x1=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1)); - (*kernel_error)->Map(nullptr); - *((*kernel_error)->mutable_data()) = 0; - (*kernel_error)->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); if (bias != nullptr) { built_options.emplace("-DBIAS"); } @@ -147,17 +137,11 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, static_cast(width_blocks), static_cast(height * batch)}; + // Support different input size if (!IsVecEqual(*prev_input_shape, input->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel->setArg(idx++, - *(static_cast((*kernel_error)->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel->setArg(idx++, gws[0]); - kernel->setArg(idx++, gws[1]); - kernel->setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG_PTR; + SET_3D_GWS_ARGS_PTR(kernel, gws); kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image())); if (bias != nullptr) { @@ -182,14 +166,7 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future)); - - if (runtime->IsOutOfRangeCheckEnabled()) { - (*kernel_error)->Map(nullptr); - char *kerror_code = (*kernel_error)->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - (*kernel_error)->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(*kernel_error); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/conv_2d_3x3.cc b/mace/kernels/opencl/conv_2d_3x3.cc index 9984fe10..02df4ea1 100644 --- a/mace/kernels/opencl/conv_2d_3x3.cc +++ b/mace/kernels/opencl/conv_2d_3x3.cc @@ -84,22 +84,12 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, if (kernel->get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(*kernel_error); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3"); built_options.emplace("-Dconv_2d_3x3=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1)); - (*kernel_error)->Map(nullptr); - *((*kernel_error)->mutable_data()) = 0; - (*kernel_error)->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation) { case NOOP: @@ -131,17 +121,11 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, static_cast(width_blocks), static_cast(height * batch)}; + // Support different input size if (!IsVecEqual(*prev_input_shape, input->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel->setArg(idx++, - *(static_cast((*kernel_error)->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel->setArg(idx++, gws[0]); - kernel->setArg(idx++, gws[1]); - kernel->setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG_PTR; + SET_3D_GWS_ARGS_PTR(kernel, gws); kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image())); if (bias != nullptr) { @@ -169,14 +153,7 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future)); - - if (runtime->IsOutOfRangeCheckEnabled()) { - (*kernel_error)->Map(nullptr); - char *kerror_code = (*kernel_error)->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - (*kernel_error)->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(*kernel_error); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/conv_2d_general.cc b/mace/kernels/opencl/conv_2d_general.cc index a6e29694..fa2c9774 100644 --- a/mace/kernels/opencl/conv_2d_general.cc +++ b/mace/kernels/opencl/conv_2d_general.cc @@ -93,22 +93,12 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, if (kernel->get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(*kernel_error); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d"); built_options.emplace("-Dconv_2d=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1)); - (*kernel_error)->Map(nullptr); - *((*kernel_error)->mutable_data()) = 0; - (*kernel_error)->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation) { case NOOP: @@ -140,17 +130,11 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, static_cast(width_blocks), static_cast(height * batch)}; + // Support different input size if (!IsVecEqual(*prev_input_shape, input->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel->setArg(idx++, - *(static_cast((*kernel_error)->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel->setArg(idx++, gws[0]); - kernel->setArg(idx++, gws[1]); - kernel->setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG_PTR; + SET_3D_GWS_ARGS_PTR(kernel, gws); kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image())); if (bias != nullptr) { @@ -182,13 +166,7 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - (*kernel_error)->Map(nullptr); - char *kerror_code = (*kernel_error)->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - (*kernel_error)->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(*kernel_error); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/crop.cc b/mace/kernels/opencl/crop.cc index ba9248d1..651b2ef8 100644 --- a/mace/kernels/opencl/crop.cc +++ b/mace/kernels/opencl/crop.cc @@ -136,23 +136,13 @@ MaceStatus CropFunctor::operator()( if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop"); built_options.emplace("-Dcrop=" + kernel_name); auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name, built_options, &kernel_)); @@ -161,15 +151,8 @@ MaceStatus CropFunctor::operator()( } if (!IsVecEqual(input_shape_, input0->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input0->opencl_image())); kernel_.setArg(idx++, static_cast(offsets[0])); kernel_.setArg(idx++, static_cast(offsets[1])); @@ -190,14 +173,7 @@ MaceStatus CropFunctor::operator()( output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future)); - - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/deconv_2d_opencl.cc b/mace/kernels/opencl/deconv_2d.cc similarity index 85% rename from mace/kernels/opencl/deconv_2d_opencl.cc rename to mace/kernels/opencl/deconv_2d.cc index ac7af70d..80e6370d 100644 --- a/mace/kernels/opencl/deconv_2d_opencl.cc +++ b/mace/kernels/opencl/deconv_2d.cc @@ -59,22 +59,12 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel, if (kernel->get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(*kernel_error); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d"); built_options.emplace("-Ddeconv_2d=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1)); - (*kernel_error)->Map(nullptr); - *((*kernel_error)->mutable_data()) = 0; - (*kernel_error)->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation) { case NOOP: @@ -108,15 +98,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel, if (!IsVecEqual(*prev_input_shape, input->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel->setArg(idx++, - *(static_cast((*kernel_error)->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel->setArg(idx++, gws[0]); - kernel->setArg(idx++, gws[1]); - kernel->setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG_PTR; + SET_3D_GWS_ARGS_PTR(kernel, gws); kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image())); if (bias != nullptr) { @@ -152,13 +135,7 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - (*kernel_error)->Map(nullptr); - char *kerror_code = (*kernel_error)->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - (*kernel_error)->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(*kernel_error); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/depth_to_space.cc b/mace/kernels/opencl/depth_to_space.cc index cd379b22..4c1fd3be 100644 --- a/mace/kernels/opencl/depth_to_space.cc +++ b/mace/kernels/opencl/depth_to_space.cc @@ -76,6 +76,8 @@ MaceStatus DepthToSpaceOpFunctor::operator()( if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; @@ -83,18 +85,6 @@ MaceStatus DepthToSpaceOpFunctor::operator()( auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space", obfuscated_kernel_name, built_options, @@ -106,15 +96,8 @@ MaceStatus DepthToSpaceOpFunctor::operator()( if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input->opencl_image())); if (d2s_) { kernel_.setArg(idx++, static_cast(block_size_)); @@ -140,13 +123,7 @@ MaceStatus DepthToSpaceOpFunctor::operator()( MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/depthwise_conv.cc b/mace/kernels/opencl/depthwise_conv.cc index 517ff16d..3c97a288 100644 --- a/mace/kernels/opencl/depthwise_conv.cc +++ b/mace/kernels/opencl/depthwise_conv.cc @@ -93,6 +93,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, if (kernel->get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(*kernel_error); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) { kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d_s1"); @@ -100,20 +102,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, } else { built_options.emplace("-Ddepthwise_conv2d=" + kernel_name); } - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1)); - (*kernel_error)->Map(nullptr); - *((*kernel_error)->mutable_data()) = 0; - (*kernel_error)->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(MakeString("-DSTRIDE=", stride)); switch (activation) { @@ -154,15 +144,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, input_channels); uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel->setArg(idx++, - *(static_cast((*kernel_error)->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel->setArg(idx++, gws[0]); - kernel->setArg(idx++, gws[1]); - kernel->setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG_PTR; + SET_3D_GWS_ARGS_PTR(kernel, gws); kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image())); if (bias != nullptr) { @@ -193,13 +176,7 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - (*kernel_error)->Map(nullptr); - char *kerror_code = (*kernel_error)->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - (*kernel_error)->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(*kernel_error); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/eltwise.cc b/mace/kernels/opencl/eltwise.cc index 503d5d5d..1f9eebe3 100644 --- a/mace/kernels/opencl/eltwise.cc +++ b/mace/kernels/opencl/eltwise.cc @@ -74,11 +74,13 @@ MaceStatus EltwiseFunctor::operator()(const Tensor *input0, auto runtime = OpenCLRuntime::Global(); if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise"); built_options.emplace("-Deltwise=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace(MakeString("-DELTWISE_TYPE=", type_)); if (input1 == nullptr) { built_options.emplace("-DINPUT_TYPE=1"); @@ -90,19 +92,6 @@ MaceStatus EltwiseFunctor::operator()(const Tensor *input0, if (swapped) built_options.emplace("-DSWAPPED"); } if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM"); - - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name, built_options, &kernel_)); @@ -111,15 +100,8 @@ MaceStatus EltwiseFunctor::operator()(const Tensor *input0, } if (!IsVecEqual(input_shape_, input0->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input0->opencl_image())); if (input1 == nullptr) { kernel_.setArg(idx++, value_); @@ -144,13 +126,7 @@ MaceStatus EltwiseFunctor::operator()(const Tensor *input0, output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/fully_connected.cc b/mace/kernels/opencl/fully_connected.cc index cbd046b4..dc8798a5 100644 --- a/mace/kernels/opencl/fully_connected.cc +++ b/mace/kernels/opencl/fully_connected.cc @@ -13,6 +13,8 @@ // limitations under the License. #include "mace/kernels/fully_connected.h" + +#include "mace/kernels/opencl/helper.h" #include "mace/utils/tuner.h" namespace mace { @@ -42,11 +44,13 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, const index_t output_blocks = RoundUpDiv4(output_size); std::set built_options; + OUT_OF_RANGE_CONFIG(*kernel_error); + NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width"); built_options.emplace("-Dfully_connected_width=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); if (bias != nullptr) { built_options.emplace("-DBIAS"); } @@ -71,19 +75,6 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { built_options.emplace("-DNON_QUALCOMM_ADRENO"); } - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1)); - (*kernel_error)->Map(nullptr); - *((*kernel_error)->mutable_data()) = 0; - (*kernel_error)->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } - MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name, built_options, kernel)); @@ -113,15 +104,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, (*gws)[2] = static_cast(batch * output_blocks); uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel->setArg(idx++, - *(static_cast((*kernel_error)->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel->setArg(idx++, (*gws)[0]); - kernel->setArg(idx++, (*gws)[1]); - kernel->setArg(idx++, (*gws)[2]); - } + OUT_OF_RANGE_SET_ARG_PTR; + SET_3D_GWS_ARGS_PTR(kernel, *gws); kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(weight->opencl_image())); if (bias != nullptr) { @@ -154,12 +138,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event); } - if (runtime->IsOutOfRangeCheckEnabled()) { - (*kernel_error)->Map(nullptr); - char *kerror_code = (*kernel_error)->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - (*kernel_error)->UnMap(); - } + OUT_OF_RANGE_VALIDATION(*kernel_error); MACE_CL_RET_STATUS(error); if (future != nullptr) { @@ -192,26 +171,16 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel, auto runtime = OpenCLRuntime::Global(); if (kernel->get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(*kernel_error); + NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected"); built_options.emplace("-Dfully_connected=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); if (bias != nullptr) { built_options.emplace("-DBIAS"); } - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1)); - (*kernel_error)->Map(nullptr); - *((*kernel_error)->mutable_data()) = 0; - (*kernel_error)->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } switch (activation) { case NOOP: break; @@ -247,14 +216,8 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel, }; uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel->setArg(idx++, - *(static_cast((*kernel_error)->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel->setArg(idx++, (*gws)[0]); - kernel->setArg(idx++, (*gws)[1]); - } + OUT_OF_RANGE_SET_ARG_PTR; + SET_2D_GWS_ARGS_PTR(kernel, *gws); kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(weight->opencl_image())); if (bias != nullptr) { @@ -276,13 +239,7 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel, MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(*kernel, tuning_key, gws->data(), *lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - (*kernel_error)->Map(nullptr); - char *kerror_code = (*kernel_error)->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - (*kernel_error)->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(*kernel_error); return MACE_SUCCESS; } } // namespace diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index 6d882352..6ef80c80 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -180,22 +180,6 @@ std::vector FormatBufferShape( } } -std::vector CalWinogradShape(const std::vector &shape, - const BufferType type, - const int wino_blk_size) { - if (type == WINOGRAD_FILTER) { - return {(wino_blk_size + 2) * (wino_blk_size + 2), shape[0], shape[1]}; - } else if (type == IN_OUT_HEIGHT) { - index_t out_width = - shape[0] * ((shape[1] + wino_blk_size - 1) / wino_blk_size) * - ((shape[2] + wino_blk_size - 1) / wino_blk_size); - return {(wino_blk_size + 2) * (wino_blk_size + 2), shape[3], out_width}; - } else { - LOG(FATAL) << "Mace not supported yet."; - return std::vector(); - } -} - std::string DtToCLDt(const DataType dt) { switch (dt) { case DT_FLOAT: @@ -220,7 +204,7 @@ std::string DtToCLCMDDt(const DataType dt) { } } -std::string DtToUpstreamCLDt(const DataType dt) { +std::string DtToUpCompatibleCLDt(const DataType dt) { switch (dt) { case DT_FLOAT: case DT_HALF: @@ -231,7 +215,7 @@ std::string DtToUpstreamCLDt(const DataType dt) { } } -std::string DtToUpstreamCLCMDDt(const DataType dt) { +std::string DtToUpCompatibleCLCMDDt(const DataType dt) { switch (dt) { case DT_FLOAT: case DT_HALF: @@ -357,7 +341,8 @@ MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel, double elapse_time = timer->AccumulatedMicros(); timer->ClearTiming(); uint32_t num_blocks = std::min( - static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); + static_cast(elapse_time / kMaxKernelExecTime) + 1, + gws[2]); uint32_t block_size = gws[2] / num_blocks; if (!runtime->IsNonUniformWorkgroupsSupported()) { block_size = RoundUp(block_size, params[2]); @@ -465,7 +450,8 @@ MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel, double elapse_time = timer->AccumulatedMicros(); timer->ClearTiming(); uint32_t num_blocks = std::min( - static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[1]); + static_cast(elapse_time / kMaxKernelExecTime) + 1, + gws[1]); uint32_t block_size = gws[1] / num_blocks; if (!runtime->IsNonUniformWorkgroupsSupported()) { block_size = RoundUp(block_size, params[1]); diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h index 5db95e3d..22d9f1cc 100644 --- a/mace/kernels/opencl/helper.h +++ b/mace/kernels/opencl/helper.h @@ -15,7 +15,9 @@ #ifndef MACE_KERNELS_OPENCL_HELPER_H_ #define MACE_KERNELS_OPENCL_HELPER_H_ +#include #include +#include #include #include "mace/core/future.h" @@ -23,26 +25,71 @@ #include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/types.h" +#include "mace/kernels/opencl/common.h" #include "mace/utils/utils.h" namespace mace { namespace kernels { -const float kMaxKernelExeTime = 1000.0; // microseconds +#define OUT_OF_RANGE_CONFIG(kernel_error) \ + if (runtime->IsOutOfRangeCheckEnabled()) { \ + built_options.emplace("-DOUT_OF_RANGE_CHECK"); \ + (kernel_error) = std::move(std::unique_ptr( \ + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); \ + MACE_RETURN_IF_ERROR((kernel_error)->Allocate(1)); \ + (kernel_error)->Map(nullptr); \ + *((kernel_error)->mutable_data()) = 0; \ + (kernel_error)->UnMap(); \ + } -const int32_t kBaseGPUMemCacheSize = 16384; +#define OUT_OF_RANGE_SET_ARG \ + if (runtime->IsOutOfRangeCheckEnabled()) { \ + kernel_.setArg(idx++, \ + *(static_cast(kernel_error_->buffer()))); \ + } + +#define OUT_OF_RANGE_SET_ARG_PTR \ + if (runtime->IsOutOfRangeCheckEnabled()) { \ + kernel->setArg(idx++, \ + *(static_cast((*kernel_error)->buffer()))); \ + } + +#define OUT_OF_RANGE_VALIDATION(kernel_error) \ + if (runtime->IsOutOfRangeCheckEnabled()) { \ + (kernel_error)->Map(nullptr); \ + char *kerror_code = (kernel_error)->mutable_data(); \ + MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;\ + (kernel_error)->UnMap(); \ + } + +#define NON_UNIFORM_WG_CONFIG \ + if (runtime->IsNonUniformWorkgroupsSupported()) { \ + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); \ + } + +#define SET_3D_GWS_ARGS(kernel) \ + kernel.setArg(idx++, gws[0]); \ + kernel.setArg(idx++, gws[1]); \ + kernel.setArg(idx++, gws[2]); -enum BufferType { - CONV2D_FILTER = 0, - IN_OUT_CHANNEL = 1, - ARGUMENT = 2, - IN_OUT_HEIGHT = 3, - IN_OUT_WIDTH = 4, - WINOGRAD_FILTER = 5, - DW_CONV2D_FILTER = 6, - WEIGHT_HEIGHT = 7, - WEIGHT_WIDTH = 8, -}; +#define SET_2D_GWS_ARGS(kernel) \ + kernel.setArg(idx++, gws[0]); \ + kernel.setArg(idx++, gws[1]); + +#define SET_3D_GWS_ARGS_PTR(kernel, gws) \ + kernel->setArg(idx++, (gws)[0]); \ + kernel->setArg(idx++, (gws)[1]); \ + kernel->setArg(idx++, (gws)[2]); + +#define SET_2D_GWS_ARGS_PTR(kernel, gws) \ + kernel->setArg(idx++, (gws)[0]); \ + kernel->setArg(idx++, (gws)[1]); + +// Max execution time of OpenCL kernel for tuning to prevent UI stuck. +const float kMaxKernelExecTime = 1000.0; // microseconds + +// Base GPU cache size used for computing local work group size. +const int32_t kBaseGPUMemCacheSize = 16384; void CalImage2DShape(const std::vector &shape, /* NHWC */ const BufferType type, @@ -53,41 +100,35 @@ std::vector FormatBufferShape( const std::vector &buffer_shape, const BufferType type); -std::vector CalWinogradShape(const std::vector &shape, - const BufferType type, - const int wino_blk_size = 2); - +// CPU data type to OpenCL command data type std::string DtToCLCMDDt(const DataType dt); -std::string DtToUpstreamCLCMDDt(const DataType dt); +// CPU data type to upward compatible OpenCL command data type +// e.g. half -> float +std::string DtToUpCompatibleCLCMDDt(const DataType dt); +// CPU data type to OpenCL data type std::string DtToCLDt(const DataType dt); -std::string DtToUpstreamCLDt(const DataType dt); +// CPU data type to upward compatible OpenCL data type +// e.g. half -> float +std::string DtToUpCompatibleCLDt(const DataType dt); +// Tuning or Run OpenCL kernel with 3D work group size MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel, const std::string tuning_key, const uint32_t *gws, const std::vector &lws, StatsFuture *future); +// Tuning or Run OpenCL kernel with 2D work group size MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel, const std::string tuning_key, const uint32_t *gws, const std::vector &lws, StatsFuture *future); -inline void SetFuture(StatsFuture *future, const cl::Event &event) { - if (future != nullptr) { - future->wait_fn = [event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - OpenCLRuntime::Global()->GetCallStats(event, stats); - } - }; - } -} - +// Check whether limit OpenCL kernel time flag open. inline bool LimitKernelTime() { const char *flag = getenv("MACE_LIMIT_OPENCL_KERNEL_TIME"); return flag != nullptr && strlen(flag) == 1 && flag[0] == '1'; diff --git a/mace/kernels/opencl/image_to_buffer.cc b/mace/kernels/opencl/image_to_buffer.cc index c8635e1a..955b9ebe 100644 --- a/mace/kernels/opencl/image_to_buffer.cc +++ b/mace/kernels/opencl/image_to_buffer.cc @@ -13,9 +13,10 @@ // limitations under the License. #include "mace/kernels/image_to_buffer.h" -#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/kernels/opencl/helper.h" + namespace mace { namespace kernels { @@ -68,80 +69,71 @@ MaceStatus ImageToBufferFunctor::operator()( auto runtime = OpenCLRuntime::Global(); - std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); - std::set built_options; - std::stringstream kernel_name_ss; - kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; - built_options.emplace(kernel_name_ss.str()); - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } - if (buffer->dtype() == image->dtype()) { - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToCLCMDDt(DataTypeToEnum::value)); - } else { - built_options.emplace("-DDATA_TYPE=" + - DtToUpstreamCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToUpstreamCLCMDDt(DataTypeToEnum::value)); - } - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - if (!kernel_error_) { - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); + if (kernel_.get() == nullptr) { + std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); + std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; + std::stringstream kernel_name_ss; + kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; + built_options.emplace(kernel_name_ss.str()); + if (buffer->dtype() == image->dtype()) { + built_options.emplace( + "-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + + DtToCLCMDDt(DataTypeToEnum::value)); + } else { + built_options.emplace("-DDATA_TYPE=" + + DtToUpCompatibleCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + + DtToUpCompatibleCLCMDDt(DataTypeToEnum::value)); } + MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image", + obfuscated_kernel_name, + built_options, + &kernel_)); } - cl::Kernel b2f_kernel; - MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image", - obfuscated_kernel_name, - built_options, - &b2f_kernel)); - uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - b2f_kernel.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - b2f_kernel.setArg(idx++, gws[0]); - b2f_kernel.setArg(idx++, gws[1]); - } - b2f_kernel.setArg(idx++, *(buffer->opencl_buffer())); - if (type == CONV2D_FILTER) { - const index_t inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(0))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); - b2f_kernel.setArg(idx++, static_cast(inner_size)); - } else if (type == ARGUMENT) { - b2f_kernel.setArg(idx++, static_cast(buffer->dim(0))); - } else if (type == WEIGHT_HEIGHT) { - b2f_kernel.setArg(idx++, static_cast(buffer->dim(0))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(1))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); - } else { - b2f_kernel.setArg(idx++, static_cast(formatted_buffer_shape[1])); - b2f_kernel.setArg(idx++, static_cast(formatted_buffer_shape[2])); - b2f_kernel.setArg(idx++, static_cast(formatted_buffer_shape[3])); + if (!IsVecEqual(input_shape_, image->shape())) { + uint32_t idx = 0; + OUT_OF_RANGE_SET_ARG; + SET_2D_GWS_ARGS(kernel_); + kernel_.setArg(idx++, *(buffer->opencl_buffer())); + if (type == CONV2D_FILTER) { + const index_t + inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3); + kernel_.setArg(idx++, static_cast(buffer->dim(0))); + kernel_.setArg(idx++, static_cast(buffer->dim(2))); + kernel_.setArg(idx++, static_cast(buffer->dim(3))); + kernel_.setArg(idx++, static_cast(inner_size)); + } else if (type == ARGUMENT) { + kernel_.setArg(idx++, static_cast(buffer->dim(0))); + } else if (type == WEIGHT_HEIGHT) { + kernel_.setArg(idx++, static_cast(buffer->dim(0))); + kernel_.setArg(idx++, static_cast(buffer->dim(1))); + kernel_.setArg(idx++, static_cast(buffer->dim(2))); + kernel_.setArg(idx++, static_cast(buffer->dim(3))); + } else { + kernel_.setArg(idx++, + static_cast(formatted_buffer_shape[1])); + kernel_.setArg(idx++, + static_cast(formatted_buffer_shape[2])); + kernel_.setArg(idx++, + static_cast(formatted_buffer_shape[3])); + } + kernel_.setArg(idx++, *(image->opencl_image())); + input_shape_ = image->shape(); } - b2f_kernel.setArg(idx++, *(image->opencl_image())); const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(b2f_kernel)); + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); const std::vector lws = {16, kwg_size / 16}; cl::Event event; cl_int error; if (runtime->IsNonUniformWorkgroupsSupported()) { error = runtime->command_queue().enqueueNDRangeKernel( - b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), + kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]), cl::NDRange(lws[0], lws[1]), nullptr, &event); } else { std::vector roundup_gws(lws.size()); @@ -150,16 +142,11 @@ MaceStatus ImageToBufferFunctor::operator()( } error = runtime->command_queue().enqueueNDRangeKernel( - b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), + kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), cl::NDRange(lws[0], lws[1]), nullptr, &event); } MACE_CL_RET_STATUS(error); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } + OUT_OF_RANGE_VALIDATION(kernel_error_); if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { event.wait(); diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc index f9825eb6..7e5e52b4 100644 --- a/mace/kernels/opencl/matmul.cc +++ b/mace/kernels/opencl/matmul.cc @@ -57,23 +57,13 @@ MaceStatus MatMulFunctor::operator()(const Tensor *A, if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); built_options.emplace("-Dmatmul=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name, built_options, &kernel_)); @@ -81,14 +71,8 @@ MaceStatus MatMulFunctor::operator()(const Tensor *A, static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - } + OUT_OF_RANGE_SET_ARG; + SET_2D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(A->opencl_image())); kernel_.setArg(idx++, *(B->opencl_image())); kernel_.setArg(idx++, *(C->opencl_image())); @@ -98,18 +82,12 @@ MaceStatus MatMulFunctor::operator()(const Tensor *A, kernel_.setArg(idx++, static_cast(height_blocks)); kernel_.setArg(idx++, static_cast(RoundUpDiv4(K))); - const std::vector lws = {kwg_size_ / 64, 64, 0}; + const std::vector lws = {kwg_size_ / 64, 64, 1}; std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width); MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/out_of_range_check_test.cc b/mace/kernels/opencl/out_of_range_check_test.cc index 2497a4df..d257fea2 100644 --- a/mace/kernels/opencl/out_of_range_check_test.cc +++ b/mace/kernels/opencl/out_of_range_check_test.cc @@ -40,9 +40,8 @@ bool BufferToImageOpImpl(Tensor *buffer, std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; built_options.emplace(kernel_name_ss.str()); - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + OUT_OF_RANGE_CONFIG(kernel_error); + NON_UNIFORM_WG_CONFIG; if (buffer->dtype() == image->dtype()) { built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); @@ -50,57 +49,46 @@ bool BufferToImageOpImpl(Tensor *buffer, DtToCLCMDDt(DataTypeToEnum::value)); } else { built_options.emplace("-DDATA_TYPE=" + - DtToUpstreamCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToUpstreamCLCMDDt(DataTypeToEnum::value)); - } - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error->Allocate(1)); - kernel_error->Map(nullptr); - *(kernel_error->mutable_data()) = 0; - kernel_error->UnMap(); + DtToUpCompatibleCLDt(DataTypeToEnum::value)); + built_options.emplace( + "-DCMD_DATA_TYPE=" + + DtToUpCompatibleCLCMDDt(DataTypeToEnum::value)); } - cl::Kernel b2f_kernel; - + cl::Kernel kernel; cl_int error = runtime->BuildKernel("buffer_to_image", obfuscated_kernel_name, - built_options, &b2f_kernel); + built_options, + &kernel); if (error != CL_SUCCESS) { return false; } uint32_t idx = 0; if (runtime->IsOutOfRangeCheckEnabled()) { - b2f_kernel.setArg(idx++, - *(static_cast(kernel_error->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - b2f_kernel.setArg(idx++, gws[0]); - b2f_kernel.setArg(idx++, gws[1]); + kernel.setArg(idx++, + *(static_cast(kernel_error->buffer()))); } - b2f_kernel.setArg(idx++, *(buffer->opencl_buffer())); + SET_2D_GWS_ARGS(kernel); + kernel.setArg(idx++, *(buffer->opencl_buffer())); MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, "buffer offset not aligned"); - b2f_kernel.setArg(idx++, + kernel.setArg(idx++, static_cast(buffer->buffer_offset() / GetEnumTypeSize(buffer->dtype()))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(1))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); - b2f_kernel.setArg(idx++, *(image->opencl_image())); + kernel.setArg(idx++, static_cast(buffer->dim(1))); + kernel.setArg(idx++, static_cast(buffer->dim(2))); + kernel.setArg(idx++, static_cast(buffer->dim(3))); + kernel.setArg(idx++, *(image->opencl_image())); const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(b2f_kernel)); + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel)); const std::vector lws = {16, kwg_size / 16}; cl::Event event; if (runtime->IsNonUniformWorkgroupsSupported()) { error = runtime->command_queue().enqueueNDRangeKernel( - b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), + kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), cl::NDRange(lws[0], lws[1]), nullptr, &event); } else { std::vector roundup_gws(lws.size()); @@ -109,7 +97,7 @@ bool BufferToImageOpImpl(Tensor *buffer, } error = runtime->command_queue().enqueueNDRangeKernel( - b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), + kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), cl::NDRange(lws[0], lws[1]), nullptr, &event); } if (error != CL_SUCCESS) { diff --git a/mace/kernels/opencl/pad.cc b/mace/kernels/opencl/pad.cc index 1a8879e8..04e9d69d 100644 --- a/mace/kernels/opencl/pad.cc +++ b/mace/kernels/opencl/pad.cc @@ -51,23 +51,13 @@ MaceStatus PadFunctor::operator()(const Tensor *input, if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad"); built_options.emplace("-Dpad=" + kernel_name); auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name, built_options, &kernel_)); @@ -81,15 +71,8 @@ MaceStatus PadFunctor::operator()(const Tensor *input, if (!IsVecEqual(input_shape_, input->shape())) { int idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, this->constant_value_); @@ -108,13 +91,7 @@ MaceStatus PadFunctor::operator()(const Tensor *input, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/pooling.cc b/mace/kernels/opencl/pooling.cc index 405c05a1..7111317a 100644 --- a/mace/kernels/opencl/pooling.cc +++ b/mace/kernels/opencl/pooling.cc @@ -59,6 +59,8 @@ MaceStatus PoolingFunctor::operator()(const Tensor *input, if (kernel_.get() == nullptr) { const DataType dt = DataTypeToEnum::value; std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); built_options.emplace("-Dpooling=" + kernel_name); @@ -67,24 +69,12 @@ MaceStatus PoolingFunctor::operator()(const Tensor *input, built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); built_options.emplace(dt == DT_HALF ? "-DFP16" : ""); } else { - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); } if (pooling_type_ == AVG) { built_options.emplace("-DPOOL_AVG"); } - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling", kernel_name, built_options, @@ -130,15 +120,8 @@ MaceStatus PoolingFunctor::operator()(const Tensor *input, }; uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, static_cast(input->dim(1))); kernel_.setArg(idx++, static_cast(input->dim(2))); @@ -171,13 +154,7 @@ MaceStatus PoolingFunctor::operator()(const Tensor *input, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/reduce_mean_opencl.cc b/mace/kernels/opencl/reduce_mean.cc similarity index 81% rename from mace/kernels/opencl/reduce_mean_opencl.cc rename to mace/kernels/opencl/reduce_mean.cc index 266a1111..075632c5 100644 --- a/mace/kernels/opencl/reduce_mean_opencl.cc +++ b/mace/kernels/opencl/reduce_mean.cc @@ -50,26 +50,15 @@ MaceStatus ReduceMeanFunctor::operator()( if (kernel_.get() == nullptr) { const DataType dt = DataTypeToEnum::value; std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce_mean"); built_options.emplace("-Dreduce_mean=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { built_options.emplace("-DNON_QUALCOMM_ADRENO"); } - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce_mean", kernel_name, built_options, @@ -96,15 +85,8 @@ MaceStatus ReduceMeanFunctor::operator()( if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, (group_size * 4 * sizeof(T)), nullptr); @@ -140,12 +122,7 @@ MaceStatus ReduceMeanFunctor::operator()( cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); } MACE_CL_RET_STATUS(error); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } + OUT_OF_RANGE_VALIDATION(kernel_error_); if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { diff --git a/mace/kernels/opencl/resize_bilinear.cc b/mace/kernels/opencl/resize_bilinear.cc index c5b8b65b..0b297dd2 100644 --- a/mace/kernels/opencl/resize_bilinear.cc +++ b/mace/kernels/opencl/resize_bilinear.cc @@ -74,23 +74,13 @@ MaceStatus ResizeBilinearFunctor::operator()( if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache"); built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name); auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); MACE_RETURN_IF_ERROR( runtime->BuildKernel("resize_bilinear", kernel_name, @@ -115,15 +105,8 @@ MaceStatus ResizeBilinearFunctor::operator()( CalculateResizeScale(in_width, out_width, align_corners_); uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, height_scale); @@ -142,13 +125,7 @@ MaceStatus ResizeBilinearFunctor::operator()( MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc index 726bcae9..b778e0d7 100644 --- a/mace/kernels/opencl/slice.cc +++ b/mace/kernels/opencl/slice.cc @@ -44,23 +44,13 @@ MaceStatus SliceFunctor::operator()( if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice"); built_options.emplace("-Dslice=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum::value)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } MACE_RETURN_IF_ERROR(runtime->BuildKernel("slice", kernel_name, built_options, @@ -81,15 +71,8 @@ MaceStatus SliceFunctor::operator()( CallStats call_stats{INT64_MAX, 0}; for (size_t i = 0; i < outputs_count; ++i) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, static_cast(channel_blk * i)); kernel_.setArg(idx++, *(output_list[i]->opencl_image())); @@ -111,12 +94,7 @@ MaceStatus SliceFunctor::operator()( cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); } MACE_CL_RET_STATUS(error); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } + OUT_OF_RANGE_VALIDATION(kernel_error_); if (future != nullptr && runtime->is_profiling_enabled()) { event.wait(); CallStats tmp_stats; diff --git a/mace/kernels/opencl/softmax.cc b/mace/kernels/opencl/softmax.cc index 07855488..f401b827 100644 --- a/mace/kernels/opencl/softmax.cc +++ b/mace/kernels/opencl/softmax.cc @@ -82,23 +82,13 @@ MaceStatus SoftmaxFunctor::operator()(const Tensor *logits, if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); built_options.emplace("-Dsoftmax=" + kernel_name); auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name, built_options, &kernel_)); @@ -107,15 +97,8 @@ MaceStatus SoftmaxFunctor::operator()(const Tensor *logits, } if (!IsVecEqual(input_shape_, logits->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(logits->opencl_image())); kernel_.setArg(idx++, static_cast(channels)); kernel_.setArg(idx++, remain_channels); @@ -130,13 +113,7 @@ MaceStatus SoftmaxFunctor::operator()(const Tensor *logits, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/space_to_batch.cc b/mace/kernels/opencl/space_to_batch.cc index 3606d91d..c31b2d69 100644 --- a/mace/kernels/opencl/space_to_batch.cc +++ b/mace/kernels/opencl/space_to_batch.cc @@ -59,24 +59,14 @@ MaceStatus SpaceToBatchFunctor::operator()( if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; built_options.emplace(kernel_name_ss.str()); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum::value)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch", obfuscated_kernel_name, built_options, @@ -87,15 +77,8 @@ MaceStatus SpaceToBatchFunctor::operator()( } if (!IsVecEqual(space_shape_, space_tensor->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); if (b2s_) { kernel_.setArg(idx++, *(batch_tensor->opencl_image())); kernel_.setArg(idx++, *(space_tensor->opencl_image())); @@ -123,13 +106,7 @@ MaceStatus SpaceToBatchFunctor::operator()( MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc index 6cc8e08c..dd6d16d6 100644 --- a/mace/kernels/opencl/winograd_transform.cc +++ b/mace/kernels/opencl/winograd_transform.cc @@ -29,6 +29,8 @@ MaceStatus WinogradTransformFunctor::operator()( if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name; std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; if (wino_blk_size_ == 4) { obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_transform_4x4"); @@ -44,21 +46,9 @@ MaceStatus WinogradTransformFunctor::operator()( return MACE_SUCCESS; } built_options.emplace("-DDATA_TYPE=" + - DtToUpstreamCLDt(DataTypeToEnum::value)); + DtToUpCompatibleCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + - DtToUpstreamCLCMDDt(DataTypeToEnum::value)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + DtToUpCompatibleCLCMDDt(DataTypeToEnum::value)); MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform", obfuscated_kernel_name, built_options, @@ -107,14 +97,8 @@ MaceStatus WinogradTransformFunctor::operator()( MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape)); uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - } + OUT_OF_RANGE_SET_ARG; + SET_2D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input_tensor->opencl_image())); kernel_.setArg(idx++, *(output_tensor->opencl_image())); kernel_.setArg(idx++, static_cast(input_tensor->dim(1))); @@ -139,13 +123,7 @@ MaceStatus WinogradTransformFunctor::operator()( MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } @@ -160,6 +138,8 @@ MaceStatus WinogradInverseTransformFunctor::operator()( if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name; std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; if (wino_blk_size_ == 4) { obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_4x4"); @@ -176,21 +156,9 @@ MaceStatus WinogradInverseTransformFunctor::operator()( } built_options.emplace("-DDATA_TYPE=" + - DtToUpstreamCLDt(DataTypeToEnum::value)); + DtToUpCompatibleCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + - DtToUpstreamCLCMDDt(DataTypeToEnum::value)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + DtToUpCompatibleCLCMDDt(DataTypeToEnum::value)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation_) { case NOOP: @@ -240,14 +208,8 @@ MaceStatus WinogradInverseTransformFunctor::operator()( const float round_w_r = 1.f / static_cast(round_w); uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - } + OUT_OF_RANGE_SET_ARG; + SET_2D_GWS_ARGS(kernel_); kernel_.setArg( idx++, *(static_cast(input_tensor->opencl_image()))); @@ -275,12 +237,7 @@ MaceStatus WinogradInverseTransformFunctor::operator()( MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index 37e3b4ac..e8da144d 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -28,7 +28,7 @@ #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/tensor.h" #include "mace/core/workspace.h" -#include "mace/kernels/opencl/helper.h" +#include "mace/kernels/opencl/common.h" #include "mace/ops/ops_register.h" #include "mace/utils/utils.h" -- GitLab