diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD index a8991f472af6d90da8c9b3e499a0572873826025..1035b54bed1b69c2da05b4c3f2c52d6222bac95f 100644 --- a/mace/kernels/BUILD +++ b/mace/kernels/BUILD @@ -44,10 +44,12 @@ cc_library( ], exclude = [ "buffer_to_image.h", + "image_to_buffer.h", ], ) + if_opencl_enabled(glob([ "opencl/*.h", "buffer_to_image.h", + "image_to_buffer.h", ])), copts = [ "-Werror", diff --git a/mace/kernels/buffer_to_image.h b/mace/kernels/buffer_to_image.h index fedf8190c3bae60ed46d8fb6611aa0498e11c3e3..1def908705686085ac5e5a73f9e022e6f4df27e1 100644 --- a/mace/kernels/buffer_to_image.h +++ b/mace/kernels/buffer_to_image.h @@ -16,19 +16,18 @@ #define MACE_KERNELS_BUFFER_TO_IMAGE_H_ #include +#include #include "mace/core/future.h" #include "mace/core/tensor.h" -#include "mace/kernels/opencl/helper.h" +#include "mace/kernels/opencl/common.h" namespace mace { namespace kernels { struct BufferToImageFunctorBase { explicit BufferToImageFunctorBase(const int wino_blk_size) - : kernel_error_(nullptr), - wino_blk_size_(wino_blk_size) {} - std::unique_ptr kernel_error_; + : wino_blk_size_(wino_blk_size) {} const int wino_blk_size_; }; @@ -57,6 +56,10 @@ struct BufferToImageFunctor : BufferToImageFunctorBase { const BufferType type, Tensor *output, StatsFuture *future); + + cl::Kernel kernel_; + std::unique_ptr kernel_error_; + std::vector input_shape_; }; } // namespace kernels diff --git a/mace/kernels/fully_connected.h b/mace/kernels/fully_connected.h index 7a337a9d847ff658e4e48304122abe0ecd10e269..82b0ba1e4f3d408fa5ddf2e919e7d177d7b0c29a 100644 --- a/mace/kernels/fully_connected.h +++ b/mace/kernels/fully_connected.h @@ -23,11 +23,6 @@ #include "mace/kernels/activation.h" #include "mace/kernels/gemm.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#include "mace/kernels/opencl/helper.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { namespace kernels { diff --git a/mace/kernels/image_to_buffer.h b/mace/kernels/image_to_buffer.h index 77388da7511a5493ba5cc1f4cd93e8fb6f5c6d82..4e6b057f78520bbb05b18599482ff04a24e407c2 100644 --- a/mace/kernels/image_to_buffer.h +++ b/mace/kernels/image_to_buffer.h @@ -16,19 +16,18 @@ #define MACE_KERNELS_IMAGE_TO_BUFFER_H_ #include +#include #include "mace/core/future.h" #include "mace/core/tensor.h" -#include "mace/kernels/opencl/helper.h" +#include "mace/kernels/opencl/common.h" namespace mace { namespace kernels { struct ImageToBufferFunctorBase { explicit ImageToBufferFunctorBase(const int wino_blk_size) - : kernel_error_(nullptr), - wino_blk_size_(wino_blk_size) {} - std::unique_ptr kernel_error_; + : wino_blk_size_(wino_blk_size) {} const int wino_blk_size_; }; @@ -37,9 +36,9 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase { explicit ImageToBufferFunctor(const int wino_blk_size) : ImageToBufferFunctorBase(wino_blk_size) {} MaceStatus operator()(const Tensor *input, - const BufferType type, - Tensor *output, - StatsFuture *future) { + const BufferType type, + Tensor *output, + StatsFuture *future) { MACE_UNUSED(input); MACE_UNUSED(type); MACE_UNUSED(output); @@ -54,9 +53,13 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase { explicit ImageToBufferFunctor(const int wino_blk_size) : ImageToBufferFunctorBase(wino_blk_size) {} MaceStatus operator()(const Tensor *input, - const BufferType type, - Tensor *output, - StatsFuture *future); + const BufferType type, + Tensor *output, + StatsFuture *future); + + cl::Kernel kernel_; + std::unique_ptr kernel_error_; + std::vector input_shape_; }; } // namespace kernels diff --git a/mace/kernels/opencl/activation.cc b/mace/kernels/opencl/activation.cc index 2e343aa902d87d1595aef5f92562a68deec9549d..2cd0c2a3868357946ccb73979fb0e4b4c1391a06 100644 --- a/mace/kernels/opencl/activation.cc +++ b/mace/kernels/opencl/activation.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "mace/kernels/activation.h" -#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/helper.h" #include "mace/utils/tuner.h" @@ -38,23 +37,13 @@ MaceStatus ActivationFunctor::operator()( if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation"); built_options.emplace("-Dactivation=" + kernel_name); auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); switch (activation_) { case RELU: tuning_key_prefix_ = "relu_opencl_kernel"; @@ -92,15 +81,8 @@ MaceStatus ActivationFunctor::operator()( if (!IsVecEqual(input_shape_, input->shape())) { int idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input->opencl_image())); if (activation_ == PRELU) { MACE_CHECK_NOTNULL(alpha); @@ -119,13 +101,7 @@ MaceStatus ActivationFunctor::operator()( MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index e47f5103ef56b33544966b5f5fb8eee0b4ae0a52..f01baa7170dbbf3c907ac38ab6d45bd600e50a31 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -49,24 +49,14 @@ MaceStatus AddNFunctor::operator()( MACE_NOT_IMPLEMENTED; } std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn"); built_options.emplace("-Daddn=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size())); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name, built_options, &kernel_)); @@ -92,14 +82,8 @@ MaceStatus AddNFunctor::operator()( output_tensor->ResizeImage(output_shape, output_image_shape)); uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - } + OUT_OF_RANGE_SET_ARG; + SET_2D_GWS_ARGS(kernel_); for (auto input : input_tensors) { kernel_.setArg(idx++, *(input->opencl_image())); } @@ -114,14 +98,7 @@ MaceStatus AddNFunctor::operator()( output_tensor->dim(2), output_tensor->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future)); - - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/batch_norm.cc b/mace/kernels/opencl/batch_norm.cc index d2dce6d3708b69be7b7ac36d0aaeef1dd48bf920..e26065d9d340022455b585e55c953aab8c307e5c 100644 --- a/mace/kernels/opencl/batch_norm.cc +++ b/mace/kernels/opencl/batch_norm.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "mace/kernels/batch_norm.h" -#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/helper.h" #include "mace/utils/tuner.h" @@ -49,23 +48,13 @@ MaceStatus BatchNormFunctor::operator()( if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm"); built_options.emplace("-Dbatch_norm=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); if (folded_constant_) { built_options.emplace("-DFOLDED_CONSTANT"); } @@ -96,15 +85,8 @@ MaceStatus BatchNormFunctor::operator()( } if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(scale->opencl_image())); kernel_.setArg(idx++, *(offset->opencl_image())); @@ -125,14 +107,7 @@ MaceStatus BatchNormFunctor::operator()( output->dim(1), output->dim(2), output->dim(3), folded_constant_); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future)); - - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/bias_add.cc b/mace/kernels/opencl/bias_add.cc index b7023dd58fdf42afa7e4655b6034c1f8d3c4c69b..aaa0d17203c40dbd177e5a42956b8d9d3078c9f2 100644 --- a/mace/kernels/opencl/bias_add.cc +++ b/mace/kernels/opencl/bias_add.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "mace/kernels/bias_add.h" -#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/helper.h" #include "mace/utils/utils.h" @@ -45,39 +44,21 @@ MaceStatus BiasAddFunctor::operator()(const Tensor *input, if (kernel_.get() == nullptr) { std::set built_options; auto dt = DataTypeToEnum::value; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add"); built_options.emplace("-Dbias_add=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name, built_options, &kernel_)); - kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(bias->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image())); @@ -104,12 +85,7 @@ MaceStatus BiasAddFunctor::operator()(const Tensor *input, cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); } MACE_CL_RET_STATUS(error); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } + OUT_OF_RANGE_VALIDATION(kernel_error_); if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { event.wait(); diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index df104d66e299a361b601d997581a221194e1ae46..c95ef0ade2789f880cb563ee2d0103c7de4abf6f 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -13,8 +13,8 @@ // limitations under the License. #include "mace/kernels/buffer_to_image.h" -#include "mace/core/runtime/opencl/cl2_header.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" + +#include "mace/kernels/opencl/helper.h" namespace mace { namespace kernels { @@ -30,7 +30,8 @@ MaceStatus BufferToImageFunctor::operator()( CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size_); if (type == WINOGRAD_FILTER) { std::vector new_shape = - CalWinogradShape(buffer->shape(), type, wino_blk_size_); + {(wino_blk_size_ + 2) * (wino_blk_size_ + 2), + buffer->dim(0), buffer->dim(1)}; MACE_RETURN_IF_ERROR(image->ResizeImage(new_shape, image_shape)); } else { MACE_RETURN_IF_ERROR(image->ResizeImage(buffer->shape(), image_shape)); @@ -76,85 +77,74 @@ MaceStatus BufferToImageFunctor::operator()( auto runtime = OpenCLRuntime::Global(); - std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); - std::set built_options; - std::stringstream kernel_name_ss; - kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; - built_options.emplace(kernel_name_ss.str()); - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } - if (buffer->dtype() == image->dtype()) { - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToCLCMDDt(DataTypeToEnum::value)); - } else { - built_options.emplace("-DDATA_TYPE=" + - DtToUpstreamCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToUpstreamCLCMDDt(DataTypeToEnum::value)); - } - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - if (!kernel_error_) { - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); + if (kernel_.get() == nullptr) { + std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); + std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; + std::stringstream kernel_name_ss; + kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; + built_options.emplace(kernel_name_ss.str()); + if (buffer->dtype() == image->dtype()) { + built_options.emplace( + "-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + + DtToCLCMDDt(DataTypeToEnum::value)); + } else { + built_options.emplace("-DDATA_TYPE=" + + DtToUpCompatibleCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + + DtToUpCompatibleCLCMDDt(DataTypeToEnum::value)); } + MACE_RETURN_IF_ERROR(runtime->BuildKernel( + "buffer_to_image", obfuscated_kernel_name, built_options, &kernel_)); } - cl::Kernel b2f_kernel; - - MACE_RETURN_IF_ERROR(runtime->BuildKernel( - "buffer_to_image", obfuscated_kernel_name, built_options, &b2f_kernel)); - - uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - b2f_kernel.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - b2f_kernel.setArg(idx++, gws[0]); - b2f_kernel.setArg(idx++, gws[1]); - } - b2f_kernel.setArg(idx++, *(buffer->opencl_buffer())); - MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, - "buffer offset not aligned"); - b2f_kernel.setArg(idx++, - static_cast(buffer->buffer_offset() / - GetEnumTypeSize(buffer->dtype()))); - if (type == CONV2D_FILTER) { - const index_t inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(0))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); - b2f_kernel.setArg(idx++, static_cast(inner_size)); - } else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) { - b2f_kernel.setArg(idx++, static_cast(buffer->dim(0))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(1))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); - } else if (type == ARGUMENT) { - b2f_kernel.setArg(idx++, static_cast(buffer->dim(0))); - } else { - b2f_kernel.setArg(idx++, static_cast(formatted_buffer_shape[1])); - b2f_kernel.setArg(idx++, static_cast(formatted_buffer_shape[2])); - b2f_kernel.setArg(idx++, static_cast(formatted_buffer_shape[3])); + if (!IsVecEqual(input_shape_, buffer->shape())) { + uint32_t idx = 0; + OUT_OF_RANGE_SET_ARG; + SET_2D_GWS_ARGS(kernel_); + kernel_.setArg(idx++, *(buffer->opencl_buffer())); + MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, + "buffer offset not aligned"); + kernel_.setArg(idx++, + static_cast(buffer->buffer_offset() / + GetEnumTypeSize(buffer->dtype()))); + if (type == CONV2D_FILTER) { + const index_t + inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3); + kernel_.setArg(idx++, static_cast(buffer->dim(0))); + kernel_.setArg(idx++, static_cast(buffer->dim(2))); + kernel_.setArg(idx++, static_cast(buffer->dim(3))); + kernel_.setArg(idx++, static_cast(inner_size)); + } else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) { + kernel_.setArg(idx++, static_cast(buffer->dim(0))); + kernel_.setArg(idx++, static_cast(buffer->dim(1))); + kernel_.setArg(idx++, static_cast(buffer->dim(2))); + kernel_.setArg(idx++, static_cast(buffer->dim(3))); + } else if (type == ARGUMENT) { + kernel_.setArg(idx++, static_cast(buffer->dim(0))); + } else { + kernel_.setArg(idx++, + static_cast(formatted_buffer_shape[1])); + kernel_.setArg(idx++, + static_cast(formatted_buffer_shape[2])); + kernel_.setArg(idx++, + static_cast(formatted_buffer_shape[3])); + } + kernel_.setArg(idx++, *(image->opencl_image())); + input_shape_ = buffer->shape(); } - b2f_kernel.setArg(idx++, *(image->opencl_image())); const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(b2f_kernel)); + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); const std::vector lws = {16, kwg_size / 16}; cl::Event event; cl_int error; if (runtime->IsNonUniformWorkgroupsSupported()) { error = runtime->command_queue().enqueueNDRangeKernel( - b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), + kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]), cl::NDRange(lws[0], lws[1]), nullptr, &event); } else { std::vector roundup_gws(lws.size()); @@ -163,16 +153,11 @@ MaceStatus BufferToImageFunctor::operator()( } error = runtime->command_queue().enqueueNDRangeKernel( - b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), + kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), cl::NDRange(lws[0], lws[1]), nullptr, &event); } MACE_CL_RET_STATUS(error); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } + OUT_OF_RANGE_VALIDATION(kernel_error_); if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { event.wait(); diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc index 8babf33845c584d0ddd44ea1cc8212a1c3588f35..d74346832d9ff41af251decbdd0e113a408c9f62 100644 --- a/mace/kernels/opencl/channel_shuffle.cc +++ b/mace/kernels/opencl/channel_shuffle.cc @@ -45,23 +45,13 @@ MaceStatus ChannelShuffleFunctor::operator()( if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle"); built_options.emplace("-Dchannel_shuffle=" + kernel_name); auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); MACE_RETURN_IF_ERROR( runtime->BuildKernel("channel_shuffle", kernel_name, built_options, &kernel_)); @@ -72,15 +62,8 @@ MaceStatus ChannelShuffleFunctor::operator()( if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, groups_); kernel_.setArg(idx++, static_cast(channels_per_group)); @@ -95,14 +78,7 @@ MaceStatus ChannelShuffleFunctor::operator()( output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future)); - - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/cl/activation.cl b/mace/kernels/opencl/cl/activation.cl index 25e79b3db2e36f72ec961b1a6f3206b416ce81dd..6436b82ae8fff1be4994f1cad6f11ba084bba367 100644 --- a/mace/kernels/opencl/cl/activation.cl +++ b/mace/kernels/opencl/cl/activation.cl @@ -17,10 +17,8 @@ __kernel void activation(KERNEL_ERROR_PARAMS || hb >= global_size_dim2) { return; } - const int width = global_size_dim1; -#else - const int width = get_global_size(1); #endif + const int width = global_size_dim1; const int pos = mad24(ch_blk, width, w); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl index 064d8ecc4b88310a1817e701823a2b75c8db1036..2da41eedf8a4cad86c048d1b4cd450fd6a7ca593 100644 --- a/mace/kernels/opencl/cl/batch_norm.cl +++ b/mace/kernels/opencl/cl/batch_norm.cl @@ -21,10 +21,8 @@ __kernel void batch_norm(KERNEL_ERROR_PARAMS || hb >= global_size_dim2) { return; } - const int width = global_size_dim1; -#else - const int width = get_global_size(1); #endif + const int width = global_size_dim1; #ifdef FOLDED_CONSTANT DATA_TYPE4 bn_scale = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0)); diff --git a/mace/kernels/opencl/cl/bias_add.cl b/mace/kernels/opencl/cl/bias_add.cl index 2cd5fb5caa9dc3c39c82bb6ff2c0f1c16dc87a66..31d11be70eb18242b0d21d12bbb96b1eb117cee0 100644 --- a/mace/kernels/opencl/cl/bias_add.cl +++ b/mace/kernels/opencl/cl/bias_add.cl @@ -14,10 +14,8 @@ __kernel void bias_add(KERNEL_ERROR_PARAMS || hb >= global_size_dim2) { return; } - const int width = global_size_dim1; -#else - const int width = get_global_size(1); #endif + const int width = global_size_dim1; const int pos = mad24(ch_blk, width, w); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl index c4ef49886e81fd54ffc2195330b1d3caaa51299a..0ab39219c6068544037289b5b9a24d60069f5bd2 100644 --- a/mace/kernels/opencl/cl/buffer_to_image.cl +++ b/mace/kernels/opencl/cl/buffer_to_image.cl @@ -446,10 +446,8 @@ __kernel void weight_height_buffer_to_image(KERNEL_ERROR_PARAMS if (w >= global_size_dim0 || h >= global_size_dim1) { return; } - const int inner_size = global_size_dim0; -#else - const int inner_size = get_global_size(0); #endif + const int inner_size = global_size_dim0; const int out_chan_idx = h << 2; const int in_chan_idx = w % in_channels; @@ -492,10 +490,8 @@ __kernel void weight_height_image_to_buffer(KERNEL_ERROR_PARAMS if (w >= global_size_dim0 || h >= global_size_dim1) { return; } - const int inner_size = global_size_dim0; -#else - const int inner_size = get_global_size(0); #endif + const int inner_size = global_size_dim0; const int out_chan_idx = h << 2; const int in_chan_idx = w % in_channels; @@ -536,10 +532,8 @@ __kernel void weight_width_buffer_to_image(KERNEL_ERROR_PARAMS if (w >= global_size_dim0 || h >= global_size_dim1) { return; } - const int out_channels = global_size_dim1; -#else - const int out_channels = get_global_size(1); #endif + const int out_channels = global_size_dim1; const int in_chan_blks = (in_channels + 3) >> 2; const int hw_size = height * width; const int inner_size = in_channels * hw_size; @@ -585,10 +579,8 @@ __kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS if (w >= global_size_dim0 || h >= global_size_dim1) { return; } - const int out_channels = global_size_dim1; -#else - const int out_channels = get_global_size(1); #endif + const int out_channels = global_size_dim1; const int in_chan_blks = (in_channels + 3) >> 2; const int hw_size = height * width; const int inner_size = in_channels * hw_size; @@ -632,10 +624,8 @@ __kernel void winograd_filter_buffer_to_image_2x2(KERNEL_ERROR_PARAMS if (w >= global_size_dim0 || h >= global_size_dim1) { return; } - const int out_channels = global_size_dim1; -#else - const int out_channels = get_global_size(1); #endif + const int out_channels = global_size_dim1; const int out_channel_idx = h; const int in_channel_idx = w << 2; @@ -782,10 +772,8 @@ __kernel void winograd_filter_buffer_to_image_6x6(KERNEL_ERROR_PARAMS if (w >= global_size_dim0 || h >= global_size_dim1) { return; } - const int out_channels = global_size_dim1; -#else - const int out_channels = get_global_size(1); #endif + const int out_channels = global_size_dim1; const int out_channel_idx = h; const int in_channel_idx = w << 2; @@ -960,10 +948,8 @@ __kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS if (w >= global_size_dim0 || h >= global_size_dim1) { return; } - const int out_channels = global_size_dim1; -#else - const int out_channels = get_global_size(1); #endif + const int out_channels = global_size_dim1; const int out_channel_idx = h; const int in_channel_idx = w << 2; diff --git a/mace/kernels/opencl/cl/channel_shuffle.cl b/mace/kernels/opencl/cl/channel_shuffle.cl index c404130eb0fd04d9f900b986405c1bd616bcbc8b..6563c7a8b79db82f9829cde65bcb5e558553ec0e 100644 --- a/mace/kernels/opencl/cl/channel_shuffle.cl +++ b/mace/kernels/opencl/cl/channel_shuffle.cl @@ -16,10 +16,8 @@ __kernel void channel_shuffle(KERNEL_ERROR_PARAMS || hb_idx >= global_size_dim2) { return; } - const int width = global_size_dim1; -#else - const int width = get_global_size(1); #endif + const int width = global_size_dim1; const int group_blks = groups / 4; const int groups_blks_width = group_blks * width; diff --git a/mace/kernels/opencl/cl/common.h b/mace/kernels/opencl/cl/common.h index 09731d77fe92f948e2b287ad13eb4e5f0b304ec3..8408f1be075337caa38b30b7d9f15bb25ab24f85 100644 --- a/mace/kernels/opencl/cl/common.h +++ b/mace/kernels/opencl/cl/common.h @@ -38,24 +38,15 @@ CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \ CMD_TYPE(write_image, CMD_DATA_TYPE)(image, coord, value); -#ifndef NON_UNIFORM_WORK_GROUP - #define GLOBAL_WORK_GROUP_SIZE_DIM2 \ __private const int global_size_dim0, \ __private const int global_size_dim1, + #define GLOBAL_WORK_GROUP_SIZE_DIM3 \ __private const int global_size_dim0, \ __private const int global_size_dim1, \ __private const int global_size_dim2, -#else - -#define GLOBAL_WORK_GROUP_SIZE_DIM2 -#define GLOBAL_WORK_GROUP_SIZE_DIM3 - -#endif - - #ifdef OUT_OF_RANGE_CHECK #define KERNEL_ERROR_PARAMS \ diff --git a/mace/kernels/opencl/cl/concat.cl b/mace/kernels/opencl/cl/concat.cl index e6e78e8651286118f8c7f7772eebbfe6c138cfa2..e656109cef25a5b41ffaa8d166f0f90a8579af66 100644 --- a/mace/kernels/opencl/cl/concat.cl +++ b/mace/kernels/opencl/cl/concat.cl @@ -37,10 +37,8 @@ __kernel void concat_channel(KERNEL_ERROR_PARAMS || hb_idx >= global_size_dim2) { return; } - const int width = global_size_dim1; -#else - const int width = get_global_size(1); #endif + const int width = global_size_dim1; const int input0_chan_blk = (input0_chan + 3) >> 2; @@ -100,10 +98,8 @@ __kernel void concat_channel_multi(KERNEL_ERROR_PARAMS || hb_idx >= global_size_dim2) { return; } - const int width = global_size_dim1; -#else - const int width = get_global_size(1); #endif + const int width = global_size_dim1; DATA_TYPE4 data = 0; data = READ_IMAGET(input, diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl index d23f6e435c33325f2c019f8959c4adff74e7491c..b645502c48797d92a8c45691a466d7f67f1bb704 100644 --- a/mace/kernels/opencl/cl/conv_2d.cl +++ b/mace/kernels/opencl/cl/conv_2d.cl @@ -30,10 +30,8 @@ __kernel void conv_2d(KERNEL_ERROR_PARAMS || out_hb >= global_size_dim2) { return; } - const int out_w_blks = global_size_dim1; -#else - const int out_w_blks = get_global_size(1); #endif + const int out_w_blks = global_size_dim1; #ifdef BIAS DATA_TYPE4 out0 = diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index 65d2b9c68e8680ac62763df59daee3915e8b3f25..b9b387e1926304a4431c5e63e096d8fac8592430 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -24,10 +24,8 @@ __kernel void conv_2d_1x1(KERNEL_ERROR_PARAMS || out_hb >= global_size_dim2) { return; } - const int out_w_blks = global_size_dim1; -#else - const int out_w_blks = get_global_size(1); #endif + const int out_w_blks = global_size_dim1; #ifdef BIAS DATA_TYPE4 out0 = READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0)); diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index 13e4ccb347cc4a42c67e7f8dc7e740cf336af0d9..076032879a53b572a989f1b59d7a4ea171c11b18 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -28,10 +28,8 @@ __kernel void conv_2d_3x3(KERNEL_ERROR_PARAMS || out_hb >= global_size_dim2) { return; } - const int out_w_blks = global_size_dim1; -#else - const int out_w_blks = get_global_size(1); #endif + const int out_w_blks = global_size_dim1; #ifdef BIAS DATA_TYPE4 out0 = diff --git a/mace/kernels/opencl/cl/depthwise_conv2d.cl b/mace/kernels/opencl/cl/depthwise_conv2d.cl index 9173f1b4d2c47c86db4bf4610ba3ab5c5085df9f..979c882b2565f42b7579a92b4d1553e93b9b602b 100644 --- a/mace/kernels/opencl/cl/depthwise_conv2d.cl +++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl @@ -30,10 +30,8 @@ __kernel void depthwise_conv2d(KERNEL_ERROR_PARAMS || out_hb >= global_size_dim2) { return; } - const short out_w_blks = global_size_dim1; -#else - const short out_w_blks = get_global_size(1); #endif + const short out_w_blks = global_size_dim1; const short rounded_in_ch = in_ch_blks << 2; const short in_ch_blk = out_ch_blk; // multiplier = 1 diff --git a/mace/kernels/opencl/cl/fully_connected.cl b/mace/kernels/opencl/cl/fully_connected.cl index e5de2c641e10fc51415004b7a46d075fdb62ddb3..0aea2ee56e99f3213b178f8ff60e6963bda2d052 100644 --- a/mace/kernels/opencl/cl/fully_connected.cl +++ b/mace/kernels/opencl/cl/fully_connected.cl @@ -80,7 +80,7 @@ __kernel void fully_connected_width(KERNEL_ERROR_PARAMS __private const float relux_max_limit) { const int inter_out_idx = get_global_id(0); const int width_blk_idx = get_global_id(1); - const int width_blk_count = get_global_size(1); + const int width_blk_count = global_size_dim1; const int batch_out_blk_idx = get_global_id(2); const int batch_idx = batch_out_blk_idx / out_blks; diff --git a/mace/kernels/opencl/cl/pad.cl b/mace/kernels/opencl/cl/pad.cl index 1ccaa29af28df2bbf329f6716511d736d2624964..8e102d60c76f1acd7b8f2551e8021e45bdb3256e 100644 --- a/mace/kernels/opencl/cl/pad.cl +++ b/mace/kernels/opencl/cl/pad.cl @@ -23,11 +23,8 @@ __kernel void pad(KERNEL_ERROR_PARAMS || hb_idx >= global_size_dim2) { return; } - const int width = global_size_dim1; -#else - const int width = get_global_size(1); #endif - + const int width = global_size_dim1; DATA_TYPE4 data = constant_value; if ((height_padding <= height_idx && height_idx < input_padded_height) && diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl index b0faddb021451f834334b1ca84c318011d1cdd9c..11ff3b89569a31a73437cb8f22b0e009d2ad5769 100644 --- a/mace/kernels/opencl/cl/pooling.cl +++ b/mace/kernels/opencl/cl/pooling.cl @@ -36,10 +36,8 @@ __kernel void pooling(KERNEL_ERROR_PARAMS || out_hb_idx >= global_size_dim2) { return; } - const int out_width = global_size_dim1; -#else - const int out_width = get_global_size(1); #endif + const int out_width = global_size_dim1; const int batch_idx = mul24((out_hb_idx / out_height), in_height); const int in_height_start = mul24((out_hb_idx % out_height), stride) - pad_top; diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl index 8736bf52717da115eaf9f32afe8a908b3754cee8..8e1fb1e6ecdf987a822e2b8c2656d4162cbd984e 100644 --- a/mace/kernels/opencl/cl/resize_bilinear.cl +++ b/mace/kernels/opencl/cl/resize_bilinear.cl @@ -19,12 +19,9 @@ __kernel void resize_bilinear_nocache(KERNEL_ERROR_PARAMS || hb >= global_size_dim2) { return; } +#endif const int ch_blks = global_size_dim0; const int out_width = global_size_dim1; -#else - const int ch_blks = get_global_size(0); - const int out_width = get_global_size(1); -#endif const int b = hb / out_height; const int h = hb % out_height; diff --git a/mace/kernels/opencl/cl/slice.cl b/mace/kernels/opencl/cl/slice.cl index 366cddc32f77e8e7d7bfcbe35abf3880d560274c..f6b0c35a95249129a2d235557744ff9c522b7e84 100644 --- a/mace/kernels/opencl/cl/slice.cl +++ b/mace/kernels/opencl/cl/slice.cl @@ -14,10 +14,8 @@ __kernel void slice(KERNEL_ERROR_PARAMS || hb_idx >= global_size_dim2) { return; } - const int width = global_size_dim1; -#else - const int width = get_global_size(1); #endif + const int width = global_size_dim1; DATA_TYPE4 data = READ_IMAGET(input, SAMPLER, (int2)(mad24(chan_blk_idx + chan_blk_offset, diff --git a/mace/kernels/opencl/cl/softmax.cl b/mace/kernels/opencl/cl/softmax.cl index 710433a219146af66e63447fbbab7470a9ea9e2f..361ea263cc13a79f561f633bc17112e1b6f0c2d7 100644 --- a/mace/kernels/opencl/cl/softmax.cl +++ b/mace/kernels/opencl/cl/softmax.cl @@ -15,12 +15,9 @@ __kernel void softmax(KERNEL_ERROR_PARAMS || hb_idx >= global_size_dim2) { return; } +#endif const int chan_blks = global_size_dim0 - 1; const int width = global_size_dim1; -#else - const int chan_blks = get_global_size(0) - 1; - const int width = get_global_size(1); -#endif int pos = width_idx; DATA_TYPE max_value = -FLT_MAX; diff --git a/mace/kernels/opencl/cl/winograd_transform.cl b/mace/kernels/opencl/cl/winograd_transform.cl index bfc589814c1eb8f52a1b71414ddb0dcd87a4fa4a..5e0a467fda8277532d66d787afbffa22626da443 100644 --- a/mace/kernels/opencl/cl/winograd_transform.cl +++ b/mace/kernels/opencl/cl/winograd_transform.cl @@ -20,10 +20,8 @@ __kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) { return; } - const int chan_blk_size = global_size_dim1; -#else - const int chan_blk_size = get_global_size(1); #endif + const int chan_blk_size = global_size_dim1; const int batch_idx = out_width_idx * round_hw_r; const int t_idx = mad24(batch_idx, -round_hw, out_width_idx); @@ -141,10 +139,8 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) { return; } - const int out_channel = global_size_dim1; -#else - const int out_channel = get_global_size(1); #endif + const int out_channel = global_size_dim1; int width = width_idx; int height = height_idx; @@ -255,10 +251,8 @@ __kernel void winograd_transform_4x4(KERNEL_ERROR_PARAMS if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) { return; } - const int chan_blk_size = global_size_dim1; -#else - const int chan_blk_size = get_global_size(1); #endif + const int chan_blk_size = global_size_dim1; const int batch_idx = out_width_idx * round_hw_r; const int t_idx = mad24(batch_idx, -round_hw, out_width_idx); @@ -417,10 +411,8 @@ __kernel void winograd_inverse_transform_4x4(KERNEL_ERROR_PARAMS if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) { return; } - const int out_channel = global_size_dim1; -#else - const int out_channel = get_global_size(1); #endif + const int out_channel = global_size_dim1; const int batch = width_idx * round_hw_r; int h = mad24(batch, -round_hw, width_idx); diff --git a/mace/kernels/opencl/common.h b/mace/kernels/opencl/common.h new file mode 100644 index 0000000000000000000000000000000000000000..176f58eda5acbb9ab88807f92147e2db859fe3c0 --- /dev/null +++ b/mace/kernels/opencl/common.h @@ -0,0 +1,35 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_COMMON_H_ +#define MACE_KERNELS_OPENCL_COMMON_H_ + +namespace mace { +namespace kernels { + +enum BufferType { + CONV2D_FILTER = 0, + IN_OUT_CHANNEL = 1, + ARGUMENT = 2, + IN_OUT_HEIGHT = 3, + IN_OUT_WIDTH = 4, + WINOGRAD_FILTER = 5, + DW_CONV2D_FILTER = 6, + WEIGHT_HEIGHT = 7, + WEIGHT_WIDTH = 8, +}; + +} // namespace kernels +} // namespace mace +#endif // MACE_KERNELS_OPENCL_COMMON_H_ diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index 98ac4342908f8be480f667b55654448d23e17ede..58b27faa92ff1a3979249cbe9a552ff4f58323c6 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -65,26 +65,16 @@ static MaceStatus Concat2(cl::Kernel *kernel, if (kernel->get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(*kernel_error); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); built_options.emplace("-Dconcat_channel=" + kernel_name); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1)); - (*kernel_error)->Map(nullptr); - *((*kernel_error)->mutable_data()) = 0; - (*kernel_error)->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } if (input0->dtype() == output->dtype()) { built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); } else { - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); } if (input0->dim(3) % 4 == 0) { built_options.emplace("-DDIVISIBLE_FOUR"); @@ -97,15 +87,8 @@ static MaceStatus Concat2(cl::Kernel *kernel, } if (!IsVecEqual(*prev_input_shape, input0->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel->setArg(idx++, - *(static_cast((*kernel_error)->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel->setArg(idx++, gws[0]); - kernel->setArg(idx++, gws[1]); - kernel->setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG_PTR; + SET_3D_GWS_ARGS_PTR(kernel, gws); kernel->setArg(idx++, *(static_cast(input0->opencl_image()))); kernel->setArg(idx++, @@ -123,14 +106,7 @@ static MaceStatus Concat2(cl::Kernel *kernel, output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future)); - - if (runtime->IsOutOfRangeCheckEnabled()) { - (*kernel_error)->Map(nullptr); - char *kerror_code = (*kernel_error)->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - (*kernel_error)->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(*kernel_error); return MACE_SUCCESS; } @@ -149,22 +125,12 @@ static MaceStatus ConcatN(cl::Kernel *kernel, if (kernel->get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(*kernel_error); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi"); built_options.emplace("-Dconcat_channel_multi=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1)); - (*kernel_error)->Map(nullptr); - *((*kernel_error)->mutable_data()) = 0; - (*kernel_error)->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name, built_options, kernel)); *kwg_size = @@ -185,15 +151,8 @@ static MaceStatus ConcatN(cl::Kernel *kernel, const std::vector lws = LocalWS(gws, *kwg_size); uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel->setArg(idx++, - *(static_cast((*kernel_error)->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel->setArg(idx++, gws[0]); - kernel->setArg(idx++, gws[1]); - kernel->setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG_PTR; + SET_3D_GWS_ARGS_PTR(kernel, gws); kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, static_cast(chan_blk_offset)); kernel->setArg(idx++, *(output->opencl_image())); @@ -217,12 +176,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel, cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); } MACE_CL_RET_STATUS(error); - if (runtime->IsOutOfRangeCheckEnabled()) { - (*kernel_error)->Map(nullptr); - char *kerror_code = (*kernel_error)->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - (*kernel_error)->UnMap(); - } + OUT_OF_RANGE_VALIDATION(*kernel_error); if (future != nullptr && runtime->is_profiling_enabled()) { event.wait(); CallStats tmp_stats; diff --git a/mace/kernels/opencl/conv_2d_1x1.cc b/mace/kernels/opencl/conv_2d_1x1.cc index e5eb2134a21504283b173382aa6e112e2ba7c303..770f0606d4152c6ad7e65f92c94246487571417a 100644 --- a/mace/kernels/opencl/conv_2d_1x1.cc +++ b/mace/kernels/opencl/conv_2d_1x1.cc @@ -98,22 +98,12 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, MACE_CHECK(input_batch == batch); std::set built_options; + OUT_OF_RANGE_CONFIG(*kernel_error); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1"); built_options.emplace("-Dconv_2d_1x1=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1)); - (*kernel_error)->Map(nullptr); - *((*kernel_error)->mutable_data()) = 0; - (*kernel_error)->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); if (bias != nullptr) { built_options.emplace("-DBIAS"); } @@ -147,17 +137,11 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, static_cast(width_blocks), static_cast(height * batch)}; + // Support different input size if (!IsVecEqual(*prev_input_shape, input->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel->setArg(idx++, - *(static_cast((*kernel_error)->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel->setArg(idx++, gws[0]); - kernel->setArg(idx++, gws[1]); - kernel->setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG_PTR; + SET_3D_GWS_ARGS_PTR(kernel, gws); kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image())); if (bias != nullptr) { @@ -182,14 +166,7 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future)); - - if (runtime->IsOutOfRangeCheckEnabled()) { - (*kernel_error)->Map(nullptr); - char *kerror_code = (*kernel_error)->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - (*kernel_error)->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(*kernel_error); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/conv_2d_3x3.cc b/mace/kernels/opencl/conv_2d_3x3.cc index 9984fe10fcd6043762b966b7abe920987859a52a..02df4ea166abd8f80bd77e9cc6c91754a30503e8 100644 --- a/mace/kernels/opencl/conv_2d_3x3.cc +++ b/mace/kernels/opencl/conv_2d_3x3.cc @@ -84,22 +84,12 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, if (kernel->get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(*kernel_error); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3"); built_options.emplace("-Dconv_2d_3x3=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1)); - (*kernel_error)->Map(nullptr); - *((*kernel_error)->mutable_data()) = 0; - (*kernel_error)->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation) { case NOOP: @@ -131,17 +121,11 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, static_cast(width_blocks), static_cast(height * batch)}; + // Support different input size if (!IsVecEqual(*prev_input_shape, input->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel->setArg(idx++, - *(static_cast((*kernel_error)->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel->setArg(idx++, gws[0]); - kernel->setArg(idx++, gws[1]); - kernel->setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG_PTR; + SET_3D_GWS_ARGS_PTR(kernel, gws); kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image())); if (bias != nullptr) { @@ -169,14 +153,7 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future)); - - if (runtime->IsOutOfRangeCheckEnabled()) { - (*kernel_error)->Map(nullptr); - char *kerror_code = (*kernel_error)->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - (*kernel_error)->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(*kernel_error); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/conv_2d_general.cc b/mace/kernels/opencl/conv_2d_general.cc index a6e2969443b7ca28d009f94853249a41156cae00..fa2c9774b607652c3ca307239d5baae33aeac699 100644 --- a/mace/kernels/opencl/conv_2d_general.cc +++ b/mace/kernels/opencl/conv_2d_general.cc @@ -93,22 +93,12 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, if (kernel->get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(*kernel_error); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d"); built_options.emplace("-Dconv_2d=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1)); - (*kernel_error)->Map(nullptr); - *((*kernel_error)->mutable_data()) = 0; - (*kernel_error)->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation) { case NOOP: @@ -140,17 +130,11 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, static_cast(width_blocks), static_cast(height * batch)}; + // Support different input size if (!IsVecEqual(*prev_input_shape, input->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel->setArg(idx++, - *(static_cast((*kernel_error)->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel->setArg(idx++, gws[0]); - kernel->setArg(idx++, gws[1]); - kernel->setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG_PTR; + SET_3D_GWS_ARGS_PTR(kernel, gws); kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image())); if (bias != nullptr) { @@ -182,13 +166,7 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - (*kernel_error)->Map(nullptr); - char *kerror_code = (*kernel_error)->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - (*kernel_error)->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(*kernel_error); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/crop.cc b/mace/kernels/opencl/crop.cc index ba9248d14f9c2a6b4bdac530808c7c375334cee4..651b2ef87a544ca6f682aedb3e8a2c1ae3bd4bf1 100644 --- a/mace/kernels/opencl/crop.cc +++ b/mace/kernels/opencl/crop.cc @@ -136,23 +136,13 @@ MaceStatus CropFunctor::operator()( if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop"); built_options.emplace("-Dcrop=" + kernel_name); auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name, built_options, &kernel_)); @@ -161,15 +151,8 @@ MaceStatus CropFunctor::operator()( } if (!IsVecEqual(input_shape_, input0->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input0->opencl_image())); kernel_.setArg(idx++, static_cast(offsets[0])); kernel_.setArg(idx++, static_cast(offsets[1])); @@ -190,14 +173,7 @@ MaceStatus CropFunctor::operator()( output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future)); - - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/deconv_2d_opencl.cc b/mace/kernels/opencl/deconv_2d.cc similarity index 85% rename from mace/kernels/opencl/deconv_2d_opencl.cc rename to mace/kernels/opencl/deconv_2d.cc index ac7af70dbe12aaf34d3b85e7e4698d6ad7a96ab6..80e6370d751af8cd6f5e26fa6b5b8f0046b54592 100644 --- a/mace/kernels/opencl/deconv_2d_opencl.cc +++ b/mace/kernels/opencl/deconv_2d.cc @@ -59,22 +59,12 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel, if (kernel->get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(*kernel_error); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d"); built_options.emplace("-Ddeconv_2d=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1)); - (*kernel_error)->Map(nullptr); - *((*kernel_error)->mutable_data()) = 0; - (*kernel_error)->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation) { case NOOP: @@ -108,15 +98,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel, if (!IsVecEqual(*prev_input_shape, input->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel->setArg(idx++, - *(static_cast((*kernel_error)->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel->setArg(idx++, gws[0]); - kernel->setArg(idx++, gws[1]); - kernel->setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG_PTR; + SET_3D_GWS_ARGS_PTR(kernel, gws); kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image())); if (bias != nullptr) { @@ -152,13 +135,7 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - (*kernel_error)->Map(nullptr); - char *kerror_code = (*kernel_error)->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - (*kernel_error)->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(*kernel_error); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/depth_to_space.cc b/mace/kernels/opencl/depth_to_space.cc index cd379b2274882bafe78737cad4f833e3fa7d14c4..4c1fd3becb1ada46dee96afe50ff56ff728ba0e9 100644 --- a/mace/kernels/opencl/depth_to_space.cc +++ b/mace/kernels/opencl/depth_to_space.cc @@ -76,6 +76,8 @@ MaceStatus DepthToSpaceOpFunctor::operator()( if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; @@ -83,18 +85,6 @@ MaceStatus DepthToSpaceOpFunctor::operator()( auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space", obfuscated_kernel_name, built_options, @@ -106,15 +96,8 @@ MaceStatus DepthToSpaceOpFunctor::operator()( if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input->opencl_image())); if (d2s_) { kernel_.setArg(idx++, static_cast(block_size_)); @@ -140,13 +123,7 @@ MaceStatus DepthToSpaceOpFunctor::operator()( MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/depthwise_conv.cc b/mace/kernels/opencl/depthwise_conv.cc index 517ff16d26e8bb894d890e4136e0eef6dca1a713..3c97a28845ae09152e6439092db8b78e4f992275 100644 --- a/mace/kernels/opencl/depthwise_conv.cc +++ b/mace/kernels/opencl/depthwise_conv.cc @@ -93,6 +93,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, if (kernel->get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(*kernel_error); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) { kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d_s1"); @@ -100,20 +102,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, } else { built_options.emplace("-Ddepthwise_conv2d=" + kernel_name); } - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1)); - (*kernel_error)->Map(nullptr); - *((*kernel_error)->mutable_data()) = 0; - (*kernel_error)->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace(MakeString("-DSTRIDE=", stride)); switch (activation) { @@ -154,15 +144,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, input_channels); uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel->setArg(idx++, - *(static_cast((*kernel_error)->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel->setArg(idx++, gws[0]); - kernel->setArg(idx++, gws[1]); - kernel->setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG_PTR; + SET_3D_GWS_ARGS_PTR(kernel, gws); kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image())); if (bias != nullptr) { @@ -193,13 +176,7 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - (*kernel_error)->Map(nullptr); - char *kerror_code = (*kernel_error)->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - (*kernel_error)->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(*kernel_error); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/eltwise.cc b/mace/kernels/opencl/eltwise.cc index 503d5d5d43c5bef2d67a25a8bd712ff3f92a473d..1f9eebe35702c9cd713b82e50d7a8abcbc43f830 100644 --- a/mace/kernels/opencl/eltwise.cc +++ b/mace/kernels/opencl/eltwise.cc @@ -74,11 +74,13 @@ MaceStatus EltwiseFunctor::operator()(const Tensor *input0, auto runtime = OpenCLRuntime::Global(); if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise"); built_options.emplace("-Deltwise=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace(MakeString("-DELTWISE_TYPE=", type_)); if (input1 == nullptr) { built_options.emplace("-DINPUT_TYPE=1"); @@ -90,19 +92,6 @@ MaceStatus EltwiseFunctor::operator()(const Tensor *input0, if (swapped) built_options.emplace("-DSWAPPED"); } if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM"); - - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name, built_options, &kernel_)); @@ -111,15 +100,8 @@ MaceStatus EltwiseFunctor::operator()(const Tensor *input0, } if (!IsVecEqual(input_shape_, input0->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input0->opencl_image())); if (input1 == nullptr) { kernel_.setArg(idx++, value_); @@ -144,13 +126,7 @@ MaceStatus EltwiseFunctor::operator()(const Tensor *input0, output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/fully_connected.cc b/mace/kernels/opencl/fully_connected.cc index cbd046b46d62fa0c5dde3a753aef22b4932def3a..dc8798a53c816b7a03153ae8dd2604d02dd7de67 100644 --- a/mace/kernels/opencl/fully_connected.cc +++ b/mace/kernels/opencl/fully_connected.cc @@ -13,6 +13,8 @@ // limitations under the License. #include "mace/kernels/fully_connected.h" + +#include "mace/kernels/opencl/helper.h" #include "mace/utils/tuner.h" namespace mace { @@ -42,11 +44,13 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, const index_t output_blocks = RoundUpDiv4(output_size); std::set built_options; + OUT_OF_RANGE_CONFIG(*kernel_error); + NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width"); built_options.emplace("-Dfully_connected_width=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); if (bias != nullptr) { built_options.emplace("-DBIAS"); } @@ -71,19 +75,6 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { built_options.emplace("-DNON_QUALCOMM_ADRENO"); } - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1)); - (*kernel_error)->Map(nullptr); - *((*kernel_error)->mutable_data()) = 0; - (*kernel_error)->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } - MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name, built_options, kernel)); @@ -113,15 +104,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, (*gws)[2] = static_cast(batch * output_blocks); uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel->setArg(idx++, - *(static_cast((*kernel_error)->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel->setArg(idx++, (*gws)[0]); - kernel->setArg(idx++, (*gws)[1]); - kernel->setArg(idx++, (*gws)[2]); - } + OUT_OF_RANGE_SET_ARG_PTR; + SET_3D_GWS_ARGS_PTR(kernel, *gws); kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(weight->opencl_image())); if (bias != nullptr) { @@ -154,12 +138,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event); } - if (runtime->IsOutOfRangeCheckEnabled()) { - (*kernel_error)->Map(nullptr); - char *kerror_code = (*kernel_error)->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - (*kernel_error)->UnMap(); - } + OUT_OF_RANGE_VALIDATION(*kernel_error); MACE_CL_RET_STATUS(error); if (future != nullptr) { @@ -192,26 +171,16 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel, auto runtime = OpenCLRuntime::Global(); if (kernel->get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(*kernel_error); + NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected"); built_options.emplace("-Dfully_connected=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); if (bias != nullptr) { built_options.emplace("-DBIAS"); } - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - *kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR((*kernel_error)->Allocate(1)); - (*kernel_error)->Map(nullptr); - *((*kernel_error)->mutable_data()) = 0; - (*kernel_error)->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } switch (activation) { case NOOP: break; @@ -247,14 +216,8 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel, }; uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel->setArg(idx++, - *(static_cast((*kernel_error)->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel->setArg(idx++, (*gws)[0]); - kernel->setArg(idx++, (*gws)[1]); - } + OUT_OF_RANGE_SET_ARG_PTR; + SET_2D_GWS_ARGS_PTR(kernel, *gws); kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(weight->opencl_image())); if (bias != nullptr) { @@ -276,13 +239,7 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel, MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(*kernel, tuning_key, gws->data(), *lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - (*kernel_error)->Map(nullptr); - char *kerror_code = (*kernel_error)->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - (*kernel_error)->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(*kernel_error); return MACE_SUCCESS; } } // namespace diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index 6d882352df9ef0790d68a814ba772d997aa18ebf..6ef80c80d1e21b9e8a3c0e93b1721d50ccc46d00 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -180,22 +180,6 @@ std::vector FormatBufferShape( } } -std::vector CalWinogradShape(const std::vector &shape, - const BufferType type, - const int wino_blk_size) { - if (type == WINOGRAD_FILTER) { - return {(wino_blk_size + 2) * (wino_blk_size + 2), shape[0], shape[1]}; - } else if (type == IN_OUT_HEIGHT) { - index_t out_width = - shape[0] * ((shape[1] + wino_blk_size - 1) / wino_blk_size) * - ((shape[2] + wino_blk_size - 1) / wino_blk_size); - return {(wino_blk_size + 2) * (wino_blk_size + 2), shape[3], out_width}; - } else { - LOG(FATAL) << "Mace not supported yet."; - return std::vector(); - } -} - std::string DtToCLDt(const DataType dt) { switch (dt) { case DT_FLOAT: @@ -220,7 +204,7 @@ std::string DtToCLCMDDt(const DataType dt) { } } -std::string DtToUpstreamCLDt(const DataType dt) { +std::string DtToUpCompatibleCLDt(const DataType dt) { switch (dt) { case DT_FLOAT: case DT_HALF: @@ -231,7 +215,7 @@ std::string DtToUpstreamCLDt(const DataType dt) { } } -std::string DtToUpstreamCLCMDDt(const DataType dt) { +std::string DtToUpCompatibleCLCMDDt(const DataType dt) { switch (dt) { case DT_FLOAT: case DT_HALF: @@ -357,7 +341,8 @@ MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel, double elapse_time = timer->AccumulatedMicros(); timer->ClearTiming(); uint32_t num_blocks = std::min( - static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); + static_cast(elapse_time / kMaxKernelExecTime) + 1, + gws[2]); uint32_t block_size = gws[2] / num_blocks; if (!runtime->IsNonUniformWorkgroupsSupported()) { block_size = RoundUp(block_size, params[2]); @@ -465,7 +450,8 @@ MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel, double elapse_time = timer->AccumulatedMicros(); timer->ClearTiming(); uint32_t num_blocks = std::min( - static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[1]); + static_cast(elapse_time / kMaxKernelExecTime) + 1, + gws[1]); uint32_t block_size = gws[1] / num_blocks; if (!runtime->IsNonUniformWorkgroupsSupported()) { block_size = RoundUp(block_size, params[1]); diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h index 5db95e3d8be30928e743ba8f9e45f2f57153075e..22d9f1cc548c8691b313db12f6693a86bdbf957b 100644 --- a/mace/kernels/opencl/helper.h +++ b/mace/kernels/opencl/helper.h @@ -15,7 +15,9 @@ #ifndef MACE_KERNELS_OPENCL_HELPER_H_ #define MACE_KERNELS_OPENCL_HELPER_H_ +#include #include +#include #include #include "mace/core/future.h" @@ -23,26 +25,71 @@ #include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/types.h" +#include "mace/kernels/opencl/common.h" #include "mace/utils/utils.h" namespace mace { namespace kernels { -const float kMaxKernelExeTime = 1000.0; // microseconds +#define OUT_OF_RANGE_CONFIG(kernel_error) \ + if (runtime->IsOutOfRangeCheckEnabled()) { \ + built_options.emplace("-DOUT_OF_RANGE_CHECK"); \ + (kernel_error) = std::move(std::unique_ptr( \ + new Buffer(GetDeviceAllocator(DeviceType::GPU)))); \ + MACE_RETURN_IF_ERROR((kernel_error)->Allocate(1)); \ + (kernel_error)->Map(nullptr); \ + *((kernel_error)->mutable_data()) = 0; \ + (kernel_error)->UnMap(); \ + } -const int32_t kBaseGPUMemCacheSize = 16384; +#define OUT_OF_RANGE_SET_ARG \ + if (runtime->IsOutOfRangeCheckEnabled()) { \ + kernel_.setArg(idx++, \ + *(static_cast(kernel_error_->buffer()))); \ + } + +#define OUT_OF_RANGE_SET_ARG_PTR \ + if (runtime->IsOutOfRangeCheckEnabled()) { \ + kernel->setArg(idx++, \ + *(static_cast((*kernel_error)->buffer()))); \ + } + +#define OUT_OF_RANGE_VALIDATION(kernel_error) \ + if (runtime->IsOutOfRangeCheckEnabled()) { \ + (kernel_error)->Map(nullptr); \ + char *kerror_code = (kernel_error)->mutable_data(); \ + MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;\ + (kernel_error)->UnMap(); \ + } + +#define NON_UNIFORM_WG_CONFIG \ + if (runtime->IsNonUniformWorkgroupsSupported()) { \ + built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); \ + } + +#define SET_3D_GWS_ARGS(kernel) \ + kernel.setArg(idx++, gws[0]); \ + kernel.setArg(idx++, gws[1]); \ + kernel.setArg(idx++, gws[2]); -enum BufferType { - CONV2D_FILTER = 0, - IN_OUT_CHANNEL = 1, - ARGUMENT = 2, - IN_OUT_HEIGHT = 3, - IN_OUT_WIDTH = 4, - WINOGRAD_FILTER = 5, - DW_CONV2D_FILTER = 6, - WEIGHT_HEIGHT = 7, - WEIGHT_WIDTH = 8, -}; +#define SET_2D_GWS_ARGS(kernel) \ + kernel.setArg(idx++, gws[0]); \ + kernel.setArg(idx++, gws[1]); + +#define SET_3D_GWS_ARGS_PTR(kernel, gws) \ + kernel->setArg(idx++, (gws)[0]); \ + kernel->setArg(idx++, (gws)[1]); \ + kernel->setArg(idx++, (gws)[2]); + +#define SET_2D_GWS_ARGS_PTR(kernel, gws) \ + kernel->setArg(idx++, (gws)[0]); \ + kernel->setArg(idx++, (gws)[1]); + +// Max execution time of OpenCL kernel for tuning to prevent UI stuck. +const float kMaxKernelExecTime = 1000.0; // microseconds + +// Base GPU cache size used for computing local work group size. +const int32_t kBaseGPUMemCacheSize = 16384; void CalImage2DShape(const std::vector &shape, /* NHWC */ const BufferType type, @@ -53,41 +100,35 @@ std::vector FormatBufferShape( const std::vector &buffer_shape, const BufferType type); -std::vector CalWinogradShape(const std::vector &shape, - const BufferType type, - const int wino_blk_size = 2); - +// CPU data type to OpenCL command data type std::string DtToCLCMDDt(const DataType dt); -std::string DtToUpstreamCLCMDDt(const DataType dt); +// CPU data type to upward compatible OpenCL command data type +// e.g. half -> float +std::string DtToUpCompatibleCLCMDDt(const DataType dt); +// CPU data type to OpenCL data type std::string DtToCLDt(const DataType dt); -std::string DtToUpstreamCLDt(const DataType dt); +// CPU data type to upward compatible OpenCL data type +// e.g. half -> float +std::string DtToUpCompatibleCLDt(const DataType dt); +// Tuning or Run OpenCL kernel with 3D work group size MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel, const std::string tuning_key, const uint32_t *gws, const std::vector &lws, StatsFuture *future); +// Tuning or Run OpenCL kernel with 2D work group size MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel, const std::string tuning_key, const uint32_t *gws, const std::vector &lws, StatsFuture *future); -inline void SetFuture(StatsFuture *future, const cl::Event &event) { - if (future != nullptr) { - future->wait_fn = [event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - OpenCLRuntime::Global()->GetCallStats(event, stats); - } - }; - } -} - +// Check whether limit OpenCL kernel time flag open. inline bool LimitKernelTime() { const char *flag = getenv("MACE_LIMIT_OPENCL_KERNEL_TIME"); return flag != nullptr && strlen(flag) == 1 && flag[0] == '1'; diff --git a/mace/kernels/opencl/image_to_buffer.cc b/mace/kernels/opencl/image_to_buffer.cc index c8635e1acf39ff6bd4896e22d63cde7e949d4b23..955b9ebebd3fcb1d3bc48f04de7617e5b10e43cb 100644 --- a/mace/kernels/opencl/image_to_buffer.cc +++ b/mace/kernels/opencl/image_to_buffer.cc @@ -13,9 +13,10 @@ // limitations under the License. #include "mace/kernels/image_to_buffer.h" -#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/kernels/opencl/helper.h" + namespace mace { namespace kernels { @@ -68,80 +69,71 @@ MaceStatus ImageToBufferFunctor::operator()( auto runtime = OpenCLRuntime::Global(); - std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); - std::set built_options; - std::stringstream kernel_name_ss; - kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; - built_options.emplace(kernel_name_ss.str()); - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } - if (buffer->dtype() == image->dtype()) { - built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToCLCMDDt(DataTypeToEnum::value)); - } else { - built_options.emplace("-DDATA_TYPE=" + - DtToUpstreamCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToUpstreamCLCMDDt(DataTypeToEnum::value)); - } - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - if (!kernel_error_) { - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); + if (kernel_.get() == nullptr) { + std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); + std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; + std::stringstream kernel_name_ss; + kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; + built_options.emplace(kernel_name_ss.str()); + if (buffer->dtype() == image->dtype()) { + built_options.emplace( + "-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + + DtToCLCMDDt(DataTypeToEnum::value)); + } else { + built_options.emplace("-DDATA_TYPE=" + + DtToUpCompatibleCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + + DtToUpCompatibleCLCMDDt(DataTypeToEnum::value)); } + MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image", + obfuscated_kernel_name, + built_options, + &kernel_)); } - cl::Kernel b2f_kernel; - MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image", - obfuscated_kernel_name, - built_options, - &b2f_kernel)); - uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - b2f_kernel.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - b2f_kernel.setArg(idx++, gws[0]); - b2f_kernel.setArg(idx++, gws[1]); - } - b2f_kernel.setArg(idx++, *(buffer->opencl_buffer())); - if (type == CONV2D_FILTER) { - const index_t inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(0))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); - b2f_kernel.setArg(idx++, static_cast(inner_size)); - } else if (type == ARGUMENT) { - b2f_kernel.setArg(idx++, static_cast(buffer->dim(0))); - } else if (type == WEIGHT_HEIGHT) { - b2f_kernel.setArg(idx++, static_cast(buffer->dim(0))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(1))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); - } else { - b2f_kernel.setArg(idx++, static_cast(formatted_buffer_shape[1])); - b2f_kernel.setArg(idx++, static_cast(formatted_buffer_shape[2])); - b2f_kernel.setArg(idx++, static_cast(formatted_buffer_shape[3])); + if (!IsVecEqual(input_shape_, image->shape())) { + uint32_t idx = 0; + OUT_OF_RANGE_SET_ARG; + SET_2D_GWS_ARGS(kernel_); + kernel_.setArg(idx++, *(buffer->opencl_buffer())); + if (type == CONV2D_FILTER) { + const index_t + inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3); + kernel_.setArg(idx++, static_cast(buffer->dim(0))); + kernel_.setArg(idx++, static_cast(buffer->dim(2))); + kernel_.setArg(idx++, static_cast(buffer->dim(3))); + kernel_.setArg(idx++, static_cast(inner_size)); + } else if (type == ARGUMENT) { + kernel_.setArg(idx++, static_cast(buffer->dim(0))); + } else if (type == WEIGHT_HEIGHT) { + kernel_.setArg(idx++, static_cast(buffer->dim(0))); + kernel_.setArg(idx++, static_cast(buffer->dim(1))); + kernel_.setArg(idx++, static_cast(buffer->dim(2))); + kernel_.setArg(idx++, static_cast(buffer->dim(3))); + } else { + kernel_.setArg(idx++, + static_cast(formatted_buffer_shape[1])); + kernel_.setArg(idx++, + static_cast(formatted_buffer_shape[2])); + kernel_.setArg(idx++, + static_cast(formatted_buffer_shape[3])); + } + kernel_.setArg(idx++, *(image->opencl_image())); + input_shape_ = image->shape(); } - b2f_kernel.setArg(idx++, *(image->opencl_image())); const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(b2f_kernel)); + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); const std::vector lws = {16, kwg_size / 16}; cl::Event event; cl_int error; if (runtime->IsNonUniformWorkgroupsSupported()) { error = runtime->command_queue().enqueueNDRangeKernel( - b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), + kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]), cl::NDRange(lws[0], lws[1]), nullptr, &event); } else { std::vector roundup_gws(lws.size()); @@ -150,16 +142,11 @@ MaceStatus ImageToBufferFunctor::operator()( } error = runtime->command_queue().enqueueNDRangeKernel( - b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), + kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), cl::NDRange(lws[0], lws[1]), nullptr, &event); } MACE_CL_RET_STATUS(error); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } + OUT_OF_RANGE_VALIDATION(kernel_error_); if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { event.wait(); diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc index f9825eb6da38b8848b976c5a78caae974c90cc9d..7e5e52b43614eb51eb7b181c92ef4aea0f8ff612 100644 --- a/mace/kernels/opencl/matmul.cc +++ b/mace/kernels/opencl/matmul.cc @@ -57,23 +57,13 @@ MaceStatus MatMulFunctor::operator()(const Tensor *A, if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); built_options.emplace("-Dmatmul=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name, built_options, &kernel_)); @@ -81,14 +71,8 @@ MaceStatus MatMulFunctor::operator()(const Tensor *A, static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - } + OUT_OF_RANGE_SET_ARG; + SET_2D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(A->opencl_image())); kernel_.setArg(idx++, *(B->opencl_image())); kernel_.setArg(idx++, *(C->opencl_image())); @@ -98,18 +82,12 @@ MaceStatus MatMulFunctor::operator()(const Tensor *A, kernel_.setArg(idx++, static_cast(height_blocks)); kernel_.setArg(idx++, static_cast(RoundUpDiv4(K))); - const std::vector lws = {kwg_size_ / 64, 64, 0}; + const std::vector lws = {kwg_size_ / 64, 64, 1}; std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width); MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/out_of_range_check_test.cc b/mace/kernels/opencl/out_of_range_check_test.cc index 2497a4dffa5d61d7fff800f163c98e176145c06e..d257fea2d7fca9333c8d997e7703f53345feba2a 100644 --- a/mace/kernels/opencl/out_of_range_check_test.cc +++ b/mace/kernels/opencl/out_of_range_check_test.cc @@ -40,9 +40,8 @@ bool BufferToImageOpImpl(Tensor *buffer, std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; built_options.emplace(kernel_name_ss.str()); - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + OUT_OF_RANGE_CONFIG(kernel_error); + NON_UNIFORM_WG_CONFIG; if (buffer->dtype() == image->dtype()) { built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); @@ -50,57 +49,46 @@ bool BufferToImageOpImpl(Tensor *buffer, DtToCLCMDDt(DataTypeToEnum::value)); } else { built_options.emplace("-DDATA_TYPE=" + - DtToUpstreamCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToUpstreamCLCMDDt(DataTypeToEnum::value)); - } - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error->Allocate(1)); - kernel_error->Map(nullptr); - *(kernel_error->mutable_data()) = 0; - kernel_error->UnMap(); + DtToUpCompatibleCLDt(DataTypeToEnum::value)); + built_options.emplace( + "-DCMD_DATA_TYPE=" + + DtToUpCompatibleCLCMDDt(DataTypeToEnum::value)); } - cl::Kernel b2f_kernel; - + cl::Kernel kernel; cl_int error = runtime->BuildKernel("buffer_to_image", obfuscated_kernel_name, - built_options, &b2f_kernel); + built_options, + &kernel); if (error != CL_SUCCESS) { return false; } uint32_t idx = 0; if (runtime->IsOutOfRangeCheckEnabled()) { - b2f_kernel.setArg(idx++, - *(static_cast(kernel_error->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - b2f_kernel.setArg(idx++, gws[0]); - b2f_kernel.setArg(idx++, gws[1]); + kernel.setArg(idx++, + *(static_cast(kernel_error->buffer()))); } - b2f_kernel.setArg(idx++, *(buffer->opencl_buffer())); + SET_2D_GWS_ARGS(kernel); + kernel.setArg(idx++, *(buffer->opencl_buffer())); MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, "buffer offset not aligned"); - b2f_kernel.setArg(idx++, + kernel.setArg(idx++, static_cast(buffer->buffer_offset() / GetEnumTypeSize(buffer->dtype()))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(1))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(2))); - b2f_kernel.setArg(idx++, static_cast(buffer->dim(3))); - b2f_kernel.setArg(idx++, *(image->opencl_image())); + kernel.setArg(idx++, static_cast(buffer->dim(1))); + kernel.setArg(idx++, static_cast(buffer->dim(2))); + kernel.setArg(idx++, static_cast(buffer->dim(3))); + kernel.setArg(idx++, *(image->opencl_image())); const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(b2f_kernel)); + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel)); const std::vector lws = {16, kwg_size / 16}; cl::Event event; if (runtime->IsNonUniformWorkgroupsSupported()) { error = runtime->command_queue().enqueueNDRangeKernel( - b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), + kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), cl::NDRange(lws[0], lws[1]), nullptr, &event); } else { std::vector roundup_gws(lws.size()); @@ -109,7 +97,7 @@ bool BufferToImageOpImpl(Tensor *buffer, } error = runtime->command_queue().enqueueNDRangeKernel( - b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), + kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), cl::NDRange(lws[0], lws[1]), nullptr, &event); } if (error != CL_SUCCESS) { diff --git a/mace/kernels/opencl/pad.cc b/mace/kernels/opencl/pad.cc index 1a8879e8addda96b2ddbe0aaec174ded7d906724..04e9d69d4aaf8f7a81f2deee644e80cdc4988145 100644 --- a/mace/kernels/opencl/pad.cc +++ b/mace/kernels/opencl/pad.cc @@ -51,23 +51,13 @@ MaceStatus PadFunctor::operator()(const Tensor *input, if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad"); built_options.emplace("-Dpad=" + kernel_name); auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name, built_options, &kernel_)); @@ -81,15 +71,8 @@ MaceStatus PadFunctor::operator()(const Tensor *input, if (!IsVecEqual(input_shape_, input->shape())) { int idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, this->constant_value_); @@ -108,13 +91,7 @@ MaceStatus PadFunctor::operator()(const Tensor *input, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/pooling.cc b/mace/kernels/opencl/pooling.cc index 405c05a1a00c52188da55069ecf6575c3088e0d2..7111317a8419d0fdf9b2518102495c66bc4d8bdb 100644 --- a/mace/kernels/opencl/pooling.cc +++ b/mace/kernels/opencl/pooling.cc @@ -59,6 +59,8 @@ MaceStatus PoolingFunctor::operator()(const Tensor *input, if (kernel_.get() == nullptr) { const DataType dt = DataTypeToEnum::value; std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); built_options.emplace("-Dpooling=" + kernel_name); @@ -67,24 +69,12 @@ MaceStatus PoolingFunctor::operator()(const Tensor *input, built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); built_options.emplace(dt == DT_HALF ? "-DFP16" : ""); } else { - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); } if (pooling_type_ == AVG) { built_options.emplace("-DPOOL_AVG"); } - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling", kernel_name, built_options, @@ -130,15 +120,8 @@ MaceStatus PoolingFunctor::operator()(const Tensor *input, }; uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, static_cast(input->dim(1))); kernel_.setArg(idx++, static_cast(input->dim(2))); @@ -171,13 +154,7 @@ MaceStatus PoolingFunctor::operator()(const Tensor *input, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/reduce_mean_opencl.cc b/mace/kernels/opencl/reduce_mean.cc similarity index 81% rename from mace/kernels/opencl/reduce_mean_opencl.cc rename to mace/kernels/opencl/reduce_mean.cc index 266a111111e4b3a781729fcbbd3bb0acdf8a7e30..075632c554323d591ab614c45d55717b9bcc44ad 100644 --- a/mace/kernels/opencl/reduce_mean_opencl.cc +++ b/mace/kernels/opencl/reduce_mean.cc @@ -50,26 +50,15 @@ MaceStatus ReduceMeanFunctor::operator()( if (kernel_.get() == nullptr) { const DataType dt = DataTypeToEnum::value; std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce_mean"); built_options.emplace("-Dreduce_mean=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { built_options.emplace("-DNON_QUALCOMM_ADRENO"); } - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce_mean", kernel_name, built_options, @@ -96,15 +85,8 @@ MaceStatus ReduceMeanFunctor::operator()( if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, (group_size * 4 * sizeof(T)), nullptr); @@ -140,12 +122,7 @@ MaceStatus ReduceMeanFunctor::operator()( cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); } MACE_CL_RET_STATUS(error); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } + OUT_OF_RANGE_VALIDATION(kernel_error_); if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { diff --git a/mace/kernels/opencl/resize_bilinear.cc b/mace/kernels/opencl/resize_bilinear.cc index c5b8b65bd16bc4dad80fd3602bd6a94401a63077..0b297dd22dae97f3be1fdf881a42118acf03169c 100644 --- a/mace/kernels/opencl/resize_bilinear.cc +++ b/mace/kernels/opencl/resize_bilinear.cc @@ -74,23 +74,13 @@ MaceStatus ResizeBilinearFunctor::operator()( if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache"); built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name); auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); MACE_RETURN_IF_ERROR( runtime->BuildKernel("resize_bilinear", kernel_name, @@ -115,15 +105,8 @@ MaceStatus ResizeBilinearFunctor::operator()( CalculateResizeScale(in_width, out_width, align_corners_); uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, height_scale); @@ -142,13 +125,7 @@ MaceStatus ResizeBilinearFunctor::operator()( MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc index 726bcae92b8ffba562cfb5cdf0b2badd98941c39..b778e0d70aa16f3aa6141a6ed3198a6caec188cf 100644 --- a/mace/kernels/opencl/slice.cc +++ b/mace/kernels/opencl/slice.cc @@ -44,23 +44,13 @@ MaceStatus SliceFunctor::operator()( if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice"); built_options.emplace("-Dslice=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum::value)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } MACE_RETURN_IF_ERROR(runtime->BuildKernel("slice", kernel_name, built_options, @@ -81,15 +71,8 @@ MaceStatus SliceFunctor::operator()( CallStats call_stats{INT64_MAX, 0}; for (size_t i = 0; i < outputs_count; ++i) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, static_cast(channel_blk * i)); kernel_.setArg(idx++, *(output_list[i]->opencl_image())); @@ -111,12 +94,7 @@ MaceStatus SliceFunctor::operator()( cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); } MACE_CL_RET_STATUS(error); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } + OUT_OF_RANGE_VALIDATION(kernel_error_); if (future != nullptr && runtime->is_profiling_enabled()) { event.wait(); CallStats tmp_stats; diff --git a/mace/kernels/opencl/softmax.cc b/mace/kernels/opencl/softmax.cc index 0785548849dfa3e9af3c7c6807bb16c07cb3eab6..f401b827096189156c184f348f0017ede7dce13f 100644 --- a/mace/kernels/opencl/softmax.cc +++ b/mace/kernels/opencl/softmax.cc @@ -82,23 +82,13 @@ MaceStatus SoftmaxFunctor::operator()(const Tensor *logits, if (kernel_.get() == nullptr) { std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); built_options.emplace("-Dsoftmax=" + kernel_name); auto dt = DataTypeToEnum::value; - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); - built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name, built_options, &kernel_)); @@ -107,15 +97,8 @@ MaceStatus SoftmaxFunctor::operator()(const Tensor *logits, } if (!IsVecEqual(input_shape_, logits->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(logits->opencl_image())); kernel_.setArg(idx++, static_cast(channels)); kernel_.setArg(idx++, remain_channels); @@ -130,13 +113,7 @@ MaceStatus SoftmaxFunctor::operator()(const Tensor *logits, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/space_to_batch.cc b/mace/kernels/opencl/space_to_batch.cc index 3606d91d5badf6d71f21f98135ddaec7e95e338f..c31b2d691f5fccd72faa75f35ce88f034bd7900f 100644 --- a/mace/kernels/opencl/space_to_batch.cc +++ b/mace/kernels/opencl/space_to_batch.cc @@ -59,24 +59,14 @@ MaceStatus SpaceToBatchFunctor::operator()( if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; built_options.emplace(kernel_name_ss.str()); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum::value)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch", obfuscated_kernel_name, built_options, @@ -87,15 +77,8 @@ MaceStatus SpaceToBatchFunctor::operator()( } if (!IsVecEqual(space_shape_, space_tensor->shape())) { uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - kernel_.setArg(idx++, gws[2]); - } + OUT_OF_RANGE_SET_ARG; + SET_3D_GWS_ARGS(kernel_); if (b2s_) { kernel_.setArg(idx++, *(batch_tensor->opencl_image())); kernel_.setArg(idx++, *(space_tensor->opencl_image())); @@ -123,13 +106,7 @@ MaceStatus SpaceToBatchFunctor::operator()( MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc index 6cc8e08cd0946356533eb6623d6aefa490c66c83..dd6d16d6351810c544263aadf5e0a7abbe24fcb3 100644 --- a/mace/kernels/opencl/winograd_transform.cc +++ b/mace/kernels/opencl/winograd_transform.cc @@ -29,6 +29,8 @@ MaceStatus WinogradTransformFunctor::operator()( if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name; std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; if (wino_blk_size_ == 4) { obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_transform_4x4"); @@ -44,21 +46,9 @@ MaceStatus WinogradTransformFunctor::operator()( return MACE_SUCCESS; } built_options.emplace("-DDATA_TYPE=" + - DtToUpstreamCLDt(DataTypeToEnum::value)); + DtToUpCompatibleCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + - DtToUpstreamCLCMDDt(DataTypeToEnum::value)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + DtToUpCompatibleCLCMDDt(DataTypeToEnum::value)); MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform", obfuscated_kernel_name, built_options, @@ -107,14 +97,8 @@ MaceStatus WinogradTransformFunctor::operator()( MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape)); uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - } + OUT_OF_RANGE_SET_ARG; + SET_2D_GWS_ARGS(kernel_); kernel_.setArg(idx++, *(input_tensor->opencl_image())); kernel_.setArg(idx++, *(output_tensor->opencl_image())); kernel_.setArg(idx++, static_cast(input_tensor->dim(1))); @@ -139,13 +123,7 @@ MaceStatus WinogradTransformFunctor::operator()( MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } - + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } @@ -160,6 +138,8 @@ MaceStatus WinogradInverseTransformFunctor::operator()( if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name; std::set built_options; + OUT_OF_RANGE_CONFIG(kernel_error_); + NON_UNIFORM_WG_CONFIG; if (wino_blk_size_ == 4) { obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_4x4"); @@ -176,21 +156,9 @@ MaceStatus WinogradInverseTransformFunctor::operator()( } built_options.emplace("-DDATA_TYPE=" + - DtToUpstreamCLDt(DataTypeToEnum::value)); + DtToUpCompatibleCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + - DtToUpstreamCLCMDDt(DataTypeToEnum::value)); - if (runtime->IsOutOfRangeCheckEnabled()) { - built_options.emplace("-DOUT_OF_RANGE_CHECK"); - kernel_error_ = std::move(std::unique_ptr( - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); - MACE_RETURN_IF_ERROR(kernel_error_->Allocate(1)); - kernel_error_->Map(nullptr); - *(kernel_error_->mutable_data()) = 0; - kernel_error_->UnMap(); - } - if (runtime->IsNonUniformWorkgroupsSupported()) { - built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); - } + DtToUpCompatibleCLCMDDt(DataTypeToEnum::value)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); switch (activation_) { case NOOP: @@ -240,14 +208,8 @@ MaceStatus WinogradInverseTransformFunctor::operator()( const float round_w_r = 1.f / static_cast(round_w); uint32_t idx = 0; - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_.setArg(idx++, - *(static_cast(kernel_error_->buffer()))); - } - if (!runtime->IsNonUniformWorkgroupsSupported()) { - kernel_.setArg(idx++, gws[0]); - kernel_.setArg(idx++, gws[1]); - } + OUT_OF_RANGE_SET_ARG; + SET_2D_GWS_ARGS(kernel_); kernel_.setArg( idx++, *(static_cast(input_tensor->opencl_image()))); @@ -275,12 +237,7 @@ MaceStatus WinogradInverseTransformFunctor::operator()( MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future)); - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); - } + OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; } diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index 37e3b4ac91bcda2c4c606e4c25c8dde9cf6914ee..e8da144df242dac95f8360cfa0897483b3bb8476 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -28,7 +28,7 @@ #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/tensor.h" #include "mace/core/workspace.h" -#include "mace/kernels/opencl/helper.h" +#include "mace/kernels/opencl/common.h" #include "mace/ops/ops_register.h" #include "mace/utils/utils.h"