diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 0728d5f0905af0b367fb77df4b986e3f9256ff74..a592b6fa119239ef09c293e602caa29fd66e8d23 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -323,6 +323,14 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint, const char *kernel_path = getenv("MACE_KERNEL_PATH"); this->kernel_path_ = std::string(kernel_path == nullptr ? "" : kernel_path) + "/"; + + const char *out_of_range_check = getenv("MACE_OUT_OF_RANGE_CHECK"); + if (out_of_range_check != nullptr && strlen(out_of_range_check) == 1 + && out_of_range_check[0] == '1') { + this->out_of_range_check_ = true; + } else { + this->out_of_range_check_ = false; + } } OpenCLRuntime::~OpenCLRuntime() { @@ -505,4 +513,8 @@ const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName( } } +const bool OpenCLRuntime::IsOutOfRangeCheckEnabled() const { + return out_of_range_check_; +} + } // namespace mace diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index 3f5261b860bf43a91867214b476edab4ff009e50..8b29f145784a7b25ff7206b4a0bcbeaf1533475d 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -70,6 +70,7 @@ class OpenCLRuntime { cl::Kernel BuildKernel(const std::string &program_name, const std::string &kernel_name, const std::set &build_options); + const bool IsOutOfRangeCheckEnabled() const; private: OpenCLRuntime(GPUPerfHint, GPUPriorityHint); @@ -94,6 +95,7 @@ class OpenCLRuntime { std::string kernel_path_; GPUType gpu_type_; std::string opencl_version_; + bool out_of_range_check_; static GPUPerfHint gpu_perf_hint_; static GPUPriorityHint gpu_priority_hint_; diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h index 5130ccdcb1675e46a653e30634e379f1b0898769..67b01458bdabb60dd56797a4912a18f6386944ae 100644 --- a/mace/kernels/activation.h +++ b/mace/kernels/activation.h @@ -6,6 +6,7 @@ #define MACE_KERNELS_ACTIVATION_H_ #include +#include #include #include @@ -165,6 +166,7 @@ class ActivationFunctor { T relux_max_limit_; cl::Kernel kernel_; uint32_t kwg_size_; + std::unique_ptr kernel_error_; std::string tuning_key_prefix_; std::vector input_shape_; }; diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h index 61e906bf575f408f7425b6d80414f75008ddec57..7538b5b1681e05cdd507277915e9b746461b7d4a 100644 --- a/mace/kernels/addn.h +++ b/mace/kernels/addn.h @@ -9,6 +9,7 @@ #include #endif #include +#include #include #include "mace/core/future.h" @@ -85,6 +86,7 @@ struct AddNFunctor { cl::Kernel kernel_; uint32_t kwg_size_; + std::unique_ptr kernel_error_; std::vector input_shape_; }; diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h index 0b8ae31735e25dbd6b0014878dd5d84f5db638ed..c0aab3d958170c9411bdad9d0077a4fc8e0cb8e9 100644 --- a/mace/kernels/batch_norm.h +++ b/mace/kernels/batch_norm.h @@ -8,6 +8,7 @@ #if defined(MACE_ENABLE_NEON) && defined(__aarch64__) #include #endif +#include #include #include "mace/core/future.h" @@ -165,6 +166,7 @@ struct BatchNormFunctor : BatchNormFunctorBase { StatsFuture *future); cl::Kernel kernel_; uint32_t kwg_size_; + std::unique_ptr kernel_error_; std::vector input_shape_; }; diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.h index a3d1ff61b1c0fa0be7c7eb0506c495fc1331476c..ab2a35f1ec57f0712966d6367f6c24f24d3844c7 100644 --- a/mace/kernels/bias_add.h +++ b/mace/kernels/bias_add.h @@ -5,6 +5,7 @@ #ifndef MACE_KERNELS_BIAS_ADD_H_ #define MACE_KERNELS_BIAS_ADD_H_ +#include #include #include "mace/core/future.h" @@ -65,6 +66,7 @@ struct BiasAddFunctor { StatsFuture *future); cl::Kernel kernel_; uint32_t kwg_size_; + std::unique_ptr kernel_error_; std::vector input_shape_; }; diff --git a/mace/kernels/buffer_to_image.h b/mace/kernels/buffer_to_image.h index 2956762d5d70fb089e8e2bee34f114693fb1cc12..ff6cebb206e7079cbd2f80da610bbf24908f1e30 100644 --- a/mace/kernels/buffer_to_image.h +++ b/mace/kernels/buffer_to_image.h @@ -5,6 +5,8 @@ #ifndef MACE_KERNELS_BUFFER_TO_IMAGE_H_ #define MACE_KERNELS_BUFFER_TO_IMAGE_H_ +#include + #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" @@ -13,8 +15,10 @@ namespace mace { namespace kernels { struct BufferToImageFunctorBase { - explicit BufferToImageFunctorBase(bool i2b) : i2b_(i2b) {} + explicit BufferToImageFunctorBase(bool i2b) + : i2b_(i2b), kernel_error_(nullptr) {} bool i2b_; + std::unique_ptr kernel_error_; }; template diff --git a/mace/kernels/channel_shuffle.h b/mace/kernels/channel_shuffle.h index 69332e0182e4623a7ffb9e1e87fd05fbd9ca5b75..258cd39af9a8c8165b3c99a4266c86d9957fd907 100644 --- a/mace/kernels/channel_shuffle.h +++ b/mace/kernels/channel_shuffle.h @@ -5,6 +5,7 @@ #ifndef MACE_KERNELS_CHANNEL_SHUFFLE_H_ #define MACE_KERNELS_CHANNEL_SHUFFLE_H_ +#include #include #include "mace/core/future.h" @@ -57,6 +58,7 @@ struct ChannelShuffleFunctor { cl::Kernel kernel_; uint32_t kwg_size_; + std::unique_ptr kernel_error_; const int groups_; std::vector input_shape_; }; diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h index 6c803b5f2a32544cac7d79145b691b3166faeb9b..88c6cdb862e69cca62ca53645c7193a747b47ec9 100644 --- a/mace/kernels/concat.h +++ b/mace/kernels/concat.h @@ -5,6 +5,7 @@ #ifndef MACE_KERNELS_CONCAT_H_ #define MACE_KERNELS_CONCAT_H_ +#include #include #include "mace/core/future.h" @@ -86,6 +87,7 @@ struct ConcatFunctor : ConcatFunctorBase { StatsFuture *future); cl::Kernel kernel_; uint32_t kwg_size_; + std::unique_ptr kernel_error_; std::vector input_shape_; }; diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index 6ef35d29f7a7146e8548d31ecabf00f9ce77b4ae..6833b9b66449f70f38ef56779722fbd67b06eaa7 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -9,6 +9,7 @@ #include #endif #include +#include #include #include "mace/core/future.h" @@ -468,6 +469,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { cl::Kernel kernel_; uint32_t kwg_size_; + std::unique_ptr kernel_error_; std::vector input_shape_; }; diff --git a/mace/kernels/cwise.h b/mace/kernels/cwise.h index 07e03e7ff2d77cf0b907c9d88ee5bff221a96f80..e33528113394509fd976e2e825568df96806d6cf 100644 --- a/mace/kernels/cwise.h +++ b/mace/kernels/cwise.h @@ -115,6 +115,7 @@ struct CWiseFunctor : CWiseFunctorBase { cl::Kernel kernel_; uint32_t kwg_size_; + std::unique_ptr kernel_error_; std::vector input_shape_; }; diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h index 6d30673976df6399351ad50c871d2f9e578cea22..648a66d9448941aa71a820cc36b2b39cc80d7910 100644 --- a/mace/kernels/depth_to_space.h +++ b/mace/kernels/depth_to_space.h @@ -109,6 +109,7 @@ struct DepthToSpaceOpFunctor { cl::Kernel kernel_; uint32_t kwg_size_; + std::unique_ptr kernel_error_; const int block_size_; bool d2s_; std::vector input_shape_; diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h index 7f4f2021f134dbcb2ffa9880a7c3e4c5a2ad7ddb..8767e8ab57b6fcb86f58d3083bcef1e1d45965fd 100644 --- a/mace/kernels/depthwise_conv2d.h +++ b/mace/kernels/depthwise_conv2d.h @@ -454,6 +454,7 @@ struct DepthwiseConv2dFunctor cl::Kernel kernel_; uint32_t kwg_size_; + std::unique_ptr kernel_error_; std::vector input_shape_; }; diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h index 423a8f9fdf59abba9f3c92acdcd5aa0ba5ca40f2..13adae8cde402b5ff3db0eefaec030ce1c8c7aa9 100644 --- a/mace/kernels/eltwise.h +++ b/mace/kernels/eltwise.h @@ -105,6 +105,7 @@ struct EltwiseFunctor : EltwiseFunctorBase { cl::Kernel kernel_; uint32_t kwg_size_; + std::unique_ptr kernel_error_; std::vector input_shape_; }; diff --git a/mace/kernels/fully_connected.h b/mace/kernels/fully_connected.h index 3c21fba7ba827ab78ac81467886672262bf8d391..56acf6b64ab5996e1a3af1ce062c6fe8446be8e4 100644 --- a/mace/kernels/fully_connected.h +++ b/mace/kernels/fully_connected.h @@ -107,6 +107,7 @@ struct FullyConnectedFunctor : FullyConnectedBase { std::vector gws_; std::vector lws_; std::vector input_shape_; + std::unique_ptr kernel_error_; }; } // namespace kernels diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.h index 28db80c0cf8896a6fd48143a8aa4167b0a04e1cd..f95ce73fd06ffe8ad5513ff6463f2fa5bd5bb0f9 100644 --- a/mace/kernels/matmul.h +++ b/mace/kernels/matmul.h @@ -68,6 +68,7 @@ struct MatMulFunctor { cl::Kernel kernel_; uint32_t kwg_size_; + std::unique_ptr kernel_error_; }; } // namespace kernels diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc index 368a78bc9594da81e84831136fbfcfe93a4f295f..3ff7ddc4f9b83ff7b3b2cc58f43a1bdf2bd436a6 100644 --- a/mace/kernels/opencl/activation_opencl.cc +++ b/mace/kernels/opencl/activation_opencl.cc @@ -33,6 +33,14 @@ void ActivationFunctor::operator()(const Tensor *input, auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + kernel_error_ = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + kernel_error_->Map(nullptr); + *(kernel_error_->mutable_data()) = '0'; + kernel_error_->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -72,6 +80,10 @@ void ActivationFunctor::operator()(const Tensor *input, if (!IsVecEqual(input_shape_, input->shape())) { int idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_.setArg(idx++, + *(static_cast(kernel_error_->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); @@ -93,6 +105,13 @@ void ActivationFunctor::operator()(const Tensor *input, Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); + + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_error_->Map(nullptr); + char *kerror_code = kernel_error_->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + kernel_error_->UnMap(); + } } template struct ActivationFunctor; diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index dc2aabeb949f9c6e54b41ebc8c2eb1324f1e4694..f580ab58af40d8650d91b0eef76bc57686a30ceb 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -45,6 +45,14 @@ void AddNFunctor::operator()( built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size())); + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + kernel_error_ = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + kernel_error_->Map(nullptr); + *(kernel_error_->mutable_data()) = '0'; + kernel_error_->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -71,6 +79,10 @@ void AddNFunctor::operator()( output_tensor->ResizeImage(output_shape, output_image_shape); uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_.setArg(idx++, + *(static_cast(kernel_error_->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); @@ -88,6 +100,13 @@ void AddNFunctor::operator()( ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1] << "_" << output_shape[2] << "_" << output_shape[3]; TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); + + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_error_->Map(nullptr); + char *kerror_code = kernel_error_->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + kernel_error_->UnMap(); + } } template struct AddNFunctor; diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index 09be320fc680cb0e34306762a4672cd774f1d5ea..b5ae499c348689a724761581fc31654229aec3bd 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -36,7 +36,6 @@ void BatchNormFunctor::operator()(const Tensor *input, auto runtime = OpenCLRuntime::Global(); - if (kernel_.get() == nullptr) { std::set built_options; auto dt = DataTypeToEnum::value; @@ -44,6 +43,14 @@ void BatchNormFunctor::operator()(const Tensor *input, built_options.emplace("-Dbatch_norm=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + kernel_error_ = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + kernel_error_->Map(nullptr); + *(kernel_error_->mutable_data()) = '0'; + kernel_error_->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -76,6 +83,10 @@ void BatchNormFunctor::operator()(const Tensor *input, } if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_.setArg(idx++, + *(static_cast(kernel_error_->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); @@ -100,6 +111,13 @@ void BatchNormFunctor::operator()(const Tensor *input, Concat("batch_norm_opencl_kernel_", activation_, output->dim(0), output->dim(1), output->dim(2), output->dim(3), folded_constant_); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); + + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_error_->Map(nullptr); + char *kerror_code = kernel_error_->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + kernel_error_->UnMap(); + } } template struct BatchNormFunctor; diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add_opencl.cc index 684474be64632d47bd0b4ae488c22bf687b5146b..9612850a7deee5332bb9d1ac2ef6aede79f6f4cd 100644 --- a/mace/kernels/opencl/bias_add_opencl.cc +++ b/mace/kernels/opencl/bias_add_opencl.cc @@ -36,6 +36,14 @@ void BiasAddFunctor::operator()(const Tensor *input, built_options.emplace("-Dbias_add=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + kernel_error_ = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + kernel_error_->Map(nullptr); + *(kernel_error_->mutable_data()) = '0'; + kernel_error_->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -46,6 +54,10 @@ void BiasAddFunctor::operator()(const Tensor *input, } if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_.setArg(idx++, + *(static_cast(kernel_error_->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); @@ -77,6 +89,12 @@ void BiasAddFunctor::operator()(const Tensor *input, cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); } MACE_CHECK_CL_SUCCESS(error); + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_error_->Map(nullptr); + char *kerror_code = kernel_error_->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + kernel_error_->UnMap(); + } if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { event.wait(); diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index f652696f64b70444e41a5e50c793e7b093fe91c3..b2d418f8c38b8bf6b543121ea0a2cbe7711ee122 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -13,6 +13,7 @@ template void BufferToImageFunctor::operator()( Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) { std::vector image_shape; + if (!i2b_) { CalImage2DShape(buffer->shape(), type, &image_shape); if (type == WINOGRAD_FILTER) { @@ -80,10 +81,25 @@ void BufferToImageFunctor::operator()( built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum::value)); } + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + if (!kernel_error_) { + kernel_error_ = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + kernel_error_->Map(nullptr); + *(kernel_error_->mutable_data()) = '0'; + kernel_error_->UnMap(); + } + } + auto b2f_kernel = runtime->BuildKernel("buffer_to_image", obfuscated_kernel_name, built_options); uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + b2f_kernel.setArg(idx++, + *(static_cast(kernel_error_->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { b2f_kernel.setArg(idx++, gws[0]); b2f_kernel.setArg(idx++, gws[1]); @@ -135,6 +151,12 @@ void BufferToImageFunctor::operator()( cl::NDRange(lws[0], lws[1]), nullptr, &event); } MACE_CHECK_CL_SUCCESS(error); + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_error_->Map(nullptr); + char *kerror_code = kernel_error_->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + kernel_error_->UnMap(); + } if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { event.wait(); diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc index 59f2c9518061a7e1e6f0e8071a18cec699814be4..424685b5b65610a3d0a0e732743f39336cf4688d 100644 --- a/mace/kernels/opencl/channel_shuffle.cc +++ b/mace/kernels/opencl/channel_shuffle.cc @@ -43,6 +43,14 @@ void ChannelShuffleFunctor::operator()( auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + kernel_error_ = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + kernel_error_->Map(nullptr); + *(kernel_error_->mutable_data()) = '0'; + kernel_error_->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -55,6 +63,10 @@ void ChannelShuffleFunctor::operator()( if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_.setArg(idx++, + *(static_cast(kernel_error_->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); @@ -76,6 +88,13 @@ void ChannelShuffleFunctor::operator()( << output->dim(2) << "_" << output->dim(3); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); + + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_error_->Map(nullptr); + char *kerror_code = kernel_error_->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + kernel_error_->UnMap(); + } } template diff --git a/mace/kernels/opencl/cl/activation.cl b/mace/kernels/opencl/cl/activation.cl index 2978f4022e67ffa13b3e318bcd75490ae66a8d1b..a3aa0be70d27105253cd112a136ad2fd12da15c8 100644 --- a/mace/kernels/opencl/cl/activation.cl +++ b/mace/kernels/opencl/cl/activation.cl @@ -1,6 +1,7 @@ #include -__kernel void activation(GLOBAL_WORK_GROUP_SIZE_DIM3 +__kernel void activation(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, #ifdef USE_PRELU __read_only image2d_t alpha, @@ -29,6 +30,9 @@ __kernel void activation(GLOBAL_WORK_GROUP_SIZE_DIM3 #else DATA_TYPE4 out = do_activation(in, relux_max_limit); #endif + +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, pos, hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(pos, hb), out); } - diff --git a/mace/kernels/opencl/cl/addn.cl b/mace/kernels/opencl/cl/addn.cl index 30f52247f22e95718239e60e956bf856eba65f39..7d2d43233d14486cf4c8f3b154526e02f7e7467a 100644 --- a/mace/kernels/opencl/cl/addn.cl +++ b/mace/kernels/opencl/cl/addn.cl @@ -1,6 +1,7 @@ #include -__kernel void addn(GLOBAL_WORK_GROUP_SIZE_DIM2 +__kernel void addn(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input1, #if INPUT_NUM > 2 @@ -31,6 +32,9 @@ __kernel void addn(GLOBAL_WORK_GROUP_SIZE_DIM2 out = out + in3; #endif +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, w, hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(w, hb), out); } diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl index 290b6c1a12216d0771bcfb65b6c81660e4e59833..f3ba9accecf093af396a157cb528c078f5f956a2 100644 --- a/mace/kernels/opencl/cl/batch_norm.cl +++ b/mace/kernels/opencl/cl/batch_norm.cl @@ -1,6 +1,7 @@ #include // Supported data types: half/float -__kernel void batch_norm(GLOBAL_WORK_GROUP_SIZE_DIM3 +__kernel void batch_norm(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset, @@ -48,5 +49,8 @@ __kernel void batch_norm(GLOBAL_WORK_GROUP_SIZE_DIM3 out = do_activation(out, relux_max_limit); #endif +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, pos, hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(pos, hb), out); } diff --git a/mace/kernels/opencl/cl/bias_add.cl b/mace/kernels/opencl/cl/bias_add.cl index 64de2d77ff8371c6acafcea1bb4afdc00b105a8d..77cf7e7e5dcfcf5239c8d57e8f57f768e637837c 100644 --- a/mace/kernels/opencl/cl/bias_add.cl +++ b/mace/kernels/opencl/cl/bias_add.cl @@ -1,6 +1,7 @@ #include // Supported data types: half/float -__kernel void bias_add(GLOBAL_WORK_GROUP_SIZE_DIM3 +__kernel void bias_add(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t output) { @@ -22,5 +23,9 @@ __kernel void bias_add(GLOBAL_WORK_GROUP_SIZE_DIM3 DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); DATA_TYPE4 bias_value = READ_IMAGET(bias, SAMPLER, (int2)(ch_blk, 0)); DATA_TYPE4 out = in + bias_value; + +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, pos, hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(pos, hb), out); } diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl index e300bc51e2e429bd2c232068cba976269a2081d6..4a4b068ed7857ace604f854b3cfbb11b96a6ac05 100644 --- a/mace/kernels/opencl/cl/buffer_to_image.cl +++ b/mace/kernels/opencl/cl/buffer_to_image.cl @@ -1,6 +1,7 @@ #include -__kernel void filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 +__kernel void filter_buffer_to_image(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, /* h, w, oc, ic */ __private const int input_offset, __private const int filter_h, @@ -49,10 +50,14 @@ __kernel void filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 } int2 coord = (int2)(w, h); +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, w, h, kernel_error); +#endif WRITE_IMAGET(output, coord, values); } -__kernel void filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 +__kernel void filter_image_to_buffer(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, /* h, w, oc, ic */ __private const int filter_h, __private const int filter_w, @@ -100,7 +105,8 @@ __kernel void filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 } } -__kernel void dw_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 +__kernel void dw_filter_buffer_to_image(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, /* h, w, ic, m */ __private const int input_offset, __private const int filter_w, @@ -154,10 +160,14 @@ __kernel void dw_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 } int2 coord = (int2)(w, h); +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, w, h, kernel_error); +#endif WRITE_IMAGET(output, coord, values); } -__kernel void in_out_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 +__kernel void in_out_buffer_to_image(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, /* nhwc */ __private const int input_offset, __private const int height, @@ -195,10 +205,14 @@ __kernel void in_out_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 values = vload4(0, input + offset); } int2 coord = (int2)(w, h); +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, w, h, kernel_error); +#endif WRITE_IMAGET(output, coord, values); } -__kernel void in_out_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 +__kernel void in_out_image_to_buffer(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, /* nhwc */ __private const int height, __private const int width, @@ -237,7 +251,8 @@ __kernel void in_out_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 } } -__kernel void arg_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 +__kernel void arg_buffer_to_image(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, /* nhwc */ __private const int input_offset, __private const int count, @@ -269,10 +284,14 @@ __kernel void arg_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 values = vload4(0, input + offset); } int2 coord = (int2)(w, h); +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, w, h, kernel_error); +#endif WRITE_IMAGET(output, coord, values); } -__kernel void arg_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 +__kernel void arg_image_to_buffer(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, /* nhwc */ __private const int count, __read_only image2d_t input) { @@ -305,7 +324,8 @@ __kernel void arg_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 } -__kernel void in_out_height_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 +__kernel void in_out_height_buffer_to_image(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, //nhwc __private const int input_offset, __private const int height, @@ -344,10 +364,14 @@ __kernel void in_out_height_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 values.x = *(input + offset); } int2 coord = (int2)(w, h); +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, w, h, kernel_error); +#endif WRITE_IMAGET(output, coord, values); } -__kernel void in_out_height_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 +__kernel void in_out_height_image_to_buffer(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, //nhwc __private const int height, __private const int width, @@ -385,7 +409,8 @@ __kernel void in_out_height_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 } -__kernel void in_out_width_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 +__kernel void in_out_width_buffer_to_image(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, /* nhwc */ __private const int input_offset, __private const int height, @@ -423,11 +448,15 @@ __kernel void in_out_width_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 values.x = *(input + offset); } int2 coord = (int2)(w, h); +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, w, h, kernel_error); +#endif WRITE_IMAGET(output, coord, values); } // only support 3x3 now -__kernel void winograd_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 +__kernel void winograd_filter_buffer_to_image(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, //Oc, Ic, H, W __private const int input_offset, __private const int in_channels, @@ -495,6 +524,11 @@ __kernel void winograd_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 tu3[1] = tt + tu3[1] / 2; int2 coord = (int2)(w, h); + +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, coord.x, coord.y + out_channels * 15, kernel_error); +#endif + #pragma unroll for (short i = 0; i < 4; ++i) { WRITE_IMAGET(output, coord, tu0[i]); @@ -518,7 +552,8 @@ __kernel void winograd_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2 } // only support 3x3 now -__kernel void winograd_filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2 +__kernel void winograd_filter_image_to_buffer(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, //Oc, Ic, H, W __private const int height, __private const int width, diff --git a/mace/kernels/opencl/cl/channel_shuffle.cl b/mace/kernels/opencl/cl/channel_shuffle.cl index 92ff94473d7964925f67bfc3517194dc1111a4ee..a73ab8c54c2505cb7e01ee661f394d8d1bc61728 100644 --- a/mace/kernels/opencl/cl/channel_shuffle.cl +++ b/mace/kernels/opencl/cl/channel_shuffle.cl @@ -1,7 +1,8 @@ #include // assume channes_per_group mod 4 = 0 && groups mod 4 == 0 -__kernel void channel_shuffle(GLOBAL_WORK_GROUP_SIZE_DIM3 +__kernel void channel_shuffle(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int groups, __private const int channels_per_group, @@ -49,6 +50,11 @@ __kernel void channel_shuffle(GLOBAL_WORK_GROUP_SIZE_DIM3 out_chan_data3 = (DATA_TYPE4)(in_chan_data0.w, in_chan_data1.w, in_chan_data2.w, in_chan_data3.w); int out_x = mad24(mad24(group_chan_blk_idx, groups, g_blk), width, width_idx); + +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x + groups_blks_width * 3, hb_idx, kernel_error); +#endif + WRITE_IMAGET(output, (int2)(out_x, hb_idx), out_chan_data0); out_x += groups_blks_width; diff --git a/mace/kernels/opencl/cl/common.h b/mace/kernels/opencl/cl/common.h index b68bca075491fba94732c9ab998751284a232a55..b3054c940010cc32c1a8e87a58dceae4fab7e098 100644 --- a/mace/kernels/opencl/cl/common.h +++ b/mace/kernels/opencl/cl/common.h @@ -17,6 +17,7 @@ #define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE) #define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE) + #ifndef NON_UNIFORM_WORK_GROUP #define GLOBAL_WORK_GROUP_SIZE_DIM2 \ @@ -34,6 +35,18 @@ #endif + +#ifdef OUT_OF_RANGE_CHECK + +#define KERNEL_ERROR_PARAMS \ + __global char *kernel_error, + +#else + +#define KERNEL_ERROR_PARAMS + +#endif + __constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; @@ -61,4 +74,14 @@ inline DATA_TYPE4 do_activation(DATA_TYPE4 in, return out; } +inline void check_out_of_range_for_image2d(__write_only image2d_t image, + __private const int x, + __private const int y, + global char *kernel_error) { + int2 image_dim = get_image_dim(image); + if (x >= image_dim.x || y >= image_dim.y) { + *kernel_error = '1'; + } +} + #endif // MACE_KERNELS_OPENCL_CL_COMMON_H_ diff --git a/mace/kernels/opencl/cl/concat.cl b/mace/kernels/opencl/cl/concat.cl index 0e171e0f4dac7bbd41856e1a43518aacbb6ffa12..b1d5fb52c978269ec4066390a7f6c6dfae25a04e 100644 --- a/mace/kernels/opencl/cl/concat.cl +++ b/mace/kernels/opencl/cl/concat.cl @@ -22,7 +22,8 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left, } // Supported data type: half/float -__kernel void concat_channel(GLOBAL_WORK_GROUP_SIZE_DIM3 +__kernel void concat_channel(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input0, __read_only image2d_t input1, __private const int input0_chan, @@ -79,11 +80,17 @@ __kernel void concat_channel(GLOBAL_WORK_GROUP_SIZE_DIM3 } #endif - WRITE_IMAGET(output, (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx), data); + const int pos = mad24(chan_blk_idx, width, width_idx); + +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, pos, hb_idx, kernel_error); +#endif + WRITE_IMAGET(output, (int2)(pos, hb_idx), data); } // Required: All input channels are divisible by 4 -__kernel void concat_channel_multi(GLOBAL_WORK_GROUP_SIZE_DIM3 +__kernel void concat_channel_multi(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int chan_blk_offset, __write_only image2d_t output) { @@ -106,7 +113,12 @@ __kernel void concat_channel_multi(GLOBAL_WORK_GROUP_SIZE_DIM3 SAMPLER, (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx)); - WRITE_IMAGET(output, (int2)(mad24(chan_blk_idx + chan_blk_offset, width, width_idx), hb_idx), data); + const int pos = mad24(chan_blk_idx + chan_blk_offset, width, width_idx); + +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, pos, hb_idx, kernel_error); +#endif + WRITE_IMAGET(output, (int2)(pos, hb_idx), data); } //__kernel void concat_width(__read_only image2d_t input0, diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl index f88885b06c7b6a4ac9efb91a950e6e94b4e54076..03a2b47e56b5752f71ef2bbbf8e68682e5033445 100644 --- a/mace/kernels/opencl/cl/conv_2d.cl +++ b/mace/kernels/opencl/cl/conv_2d.cl @@ -1,6 +1,7 @@ #include -__kernel void conv_2d(GLOBAL_WORK_GROUP_SIZE_DIM3 +__kernel void conv_2d(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */ #ifdef BIAS @@ -126,19 +127,32 @@ __kernel void conv_2d(GLOBAL_WORK_GROUP_SIZE_DIM3 #endif const int out_x_base = mul24(out_ch_blk, out_width); + int w = out_w_blk; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0); w += out_w_blks; if (w >= out_width) return; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out1); w += out_w_blks; if (w >= out_width) return; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out2); w += out_w_blks; if (w >= out_width) return; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3); } diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index a5454a6700eeb78573994833f9fa1bc3bb0029ff..0a748925ea01215d71fbac7e3d23a30d2614250f 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -1,6 +1,7 @@ #include -__kernel void conv_2d_1x1(GLOBAL_WORK_GROUP_SIZE_DIM3 +__kernel void conv_2d_1x1(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * cin, cout/4 */ #ifdef BIAS @@ -104,17 +105,29 @@ __kernel void conv_2d_1x1(GLOBAL_WORK_GROUP_SIZE_DIM3 const int out_x_base = mul24(out_ch_blk, width); int out_x_idx = out_w_blk; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x_base + out_x_idx, out_hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0); out_x_idx += out_w_blks; if (out_x_idx >= width) return; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x_base + out_x_idx, out_hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out1); out_x_idx += out_w_blks; if (out_x_idx >= width) return; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x_base + out_x_idx, out_hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out2); out_x_idx += out_w_blks; if (out_x_idx >= width) return; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x_base + out_x_idx, out_hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3); } diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index 19a636bb109d6b9fb04577548c0c72e0824f0e88..a6d07b8b6a54e0542471ca4e7de23f6d0eb196d6 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -1,6 +1,7 @@ #include -__kernel void conv_2d_3x3(GLOBAL_WORK_GROUP_SIZE_DIM3 +__kernel void conv_2d_3x3(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */ #ifdef BIAS @@ -135,30 +136,45 @@ __kernel void conv_2d_3x3(GLOBAL_WORK_GROUP_SIZE_DIM3 const int out_x_base = mul24(out_ch_blk, out_width); int w = out_w_blk; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0); w += out_w_blks; if (w >= out_width) return; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out1); w += out_w_blks; if (w >= out_width) return; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out2); w += out_w_blks; if (w >= out_width) return; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3); w += out_w_blks; if (w >= out_width) return; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out4); diff --git a/mace/kernels/opencl/cl/cwise.cl b/mace/kernels/opencl/cl/cwise.cl index 92cdaf7ea20ef7e77467a52b494b3c72506269c1..f5d96647d2a0db5efd42b2ae108bc6a913995d92 100644 --- a/mace/kernels/opencl/cl/cwise.cl +++ b/mace/kernels/opencl/cl/cwise.cl @@ -1,6 +1,7 @@ #include -__kernel void cwise(GLOBAL_WORK_GROUP_SIZE_DIM2 +__kernel void cwise(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __private const float value, __write_only image2d_t output) { @@ -43,5 +44,8 @@ __kernel void cwise(GLOBAL_WORK_GROUP_SIZE_DIM2 out.w = fabs(in0.w); #endif +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, w, hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(w, hb), out); } diff --git a/mace/kernels/opencl/cl/depth_to_space.cl b/mace/kernels/opencl/cl/depth_to_space.cl index 8d989290b6496bd8e1f4797f711e6002fbc189d8..f6e82b134d1c4e884051c7e6df8b0e816fcc0872 100644 --- a/mace/kernels/opencl/cl/depth_to_space.cl +++ b/mace/kernels/opencl/cl/depth_to_space.cl @@ -1,71 +1,87 @@ #include -__kernel void depth_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3 +__kernel void depth_to_space(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int block_size, - __private const int input_height, + __private const int input_hb, __private const int input_width, __private const int input_depth_blocks, - __private const int output_height, __private const int output_width, __private const int output_depth_blocks, __write_only image2d_t output) { const int out_d = get_global_id(0); const int out_w = get_global_id(1); - const int out_h = get_global_id(2); + const int out_hb = get_global_id(2); - if (out_d >= output_depth_blocks || out_h >= output_height || out_w >= output_width) +#ifndef NON_UNIFORM_WORK_GROUP + if (out_d >= global_size_dim0 || out_w >= global_size_dim1 + || out_hb >= global_size_dim2) { return; + } +#endif const int out_pos = mad24(out_d, output_width, out_w); - const int in_h = out_h / block_size; - const int offset_h = out_h % block_size; + const int in_hb = out_hb / block_size; + const int offset_h = out_hb % block_size; const int in_w = out_w / block_size; const int offset_w = out_w % block_size; const int offset_d = (offset_h * block_size + offset_w) * output_depth_blocks; const int in_d = out_d + offset_d; - if (in_h >= input_height || in_w >= input_width || in_d >= input_depth_blocks) + if (in_hb >= input_hb || in_w >= input_width || in_d >= input_depth_blocks) { return; + } const int in_pos = mad24(in_d, input_width, in_w); - DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, in_h)); - WRITE_IMAGET(output, (int2)(out_pos, out_h), in_data); + DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, in_hb)); + +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_pos, out_hb, kernel_error); +#endif + WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data); } -__kernel void space_to_depth( +__kernel void space_to_depth(KERNEL_ERROR_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int block_size, - __private const int input_height, __private const int input_width, __private const int input_depth_blocks, - __private const int output_height, + __private const int output_hb, __private const int output_width, __private const int output_depth_blocks, __write_only image2d_t output) { - const int d = get_global_id(0); const int w = get_global_id(1); - const int h = get_global_id(2); + const int hb = get_global_id(2); - if (h >= input_height || w >= input_width || d >= input_depth_blocks) +#ifndef NON_UNIFORM_WORK_GROUP + if (d >= global_size_dim0 || w >= global_size_dim1 + || hb >= global_size_dim2) { return; + } +#endif const int in_pos = mad24(d, input_width, w); - const int out_h = h / block_size; - const int offset_h = h % block_size; + const int out_hb = hb / block_size; + const int offset_h = hb % block_size; const int out_w = w / block_size; const int offset_w = w % block_size; const int offset_d = (offset_h * block_size + offset_w) * input_depth_blocks; const int out_d = d + offset_d; - if (out_d >= output_depth_blocks || out_h >= output_height || out_w >= output_width) + if (out_d >= output_depth_blocks || out_hb >= output_hb || out_w >= output_width) { return; + } const int out_pos = mad24(out_d, output_width, out_w); - DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, h)); - WRITE_IMAGET(output, (int2)(out_pos, out_h), in_data); + DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, hb)); + +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_pos, out_hb, kernel_error); +#endif + WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data); } diff --git a/mace/kernels/opencl/cl/depthwise_conv2d.cl b/mace/kernels/opencl/cl/depthwise_conv2d.cl index c71ec4049ab6218c4d22ee446e371ba8c3622cab..932e37da204645a621067bb302cb06d4fd591175 100644 --- a/mace/kernels/opencl/cl/depthwise_conv2d.cl +++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl @@ -1,7 +1,8 @@ #include // Only multiplier = 1 is supported -__kernel void depthwise_conv2d(GLOBAL_WORK_GROUP_SIZE_DIM3 +__kernel void depthwise_conv2d(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ #ifdef BIAS @@ -122,22 +123,35 @@ __kernel void depthwise_conv2d(GLOBAL_WORK_GROUP_SIZE_DIM3 const short out_x_base = mul24(out_ch_blk, out_width); short w = out_w_blk; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0); w += out_w_blks; if (w >= out_width) return; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out1); w += out_w_blks; if (w >= out_width) return; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out2); w += out_w_blks; if (w >= out_width) return; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3); } -__kernel void depthwise_conv2d_s1(GLOBAL_WORK_GROUP_SIZE_DIM3 +__kernel void depthwise_conv2d_s1(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ #ifdef BIAS @@ -247,17 +261,29 @@ __kernel void depthwise_conv2d_s1(GLOBAL_WORK_GROUP_SIZE_DIM3 const short out_x_base = mul24(out_ch_blk, out_width); short w = out_w_blk; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0); w += 1; if (w >= out_width) return; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out1); w += 1; if (w >= out_width) return; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out2); w += 1; if (w >= out_width) return; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3); } diff --git a/mace/kernels/opencl/cl/eltwise.cl b/mace/kernels/opencl/cl/eltwise.cl index 0b9647f50574c7522ce54631980cb21c01038361..81ba7d8718f42387280ff83be069a86870bff7c5 100644 --- a/mace/kernels/opencl/cl/eltwise.cl +++ b/mace/kernels/opencl/cl/eltwise.cl @@ -1,6 +1,7 @@ #include -__kernel void eltwise(GLOBAL_WORK_GROUP_SIZE_DIM2 +__kernel void eltwise(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t input1, #ifdef COEFF_SUM @@ -36,5 +37,8 @@ __kernel void eltwise(GLOBAL_WORK_GROUP_SIZE_DIM2 out = in0 - in1; #endif +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, w, hb, kernel_error); +#endif WRITE_IMAGET(output, (int2)(w, hb), out); } diff --git a/mace/kernels/opencl/cl/fully_connected.cl b/mace/kernels/opencl/cl/fully_connected.cl index 3205e4921b4da3e44bbb5aa7cfa871ef3bad2ec1..b913986a06fbbdfbfcf0b0f5a6a2ce08252efe0d 100644 --- a/mace/kernels/opencl/cl/fully_connected.cl +++ b/mace/kernels/opencl/cl/fully_connected.cl @@ -1,7 +1,8 @@ #include // output = weight * input + bias -__kernel void fully_connected(GLOBAL_WORK_GROUP_SIZE_DIM2 +__kernel void fully_connected(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t input, __read_only image2d_t weight, #ifdef BIAS @@ -58,11 +59,16 @@ __kernel void fully_connected(GLOBAL_WORK_GROUP_SIZE_DIM2 #if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID) result = do_activation(result, relux_max_limit); #endif + +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_blk_idx, batch_idx, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_blk_idx, batch_idx), result); } // output = weight * input + bias -__kernel void fully_connected_width(GLOBAL_WORK_GROUP_SIZE_DIM3 +__kernel void fully_connected_width(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __read_only image2d_t weight, #ifdef BIAS @@ -147,6 +153,10 @@ __kernel void fully_connected_width(GLOBAL_WORK_GROUP_SIZE_DIM3 #if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID) result = do_activation(result, relux_max_limit); #endif + +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_blk_idx, batch_idx, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_blk_idx, batch_idx), result); } } diff --git a/mace/kernels/opencl/cl/matmul.cl b/mace/kernels/opencl/cl/matmul.cl index 82ccf6bad13a718f5777957fbebb44fd8e8b14df..1fa2b46f481d629f107959bf5f820cfdfc2a2f90 100644 --- a/mace/kernels/opencl/cl/matmul.cl +++ b/mace/kernels/opencl/cl/matmul.cl @@ -1,7 +1,8 @@ #include // C = A * B -__kernel void matmul(GLOBAL_WORK_GROUP_SIZE_DIM2 +__kernel void matmul(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t A, __read_only image2d_t B, __write_only image2d_t C, @@ -46,11 +47,27 @@ __kernel void matmul(GLOBAL_WORK_GROUP_SIZE_DIM2 c3 += (DATA_TYPE4)(dot(a0, b3), dot(a1, b3), dot(a2, b3), dot(a3, b3)); } + +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(C, gx, gy, kernel_error); +#endif WRITE_IMAGET(C, (int2)(gx, gy), c0); + if ((gx + 1) >= N) return; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(C, gx + 1, gy, kernel_error); +#endif WRITE_IMAGET(C, (int2)(gx + 1, gy), c1); + if ((gx + 2) >= N) return; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(C, gx + 2, gy, kernel_error); +#endif WRITE_IMAGET(C, (int2)(gx + 2, gy), c2); + if ((gx + 3) >= N) return; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(C, gx + 3, gy, kernel_error); +#endif WRITE_IMAGET(C, (int2)(gx + 3, gy), c3); } diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl index 25785bb2a089eb4aed28e46fc434403fc365f1bc..c3abeb01c12441c4d5a01b3ea8c3d7753f050ec5 100644 --- a/mace/kernels/opencl/cl/pooling.cl +++ b/mace/kernels/opencl/cl/pooling.cl @@ -19,7 +19,8 @@ inline int calculate_avg_block_size(const int pool_size, } // Supported data type: half/float -__kernel void pooling(GLOBAL_WORK_GROUP_SIZE_DIM3 +__kernel void pooling(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int in_height, __private const int in_width, @@ -94,5 +95,9 @@ __kernel void pooling(GLOBAL_WORK_GROUP_SIZE_DIM3 } #endif - WRITE_IMAGET(output, (int2)(mad24(out_chan_idx, out_width, out_width_idx), out_hb_idx), res); + const int pos = mad24(out_chan_idx, out_width, out_width_idx); +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, pos, out_hb_idx, kernel_error); +#endif + WRITE_IMAGET(output, (int2)(pos, out_hb_idx), res); } diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl index 2b0464c70d2042908345a1b360af8c5a4d91a15c..091fb617c120b4c3449e248391ca1221a133f23f 100644 --- a/mace/kernels/opencl/cl/resize_bilinear.cl +++ b/mace/kernels/opencl/cl/resize_bilinear.cl @@ -1,6 +1,7 @@ #include -__kernel void resize_bilinear_nocache(GLOBAL_WORK_GROUP_SIZE_DIM3 +__kernel void resize_bilinear_nocache(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __write_only image2d_t output, __private const float height_scale, @@ -56,6 +57,10 @@ __kernel void resize_bilinear_nocache(GLOBAL_WORK_GROUP_SIZE_DIM3 const int out_w_offset = mul24(ch_blk, out_width); const int out_h_offset = mul24(b, out_height); + +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_w_offset + w, out_h_offset + h, kernel_error); +#endif WRITE_IMAGET(output, (int2)(out_w_offset + w, out_h_offset + h), out); } diff --git a/mace/kernels/opencl/cl/slice.cl b/mace/kernels/opencl/cl/slice.cl index 0692c62b70f4ed0933bb94627c16413d2602d2ab..0116fd91da6f6be92dc1a97ba683f79e000f10cf 100644 --- a/mace/kernels/opencl/cl/slice.cl +++ b/mace/kernels/opencl/cl/slice.cl @@ -1,6 +1,7 @@ #include -__kernel void slice(GLOBAL_WORK_GROUP_SIZE_DIM3 +__kernel void slice(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int chan_blk_offset, __write_only image2d_t output) { @@ -21,6 +22,11 @@ __kernel void slice(GLOBAL_WORK_GROUP_SIZE_DIM3 DATA_TYPE4 data = READ_IMAGET(input, SAMPLER, (int2)(mad24(chan_blk_idx + chan_blk_offset, width, width_idx), hb_idx)); + + const int pos = mad24(chan_blk_idx, width, width_idx); +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, pos, hb_idx, kernel_error); +#endif WRITE_IMAGET(output, - (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx), data); + (int2)(pos, hb_idx), data); } diff --git a/mace/kernels/opencl/cl/softmax.cl b/mace/kernels/opencl/cl/softmax.cl index b5b99de6c0ef92064174a14e16014ddb22093eb9..ae434eb91c3a927ed1d7103da4cd5cf1b07cfa7c 100644 --- a/mace/kernels/opencl/cl/softmax.cl +++ b/mace/kernels/opencl/cl/softmax.cl @@ -1,6 +1,7 @@ #include -__kernel void softmax(GLOBAL_WORK_GROUP_SIZE_DIM3 +__kernel void softmax(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t input, __private const int channels, __private const int remain_channels, @@ -84,5 +85,8 @@ __kernel void softmax(GLOBAL_WORK_GROUP_SIZE_DIM3 data = native_exp(data) / sum; } +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, pos, hb_idx, kernel_error); +#endif WRITE_IMAGET(output, (int2)(pos, hb_idx), data); } diff --git a/mace/kernels/opencl/cl/space_to_batch.cl b/mace/kernels/opencl/cl/space_to_batch.cl index 431a599705b7a522ff4e366c5c73a173b38d9673..dcf927787c9204bf04c379639032bd4a5a37953b 100644 --- a/mace/kernels/opencl/cl/space_to_batch.cl +++ b/mace/kernels/opencl/cl/space_to_batch.cl @@ -1,6 +1,7 @@ #include -__kernel void space_to_batch(GLOBAL_WORK_GROUP_SIZE_DIM3 +__kernel void space_to_batch(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t space_data, __write_only image2d_t batch_data, __private const int block_height, @@ -44,10 +45,15 @@ __kernel void space_to_batch(GLOBAL_WORK_GROUP_SIZE_DIM3 DATA_TYPE4 value = READ_IMAGET(space_data, SAMPLER, space_coord); int2 batch_coord = (int2)(mul24(chan_idx, batch_width) + batch_w_idx, batch_hb_idx); + +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(batch_data, batch_coord.x, batch_coord.y, kernel_error); +#endif WRITE_IMAGET(batch_data, batch_coord, value); } -__kernel void batch_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3 +__kernel void batch_to_space(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 __read_only image2d_t batch_data, __write_only image2d_t space_data, __private const int block_height, @@ -87,6 +93,10 @@ __kernel void batch_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3 int2 space_coord = (int2)(mul24(chan_idx, space_width) + space_w_idx, space_b_idx * space_height + space_h_idx); + +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(space_data, space_coord.x, space_coord.y, kernel_error); +#endif WRITE_IMAGET(space_data, space_coord, value); } } diff --git a/mace/kernels/opencl/cl/winograd_transform.cl b/mace/kernels/opencl/cl/winograd_transform.cl index 0cab37d750510f1f7bedb02ceddfb49577e4ee31..aac11e97b9db161f33283430942ce8015582324a 100644 --- a/mace/kernels/opencl/cl/winograd_transform.cl +++ b/mace/kernels/opencl/cl/winograd_transform.cl @@ -1,6 +1,7 @@ #include -__kernel void winograd_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2 +__kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t input, __write_only image2d_t output, __private const int in_height, @@ -93,6 +94,9 @@ __kernel void winograd_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2 input3[2] = tv3[2] - tv3[1]; input3[3] = tv3[1] - tv3[3]; +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, out_width_idx, chan_blk_idx + chan_blk_idx * 15, kernel_error); +#endif #pragma unroll for (short i = 0; i < 4; ++i) { WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), input0[i]); @@ -115,7 +119,8 @@ __kernel void winograd_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2 } } -__kernel void winograd_inverse_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2 +__kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 __read_only image2d_t input, #ifdef BIAS __read_only image2d_t bias, /* cout%4 * cout/4 */ @@ -208,18 +213,30 @@ __kernel void winograd_inverse_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2 in1[1] = do_activation(in1[1], relux_max_limit); #endif +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, coord_x, coord_y, kernel_error); +#endif WRITE_IMAGET(output, (int2)(coord_x, coord_y), in0[0]); t = 0; if (out_width_idx + 1 < out_width) { +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, coord_x + 1, coord_y, kernel_error); +#endif WRITE_IMAGET(output, (int2)(coord_x + 1, coord_y), in0[1]); t += 1; } if (out_height_idx + 1 < out_height) { +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, coord_x, coord_y + 1, kernel_error); +#endif WRITE_IMAGET(output, (int2)(coord_x, coord_y + 1), in1[0]); t += 1; } if (t == 2) { +#ifdef OUT_OF_RANGE_CHECK + check_out_of_range_for_image2d(output, coord_x + 1, coord_y + 1, kernel_error); +#endif WRITE_IMAGET(output, (int2)(coord_x + 1, coord_y + 1), in1[1]); } diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index bccc8623997c7b356190ecc4818fc3394eb45d89..cf587e26f69b2cb930892e164153a9c0cbe153e7 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -18,7 +18,8 @@ static void Concat2(cl::Kernel *kernel, std::vector *prev_input_shape, Tensor *output, StatsFuture *future, - uint32_t *kwg_size) { + uint32_t *kwg_size, + std::unique_ptr *kernel_error) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -36,6 +37,14 @@ static void Concat2(cl::Kernel *kernel, std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); built_options.emplace("-Dconcat_channel=" + kernel_name); + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + *kernel_error = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + (*kernel_error)->Map(nullptr); + *((*kernel_error)->mutable_data()) = '0'; + (*kernel_error)->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -56,6 +65,10 @@ static void Concat2(cl::Kernel *kernel, } if (!IsVecEqual(*prev_input_shape, input0->shape())) { uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel->setArg(idx++, + *(static_cast((*kernel_error)->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[1]); @@ -77,6 +90,13 @@ static void Concat2(cl::Kernel *kernel, ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future); + + if (runtime->IsOutOfRangeCheckEnabled()) { + (*kernel_error)->Map(nullptr); + char *kerror_code = (*kernel_error)->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + (*kernel_error)->UnMap(); + } } static void ConcatN(cl::Kernel *kernel, @@ -84,7 +104,8 @@ static void ConcatN(cl::Kernel *kernel, const DataType dt, Tensor *output, StatsFuture *future, - uint32_t *kwg_size) { + uint32_t *kwg_size, + std::unique_ptr *kernel_error) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -98,6 +119,14 @@ static void ConcatN(cl::Kernel *kernel, built_options.emplace("-Dconcat_channel_multi=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + *kernel_error = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + (*kernel_error)->Map(nullptr); + *((*kernel_error)->mutable_data()) = '0'; + (*kernel_error)->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -117,6 +146,10 @@ static void ConcatN(cl::Kernel *kernel, }; uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel->setArg(idx++, + *(static_cast((*kernel_error)->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[1]); @@ -132,6 +165,13 @@ static void ConcatN(cl::Kernel *kernel, ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_" << batch * height; TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future); + + if (runtime->IsOutOfRangeCheckEnabled()) { + (*kernel_error)->Map(nullptr); + char *kerror_code = (*kernel_error)->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + (*kernel_error)->UnMap(); + } } } @@ -172,12 +212,12 @@ void ConcatFunctor::operator()( switch (inputs_count) { case 2: Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum::value, - &input_shape_, output, future, &kwg_size_); + &input_shape_, output, future, &kwg_size_, &kernel_error_); break; default: if (divisible_four) { ConcatN(&kernel_, input_list, DataTypeToEnum::value, output, future, - &kwg_size_); + &kwg_size_, &kernel_error_); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc index 5a002666320dfbfbea8263d1693bae53231da952..684de9eff434832dcc946d676667453c8cd8d0f9 100644 --- a/mace/kernels/opencl/conv_2d_opencl.cc +++ b/mace/kernels/opencl/conv_2d_opencl.cc @@ -21,7 +21,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, std::vector *prev_input_shape, Tensor *output, StatsFuture *future, - uint32_t *kwg_size); + uint32_t *kwg_size, + std::unique_ptr *kernel_error); extern void Conv2dOpenclK3x3(cl::Kernel *kernel, const Tensor *input, @@ -36,7 +37,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, std::vector *prev_input_shape, Tensor *output, StatsFuture *future, - uint32_t *kwg_size); + uint32_t *kwg_size, + std::unique_ptr *kernel_error); extern void Conv2dOpencl(cl::Kernel *kernel, const Tensor *input, @@ -51,7 +53,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel, std::vector *prev_input_shape, Tensor *output, StatsFuture *future, - uint32_t *kwg_size); + uint32_t *kwg_size, + std::unique_ptr *kernel_error); template void Conv2dFunctor::operator()(const Tensor *input, @@ -65,7 +68,7 @@ void Conv2dFunctor::operator()(const Tensor *input, const int *dilations, const ActivationType activation, const float relux_max_limit, const DataType dt, std::vector *input_shape, Tensor *output, StatsFuture *future, - uint32_t *kwg_size); + uint32_t *kwg_size, std::unique_ptr *kernel_error); // Selection matrix: kernel_size x stride_size static const Conv2dOpenclFunction selector[5] = { Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr}; @@ -106,12 +109,12 @@ void Conv2dFunctor::operator()(const Tensor *input, conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, DataTypeToEnum::value, &input_shape_, output, future, - &kwg_size_); + &kwg_size_, &kernel_error_); } else { Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, DataTypeToEnum::value, &input_shape_, output, future, - &kwg_size_); + &kwg_size_, &kernel_error_); } } diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index 5cdf8e5608386ea7547f20d239e5126d69d50efc..ca1e3c1b29fa36bd3e497ae5e1151ce9e7107f51 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -23,7 +23,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, std::vector *prev_input_shape, Tensor *output, StatsFuture *future, - uint32_t *kwg_size) { + uint32_t *kwg_size, + std::unique_ptr *kernel_error) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -47,6 +48,14 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, built_options.emplace("-Dconv_2d_1x1=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + *kernel_error = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + (*kernel_error)->Map(nullptr); + *((*kernel_error)->mutable_data()) = '0'; + (*kernel_error)->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -84,6 +93,10 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, if (!IsVecEqual(*prev_input_shape, input->shape())) { uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel->setArg(idx++, + *(static_cast((*kernel_error)->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[1]); @@ -112,6 +125,13 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); + + if (runtime->IsOutOfRangeCheckEnabled()) { + (*kernel_error)->Map(nullptr); + char *kerror_code = (*kernel_error)->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + (*kernel_error)->UnMap(); + } } } // namespace kernels diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index d0f587452579a7dfe6dbd64d29fa02ab9bf73297..98990d06b665525ad0eea5d77d0851d7b91fe511 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -25,7 +25,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, std::vector *prev_input_shape, Tensor *output, StatsFuture *future, - uint32_t *kwg_size) { + uint32_t *kwg_size, + std::unique_ptr *kernel_error) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -44,6 +45,14 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, built_options.emplace("-Dconv_2d_3x3=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + *kernel_error = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + (*kernel_error)->Map(nullptr); + *((*kernel_error)->mutable_data()) = '0'; + (*kernel_error)->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -79,6 +88,10 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, if (!IsVecEqual(*prev_input_shape, input->shape())) { uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel->setArg(idx++, + *(static_cast((*kernel_error)->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[1]); @@ -110,6 +123,13 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); + + if (runtime->IsOutOfRangeCheckEnabled()) { + (*kernel_error)->Map(nullptr); + char *kerror_code = (*kernel_error)->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + (*kernel_error)->UnMap(); + } } } // namespace kernels diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc index b9b2fec56a13dc55ee97649098f0674425ddcd4e..5b2fd1ff728d15dde31da4cbbe5ffd2473420050 100644 --- a/mace/kernels/opencl/conv_2d_opencl_general.cc +++ b/mace/kernels/opencl/conv_2d_opencl_general.cc @@ -25,7 +25,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel, std::vector *prev_input_shape, Tensor *output, StatsFuture *future, - uint32_t *kwg_size) { + uint32_t *kwg_size, + std::unique_ptr *kernel_error) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -44,6 +45,14 @@ extern void Conv2dOpencl(cl::Kernel *kernel, built_options.emplace("-Dconv_2d=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + *kernel_error = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + (*kernel_error)->Map(nullptr); + *((*kernel_error)->mutable_data()) = '0'; + (*kernel_error)->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -79,6 +88,10 @@ extern void Conv2dOpencl(cl::Kernel *kernel, if (!IsVecEqual(*prev_input_shape, input->shape())) { uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel->setArg(idx++, + *(static_cast((*kernel_error)->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[1]); @@ -112,6 +125,13 @@ extern void Conv2dOpencl(cl::Kernel *kernel, Concat("conv2d_general_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); + + if (runtime->IsOutOfRangeCheckEnabled()) { + (*kernel_error)->Map(nullptr); + char *kerror_code = (*kernel_error)->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + (*kernel_error)->UnMap(); + } } } // namespace kernels diff --git a/mace/kernels/opencl/cwise_opencl.cc b/mace/kernels/opencl/cwise_opencl.cc index dce3d14d69f7a60f4a9cc928b7b2e12a1cbc8c73..bdb41bd8a93793118d2c7c54fe6f6f905d852f0b 100644 --- a/mace/kernels/opencl/cwise_opencl.cc +++ b/mace/kernels/opencl/cwise_opencl.cc @@ -34,6 +34,14 @@ void CWiseFunctor::operator()(const Tensor *input, built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace(MakeString("-DCWISE_TYPE=", type_)); + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + kernel_error_ = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + kernel_error_->Map(nullptr); + *(kernel_error_->mutable_data()) = '0'; + kernel_error_->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -44,6 +52,10 @@ void CWiseFunctor::operator()(const Tensor *input, } if (!IsVecEqual(input_shape_, input->shape())) { uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_.setArg(idx++, + *(static_cast(kernel_error_->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); @@ -59,6 +71,13 @@ void CWiseFunctor::operator()(const Tensor *input, ss << "cwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); + + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_error_->Map(nullptr); + char *kerror_code = kernel_error_->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + kernel_error_->UnMap(); + } } template struct CWiseFunctor; diff --git a/mace/kernels/opencl/depth_to_space_opencl.cc b/mace/kernels/opencl/depth_to_space_opencl.cc index 1c0624365c96eb19f08a22f9055d75834a4d6b72..9f853b8805125560041fcfedb24715d5aba0088b 100644 --- a/mace/kernels/opencl/depth_to_space_opencl.cc +++ b/mace/kernels/opencl/depth_to_space_opencl.cc @@ -23,8 +23,7 @@ void DepthToSpaceOpFunctor::operator()( const char *kernel_name = nullptr; index_t output_height, output_width, output_depth; - if (d2s_) { - output_height = input_height * block_size_; + if (d2s_) { output_height = input_height * block_size_; output_width = input_width * block_size_; output_depth = input_depth / (block_size_ * block_size_); kernel_name = "depth_to_space"; @@ -55,6 +54,14 @@ void DepthToSpaceOpFunctor::operator()( auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + kernel_error_ = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + kernel_error_->Map(nullptr); + *(kernel_error_->mutable_data()) = '0'; + kernel_error_->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -84,19 +91,31 @@ void DepthToSpaceOpFunctor::operator()( } uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_.setArg(idx++, + *(static_cast(kernel_error_->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[2]); } kernel_.setArg(idx++, *(input->opencl_image())); - kernel_.setArg(idx++, static_cast(block_size_)); - kernel_.setArg(idx++, static_cast(input_height)); - kernel_.setArg(idx++, static_cast(input_width)); - kernel_.setArg(idx++, static_cast(input_depth_blocks)); - kernel_.setArg(idx++, static_cast(output_height)); - kernel_.setArg(idx++, static_cast(output_width)); - kernel_.setArg(idx++, static_cast(output_depth_blocks)); + if (d2s_) { + kernel_.setArg(idx++, static_cast(block_size_)); + kernel_.setArg(idx++, static_cast(input_height * batch)); + kernel_.setArg(idx++, static_cast(input_width)); + kernel_.setArg(idx++, static_cast(input_depth_blocks)); + kernel_.setArg(idx++, static_cast(output_width)); + kernel_.setArg(idx++, static_cast(output_depth_blocks)); + } else { + kernel_.setArg(idx++, static_cast(block_size_)); + kernel_.setArg(idx++, static_cast(input_width)); + kernel_.setArg(idx++, static_cast(input_depth_blocks)); + kernel_.setArg(idx++, static_cast(output_height * batch)); + kernel_.setArg(idx++, static_cast(output_width)); + kernel_.setArg(idx++, static_cast(output_depth_blocks)); + } kernel_.setArg(idx++, *(output->opencl_image())); input_shape_ = input->shape(); @@ -104,6 +123,13 @@ void DepthToSpaceOpFunctor::operator()( const std::vector lws = {8, kwg_size_ / 64, 8, 1}; TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); + + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_error_->Map(nullptr); + char *kerror_code = kernel_error_->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + kernel_error_->UnMap(); + } } template struct DepthToSpaceOpFunctor; diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc index 5e0b99ba1e5a00e25eec931f209c0a28c8ed85b2..1dfd11bc259dd5ae7b512e9415c1897c3d315e4c 100644 --- a/mace/kernels/opencl/depthwise_conv_opencl.cc +++ b/mace/kernels/opencl/depthwise_conv_opencl.cc @@ -24,7 +24,8 @@ void DepthwiseConv2d(cl::Kernel *kernel, std::vector *prev_input_shape, Tensor *output, StatsFuture *future, - uint32_t *kwg_size) { + uint32_t *kwg_size, + std::unique_ptr *kernel_error) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -52,6 +53,14 @@ void DepthwiseConv2d(cl::Kernel *kernel, } else { built_options.emplace("-Ddepthwise_conv2d=" + kernel_name); } + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + *kernel_error = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + (*kernel_error)->Map(nullptr); + *((*kernel_error)->mutable_data()) = '0'; + (*kernel_error)->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -97,6 +106,10 @@ void DepthwiseConv2d(cl::Kernel *kernel, input_channels); uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel->setArg(idx++, + *(static_cast((*kernel_error)->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[1]); @@ -130,6 +143,13 @@ void DepthwiseConv2d(cl::Kernel *kernel, std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation, batch, height, width, channels, multiplier); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); + + if (runtime->IsOutOfRangeCheckEnabled()) { + (*kernel_error)->Map(nullptr); + char *kerror_code = (*kernel_error)->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + (*kernel_error)->UnMap(); + } } template @@ -182,7 +202,7 @@ void DepthwiseConv2dFunctor::operator()( DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, DataTypeToEnum::value, &input_shape_, output, future, - &kwg_size_); + &kwg_size_, &kernel_error_); } template struct DepthwiseConv2dFunctor; diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise_opencl.cc index c23534bbce5ca423314d23bd470a5cbc2289ae1e..d6f3b0f15a754a64f8c64ecb429327b37a2a33cc 100644 --- a/mace/kernels/opencl/eltwise_opencl.cc +++ b/mace/kernels/opencl/eltwise_opencl.cc @@ -37,6 +37,14 @@ void EltwiseFunctor::operator()(const Tensor *input0, built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace(MakeString("-DELTWISE_TYPE=", type_)); + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + kernel_error_ = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + kernel_error_->Map(nullptr); + *(kernel_error_->mutable_data()) = '0'; + kernel_error_->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -48,6 +56,10 @@ void EltwiseFunctor::operator()(const Tensor *input0, } if (!IsVecEqual(input_shape_, input0->shape())) { uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_.setArg(idx++, + *(static_cast(kernel_error_->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); @@ -68,6 +80,12 @@ void EltwiseFunctor::operator()(const Tensor *input0, ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_error_->Map(nullptr); + char *kerror_code = kernel_error_->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + kernel_error_->UnMap(); + } } template struct EltwiseFunctor; diff --git a/mace/kernels/opencl/fully_connected_opencl.cc b/mace/kernels/opencl/fully_connected_opencl.cc index 3178b8ae1125e1dcc6b5efd5daebe4fc73df2168..985c97072e24fda26ffc887bd0e10bba77dfb85c 100644 --- a/mace/kernels/opencl/fully_connected_opencl.cc +++ b/mace/kernels/opencl/fully_connected_opencl.cc @@ -19,7 +19,8 @@ void FCWXKernel(cl::Kernel *kernel, std::vector *gws, std::vector *lws, const float relux_max_limit, - StatsFuture *future) { + StatsFuture *future, + std::unique_ptr *kernel_error) { MACE_CHECK(input->dim(3) % 4 == 0) << "FC width kernel only support input with 4x channel."; MACE_CHECK_NOTNULL(gws); @@ -33,8 +34,7 @@ void FCWXKernel(cl::Kernel *kernel, std::set built_options; auto dt = DataTypeToEnum::value; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected"); - kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width"); + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width"); built_options.emplace("-Dfully_connected_width=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); @@ -62,6 +62,14 @@ void FCWXKernel(cl::Kernel *kernel, if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) { built_options.emplace("-DNON_QUALCOMM_ADRENO"); } + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + *kernel_error = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + (*kernel_error)->Map(nullptr); + *((*kernel_error)->mutable_data()) = '0'; + (*kernel_error)->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -95,6 +103,10 @@ void FCWXKernel(cl::Kernel *kernel, (*gws)[2] = static_cast(batch * output_blocks); uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel->setArg(idx++, + *(static_cast((*kernel_error)->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel->setArg(idx++, (*gws)[0]); kernel->setArg(idx++, (*gws)[1]); @@ -132,6 +144,12 @@ void FCWXKernel(cl::Kernel *kernel, cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event); } + if (runtime->IsOutOfRangeCheckEnabled()) { + (*kernel_error)->Map(nullptr); + char *kerror_code = (*kernel_error)->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + (*kernel_error)->UnMap(); + } MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; if (future != nullptr) { @@ -155,7 +173,8 @@ void FCWTXKernel(cl::Kernel *kernel, std::vector *gws, std::vector *lws, const float relux_max_limit, - StatsFuture *future) { + StatsFuture *future, + std::unique_ptr *kernel_error) { MACE_CHECK_NOTNULL(gws); MACE_CHECK_NOTNULL(lws); auto runtime = OpenCLRuntime::Global(); @@ -169,6 +188,14 @@ void FCWTXKernel(cl::Kernel *kernel, if (bias != nullptr) { built_options.emplace("-DBIAS"); } + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + *kernel_error = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + (*kernel_error)->Map(nullptr); + *((*kernel_error)->mutable_data()) = '0'; + (*kernel_error)->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -206,6 +233,10 @@ void FCWTXKernel(cl::Kernel *kernel, }; uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel->setArg(idx++, + *(static_cast((*kernel_error)->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel->setArg(idx++, (*gws)[0]); kernel->setArg(idx++, (*gws)[1]); @@ -229,6 +260,13 @@ void FCWTXKernel(cl::Kernel *kernel, ss << "fc_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); TuningOrRun2DKernel(*kernel, ss.str(), gws->data(), *lws, future); + + if (runtime->IsOutOfRangeCheckEnabled()) { + (*kernel_error)->Map(nullptr); + char *kerror_code = (*kernel_error)->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + (*kernel_error)->UnMap(); + } } template @@ -246,10 +284,12 @@ void FullyConnectedFunctor::operator()( if (weight_type_ == BufferType::WEIGHT_HEIGHT) { FCWTXKernel(&kernel_, input, weight, bias, &input_shape_, output, - activation_, &gws_, &lws_, relux_max_limit_, future); + activation_, &gws_, &lws_, relux_max_limit_, future, + &kernel_error_); } else { FCWXKernel(&kernel_, input, weight, bias, &input_shape_, output, - activation_, &gws_, &lws_, relux_max_limit_, future); + activation_, &gws_, &lws_, relux_max_limit_, future, + &kernel_error_); } } diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc index 7a4822096309297d8c11e80c1f7eb6ea5069b48f..9fb80102d63d79fc319581a66e91042db4de0bd2 100644 --- a/mace/kernels/opencl/matmul.cc +++ b/mace/kernels/opencl/matmul.cc @@ -40,6 +40,14 @@ void MatMulFunctor::operator()(const Tensor *A, built_options.emplace("-Dmatmul=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + kernel_error_ = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + kernel_error_->Map(nullptr); + *(kernel_error_->mutable_data()) = '0'; + kernel_error_->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -49,6 +57,10 @@ void MatMulFunctor::operator()(const Tensor *A, static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_.setArg(idx++, + *(static_cast(kernel_error_->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); @@ -67,6 +79,13 @@ void MatMulFunctor::operator()(const Tensor *A, ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_" << C->dim(2) << "_" << C->dim(3); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); + + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_error_->Map(nullptr); + char *kerror_code = kernel_error_->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + kernel_error_->UnMap(); + } } template struct MatMulFunctor; diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc index 774fd5ee2ac60875491deb00f1ce4fcbef8ba97e..a2f8927cb7f8117cdd45b0195147c4dbbfc3b34f 100644 --- a/mace/kernels/opencl/pooling_opencl.cc +++ b/mace/kernels/opencl/pooling_opencl.cc @@ -37,6 +37,14 @@ void PoolingFunctor::operator()(const Tensor *input, if (pooling_type_ == AVG) { built_options.emplace("-DPOOL_AVG"); } + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + kernel_error_ = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + kernel_error_->Map(nullptr); + *(kernel_error_->mutable_data()) = '0'; + kernel_error_->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -82,6 +90,10 @@ void PoolingFunctor::operator()(const Tensor *input, }; uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_.setArg(idx++, + *(static_cast(kernel_error_->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); @@ -117,6 +129,13 @@ void PoolingFunctor::operator()(const Tensor *input, ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); TuningOrRun3DKernel(kernel_, ss.str(), gws.data(), lws, future); + + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_error_->Map(nullptr); + char *kerror_code = kernel_error_->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + kernel_error_->UnMap(); + } } template struct PoolingFunctor; diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc index ac2733e9e511c8a32dee3371c9ed404be964cb90..002463077f755489bd8395b934238bc745db5c14 100644 --- a/mace/kernels/opencl/resize_bilinear_opencl.cc +++ b/mace/kernels/opencl/resize_bilinear_opencl.cc @@ -37,6 +37,14 @@ void ResizeBilinearFunctor::operator()( auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + kernel_error_ = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + kernel_error_->Map(nullptr); + *(kernel_error_->mutable_data()) = '0'; + kernel_error_->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -61,6 +69,10 @@ void ResizeBilinearFunctor::operator()( CalculateResizeScale(in_width, out_width, align_corners_); uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_.setArg(idx++, + *(static_cast(kernel_error_->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); @@ -82,6 +94,13 @@ void ResizeBilinearFunctor::operator()( ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); + + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_error_->Map(nullptr); + char *kerror_code = kernel_error_->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + kernel_error_->UnMap(); + } } template struct ResizeBilinearFunctor; diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc index 850f08f6df5b091750d0b6ac203bf3e72e00099c..40a404d7835cc8766f96351481ee2bbe94838c7f 100644 --- a/mace/kernels/opencl/slice.cc +++ b/mace/kernels/opencl/slice.cc @@ -38,6 +38,14 @@ void SliceFunctor::operator()( built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum::value)); + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + kernel_error_ = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + kernel_error_->Map(nullptr); + *(kernel_error_->mutable_data()) = '0'; + kernel_error_->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -64,6 +72,10 @@ void SliceFunctor::operator()( << outputs_count; for (int i = 0; i < outputs_count; ++i) { uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_.setArg(idx++, + *(static_cast(kernel_error_->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); @@ -74,6 +86,12 @@ void SliceFunctor::operator()( kernel_.setArg(idx++, *(output_list[i]->opencl_image())); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_error_->Map(nullptr); + char *kerror_code = kernel_error_->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + kernel_error_->UnMap(); + } } } diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc index ea4f0b3e13d2d023a9dc98691ce47cb269f97714..c66ed1c5fce2443b3909edf339f0db0c27a052a6 100644 --- a/mace/kernels/opencl/softmax_opencl.cc +++ b/mace/kernels/opencl/softmax_opencl.cc @@ -36,6 +36,14 @@ void SoftmaxFunctor::operator()(const Tensor *logits, auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + kernel_error_ = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + kernel_error_->Map(nullptr); + *(kernel_error_->mutable_data()) = '0'; + kernel_error_->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -46,6 +54,10 @@ void SoftmaxFunctor::operator()(const Tensor *logits, } if (!IsVecEqual(input_shape_, logits->shape())) { uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_.setArg(idx++, + *(static_cast(kernel_error_->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); @@ -64,6 +76,13 @@ void SoftmaxFunctor::operator()(const Tensor *logits, ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); + + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_error_->Map(nullptr); + char *kerror_code = kernel_error_->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + kernel_error_->UnMap(); + } } template struct SoftmaxFunctor; diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc index b4ae998a56f34cf3f23266e549f7346360ae4113..1effce99cb4892431e5a65b45e7cca83123868ee 100644 --- a/mace/kernels/opencl/space_to_batch_opencl.cc +++ b/mace/kernels/opencl/space_to_batch_opencl.cc @@ -47,6 +47,14 @@ void SpaceToBatchFunctor::operator()( built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum::value)); + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + kernel_error_ = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + kernel_error_->Map(nullptr); + *(kernel_error_->mutable_data()) = '0'; + kernel_error_->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -59,6 +67,10 @@ void SpaceToBatchFunctor::operator()( } if (!IsVecEqual(space_shape_, space_tensor->shape())) { uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_.setArg(idx++, + *(static_cast(kernel_error_->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); @@ -89,6 +101,13 @@ void SpaceToBatchFunctor::operator()( << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_" << batch_tensor->dim(3); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); + + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_error_->Map(nullptr); + char *kerror_code = kernel_error_->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + kernel_error_->UnMap(); + } } template struct SpaceToBatchFunctor; diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc index f4fd5525960706e31ae3a303d00c50e534bfeaec..988aef111f6e4ad3f9a7e1c2fa57a3e3270a207c 100644 --- a/mace/kernels/opencl/winograd_transform.cc +++ b/mace/kernels/opencl/winograd_transform.cc @@ -26,6 +26,14 @@ void WinogradTransformFunctor::operator()( DtToUpstreamCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum::value)); + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + kernel_error_ = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + kernel_error_->Map(nullptr); + *(kernel_error_->mutable_data()) = '0'; + kernel_error_->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -62,6 +70,10 @@ void WinogradTransformFunctor::operator()( output_tensor->ResizeImage(output_shape, image_shape); uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_.setArg(idx++, + *(static_cast(kernel_error_->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); @@ -85,6 +97,13 @@ void WinogradTransformFunctor::operator()( << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_" << input_tensor->dim(3); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); + + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_error_->Map(nullptr); + char *kerror_code = kernel_error_->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + kernel_error_->UnMap(); + } } template @@ -106,6 +125,14 @@ void WinogradInverseTransformFunctor::operator()( DtToUpstreamCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum::value)); + if (runtime->IsOutOfRangeCheckEnabled()) { + built_options.emplace("-DOUT_OF_RANGE_CHECK"); + kernel_error_ = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1))); + kernel_error_->Map(nullptr); + *(kernel_error_->mutable_data()) = '0'; + kernel_error_->UnMap(); + } if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } @@ -152,6 +179,10 @@ void WinogradInverseTransformFunctor::operator()( const uint32_t round_h = (height_ + 1) / 2; const uint32_t round_w = (width_ + 1) / 2; uint32_t idx = 0; + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_.setArg(idx++, + *(static_cast(kernel_error_->buffer()))); + } if (!runtime->IsNonUniformWorkgroupsSupported()) { kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); @@ -181,6 +212,13 @@ void WinogradInverseTransformFunctor::operator()( << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_" << input_tensor->dim(3); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); + + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_error_->Map(nullptr); + char *kerror_code = kernel_error_->mutable_data(); + MACE_CHECK(*kerror_code == '0') << "Kernel error code: " << *kerror_code; + kernel_error_->UnMap(); + } } template struct WinogradTransformFunctor; diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h index b5a5001271e78ff54c2fbf4efe3b541a08467390..22910b8d2e9e460c323ff5d25a6b7f95b66aef68 100644 --- a/mace/kernels/pooling.h +++ b/mace/kernels/pooling.h @@ -198,6 +198,7 @@ struct PoolingFunctor : PoolingFunctorBase { cl::Kernel kernel_; uint32_t kwg_size_; + std::unique_ptr kernel_error_; std::vector input_shape_; }; diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h index 4e0c5ae3cb04def05794faab46cfc5dc90727e3c..212940489b98f5d31d3e3a3bfbadb5612890a0da 100644 --- a/mace/kernels/resize_bilinear.h +++ b/mace/kernels/resize_bilinear.h @@ -174,6 +174,7 @@ struct ResizeBilinearFunctor cl::Kernel kernel_; uint32_t kwg_size_; + std::unique_ptr kernel_error_; std::vector input_shape_; }; diff --git a/mace/kernels/slice.h b/mace/kernels/slice.h index 1bde41e539b0aa9bd8b458261913a9957da2ec0a..cc4c0d295342a4f9c5e327805e090bc2d927180a 100644 --- a/mace/kernels/slice.h +++ b/mace/kernels/slice.h @@ -62,6 +62,7 @@ struct SliceFunctor { StatsFuture *future); cl::Kernel kernel_; uint32_t kwg_size_; + std::unique_ptr kernel_error_; }; } // namespace kernels diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h index 62cbcbd532938a3ae8854fec7a2d55123bc34e7e..ce8c820db79842741a3ab308044b461f202e0860 100644 --- a/mace/kernels/softmax.h +++ b/mace/kernels/softmax.h @@ -67,6 +67,7 @@ struct SoftmaxFunctor { cl::Kernel kernel_; uint32_t kwg_size_; + std::unique_ptr kernel_error_; std::vector input_shape_; }; diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h index 4e0d4c591e9b6cfe8544c1bfc4d98936c921dee1..e8e10e504804d52a1703feb5443b4639128b3564 100644 --- a/mace/kernels/space_to_batch.h +++ b/mace/kernels/space_to_batch.h @@ -57,6 +57,7 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { cl::Kernel kernel_; uint32_t kwg_size_; + std::unique_ptr kernel_error_; std::vector space_shape_; }; diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h index e3169541dd197764d284d35c7eabbb61a2ba38b6..0087cefb4c3db4890ca2d610d705e7b205b9c7b1 100644 --- a/mace/kernels/winograd_transform.h +++ b/mace/kernels/winograd_transform.h @@ -52,6 +52,7 @@ struct WinogradTransformFunctor cl::Kernel kernel_; uint32_t kwg_size_; + std::unique_ptr kernel_error_; std::vector input_shape_; }; @@ -110,6 +111,7 @@ struct WinogradInverseTransformFunctor cl::Kernel kernel_; uint32_t kwg_size_; + std::unique_ptr kernel_error_; std::vector input_shape_; }; diff --git a/tools/bazel_adb_run.py b/tools/bazel_adb_run.py index 675a20dd79ac9a8349ad6514204b78168724e778..68896b980e8ea88b2b2462559c6ce09e757acd00 100644 --- a/tools/bazel_adb_run.py +++ b/tools/bazel_adb_run.py @@ -99,7 +99,8 @@ def main(unused_args): args=FLAGS.args, opencl_profiling=1, vlog_level=0, - device_bin_path="/data/local/tmp/mace") + device_bin_path="/data/local/tmp/mace", + out_of_range_check=1) globals()[FLAGS.stdout_processor](stdouts, device_properties, target_abi) if __name__ == "__main__": diff --git a/tools/sh_commands.py b/tools/sh_commands.py index d7e55e39d552f5c152568648069d40692eceb958..a021f9b7ddd48afa0da34a15528a1e0a5fc5879a 100644 --- a/tools/sh_commands.py +++ b/tools/sh_commands.py @@ -60,7 +60,8 @@ def adb_run(serialno, host_bin_path, bin_name, args="", opencl_profiling=1, vlog_level=0, - device_bin_path="/data/local/tmp/mace"): + device_bin_path="/data/local/tmp/mace", + out_of_range_check=1): host_bin_full_path = "%s/%s" % (host_bin_path, bin_name) device_bin_full_path = "%s/%s" % (device_bin_path, bin_name) device_cl_path = "%s/cl" % device_bin_path @@ -77,8 +78,8 @@ def adb_run(serialno, host_bin_path, bin_name, stdout_buff=[] process_output = make_output_processor(stdout_buff) p = sh.adb("-s", serialno, "shell", - "MACE_OPENCL_PROFILING=%d MACE_KERNEL_PATH=%s MACE_CPP_MIN_VLOG_LEVEL=%d %s %s" % - (opencl_profiling, device_cl_path, vlog_level, device_bin_full_path, args), + "MACE_OUT_OF_RANGE_CHECK=%d MACE_OPENCL_PROFILING=%d MACE_KERNEL_PATH=%s MACE_CPP_MIN_VLOG_LEVEL=%d %s %s" % + (out_of_range_check, opencl_profiling, device_cl_path, vlog_level, device_bin_full_path, args), _out=process_output, _bg=True, _err_to_out=True) p.wait() return "".join(stdout_buff) diff --git a/tools/tuning_run.sh b/tools/tuning_run.sh index c4e8dbe99e403b6d067b4a802022f341e6a49141..28741d58dd05c53b9c4f9f6a1ad2e9079e790ee2 100644 --- a/tools/tuning_run.sh +++ b/tools/tuning_run.sh @@ -70,6 +70,7 @@ else ADB_CMD_STR="LD_LIBRARY_PATH=${PHONE_DATA_DIR} \ MACE_TUNING=${tuning_flag} \ + MACE_OUT_OF_RANGE_CHECK="1" \ MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \ MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \ MACE_KERNEL_PATH=$KERNEL_DIR \