diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 43f51c69ae086eb29015f92b1f26d2bcc203573f..f24333ac461e9cbff8708f4202ad0f7545e56eeb 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -322,6 +322,14 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
       }
     }
   }
+
+  const char *out_of_range_check = getenv("MACE_OUT_OF_RANGE_CHECK");
+  if (out_of_range_check != nullptr && strlen(out_of_range_check) == 1
+      && out_of_range_check[0] == '1') {
+    this->out_of_range_check_ = true;
+  } else {
+    this->out_of_range_check_ = false;
+  }
 }
 
 OpenCLRuntime::~OpenCLRuntime() {
@@ -578,4 +586,8 @@ const std::string OpenCLRuntime::ParseDeviceVersion(
   return words[1];
 }
 
+const bool OpenCLRuntime::IsOutOfRangeCheckEnabled() const {
+  return out_of_range_check_;
+}
+
 }  // namespace mace
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index f6cf4815ce7f2264fe1859f157258b55538095bc..c06564908e56ba794963a8a70314760f31b5afc0 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -73,6 +73,7 @@ class OpenCLRuntime {
   uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
   uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
   const bool IsNonUniformWorkgroupsSupported();
+  const bool IsOutOfRangeCheckEnabled() const;
   const GPUType ParseGPUType(const std::string &device_name);
   const std::string ParseDeviceVersion(const std::string &device_version);
   void SaveBuiltCLProgram();
@@ -111,6 +112,7 @@ class OpenCLRuntime {
   std::mutex program_build_mutex_;
   GPUType gpu_type_;
   std::string opencl_version_;
+  bool out_of_range_check_;
   std::string platform_info_;
   bool program_map_changed_;
   std::unique_ptr<KVStorage> storage_;
diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD
index 877fa040624b1eae2d48569b8cd23379951495d8..c8ecb5b882224198a0c34b598f3f731a9cc5b2cf 100644
--- a/mace/kernels/BUILD
+++ b/mace/kernels/BUILD
@@ -20,6 +20,7 @@ cc_library(
         exclude = [
             "*_test.cc",
             "arm/*_test.cc",
+            "opencl/*_test.cc",
         ],
     ),
     hdrs = glob([
@@ -42,6 +43,7 @@ cc_test(
         [
             "*_test.cc",
             "arm/*_test.cc",
+            "opencl/*_test.cc",
         ],
     ),
     copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled(["-DMACE_ENABLE_NEON"]),
diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h
index c60f4a3bf130ea1754ae1876354bd5fd07071e56..e62181bc3ae743e2662b3a364537de6e1d8ed88d 100644
--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -6,6 +6,7 @@
 #define MACE_KERNELS_ACTIVATION_H_
 
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -171,6 +172,7 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
   T relux_max_limit_;
   cl::Kernel kernel_;
   uint32_t kwg_size_;
+  std::unique_ptr<BufferBase> kernel_error_;
   std::string tuning_key_prefix_;
   std::vector<index_t> input_shape_;
 };
diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h
index 61e906bf575f408f7425b6d80414f75008ddec57..7538b5b1681e05cdd507277915e9b746461b7d4a 100644
--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -9,6 +9,7 @@
 #include <arm_neon.h>
 #endif
 #include <algorithm>
+#include <memory>
 #include <vector>
 
 #include "mace/core/future.h"
@@ -85,6 +86,7 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
+  std::unique_ptr<BufferBase> kernel_error_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h
index 0b8ae31735e25dbd6b0014878dd5d84f5db638ed..c0aab3d958170c9411bdad9d0077a4fc8e0cb8e9 100644
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -8,6 +8,7 @@
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
 #include <arm_neon.h>
 #endif
+#include <memory>
 #include <vector>
 
 #include "mace/core/future.h"
@@ -165,6 +166,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
                   StatsFuture *future);
   cl::Kernel kernel_;
   uint32_t kwg_size_;
+  std::unique_ptr<BufferBase> kernel_error_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.h
index a3d1ff61b1c0fa0be7c7eb0506c495fc1331476c..ab2a35f1ec57f0712966d6367f6c24f24d3844c7 100644
--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -5,6 +5,7 @@
 #ifndef MACE_KERNELS_BIAS_ADD_H_
 #define MACE_KERNELS_BIAS_ADD_H_
 
+#include <memory>
 #include <vector>
 
 #include "mace/core/future.h"
@@ -65,6 +66,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
                   StatsFuture *future);
   cl::Kernel kernel_;
   uint32_t kwg_size_;
+  std::unique_ptr<BufferBase> kernel_error_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/buffer_to_image.h b/mace/kernels/buffer_to_image.h
index 2956762d5d70fb089e8e2bee34f114693fb1cc12..ff6cebb206e7079cbd2f80da610bbf24908f1e30 100644
--- a/mace/kernels/buffer_to_image.h
+++ b/mace/kernels/buffer_to_image.h
@@ -5,6 +5,8 @@
 #ifndef MACE_KERNELS_BUFFER_TO_IMAGE_H_
 #define MACE_KERNELS_BUFFER_TO_IMAGE_H_
 
+#include <memory>
+
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
@@ -13,8 +15,10 @@ namespace mace {
 namespace kernels {
 
 struct BufferToImageFunctorBase {
-  explicit BufferToImageFunctorBase(bool i2b) : i2b_(i2b) {}
+  explicit BufferToImageFunctorBase(bool i2b)
+    : i2b_(i2b), kernel_error_(nullptr) {}
   bool i2b_;
+  std::unique_ptr<BufferBase> kernel_error_;
 };
 
 template <DeviceType D, typename T>
diff --git a/mace/kernels/channel_shuffle.h b/mace/kernels/channel_shuffle.h
index 69332e0182e4623a7ffb9e1e87fd05fbd9ca5b75..258cd39af9a8c8165b3c99a4266c86d9957fd907 100644
--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -5,6 +5,7 @@
 #ifndef MACE_KERNELS_CHANNEL_SHUFFLE_H_
 #define MACE_KERNELS_CHANNEL_SHUFFLE_H_
 
+#include <memory>
 #include <vector>
 
 #include "mace/core/future.h"
@@ -57,6 +58,7 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
+  std::unique_ptr<BufferBase> kernel_error_;
   const int groups_;
   std::vector<index_t> input_shape_;
 };
diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h
index 6c803b5f2a32544cac7d79145b691b3166faeb9b..88c6cdb862e69cca62ca53645c7193a747b47ec9 100644
--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -5,6 +5,7 @@
 #ifndef MACE_KERNELS_CONCAT_H_
 #define MACE_KERNELS_CONCAT_H_
 
+#include <memory>
 #include <vector>
 
 #include "mace/core/future.h"
@@ -86,6 +87,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
                   StatsFuture *future);
   cl::Kernel kernel_;
   uint32_t kwg_size_;
+  std::unique_ptr<BufferBase> kernel_error_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index 6ef35d29f7a7146e8548d31ecabf00f9ce77b4ae..6833b9b66449f70f38ef56779722fbd67b06eaa7 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -9,6 +9,7 @@
 #include <arm_neon.h>
 #endif
 #include <algorithm>
+#include <memory>
 #include <vector>
 
 #include "mace/core/future.h"
@@ -468,6 +469,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
+  std::unique_ptr<BufferBase> kernel_error_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/cwise.h b/mace/kernels/cwise.h
index 07e03e7ff2d77cf0b907c9d88ee5bff221a96f80..57beb64f9a9f13d099abc66ba137a5d0f0a29e1a 100644
--- a/mace/kernels/cwise.h
+++ b/mace/kernels/cwise.h
@@ -5,6 +5,7 @@
 #define MACE_KERNELS_CWISE_H_
 
 #include <algorithm>
+#include <memory>
 #include <vector>
 
 #include "mace/core/future.h"
@@ -115,6 +116,7 @@ struct CWiseFunctor<DeviceType::OPENCL, T> : CWiseFunctorBase {
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
+  std::unique_ptr<BufferBase> kernel_error_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h
index 6d30673976df6399351ad50c871d2f9e578cea22..ab557c91e3101234585c85ba1b01d4b87060134c 100644
--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -4,6 +4,7 @@
 
 #ifndef MACE_KERNELS_DEPTH_TO_SPACE_H_
 #define MACE_KERNELS_DEPTH_TO_SPACE_H_
+#include <memory>
 #include <vector>
 
 #include "mace/core/future.h"
@@ -109,6 +110,7 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
+  std::unique_ptr<BufferBase> kernel_error_;
   const int block_size_;
   bool d2s_;
   std::vector<index_t> input_shape_;
diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h
index 7f4f2021f134dbcb2ffa9880a7c3e4c5a2ad7ddb..cc6ceee507be8d9da0f367a50c69f8ab86d9760e 100644
--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -9,6 +9,7 @@
 #include <arm_neon.h>
 #endif
 #include <algorithm>
+#include <memory>
 #include <vector>
 
 #include "mace/core/future.h"
@@ -454,6 +455,7 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
+  std::unique_ptr<BufferBase> kernel_error_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h
index 423a8f9fdf59abba9f3c92acdcd5aa0ba5ca40f2..faf52ff31edf589c8fb6d89bc914a171f775b8d9 100644
--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -5,6 +5,7 @@
 #define MACE_KERNELS_ELTWISE_H_
 
 #include <algorithm>
+#include <memory>
 #include <vector>
 
 #include "mace/core/future.h"
@@ -105,6 +106,7 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
+  std::unique_ptr<BufferBase> kernel_error_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/fully_connected.h b/mace/kernels/fully_connected.h
index 3c21fba7ba827ab78ac81467886672262bf8d391..7f37b1735c3f4b20afbff19f7a0c6c663c5eacd7 100644
--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -5,6 +5,7 @@
 #ifndef MACE_KERNELS_FULLY_CONNECTED_H_
 #define MACE_KERNELS_FULLY_CONNECTED_H_
 
+#include <memory>
 #include <vector>
 
 #include "mace/core/future.h"
@@ -107,6 +108,7 @@ struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
   std::vector<uint32_t> gws_;
   std::vector<uint32_t> lws_;
   std::vector<index_t> input_shape_;
+  std::unique_ptr<BufferBase> kernel_error_;
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.h
index 28db80c0cf8896a6fd48143a8aa4167b0a04e1cd..3cba36947f2c8d0facdbb33b7927dd86713f21c3 100644
--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -9,9 +9,10 @@
 #include <arm_neon.h>
 #endif
 
+#include <algorithm>
+#include <memory>
 #include <string>
 #include <vector>
-#include <algorithm>
 
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -68,6 +69,7 @@ struct MatMulFunctor<DeviceType::OPENCL, T> {
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
+  std::unique_ptr<BufferBase> kernel_error_;
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc
index 368a78bc9594da81e84831136fbfcfe93a4f295f..020aab0d2280851fc934491c09b90de5d652a78e 100644
--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -33,6 +33,14 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      kernel_error_ = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      kernel_error_->Map(nullptr);
+      *(kernel_error_->mutable_data<char>()) = 0;
+      kernel_error_->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -72,6 +80,10 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
 
   if (!IsVecEqual(input_shape_, input->shape())) {
     int idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel_.setArg(idx++,
+          *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
@@ -93,6 +105,13 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
       Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
              output->dim(3));
   TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    kernel_error_->Map(nullptr);
+    char *kerror_code = kernel_error_->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    kernel_error_->UnMap();
+  }
 }
 
 template struct ActivationFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index dc2aabeb949f9c6e54b41ebc8c2eb1324f1e4694..5f042a4e14a40c1428555486e3fbeb252cb22040 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -45,6 +45,14 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      kernel_error_ = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      kernel_error_->Map(nullptr);
+      *(kernel_error_->mutable_data<char>()) = 0;
+      kernel_error_->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -71,6 +79,10 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
     output_tensor->ResizeImage(output_shape, output_image_shape);
 
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel_.setArg(idx++,
+          *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
@@ -88,6 +100,13 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
   ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
      << "_" << output_shape[2] << "_" << output_shape[3];
   TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
+
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    kernel_error_->Map(nullptr);
+    char *kerror_code = kernel_error_->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    kernel_error_->UnMap();
+  }
 }
 
 template struct AddNFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
index 09be320fc680cb0e34306762a4672cd774f1d5ea..234e3d348efbda27e47f5093a553e8c97521fd7f 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -36,7 +36,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
 
   auto runtime = OpenCLRuntime::Global();
 
-
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
@@ -44,6 +43,14 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     built_options.emplace("-Dbatch_norm=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      kernel_error_ = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      kernel_error_->Map(nullptr);
+      *(kernel_error_->mutable_data<char>()) = 0;
+      kernel_error_->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -76,6 +83,10 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   }
   if (!IsVecEqual(input_shape_, input->shape())) {
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel_.setArg(idx++,
+          *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
@@ -100,6 +111,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
       Concat("batch_norm_opencl_kernel_", activation_, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3), folded_constant_);
   TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    kernel_error_->Map(nullptr);
+    char *kerror_code = kernel_error_->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    kernel_error_->UnMap();
+  }
 }
 
 template struct BatchNormFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add_opencl.cc
index 684474be64632d47bd0b4ae488c22bf687b5146b..a630b6a5c6c03a935c4317d0ea260c5d07d194a1 100644
--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -36,6 +36,14 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     built_options.emplace("-Dbias_add=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      kernel_error_ = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      kernel_error_->Map(nullptr);
+      *(kernel_error_->mutable_data<char>()) = 0;
+      kernel_error_->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -46,6 +54,10 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   }
   if (!IsVecEqual(input_shape_, input->shape())) {
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel_.setArg(idx++,
+          *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
@@ -77,6 +89,12 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
         cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
   }
   MACE_CHECK_CL_SUCCESS(error);
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    kernel_error_->Map(nullptr);
+    char *kerror_code = kernel_error_->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    kernel_error_->UnMap();
+  }
   if (future != nullptr) {
     future->wait_fn = [runtime, event](CallStats *stats) {
       event.wait();
diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc
index f652696f64b70444e41a5e50c793e7b093fe91c3..eae15eadad9530290c59db243149297fa2f3c7b8 100644
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -13,6 +13,7 @@ template <typename T>
 void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
     Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) {
   std::vector<size_t> image_shape;
+
   if (!i2b_) {
     CalImage2DShape(buffer->shape(), type, &image_shape);
     if (type == WINOGRAD_FILTER) {
@@ -80,10 +81,25 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
     built_options.emplace("-DCMD_DATA_TYPE=" +
                           DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
   }
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    built_options.emplace("-DOUT_OF_RANGE_CHECK");
+    if (!kernel_error_) {
+      kernel_error_ = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      kernel_error_->Map(nullptr);
+      *(kernel_error_->mutable_data<char>()) = 0;
+      kernel_error_->UnMap();
+    }
+  }
+
   auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
                                          obfuscated_kernel_name, built_options);
 
   uint32_t idx = 0;
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    b2f_kernel.setArg(idx++,
+        *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
+  }
   if (!runtime->IsNonUniformWorkgroupsSupported()) {
     b2f_kernel.setArg(idx++, gws[0]);
     b2f_kernel.setArg(idx++, gws[1]);
@@ -135,6 +151,12 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
         cl::NDRange(lws[0], lws[1]), nullptr, &event);
   }
   MACE_CHECK_CL_SUCCESS(error);
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    kernel_error_->Map(nullptr);
+    char *kerror_code = kernel_error_->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    kernel_error_->UnMap();
+  }
   if (future != nullptr) {
     future->wait_fn = [runtime, event](CallStats *stats) {
       event.wait();
diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc
index 59f2c9518061a7e1e6f0e8071a18cec699814be4..925ce595a3e4f50c7b046cef2edcd79505538e0d 100644
--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -43,6 +43,14 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      kernel_error_ = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      kernel_error_->Map(nullptr);
+      *(kernel_error_->mutable_data<char>()) = 0;
+      kernel_error_->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -55,6 +63,10 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
 
   if (!IsVecEqual(input_shape_, input->shape())) {
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel_.setArg(idx++,
+          *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
@@ -76,6 +88,13 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
      << output->dim(2) << "_"
      << output->dim(3);
   TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    kernel_error_->Map(nullptr);
+    char *kerror_code = kernel_error_->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    kernel_error_->UnMap();
+  }
 }
 
 template
diff --git a/mace/kernels/opencl/cl/activation.cl b/mace/kernels/opencl/cl/activation.cl
index 2978f4022e67ffa13b3e318bcd75490ae66a8d1b..25e79b3db2e36f72ec961b1a6f3206b416ce81dd 100644
--- a/mace/kernels/opencl/cl/activation.cl
+++ b/mace/kernels/opencl/cl/activation.cl
@@ -1,6 +1,7 @@
 #include <common.h>
 
-__kernel void activation(GLOBAL_WORK_GROUP_SIZE_DIM3
+__kernel void activation(KERNEL_ERROR_PARAMS
+                         GLOBAL_WORK_GROUP_SIZE_DIM3
                          __read_only image2d_t input,
 #ifdef USE_PRELU
                          __read_only image2d_t alpha,
@@ -29,6 +30,6 @@ __kernel void activation(GLOBAL_WORK_GROUP_SIZE_DIM3
 #else
   DATA_TYPE4 out = do_activation(in, relux_max_limit);
 #endif
+
   WRITE_IMAGET(output, (int2)(pos, hb), out);
 }
-
diff --git a/mace/kernels/opencl/cl/addn.cl b/mace/kernels/opencl/cl/addn.cl
index 30f52247f22e95718239e60e956bf856eba65f39..1e8616f3825e0eebff7e09dae414af05e1d5fb32 100644
--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
@@ -1,6 +1,7 @@
 #include <common.h>
 
-__kernel void addn(GLOBAL_WORK_GROUP_SIZE_DIM2
+__kernel void addn(KERNEL_ERROR_PARAMS
+                   GLOBAL_WORK_GROUP_SIZE_DIM2
                    __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
                    __read_only image2d_t input1,
 #if INPUT_NUM > 2
diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl
index 290b6c1a12216d0771bcfb65b6c81660e4e59833..064d8ecc4b88310a1817e701823a2b75c8db1036 100644
--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
@@ -1,6 +1,7 @@
 #include <common.h>
 // Supported data types: half/float
-__kernel void batch_norm(GLOBAL_WORK_GROUP_SIZE_DIM3
+__kernel void batch_norm(KERNEL_ERROR_PARAMS
+                         GLOBAL_WORK_GROUP_SIZE_DIM3
                          __read_only image2d_t input,
                          __read_only image2d_t scale,
                          __read_only image2d_t offset,
diff --git a/mace/kernels/opencl/cl/bias_add.cl b/mace/kernels/opencl/cl/bias_add.cl
index 64de2d77ff8371c6acafcea1bb4afdc00b105a8d..2cd5fb5caa9dc3c39c82bb6ff2c0f1c16dc87a66 100644
--- a/mace/kernels/opencl/cl/bias_add.cl
+++ b/mace/kernels/opencl/cl/bias_add.cl
@@ -1,6 +1,7 @@
 #include <common.h>
 // Supported data types: half/float
-__kernel void bias_add(GLOBAL_WORK_GROUP_SIZE_DIM3
+__kernel void bias_add(KERNEL_ERROR_PARAMS
+                       GLOBAL_WORK_GROUP_SIZE_DIM3
                        __read_only image2d_t input,
                        __read_only image2d_t bias,
                        __write_only image2d_t output) {
@@ -22,5 +23,6 @@ __kernel void bias_add(GLOBAL_WORK_GROUP_SIZE_DIM3
   DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
   DATA_TYPE4 bias_value = READ_IMAGET(bias, SAMPLER, (int2)(ch_blk, 0));
   DATA_TYPE4 out = in + bias_value;
+
   WRITE_IMAGET(output, (int2)(pos, hb), out);
 }
diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl
index e300bc51e2e429bd2c232068cba976269a2081d6..38af3d5b6158995d67daf76db6bb3782b43032c6 100644
--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
@@ -1,6 +1,7 @@
 #include <common.h>
 
-__kernel void filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
+__kernel void filter_buffer_to_image(KERNEL_ERROR_PARAMS
+                                     GLOBAL_WORK_GROUP_SIZE_DIM2
                                      __global const DATA_TYPE *input, /* h, w, oc, ic */
                                      __private const int input_offset,
                                      __private const int filter_h,
@@ -52,7 +53,8 @@ __kernel void filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
   WRITE_IMAGET(output, coord, values);
 }
 
-__kernel void filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
+__kernel void filter_image_to_buffer(KERNEL_ERROR_PARAMS
+                                     GLOBAL_WORK_GROUP_SIZE_DIM2
                                      __global DATA_TYPE *output, /* h, w, oc, ic */
                                      __private const int filter_h,
                                      __private const int filter_w,
@@ -100,7 +102,8 @@ __kernel void filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
   }
 }
 
-__kernel void dw_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
+__kernel void dw_filter_buffer_to_image(KERNEL_ERROR_PARAMS
+                                        GLOBAL_WORK_GROUP_SIZE_DIM2
                                         __global const DATA_TYPE *input, /* h, w, ic, m */
                                         __private const int input_offset,
                                         __private const int filter_w,
@@ -157,7 +160,8 @@ __kernel void dw_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
   WRITE_IMAGET(output, coord, values);
 }
 
-__kernel void in_out_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
+__kernel void in_out_buffer_to_image(KERNEL_ERROR_PARAMS
+                                     GLOBAL_WORK_GROUP_SIZE_DIM2
                                      __global const DATA_TYPE *input, /* nhwc */
                                      __private const int input_offset,
                                      __private const int height,
@@ -198,7 +202,8 @@ __kernel void in_out_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
   WRITE_IMAGET(output, coord, values);
 }
 
-__kernel void in_out_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
+__kernel void in_out_image_to_buffer(KERNEL_ERROR_PARAMS
+                                     GLOBAL_WORK_GROUP_SIZE_DIM2
                                      __global DATA_TYPE *output, /* nhwc */
                                      __private const int height,
                                      __private const int width,
@@ -237,7 +242,8 @@ __kernel void in_out_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
   }
 }
 
-__kernel void arg_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
+__kernel void arg_buffer_to_image(KERNEL_ERROR_PARAMS
+                                  GLOBAL_WORK_GROUP_SIZE_DIM2
                                   __global const DATA_TYPE *input, /* nhwc */
                                   __private const int input_offset,
                                   __private const int count,
@@ -272,7 +278,8 @@ __kernel void arg_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
   WRITE_IMAGET(output, coord, values);
 }
 
-__kernel void arg_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
+__kernel void arg_image_to_buffer(KERNEL_ERROR_PARAMS
+                                  GLOBAL_WORK_GROUP_SIZE_DIM2
                                   __global DATA_TYPE *output, /* nhwc */
                                   __private const int count,
                                   __read_only image2d_t input) {
@@ -305,7 +312,8 @@ __kernel void arg_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
 }
 
 
-__kernel void in_out_height_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
+__kernel void in_out_height_buffer_to_image(KERNEL_ERROR_PARAMS
+                                            GLOBAL_WORK_GROUP_SIZE_DIM2
                                             __global const DATA_TYPE *input, //nhwc
                                             __private const int input_offset,
                                             __private const int height,
@@ -347,7 +355,8 @@ __kernel void in_out_height_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
   WRITE_IMAGET(output, coord, values);
 }
 
-__kernel void in_out_height_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
+__kernel void in_out_height_image_to_buffer(KERNEL_ERROR_PARAMS
+                                            GLOBAL_WORK_GROUP_SIZE_DIM2
                                             __global DATA_TYPE *output, //nhwc
                                             __private const int height,
                                             __private const int width,
@@ -385,7 +394,8 @@ __kernel void in_out_height_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
 }
 
 
-__kernel void in_out_width_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
+__kernel void in_out_width_buffer_to_image(KERNEL_ERROR_PARAMS
+                                           GLOBAL_WORK_GROUP_SIZE_DIM2
                                            __global const DATA_TYPE *input, /* nhwc */
                                            __private const int input_offset,
                                            __private const int height,
@@ -427,7 +437,8 @@ __kernel void in_out_width_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
 }
 
 // only support 3x3 now
-__kernel void winograd_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
+__kernel void winograd_filter_buffer_to_image(KERNEL_ERROR_PARAMS
+                                              GLOBAL_WORK_GROUP_SIZE_DIM2
                                               __global const DATA_TYPE *input, //Oc, Ic, H, W
                                               __private const int input_offset,
                                               __private const int in_channels,
@@ -495,30 +506,46 @@ __kernel void winograd_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
   tu3[1] = tt + tu3[1] / 2;
 
   int2 coord = (int2)(w, h);
-#pragma unroll
-  for (short i = 0; i < 4; ++i) {
-    WRITE_IMAGET(output, coord, tu0[i]);
-    coord.y += out_channels;
-  }
-#pragma unroll
-  for (short i = 0; i < 4; ++i) {
-    WRITE_IMAGET(output, coord, tu1[i]);
-    coord.y += out_channels;
-  }
-#pragma unroll
-  for (short i = 0; i < 4; ++i) {
-    WRITE_IMAGET(output, coord, tu2[i]);
-    coord.y += out_channels;
-  }
-#pragma unroll
-  for (short i = 0; i < 4; ++i) {
-    WRITE_IMAGET(output, coord, tu3[i]);
-    coord.y += out_channels;
-  }
+
+  WRITE_IMAGET(output, coord, tu0[0]);
+  coord.y += out_channels;
+  WRITE_IMAGET(output, coord, tu0[1]);
+  coord.y += out_channels;
+  WRITE_IMAGET(output, coord, tu0[2]);
+  coord.y += out_channels;
+  WRITE_IMAGET(output, coord, tu0[3]);
+  coord.y += out_channels;
+
+  WRITE_IMAGET(output, coord, tu1[0]);
+  coord.y += out_channels;
+  WRITE_IMAGET(output, coord, tu1[1]);
+  coord.y += out_channels;
+  WRITE_IMAGET(output, coord, tu1[2]);
+  coord.y += out_channels;
+  WRITE_IMAGET(output, coord, tu1[3]);
+  coord.y += out_channels;
+
+  WRITE_IMAGET(output, coord, tu2[0]);
+  coord.y += out_channels;
+  WRITE_IMAGET(output, coord, tu2[1]);
+  coord.y += out_channels;
+  WRITE_IMAGET(output, coord, tu2[2]);
+  coord.y += out_channels;
+  WRITE_IMAGET(output, coord, tu2[3]);
+  coord.y += out_channels;
+
+  WRITE_IMAGET(output, coord, tu3[0]);
+  coord.y += out_channels;
+  WRITE_IMAGET(output, coord, tu3[1]);
+  coord.y += out_channels;
+  WRITE_IMAGET(output, coord, tu3[2]);
+  coord.y += out_channels;
+  WRITE_IMAGET(output, coord, tu3[3]);
 }
 
 // only support 3x3 now
-__kernel void winograd_filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
+__kernel void winograd_filter_image_to_buffer(KERNEL_ERROR_PARAMS
+                                              GLOBAL_WORK_GROUP_SIZE_DIM2
                                               __global DATA_TYPE *output, //Oc, Ic, H, W
                                               __private const int height,
                                               __private const int width,
diff --git a/mace/kernels/opencl/cl/channel_shuffle.cl b/mace/kernels/opencl/cl/channel_shuffle.cl
index 92ff94473d7964925f67bfc3517194dc1111a4ee..c404130eb0fd04d9f900b986405c1bd616bcbc8b 100644
--- a/mace/kernels/opencl/cl/channel_shuffle.cl
+++ b/mace/kernels/opencl/cl/channel_shuffle.cl
@@ -1,7 +1,8 @@
 #include <common.h>
 
 // assume channes_per_group mod 4 = 0 && groups mod 4 == 0
-__kernel void channel_shuffle(GLOBAL_WORK_GROUP_SIZE_DIM3
+__kernel void channel_shuffle(KERNEL_ERROR_PARAMS
+                              GLOBAL_WORK_GROUP_SIZE_DIM3
                               __read_only image2d_t input,
                               __private const int groups,
                               __private const int channels_per_group,
@@ -49,6 +50,7 @@ __kernel void channel_shuffle(GLOBAL_WORK_GROUP_SIZE_DIM3
     out_chan_data3 = (DATA_TYPE4)(in_chan_data0.w, in_chan_data1.w, in_chan_data2.w, in_chan_data3.w);
 
     int out_x = mad24(mad24(group_chan_blk_idx, groups, g_blk), width, width_idx);
+
     WRITE_IMAGET(output, (int2)(out_x, hb_idx), out_chan_data0);
     out_x += groups_blks_width;
 
diff --git a/mace/kernels/opencl/cl/common.h b/mace/kernels/opencl/cl/common.h
index b68bca075491fba94732c9ab998751284a232a55..0011dc2c01297fe2f2ab2c539e837ac7ea133516 100644
--- a/mace/kernels/opencl/cl/common.h
+++ b/mace/kernels/opencl/cl/common.h
@@ -14,8 +14,19 @@
 #define CMD_TYPE(cmd, type) CMD_TYPE_STR(cmd, type)
 
 #define DATA_TYPE4 VEC_DATA_TYPE(DATA_TYPE, 4)
-#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
-#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
+
+#ifdef OUT_OF_RANGE_CHECK
+#define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \
+  check_out_of_range_for_image2d(image, (coord).x, (coord).y, kernel_error);
+#else
+#define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord)
+#endif
+
+#define READ_IMAGET(image, coord, value) \
+  CMD_TYPE(read_image, CMD_DATA_TYPE)(image, coord, value)
+#define WRITE_IMAGET(image, coord, value)        \
+  CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord)   \
+  CMD_TYPE(write_image, CMD_DATA_TYPE)(image, coord, value);
 
 #ifndef NON_UNIFORM_WORK_GROUP
 
@@ -34,6 +45,18 @@
 
 #endif
 
+
+#ifdef OUT_OF_RANGE_CHECK
+
+#define KERNEL_ERROR_PARAMS \
+  __global char *kernel_error,
+
+#else
+
+#define KERNEL_ERROR_PARAMS
+
+#endif
+
 __constant sampler_t SAMPLER =
     CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
@@ -61,4 +84,16 @@ inline DATA_TYPE4 do_activation(DATA_TYPE4 in,
   return out;
 }
 
+inline void check_out_of_range_for_image2d(__write_only image2d_t image,
+                                           __private const int x,
+                                           __private const int y,
+                                           global char *kernel_error) {
+#ifdef OUT_OF_RANGE_CHECK
+  int2 image_dim = get_image_dim(image);
+  if (x >= image_dim.x || y >= image_dim.y) {
+    *kernel_error = 1;
+  }
+#endif
+}
+
 #endif  // MACE_KERNELS_OPENCL_CL_COMMON_H_
diff --git a/mace/kernels/opencl/cl/concat.cl b/mace/kernels/opencl/cl/concat.cl
index 0e171e0f4dac7bbd41856e1a43518aacbb6ffa12..e6e78e8651286118f8c7f7772eebbfe6c138cfa2 100644
--- a/mace/kernels/opencl/cl/concat.cl
+++ b/mace/kernels/opencl/cl/concat.cl
@@ -22,7 +22,8 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
 }
 
 // Supported data type: half/float
-__kernel void concat_channel(GLOBAL_WORK_GROUP_SIZE_DIM3
+__kernel void concat_channel(KERNEL_ERROR_PARAMS
+                             GLOBAL_WORK_GROUP_SIZE_DIM3
                              __read_only image2d_t input0,
                              __read_only image2d_t input1,
                              __private const int input0_chan,
@@ -79,11 +80,14 @@ __kernel void concat_channel(GLOBAL_WORK_GROUP_SIZE_DIM3
   }
 #endif
 
-  WRITE_IMAGET(output, (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx), data);
+  const int pos = mad24(chan_blk_idx, width, width_idx);
+
+  WRITE_IMAGET(output, (int2)(pos, hb_idx), data);
 }
 
 // Required: All input channels are divisible by 4
-__kernel void concat_channel_multi(GLOBAL_WORK_GROUP_SIZE_DIM3
+__kernel void concat_channel_multi(KERNEL_ERROR_PARAMS
+                                   GLOBAL_WORK_GROUP_SIZE_DIM3
                                    __read_only image2d_t input,
                                    __private const int chan_blk_offset,
                                    __write_only image2d_t output) {
@@ -106,7 +110,9 @@ __kernel void concat_channel_multi(GLOBAL_WORK_GROUP_SIZE_DIM3
                      SAMPLER,
                      (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx));
 
-  WRITE_IMAGET(output, (int2)(mad24(chan_blk_idx + chan_blk_offset, width, width_idx), hb_idx), data);
+  const int pos = mad24(chan_blk_idx + chan_blk_offset, width, width_idx);
+
+  WRITE_IMAGET(output, (int2)(pos, hb_idx), data);
 }
 
 //__kernel void concat_width(__read_only image2d_t input0,
diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl
index f88885b06c7b6a4ac9efb91a950e6e94b4e54076..5289e513301d41d2684be090597636f263b2b601 100644
--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
@@ -1,6 +1,7 @@
 #include <common.h>
 
-__kernel void conv_2d(GLOBAL_WORK_GROUP_SIZE_DIM3
+__kernel void conv_2d(KERNEL_ERROR_PARAMS
+                      GLOBAL_WORK_GROUP_SIZE_DIM3
                       __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                       __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */
 #ifdef BIAS
@@ -126,6 +127,7 @@ __kernel void conv_2d(GLOBAL_WORK_GROUP_SIZE_DIM3
 #endif
 
   const int out_x_base = mul24(out_ch_blk, out_width);
+
   int w = out_w_blk;
   WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0);
 
diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl
index a5454a6700eeb78573994833f9fa1bc3bb0029ff..65d2b9c68e8680ac62763df59daee3915e8b3f25 100644
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -1,6 +1,7 @@
 #include <common.h>
 
-__kernel void conv_2d_1x1(GLOBAL_WORK_GROUP_SIZE_DIM3
+__kernel void conv_2d_1x1(KERNEL_ERROR_PARAMS
+                          GLOBAL_WORK_GROUP_SIZE_DIM3
                           __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                           __read_only image2d_t filter, /* cout%4 * cin, cout/4 */
 #ifdef BIAS
diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl
index 19a636bb109d6b9fb04577548c0c72e0824f0e88..f7c1149f8fd2bbeddc0ba9efccd9f35cd019196c 100644
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -1,6 +1,7 @@
 #include <common.h>
 
-__kernel void conv_2d_3x3(GLOBAL_WORK_GROUP_SIZE_DIM3
+__kernel void conv_2d_3x3(KERNEL_ERROR_PARAMS
+                          GLOBAL_WORK_GROUP_SIZE_DIM3
                           __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                           __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */
 #ifdef BIAS
@@ -162,5 +163,4 @@ __kernel void conv_2d_3x3(GLOBAL_WORK_GROUP_SIZE_DIM3
   WRITE_IMAGET(output,
                (int2)(out_x_base + w, out_hb),
                out4);
-
 }
diff --git a/mace/kernels/opencl/cl/cwise.cl b/mace/kernels/opencl/cl/cwise.cl
index 92cdaf7ea20ef7e77467a52b494b3c72506269c1..e93dfc7cbde706d4d3cb4ead20a6efdb3ea0c5ea 100644
--- a/mace/kernels/opencl/cl/cwise.cl
+++ b/mace/kernels/opencl/cl/cwise.cl
@@ -1,6 +1,7 @@
 #include <common.h>
 
-__kernel void cwise(GLOBAL_WORK_GROUP_SIZE_DIM2
+__kernel void cwise(KERNEL_ERROR_PARAMS
+                    GLOBAL_WORK_GROUP_SIZE_DIM2
                     __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                     __private const float value,
                     __write_only image2d_t output) {
diff --git a/mace/kernels/opencl/cl/depth_to_space.cl b/mace/kernels/opencl/cl/depth_to_space.cl
index 8d989290b6496bd8e1f4797f711e6002fbc189d8..3fd66f5a35fcec5b1563ecc283552a9e398fc754 100644
--- a/mace/kernels/opencl/cl/depth_to_space.cl
+++ b/mace/kernels/opencl/cl/depth_to_space.cl
@@ -1,71 +1,81 @@
 #include <common.h>
 
-__kernel void depth_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3
+__kernel void depth_to_space(KERNEL_ERROR_PARAMS
+                             GLOBAL_WORK_GROUP_SIZE_DIM3
                              __read_only image2d_t input,
                              __private const int block_size,
-                             __private const int input_height,
+                             __private const int input_hb,
                              __private const int input_width,
                              __private const int input_depth_blocks,
-                             __private const int output_height,
                              __private const int output_width,
                              __private const int output_depth_blocks,
                              __write_only image2d_t output) {
   const int out_d = get_global_id(0);
   const int out_w = get_global_id(1);
-  const int out_h = get_global_id(2);
+  const int out_hb = get_global_id(2);
 
-  if (out_d >= output_depth_blocks || out_h >= output_height || out_w >= output_width)
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_d >= global_size_dim0 || out_w >= global_size_dim1
+      || out_hb >= global_size_dim2) {
     return;
+  }
+#endif
 
   const int out_pos = mad24(out_d, output_width, out_w);
 
-  const int in_h = out_h / block_size;
-  const int offset_h = out_h % block_size;
+  const int in_hb = out_hb / block_size;
+  const int offset_h = out_hb % block_size;
   const int in_w = out_w / block_size;
   const int offset_w = out_w % block_size;
   const int offset_d = (offset_h * block_size + offset_w) * output_depth_blocks;
   const int in_d = out_d + offset_d;
 
-  if (in_h >= input_height || in_w >= input_width || in_d >= input_depth_blocks)
+  if (in_hb >= input_hb || in_w >= input_width || in_d >= input_depth_blocks) {
     return;
+  }
 
   const int in_pos = mad24(in_d, input_width, in_w);
-  DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, in_h));
-  WRITE_IMAGET(output, (int2)(out_pos, out_h), in_data);
+  DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, in_hb));
+
+  WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data);
 }
 
-__kernel void space_to_depth(
+__kernel void space_to_depth(KERNEL_ERROR_PARAMS
                              GLOBAL_WORK_GROUP_SIZE_DIM3
                              __read_only image2d_t input,
                              __private const int block_size,
-                             __private const int input_height,
                              __private const int input_width,
                              __private const int input_depth_blocks,
-                             __private const int output_height,
+                             __private const int output_hb,
                              __private const int output_width,
                              __private const int output_depth_blocks,
                              __write_only image2d_t output) {
-
   const int d = get_global_id(0);
   const int w = get_global_id(1);
-  const int h = get_global_id(2);
+  const int hb = get_global_id(2);
 
-  if (h >= input_height || w >= input_width || d >= input_depth_blocks)
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (d >= global_size_dim0 || w >= global_size_dim1
+      || hb >= global_size_dim2) {
     return;
+  }
+#endif
 
   const int in_pos = mad24(d, input_width, w);
 
-  const int out_h = h / block_size;
-  const int offset_h = h % block_size;
+  const int out_hb = hb / block_size;
+  const int offset_h = hb % block_size;
   const int out_w = w / block_size;
   const int offset_w = w % block_size;
   const int offset_d = (offset_h * block_size + offset_w) * input_depth_blocks;
   const int out_d = d + offset_d;
 
-  if (out_d >= output_depth_blocks || out_h >= output_height || out_w >= output_width)
+  if (out_d >= output_depth_blocks || out_hb >= output_hb || out_w >= output_width) {
     return;
+  }
 
   const int out_pos = mad24(out_d, output_width, out_w);
-  DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, h));
-  WRITE_IMAGET(output, (int2)(out_pos, out_h), in_data);
+  DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, hb));
+
+  WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data);
 }
diff --git a/mace/kernels/opencl/cl/depthwise_conv2d.cl b/mace/kernels/opencl/cl/depthwise_conv2d.cl
index c71ec4049ab6218c4d22ee446e371ba8c3622cab..9173f1b4d2c47c86db4bf4610ba3ab5c5085df9f 100644
--- a/mace/kernels/opencl/cl/depthwise_conv2d.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl
@@ -1,7 +1,8 @@
 #include <common.h>
 
 // Only multiplier = 1 is supported
-__kernel void depthwise_conv2d(GLOBAL_WORK_GROUP_SIZE_DIM3
+__kernel void depthwise_conv2d(KERNEL_ERROR_PARAMS
+                               GLOBAL_WORK_GROUP_SIZE_DIM3
                                __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
 #ifdef BIAS
@@ -137,7 +138,8 @@ __kernel void depthwise_conv2d(GLOBAL_WORK_GROUP_SIZE_DIM3
   WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
 }
 
-__kernel void depthwise_conv2d_s1(GLOBAL_WORK_GROUP_SIZE_DIM3
+__kernel void depthwise_conv2d_s1(KERNEL_ERROR_PARAMS
+                                  GLOBAL_WORK_GROUP_SIZE_DIM3
                                   __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                   __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
 #ifdef BIAS
diff --git a/mace/kernels/opencl/cl/eltwise.cl b/mace/kernels/opencl/cl/eltwise.cl
index 0b9647f50574c7522ce54631980cb21c01038361..58838a7d29aad87345706cb66ecea0d86d4c22a4 100644
--- a/mace/kernels/opencl/cl/eltwise.cl
+++ b/mace/kernels/opencl/cl/eltwise.cl
@@ -1,6 +1,7 @@
 #include <common.h>
 
-__kernel void eltwise(GLOBAL_WORK_GROUP_SIZE_DIM2
+__kernel void eltwise(KERNEL_ERROR_PARAMS
+                      GLOBAL_WORK_GROUP_SIZE_DIM2
                       __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
                       __read_only image2d_t input1,
 #ifdef COEFF_SUM
diff --git a/mace/kernels/opencl/cl/fully_connected.cl b/mace/kernels/opencl/cl/fully_connected.cl
index 3205e4921b4da3e44bbb5aa7cfa871ef3bad2ec1..e5de2c641e10fc51415004b7a46d075fdb62ddb3 100644
--- a/mace/kernels/opencl/cl/fully_connected.cl
+++ b/mace/kernels/opencl/cl/fully_connected.cl
@@ -1,7 +1,8 @@
 #include <common.h>
 
 // output = weight * input + bias
-__kernel void fully_connected(GLOBAL_WORK_GROUP_SIZE_DIM2
+__kernel void fully_connected(KERNEL_ERROR_PARAMS
+                              GLOBAL_WORK_GROUP_SIZE_DIM2
                               __read_only image2d_t input,
                               __read_only image2d_t weight,
 #ifdef BIAS
@@ -58,11 +59,13 @@ __kernel void fully_connected(GLOBAL_WORK_GROUP_SIZE_DIM2
 #if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
   result = do_activation(result, relux_max_limit);
 #endif
+
   WRITE_IMAGET(output, (int2)(out_blk_idx, batch_idx), result);
 }
 
 // output = weight * input + bias
-__kernel void fully_connected_width(GLOBAL_WORK_GROUP_SIZE_DIM3
+__kernel void fully_connected_width(KERNEL_ERROR_PARAMS
+                                    GLOBAL_WORK_GROUP_SIZE_DIM3
                                     __read_only image2d_t input,
                                     __read_only image2d_t weight,
 #ifdef BIAS
@@ -147,6 +150,7 @@ __kernel void fully_connected_width(GLOBAL_WORK_GROUP_SIZE_DIM3
 #if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
     result = do_activation(result, relux_max_limit);
 #endif
+
     WRITE_IMAGET(output, (int2)(out_blk_idx, batch_idx), result);
   }
 }
diff --git a/mace/kernels/opencl/cl/matmul.cl b/mace/kernels/opencl/cl/matmul.cl
index 82ccf6bad13a718f5777957fbebb44fd8e8b14df..0509159cc2022325fd8af40fa914c3a9423636f2 100644
--- a/mace/kernels/opencl/cl/matmul.cl
+++ b/mace/kernels/opencl/cl/matmul.cl
@@ -1,7 +1,8 @@
 #include <common.h>
 
 // C = A * B
-__kernel void matmul(GLOBAL_WORK_GROUP_SIZE_DIM2
+__kernel void matmul(KERNEL_ERROR_PARAMS
+                     GLOBAL_WORK_GROUP_SIZE_DIM2
                      __read_only image2d_t A,
                      __read_only image2d_t B,
                      __write_only image2d_t C,
@@ -46,11 +47,15 @@ __kernel void matmul(GLOBAL_WORK_GROUP_SIZE_DIM2
 
     c3 += (DATA_TYPE4)(dot(a0, b3), dot(a1, b3), dot(a2, b3), dot(a3, b3));
   }
+
   WRITE_IMAGET(C, (int2)(gx, gy), c0);
+
   if ((gx + 1) >= N) return;
   WRITE_IMAGET(C, (int2)(gx + 1, gy), c1);
+
   if ((gx + 2) >= N) return;
   WRITE_IMAGET(C, (int2)(gx + 2, gy), c2);
+
   if ((gx + 3) >= N) return;
   WRITE_IMAGET(C, (int2)(gx + 3, gy), c3);
 }
diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl
index 25785bb2a089eb4aed28e46fc434403fc365f1bc..c76d055ff21dc0bd8dbe689297427db6c66c9ac3 100644
--- a/mace/kernels/opencl/cl/pooling.cl
+++ b/mace/kernels/opencl/cl/pooling.cl
@@ -19,7 +19,8 @@ inline int calculate_avg_block_size(const int pool_size,
 }
 
 // Supported data type: half/float
-__kernel void pooling(GLOBAL_WORK_GROUP_SIZE_DIM3
+__kernel void pooling(KERNEL_ERROR_PARAMS
+                      GLOBAL_WORK_GROUP_SIZE_DIM3
                       __read_only image2d_t input,
                       __private const int in_height,
                       __private const int in_width,
@@ -94,5 +95,6 @@ __kernel void pooling(GLOBAL_WORK_GROUP_SIZE_DIM3
   }
 #endif
 
-  WRITE_IMAGET(output, (int2)(mad24(out_chan_idx, out_width, out_width_idx), out_hb_idx), res);
+  const int pos = mad24(out_chan_idx, out_width, out_width_idx);
+  WRITE_IMAGET(output, (int2)(pos, out_hb_idx), res);
 }
diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl
index 2b0464c70d2042908345a1b360af8c5a4d91a15c..8736bf52717da115eaf9f32afe8a908b3754cee8 100644
--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
@@ -1,6 +1,7 @@
 #include <common.h>
 
-__kernel void resize_bilinear_nocache(GLOBAL_WORK_GROUP_SIZE_DIM3
+__kernel void resize_bilinear_nocache(KERNEL_ERROR_PARAMS
+                                      GLOBAL_WORK_GROUP_SIZE_DIM3
                                       __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                       __write_only image2d_t output,
                                       __private const float height_scale,
@@ -56,6 +57,7 @@ __kernel void resize_bilinear_nocache(GLOBAL_WORK_GROUP_SIZE_DIM3
 
   const int out_w_offset = mul24(ch_blk, out_width);
   const int out_h_offset = mul24(b, out_height);
+
   WRITE_IMAGET(output, (int2)(out_w_offset + w, out_h_offset + h), out);
 }
 
diff --git a/mace/kernels/opencl/cl/slice.cl b/mace/kernels/opencl/cl/slice.cl
index 0692c62b70f4ed0933bb94627c16413d2602d2ab..366cddc32f77e8e7d7bfcbe35abf3880d560274c 100644
--- a/mace/kernels/opencl/cl/slice.cl
+++ b/mace/kernels/opencl/cl/slice.cl
@@ -1,6 +1,7 @@
 #include <common.h>
 
-__kernel void slice(GLOBAL_WORK_GROUP_SIZE_DIM3
+__kernel void slice(KERNEL_ERROR_PARAMS
+                    GLOBAL_WORK_GROUP_SIZE_DIM3
                     __read_only image2d_t input,
                     __private const int chan_blk_offset,
                     __write_only image2d_t output) {
@@ -21,6 +22,7 @@ __kernel void slice(GLOBAL_WORK_GROUP_SIZE_DIM3
   DATA_TYPE4 data = READ_IMAGET(input, SAMPLER,
                                 (int2)(mad24(chan_blk_idx + chan_blk_offset,
                                              width, width_idx), hb_idx));
-  WRITE_IMAGET(output,
-               (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx), data);
+
+  const int pos = mad24(chan_blk_idx, width, width_idx);
+  WRITE_IMAGET(output, (int2)(pos, hb_idx), data);
 }
diff --git a/mace/kernels/opencl/cl/softmax.cl b/mace/kernels/opencl/cl/softmax.cl
index b5b99de6c0ef92064174a14e16014ddb22093eb9..710433a219146af66e63447fbbab7470a9ea9e2f 100644
--- a/mace/kernels/opencl/cl/softmax.cl
+++ b/mace/kernels/opencl/cl/softmax.cl
@@ -1,6 +1,7 @@
 #include <common.h>
 
-__kernel void softmax(GLOBAL_WORK_GROUP_SIZE_DIM3
+__kernel void softmax(KERNEL_ERROR_PARAMS
+                      GLOBAL_WORK_GROUP_SIZE_DIM3
                       __read_only image2d_t input,
                       __private const int channels,
                       __private const int remain_channels,
diff --git a/mace/kernels/opencl/cl/space_to_batch.cl b/mace/kernels/opencl/cl/space_to_batch.cl
index 431a599705b7a522ff4e366c5c73a173b38d9673..eb05319896a5e034fcc9df1f189a95215c2ea77d 100644
--- a/mace/kernels/opencl/cl/space_to_batch.cl
+++ b/mace/kernels/opencl/cl/space_to_batch.cl
@@ -1,6 +1,7 @@
 #include <common.h>
 
-__kernel void space_to_batch(GLOBAL_WORK_GROUP_SIZE_DIM3
+__kernel void space_to_batch(KERNEL_ERROR_PARAMS
+                             GLOBAL_WORK_GROUP_SIZE_DIM3
                              __read_only image2d_t space_data,
                              __write_only image2d_t batch_data,
                              __private const int block_height,
@@ -44,10 +45,12 @@ __kernel void space_to_batch(GLOBAL_WORK_GROUP_SIZE_DIM3
   DATA_TYPE4 value = READ_IMAGET(space_data, SAMPLER, space_coord);
 
   int2 batch_coord = (int2)(mul24(chan_idx, batch_width) + batch_w_idx, batch_hb_idx);
+
   WRITE_IMAGET(batch_data, batch_coord, value);
 }
 
-__kernel void batch_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3
+__kernel void batch_to_space(KERNEL_ERROR_PARAMS
+                             GLOBAL_WORK_GROUP_SIZE_DIM3
                              __read_only image2d_t batch_data,
                              __write_only image2d_t space_data,
                              __private const int block_height,
@@ -87,6 +90,7 @@ __kernel void batch_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3
 
     int2 space_coord = (int2)(mul24(chan_idx, space_width) + space_w_idx,
                               space_b_idx * space_height + space_h_idx);
+
     WRITE_IMAGET(space_data, space_coord, value);
   }
 }
diff --git a/mace/kernels/opencl/cl/winograd_transform.cl b/mace/kernels/opencl/cl/winograd_transform.cl
index 0cab37d750510f1f7bedb02ceddfb49577e4ee31..4201fd63b7797723a83e88e009dd7491e654cdb1 100644
--- a/mace/kernels/opencl/cl/winograd_transform.cl
+++ b/mace/kernels/opencl/cl/winograd_transform.cl
@@ -1,6 +1,7 @@
 #include <common.h>
 
-__kernel void winograd_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2
+__kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS
+                                     GLOBAL_WORK_GROUP_SIZE_DIM2
                                      __read_only image2d_t input,
                                      __write_only image2d_t output,
                                      __private const int in_height,
@@ -115,7 +116,8 @@ __kernel void winograd_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2
   }
 }
 
-__kernel void winograd_inverse_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2
+__kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS
+                                             GLOBAL_WORK_GROUP_SIZE_DIM2
                                              __read_only image2d_t input,
 #ifdef BIAS
                                              __read_only image2d_t bias, /* cout%4 * cout/4 */
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index bccc8623997c7b356190ecc4818fc3394eb45d89..a2898edb26851fb8e98dea3b6d589c0f544b6c43 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -18,7 +18,8 @@ static void Concat2(cl::Kernel *kernel,
                     std::vector<index_t> *prev_input_shape,
                     Tensor *output,
                     StatsFuture *future,
-                    uint32_t *kwg_size) {
+                    uint32_t *kwg_size,
+                    std::unique_ptr<BufferBase> *kernel_error) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -36,6 +37,14 @@ static void Concat2(cl::Kernel *kernel,
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
     built_options.emplace("-Dconcat_channel=" + kernel_name);
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      *kernel_error = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      (*kernel_error)->Map(nullptr);
+      *((*kernel_error)->mutable_data<char>()) = 0;
+      (*kernel_error)->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -56,6 +65,10 @@ static void Concat2(cl::Kernel *kernel,
   }
   if (!IsVecEqual(*prev_input_shape, input0->shape())) {
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel->setArg(idx++,
+          *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel->setArg(idx++, gws[0]);
       kernel->setArg(idx++, gws[1]);
@@ -77,6 +90,13 @@ static void Concat2(cl::Kernel *kernel,
   ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
      << "_" << output->dim(2) << "_" << output->dim(3);
   TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
+
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    (*kernel_error)->Map(nullptr);
+    char *kerror_code = (*kernel_error)->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    (*kernel_error)->UnMap();
+  }
 }
 
 static void ConcatN(cl::Kernel *kernel,
@@ -84,7 +104,8 @@ static void ConcatN(cl::Kernel *kernel,
                     const DataType dt,
                     Tensor *output,
                     StatsFuture *future,
-                    uint32_t *kwg_size) {
+                    uint32_t *kwg_size,
+                    std::unique_ptr<BufferBase> *kernel_error) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -98,6 +119,14 @@ static void ConcatN(cl::Kernel *kernel,
     built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      *kernel_error = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      (*kernel_error)->Map(nullptr);
+      *((*kernel_error)->mutable_data<char>()) = 0;
+      (*kernel_error)->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -117,6 +146,10 @@ static void ConcatN(cl::Kernel *kernel,
     };
 
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel->setArg(idx++,
+          *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel->setArg(idx++, gws[0]);
       kernel->setArg(idx++, gws[1]);
@@ -132,6 +165,13 @@ static void ConcatN(cl::Kernel *kernel,
     ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_"
        << batch * height;
     TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
+
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      (*kernel_error)->Map(nullptr);
+      char *kerror_code = (*kernel_error)->mutable_data<char>();
+      MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+      (*kernel_error)->UnMap();
+    }
   }
 }
 
@@ -172,12 +212,12 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
   switch (inputs_count) {
     case 2:
       Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value,
-              &input_shape_, output, future, &kwg_size_);
+              &input_shape_, output, future, &kwg_size_, &kernel_error_);
       break;
     default:
       if (divisible_four) {
         ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future,
-            &kwg_size_);
+            &kwg_size_, &kernel_error_);
       } else {
         MACE_NOT_IMPLEMENTED;
       }
diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc
index 5a002666320dfbfbea8263d1693bae53231da952..684de9eff434832dcc946d676667453c8cd8d0f9 100644
--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -21,7 +21,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
                              std::vector<index_t> *prev_input_shape,
                              Tensor *output,
                              StatsFuture *future,
-                             uint32_t *kwg_size);
+                             uint32_t *kwg_size,
+                             std::unique_ptr<BufferBase> *kernel_error);
 
 extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                              const Tensor *input,
@@ -36,7 +37,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                              std::vector<index_t> *prev_input_shape,
                              Tensor *output,
                              StatsFuture *future,
-                             uint32_t *kwg_size);
+                             uint32_t *kwg_size,
+                             std::unique_ptr<BufferBase> *kernel_error);
 
 extern void Conv2dOpencl(cl::Kernel *kernel,
                          const Tensor *input,
@@ -51,7 +53,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
                          std::vector<index_t> *prev_input_shape,
                          Tensor *output,
                          StatsFuture *future,
-                         uint32_t *kwg_size);
+                         uint32_t *kwg_size,
+                         std::unique_ptr<BufferBase> *kernel_error);
 
 template <typename T>
 void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
@@ -65,7 +68,7 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
       const int *dilations, const ActivationType activation,
       const float relux_max_limit, const DataType dt,
       std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
-      uint32_t *kwg_size);
+      uint32_t *kwg_size, std::unique_ptr<BufferBase> *kernel_error);
   // Selection matrix: kernel_size x stride_size
   static const Conv2dOpenclFunction selector[5] = {
       Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
@@ -106,12 +109,12 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                 dilations_, activation_, relux_max_limit_,
                 DataTypeToEnum<T>::value, &input_shape_, output, future,
-                &kwg_size_);
+                &kwg_size_, &kernel_error_);
   } else {
     Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                  dilations_, activation_, relux_max_limit_,
                  DataTypeToEnum<T>::value, &input_shape_, output, future,
-                 &kwg_size_);
+                 &kwg_size_, &kernel_error_);
   }
 }
 
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index 5cdf8e5608386ea7547f20d239e5126d69d50efc..81391151e122cd8b991cf0fa4fa81dd1ff7a8315 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -23,7 +23,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
                              std::vector<index_t> *prev_input_shape,
                              Tensor *output,
                              StatsFuture *future,
-                             uint32_t *kwg_size) {
+                             uint32_t *kwg_size,
+                             std::unique_ptr<BufferBase> *kernel_error) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -47,6 +48,14 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
     built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      *kernel_error = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      (*kernel_error)->Map(nullptr);
+      *((*kernel_error)->mutable_data<char>()) = 0;
+      (*kernel_error)->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -84,6 +93,10 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
 
   if (!IsVecEqual(*prev_input_shape, input->shape())) {
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel->setArg(idx++,
+          *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel->setArg(idx++, gws[0]);
       kernel->setArg(idx++, gws[1]);
@@ -112,6 +125,13 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
       Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
   TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
+
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    (*kernel_error)->Map(nullptr);
+    char *kerror_code = (*kernel_error)->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    (*kernel_error)->UnMap();
+  }
 }
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index d0f587452579a7dfe6dbd64d29fa02ab9bf73297..3d02e5b004d91f799f5621d9f929706348f58a47 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -25,7 +25,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                              std::vector<index_t> *prev_input_shape,
                              Tensor *output,
                              StatsFuture *future,
-                             uint32_t *kwg_size) {
+                             uint32_t *kwg_size,
+                             std::unique_ptr<BufferBase> *kernel_error) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -44,6 +45,14 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
     built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      *kernel_error = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      (*kernel_error)->Map(nullptr);
+      *((*kernel_error)->mutable_data<char>()) = 0;
+      (*kernel_error)->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -79,6 +88,10 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
 
   if (!IsVecEqual(*prev_input_shape, input->shape())) {
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel->setArg(idx++,
+          *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel->setArg(idx++, gws[0]);
       kernel->setArg(idx++, gws[1]);
@@ -110,6 +123,13 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
       Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
   TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
+
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    (*kernel_error)->Map(nullptr);
+    char *kerror_code = (*kernel_error)->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    (*kernel_error)->UnMap();
+  }
 }
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc
index b9b2fec56a13dc55ee97649098f0674425ddcd4e..38d0e49fca89fb9417c90267e3bfb906e8cbcdbb 100644
--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -25,7 +25,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
                          std::vector<index_t> *prev_input_shape,
                          Tensor *output,
                          StatsFuture *future,
-                         uint32_t *kwg_size) {
+                         uint32_t *kwg_size,
+                         std::unique_ptr<BufferBase> *kernel_error) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -44,6 +45,14 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
     built_options.emplace("-Dconv_2d=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      *kernel_error = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      (*kernel_error)->Map(nullptr);
+      *((*kernel_error)->mutable_data<char>()) = 0;
+      (*kernel_error)->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -79,6 +88,10 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
 
   if (!IsVecEqual(*prev_input_shape, input->shape())) {
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel->setArg(idx++,
+          *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel->setArg(idx++, gws[0]);
       kernel->setArg(idx++, gws[1]);
@@ -112,6 +125,13 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
       Concat("conv2d_general_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
   TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
+
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    (*kernel_error)->Map(nullptr);
+    char *kerror_code = (*kernel_error)->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    (*kernel_error)->UnMap();
+  }
 }
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/cwise_opencl.cc b/mace/kernels/opencl/cwise_opencl.cc
index dce3d14d69f7a60f4a9cc928b7b2e12a1cbc8c73..07b2cfb2495c81b33d32e5697507101c49c95b10 100644
--- a/mace/kernels/opencl/cwise_opencl.cc
+++ b/mace/kernels/opencl/cwise_opencl.cc
@@ -34,6 +34,14 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     built_options.emplace(MakeString("-DCWISE_TYPE=", type_));
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      kernel_error_ = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      kernel_error_->Map(nullptr);
+      *(kernel_error_->mutable_data<char>()) = 0;
+      kernel_error_->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -44,6 +52,10 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   }
   if (!IsVecEqual(input_shape_, input->shape())) {
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel_.setArg(idx++,
+          *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
@@ -59,6 +71,13 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   ss << "cwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
      << "_" << output->dim(2) << "_" << output->dim(3);
   TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
+
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    kernel_error_->Map(nullptr);
+    char *kerror_code = kernel_error_->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    kernel_error_->UnMap();
+  }
 }
 
 template struct CWiseFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/depth_to_space_opencl.cc b/mace/kernels/opencl/depth_to_space_opencl.cc
index 1c0624365c96eb19f08a22f9055d75834a4d6b72..d2ea128afcf69eee459cb9d2a92f0bee06e85163 100644
--- a/mace/kernels/opencl/depth_to_space_opencl.cc
+++ b/mace/kernels/opencl/depth_to_space_opencl.cc
@@ -23,8 +23,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
   const char *kernel_name = nullptr;
 
   index_t output_height, output_width, output_depth;
-  if (d2s_) {
-    output_height = input_height * block_size_;
+  if (d2s_) { output_height = input_height * block_size_;
     output_width = input_width * block_size_;
     output_depth = input_depth / (block_size_ * block_size_);
     kernel_name = "depth_to_space";
@@ -55,6 +54,14 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      kernel_error_ = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      kernel_error_->Map(nullptr);
+      *(kernel_error_->mutable_data<char>()) = 0;
+      kernel_error_->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -84,19 +91,31 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
     }
 
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel_.setArg(idx++,
+          *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
       kernel_.setArg(idx++, gws[2]);
     }
     kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
+    if (d2s_) {
+      kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
+      kernel_.setArg(idx++, static_cast<int32_t>(input_height * batch));
+      kernel_.setArg(idx++, static_cast<int32_t>(input_width));
+      kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
+      kernel_.setArg(idx++, static_cast<int32_t>(output_width));
+      kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
+    } else {
+      kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
+      kernel_.setArg(idx++, static_cast<int32_t>(input_width));
+      kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
+      kernel_.setArg(idx++, static_cast<int32_t>(output_height * batch));
+      kernel_.setArg(idx++, static_cast<int32_t>(output_width));
+      kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
+    }
     kernel_.setArg(idx++, *(output->opencl_image()));
 
     input_shape_ = input->shape();
@@ -104,6 +123,13 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
 
   const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
   TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    kernel_error_->Map(nullptr);
+    char *kerror_code = kernel_error_->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    kernel_error_->UnMap();
+  }
 }
 
 template struct DepthToSpaceOpFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc
index 5e0b99ba1e5a00e25eec931f209c0a28c8ed85b2..585e7424062293089e396393cfe17fd6cb160df3 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -24,7 +24,8 @@ void DepthwiseConv2d(cl::Kernel *kernel,
                      std::vector<index_t> *prev_input_shape,
                      Tensor *output,
                      StatsFuture *future,
-                     uint32_t *kwg_size) {
+                     uint32_t *kwg_size,
+                     std::unique_ptr<BufferBase> *kernel_error) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -52,6 +53,14 @@ void DepthwiseConv2d(cl::Kernel *kernel,
     } else {
       built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
     }
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      *kernel_error = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      (*kernel_error)->Map(nullptr);
+      *((*kernel_error)->mutable_data<char>()) = 0;
+      (*kernel_error)->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -97,6 +106,10 @@ void DepthwiseConv2d(cl::Kernel *kernel,
                input_channels);
 
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel->setArg(idx++,
+          *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel->setArg(idx++, gws[0]);
       kernel->setArg(idx++, gws[1]);
@@ -130,6 +143,13 @@ void DepthwiseConv2d(cl::Kernel *kernel,
   std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation,
                                   batch, height, width, channels, multiplier);
   TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
+
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    (*kernel_error)->Map(nullptr);
+    char *kerror_code = (*kernel_error)->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    (*kernel_error)->UnMap();
+  }
 }
 
 template <typename T>
@@ -182,7 +202,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
   DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                   dilations_, activation_, relux_max_limit_,
                   DataTypeToEnum<T>::value, &input_shape_, output, future,
-                  &kwg_size_);
+                  &kwg_size_, &kernel_error_);
 }
 
 template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise_opencl.cc
index c23534bbce5ca423314d23bd470a5cbc2289ae1e..04c418e40b4517a564c68464c00b6129deeca4bb 100644
--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise_opencl.cc
@@ -37,6 +37,14 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      kernel_error_ = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      kernel_error_->Map(nullptr);
+      *(kernel_error_->mutable_data<char>()) = 0;
+      kernel_error_->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -48,6 +56,10 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
   }
   if (!IsVecEqual(input_shape_, input0->shape())) {
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel_.setArg(idx++,
+          *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
@@ -68,6 +80,12 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
   ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
      << "_" << output->dim(2) << "_" << output->dim(3);
   TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    kernel_error_->Map(nullptr);
+    char *kerror_code = kernel_error_->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    kernel_error_->UnMap();
+  }
 }
 
 template struct EltwiseFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/fully_connected_opencl.cc b/mace/kernels/opencl/fully_connected_opencl.cc
index 3178b8ae1125e1dcc6b5efd5daebe4fc73df2168..7f40aea3be137c407aea9dab95f6f6f29a2c2dc1 100644
--- a/mace/kernels/opencl/fully_connected_opencl.cc
+++ b/mace/kernels/opencl/fully_connected_opencl.cc
@@ -19,7 +19,8 @@ void FCWXKernel(cl::Kernel *kernel,
                 std::vector<uint32_t> *gws,
                 std::vector<uint32_t> *lws,
                 const float relux_max_limit,
-                StatsFuture *future) {
+                StatsFuture *future,
+                std::unique_ptr<BufferBase> *kernel_error) {
   MACE_CHECK(input->dim(3) % 4 == 0)
     << "FC width kernel only support input with 4x channel.";
   MACE_CHECK_NOTNULL(gws);
@@ -33,8 +34,7 @@ void FCWXKernel(cl::Kernel *kernel,
 
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected");
-    kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
     built_options.emplace("-Dfully_connected_width=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
@@ -62,6 +62,14 @@ void FCWXKernel(cl::Kernel *kernel,
     if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
       built_options.emplace("-DNON_QUALCOMM_ADRENO");
     }
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      *kernel_error = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      (*kernel_error)->Map(nullptr);
+      *((*kernel_error)->mutable_data<char>()) = 0;
+      (*kernel_error)->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -95,6 +103,10 @@ void FCWXKernel(cl::Kernel *kernel,
     (*gws)[2] = static_cast<uint32_t>(batch * output_blocks);
 
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel->setArg(idx++,
+          *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel->setArg(idx++, (*gws)[0]);
       kernel->setArg(idx++, (*gws)[1]);
@@ -132,6 +144,12 @@ void FCWXKernel(cl::Kernel *kernel,
         cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
         cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event);
   }
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    (*kernel_error)->Map(nullptr);
+    char *kerror_code = (*kernel_error)->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    (*kernel_error)->UnMap();
+  }
   MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
 
   if (future != nullptr) {
@@ -155,7 +173,8 @@ void FCWTXKernel(cl::Kernel *kernel,
                  std::vector<uint32_t> *gws,
                  std::vector<uint32_t> *lws,
                  const float relux_max_limit,
-                 StatsFuture *future) {
+                 StatsFuture *future,
+                 std::unique_ptr<BufferBase> *kernel_error) {
   MACE_CHECK_NOTNULL(gws);
   MACE_CHECK_NOTNULL(lws);
   auto runtime = OpenCLRuntime::Global();
@@ -169,6 +188,14 @@ void FCWTXKernel(cl::Kernel *kernel,
     if (bias != nullptr) {
       built_options.emplace("-DBIAS");
     }
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      *kernel_error = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      (*kernel_error)->Map(nullptr);
+      *((*kernel_error)->mutable_data<char>()) = 0;
+      (*kernel_error)->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -206,6 +233,10 @@ void FCWTXKernel(cl::Kernel *kernel,
     };
 
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel->setArg(idx++,
+          *(static_cast<cl::Buffer *>((*kernel_error)->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel->setArg(idx++, (*gws)[0]);
       kernel->setArg(idx++, (*gws)[1]);
@@ -229,6 +260,13 @@ void FCWTXKernel(cl::Kernel *kernel,
   ss << "fc_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_"
      << output->dim(2) << "_" << output->dim(3);
   TuningOrRun2DKernel(*kernel, ss.str(), gws->data(), *lws, future);
+
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    (*kernel_error)->Map(nullptr);
+    char *kerror_code = (*kernel_error)->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    (*kernel_error)->UnMap();
+  }
 }
 
 template <typename T>
@@ -246,10 +284,12 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
 
   if (weight_type_ == BufferType::WEIGHT_HEIGHT) {
     FCWTXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
-                   activation_, &gws_, &lws_, relux_max_limit_, future);
+                   activation_, &gws_, &lws_, relux_max_limit_, future,
+                   &kernel_error_);
   } else {
     FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
-                  activation_, &gws_, &lws_, relux_max_limit_, future);
+                  activation_, &gws_, &lws_, relux_max_limit_, future,
+                  &kernel_error_);
   }
 }
 
diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc
index 7a4822096309297d8c11e80c1f7eb6ea5069b48f..ead70cc2288d06d8ad11470aa4640c12af2874b1 100644
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -40,6 +40,14 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
     built_options.emplace("-Dmatmul=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      kernel_error_ = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      kernel_error_->Map(nullptr);
+      *(kernel_error_->mutable_data<char>()) = 0;
+      kernel_error_->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -49,6 +57,10 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
   uint32_t idx = 0;
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    kernel_.setArg(idx++,
+        *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
+  }
   if (!runtime->IsNonUniformWorkgroupsSupported()) {
     kernel_.setArg(idx++, gws[0]);
     kernel_.setArg(idx++, gws[1]);
@@ -67,6 +79,13 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
   ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
      << C->dim(2) << "_" << C->dim(3);
   TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
+
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    kernel_error_->Map(nullptr);
+    char *kerror_code = kernel_error_->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    kernel_error_->UnMap();
+  }
 }
 
 template struct MatMulFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/out_of_range_check_test.cc b/mace/kernels/opencl/out_of_range_check_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca0a0cebd83fc320defc03d8f74b0d18f76beae0
--- /dev/null
+++ b/mace/kernels/opencl/out_of_range_check_test.cc
@@ -0,0 +1,152 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/core/tensor.h"
+#include "mace/core/workspace.h"
+#include "mace/kernels/opencl/helper.h"
+
+namespace mace {
+namespace kernels {
+namespace {
+
+const bool BufferToImageOpImpl(Tensor *buffer,
+                               Tensor *image,
+                               const std::vector<size_t> &image_shape) {
+  std::unique_ptr<BufferBase> kernel_error;
+  uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
+                     static_cast<uint32_t>(image_shape[1])};
+
+  auto runtime = OpenCLRuntime::Global();
+
+  std::string kernel_name = "in_out_buffer_to_image";
+  std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+  std::set<std::string> built_options;
+  std::stringstream kernel_name_ss;
+  kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+  built_options.emplace(kernel_name_ss.str());
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+  }
+  if (buffer->dtype() == image->dtype()) {
+    built_options.emplace("-DDATA_TYPE=" +
+                          DtToCLDt(DataTypeToEnum<float>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" +
+                          DtToCLCMDDt(DataTypeToEnum<float>::value));
+  } else {
+    built_options.emplace("-DDATA_TYPE=" +
+                          DtToUpstreamCLDt(DataTypeToEnum<float>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" +
+                          DtToUpstreamCLCMDDt(DataTypeToEnum<float>::value));
+  }
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    built_options.emplace("-DOUT_OF_RANGE_CHECK");
+    kernel_error = std::move(std::unique_ptr<Buffer>(
+          new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+    kernel_error->Map(nullptr);
+    *(kernel_error->mutable_data<char>()) = 0;
+    kernel_error->UnMap();
+  }
+
+  auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
+                                         obfuscated_kernel_name, built_options);
+
+  uint32_t idx = 0;
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    b2f_kernel.setArg(idx++,
+        *(static_cast<cl::Buffer *>(kernel_error->buffer())));
+  }
+  if (!runtime->IsNonUniformWorkgroupsSupported()) {
+    b2f_kernel.setArg(idx++, gws[0]);
+    b2f_kernel.setArg(idx++, gws[1]);
+  }
+  b2f_kernel.setArg(idx++, *(buffer->opencl_buffer()));
+  MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0,
+             "buffer offset not aligned");
+  b2f_kernel.setArg(idx++,
+                    static_cast<uint32_t>(buffer->buffer_offset() /
+                                          GetEnumTypeSize(buffer->dtype())));
+  b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
+  b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
+  b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
+  b2f_kernel.setArg(idx++, *(image->opencl_image()));
+
+  const uint32_t kwg_size =
+      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(b2f_kernel));
+  const std::vector<uint32_t> lws = {16, kwg_size / 16};
+
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
+        cl::NDRange(lws[0], lws[1]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+      roundup_gws[i] = RoundUp(gws[i], lws[i]);
+    }
+
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
+        cl::NDRange(lws[0], lws[1]), nullptr, &event);
+  }
+  MACE_CHECK_CL_SUCCESS(error);
+
+  runtime->command_queue().finish();
+  bool is_out_of_range = false;
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    kernel_error->Map(nullptr);
+    is_out_of_range =
+        *(kernel_error->mutable_data<char>()) == 1 ? true : false;
+    kernel_error->UnMap();
+  }
+  return is_out_of_range;
+}
+
+}  // namespace
+
+class OutOfRangeCheckTest : public ::testing::Test {
+ protected:
+  virtual void SetUp() {
+    setenv("MACE_OUT_OF_RANGE_CHECK", "1", 1);
+  }
+};
+
+TEST(OutOfRangeCheckTest, RandomTest) {
+  static unsigned int seed = time(NULL);
+  for (int round = 0; round < 10; ++round) {
+    index_t batch = 11 + rand_r(&seed) % 10;
+    index_t height = 12 + rand_r(&seed) % 100;
+    index_t width = 13 + rand_r(&seed) % 100;
+    index_t channels = 14 + rand_r(&seed) % 50;
+
+    std::vector<index_t> buffer_shape = {batch, height, width, channels};
+    Workspace ws;
+    Tensor *buffer = ws.CreateTensor("Buffer",
+                                     GetDeviceAllocator(DeviceType::OPENCL),
+                                     DataTypeToEnum<float>::v());
+    buffer->Resize(buffer_shape);
+
+    std::vector<size_t> image_shape;
+    Tensor *image = ws.CreateTensor("Image",
+                                     GetDeviceAllocator(DeviceType::OPENCL),
+                                     DataTypeToEnum<float>::v());
+    CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape);
+    image->ResizeImage(buffer->shape(), image_shape);
+    ASSERT_FALSE(BufferToImageOpImpl(buffer, image, image_shape));
+
+    std::vector<size_t> overflow_image_shape = image_shape;
+    for (int i = 0; i < overflow_image_shape.size(); ++i) {
+      overflow_image_shape[i] += 1;
+    }
+    ASSERT_TRUE(BufferToImageOpImpl(buffer, image, overflow_image_shape));
+  }
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc
index 774fd5ee2ac60875491deb00f1ce4fcbef8ba97e..0afef212e5c2845d1434b8fe8ae3db76158eebf7 100644
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -37,6 +37,14 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     if (pooling_type_ == AVG) {
       built_options.emplace("-DPOOL_AVG");
     }
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      kernel_error_ = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      kernel_error_->Map(nullptr);
+      *(kernel_error_->mutable_data<char>()) = 0;
+      kernel_error_->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -82,6 +90,10 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     };
 
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel_.setArg(idx++,
+          *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
@@ -117,6 +129,13 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
      << "_" << output->dim(2) << "_" << output->dim(3);
   TuningOrRun3DKernel(kernel_, ss.str(), gws.data(), lws, future);
+
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    kernel_error_->Map(nullptr);
+    char *kerror_code = kernel_error_->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    kernel_error_->UnMap();
+  }
 }
 
 template struct PoolingFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc
index ac2733e9e511c8a32dee3371c9ed404be964cb90..779849b6b53279330f15644db33a94fe8f418e68 100644
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -37,6 +37,14 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      kernel_error_ = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      kernel_error_->Map(nullptr);
+      *(kernel_error_->mutable_data<char>()) = 0;
+      kernel_error_->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -61,6 +69,10 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
         CalculateResizeScale(in_width, out_width, align_corners_);
 
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel_.setArg(idx++,
+          *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
@@ -82,6 +94,13 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
   ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_"
      << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
   TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    kernel_error_->Map(nullptr);
+    char *kerror_code = kernel_error_->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    kernel_error_->UnMap();
+  }
 }
 
 template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc
index 850f08f6df5b091750d0b6ac203bf3e72e00099c..1945f14d135f8bebdf34d3e6d3b3ace5c75649eb 100644
--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -38,6 +38,14 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
     built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE="
                            + DtToCLCMDDt(DataTypeToEnum<T>::value));
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      kernel_error_ = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      kernel_error_->Map(nullptr);
+      *(kernel_error_->mutable_data<char>()) = 0;
+      kernel_error_->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -64,6 +72,10 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
      << outputs_count;
   for (int i = 0; i < outputs_count; ++i) {
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel_.setArg(idx++,
+          *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
@@ -74,6 +86,12 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
 
     TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel_error_->Map(nullptr);
+      char *kerror_code = kernel_error_->mutable_data<char>();
+      MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+      kernel_error_->UnMap();
+    }
   }
 }
 
diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc
index ea4f0b3e13d2d023a9dc98691ce47cb269f97714..b74fe15000b24bd696861778e3f0ffb60427bb15 100644
--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -36,6 +36,14 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      kernel_error_ = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      kernel_error_->Map(nullptr);
+      *(kernel_error_->mutable_data<char>()) = 0;
+      kernel_error_->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -46,6 +54,10 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
   }
   if (!IsVecEqual(input_shape_, logits->shape())) {
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel_.setArg(idx++,
+          *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
@@ -64,6 +76,13 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
   ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
      << "_" << output->dim(2) << "_" << output->dim(3);
   TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    kernel_error_->Map(nullptr);
+    char *kerror_code = kernel_error_->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    kernel_error_->UnMap();
+  }
 }
 
 template struct SoftmaxFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc
index b4ae998a56f34cf3f23266e549f7346360ae4113..972fc1fa131ad7982924ef53c67fa1591451c660 100644
--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -47,6 +47,14 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
     built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE=" +
                           DtToCLCMDDt(DataTypeToEnum<T>::value));
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      kernel_error_ = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      kernel_error_->Map(nullptr);
+      *(kernel_error_->mutable_data<char>()) = 0;
+      kernel_error_->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -59,6 +67,10 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
   }
   if (!IsVecEqual(space_shape_, space_tensor->shape())) {
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel_.setArg(idx++,
+          *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
@@ -89,6 +101,13 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
      << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_"
      << batch_tensor->dim(3);
   TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    kernel_error_->Map(nullptr);
+    char *kerror_code = kernel_error_->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    kernel_error_->UnMap();
+  }
 }
 
 template struct SpaceToBatchFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc
index f4fd5525960706e31ae3a303d00c50e534bfeaec..aadbc7fd418d165def6c40dad6d6a9ab422f921c 100644
--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -26,6 +26,14 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
                           DtToUpstreamCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE=" +
                           DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      kernel_error_ = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      kernel_error_->Map(nullptr);
+      *(kernel_error_->mutable_data<char>()) = 0;
+      kernel_error_->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -62,6 +70,10 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
     output_tensor->ResizeImage(output_shape, image_shape);
 
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel_.setArg(idx++,
+          *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
@@ -85,6 +97,13 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
      << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
      << input_tensor->dim(3);
   TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
+
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    kernel_error_->Map(nullptr);
+    char *kerror_code = kernel_error_->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    kernel_error_->UnMap();
+  }
 }
 
 template <typename T>
@@ -106,6 +125,14 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
                           DtToUpstreamCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE=" +
                           DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      built_options.emplace("-DOUT_OF_RANGE_CHECK");
+      kernel_error_ = std::move(std::unique_ptr<Buffer>(
+            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+      kernel_error_->Map(nullptr);
+      *(kernel_error_->mutable_data<char>()) = 0;
+      kernel_error_->UnMap();
+    }
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
@@ -152,6 +179,10 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
     const uint32_t round_h = (height_ + 1) / 2;
     const uint32_t round_w = (width_ + 1) / 2;
     uint32_t idx = 0;
+    if (runtime->IsOutOfRangeCheckEnabled()) {
+      kernel_.setArg(idx++,
+          *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
+    }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
@@ -181,6 +212,13 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
      << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
      << input_tensor->dim(3);
   TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
+
+  if (runtime->IsOutOfRangeCheckEnabled()) {
+    kernel_error_->Map(nullptr);
+    char *kerror_code = kernel_error_->mutable_data<char>();
+    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
+    kernel_error_->UnMap();
+  }
 }
 
 template struct WinogradTransformFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h
index b5a5001271e78ff54c2fbf4efe3b541a08467390..f20c059d116330d38bb419c6e5e4910a47a7ce9b 100644
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -7,6 +7,7 @@
 
 #include <algorithm>
 #include <limits>
+#include <memory>
 #include <vector>
 
 #include "mace/core/future.h"
@@ -198,6 +199,7 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
+  std::unique_ptr<BufferBase> kernel_error_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h
index 4e0c5ae3cb04def05794faab46cfc5dc90727e3c..cfc3105ba74eed1bdf8b9a13eeaea284e53e518b 100644
--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -5,6 +5,7 @@
 #define MACE_KERNELS_RESIZE_BILINEAR_H_
 
 #include <algorithm>
+#include <memory>
 #include <vector>
 
 #include "mace/core/future.h"
@@ -174,6 +175,7 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
+  std::unique_ptr<BufferBase> kernel_error_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/slice.h b/mace/kernels/slice.h
index 58c21e7a98891b3a06f7782d5bbad2bdcca12b38..36cda42f345ff9bb1a4d118cb6627608ef698309 100644
--- a/mace/kernels/slice.h
+++ b/mace/kernels/slice.h
@@ -5,6 +5,7 @@
 #ifndef MACE_KERNELS_SLICE_H_
 #define MACE_KERNELS_SLICE_H_
 
+#include <memory>
 #include <functional>
 #include <vector>
 
@@ -79,6 +80,7 @@ struct SliceFunctor<DeviceType::OPENCL, T> : SliceFunctorBase {
                   StatsFuture *future);
   cl::Kernel kernel_;
   uint32_t kwg_size_;
+  std::unique_ptr<BufferBase> kernel_error_;
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h
index 62cbcbd532938a3ae8854fec7a2d55123bc34e7e..48fe37f20b59961be88de7b26d878023329f606b 100644
--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -7,6 +7,7 @@
 
 #include <algorithm>
 #include <functional>
+#include <memory>
 #include <vector>
 
 #include "mace/core/future.h"
@@ -67,6 +68,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
+  std::unique_ptr<BufferBase> kernel_error_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h
index 4e0d4c591e9b6cfe8544c1bfc4d98936c921dee1..672ccfca558c439e9b8fa16461638c7bf937afee 100644
--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -5,6 +5,7 @@
 #ifndef MACE_KERNELS_SPACE_TO_BATCH_H_
 #define MACE_KERNELS_SPACE_TO_BATCH_H_
 
+#include <memory>
 #include <vector>
 
 #include "mace/core/future.h"
@@ -57,6 +58,7 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
+  std::unique_ptr<BufferBase> kernel_error_;
   std::vector<index_t> space_shape_;
 };
 
diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h
index e3169541dd197764d284d35c7eabbb61a2ba38b6..b6b45ea162c2abb45fcb98353821b9070542ca62 100644
--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -5,6 +5,7 @@
 #ifndef MACE_KERNELS_WINOGRAD_TRANSFORM_H_
 #define MACE_KERNELS_WINOGRAD_TRANSFORM_H_
 
+#include <memory>
 #include <vector>
 
 #include "mace/core/future.h"
@@ -52,6 +53,7 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
+  std::unique_ptr<BufferBase> kernel_error_;
   std::vector<index_t> input_shape_;
 };
 
@@ -110,6 +112,7 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
+  std::unique_ptr<BufferBase> kernel_error_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/tools/bazel_adb_run.py b/tools/bazel_adb_run.py
index 78f7a24a5591cc0164cedffdf370d9c9ea9fb0a5..c8c9ddb46f9ae640824a0b6d4faaea3a700c6993 100644
--- a/tools/bazel_adb_run.py
+++ b/tools/bazel_adb_run.py
@@ -108,7 +108,8 @@ def main(unused_args):
                                       args=FLAGS.args,
                                       opencl_profiling=1,
                                       vlog_level=0,
-                                      device_bin_path="/data/local/tmp/mace")
+                                      device_bin_path="/data/local/tmp/mace",
+                                      out_of_range_check=1)
         device_properties = sh_commands.adb_getprop_by_serialno(serialno)
         globals()[FLAGS.stdout_processor](stdouts, device_properties, target_abi)
 
diff --git a/tools/sh_commands.py b/tools/sh_commands.py
index c0742c5d32274e1f56c82a43efd2c2961e704b81..666ce7b0bff61461cf547ec416114c77e808683d 100644
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -66,7 +66,8 @@ def adb_run(serialno, host_bin_path, bin_name,
             args="",
             opencl_profiling=1,
             vlog_level=0,
-            device_bin_path="/data/local/tmp/mace"):
+            device_bin_path="/data/local/tmp/mace",
+            out_of_range_check=1):
   host_bin_full_path = "%s/%s" % (host_bin_path, bin_name)
   device_bin_full_path = "%s/%s" % (device_bin_path, bin_name)
   props = adb_getprop_by_serialno(serialno)
@@ -81,8 +82,8 @@ def adb_run(serialno, host_bin_path, bin_name,
   stdout_buff=[]
   process_output = make_output_processor(stdout_buff)
   p = sh.adb("-s", serialno, "shell",
-             "MACE_OPENCL_PROFILING=%d MACE_CPP_MIN_VLOG_LEVEL=%d %s %s" %
-             (opencl_profiling, vlog_level, device_bin_full_path, args),
+             "MACE_OUT_OF_RANGE_CHECK=%d MACE_OPENCL_PROFILING=%d MACE_CPP_MIN_VLOG_LEVEL=%d %s %s" %
+             (out_of_range_check, opencl_profiling, vlog_level, device_bin_full_path, args),
              _out=process_output, _bg=True, _err_to_out=True)
   p.wait()
   return "".join(stdout_buff)