From 88120708f7bf6da6d38803f910c27ec559fcdfaa Mon Sep 17 00:00:00 2001
From: liuqi <liuqi10@xiaomi.com>
Date: Sat, 28 Jul 2018 18:15:18 +0800
Subject: [PATCH] Add gpu avalibility check and return status to user if gpu
 call failed.

---
 mace/core/file_storage.cc                     |   4 +-
 mace/core/runtime/opencl/opencl_allocator.cc  |  15 +-
 mace/core/runtime/opencl/opencl_runtime.cc    | 133 ++++-
 mace/core/runtime/opencl/opencl_runtime.h     |  27 +-
 mace/core/runtime/opencl/opencl_wrapper.cc    | 480 ++++++++++++------
 mace/core/workspace.cc                        |  42 +-
 mace/examples/cli/example.cc                  |   5 +-
 mace/kernels/opencl/activation.cc             |   6 +-
 mace/kernels/opencl/addn.cc                   |   6 +-
 mace/kernels/opencl/batch_norm.cc             |   6 +-
 mace/kernels/opencl/bias_add.cc               |   5 +-
 mace/kernels/opencl/buffer_to_image.cc        |   8 +-
 mace/kernels/opencl/channel_shuffle.cc        |   8 +-
 mace/kernels/opencl/concat.cc                 |  29 +-
 mace/kernels/opencl/conv_2d_1x1.cc            |  54 +-
 mace/kernels/opencl/conv_2d_3x3.cc            |  47 +-
 mace/kernels/opencl/conv_2d_general.cc        |  55 +-
 mace/kernels/opencl/crop.cc                   |  24 +-
 mace/kernels/opencl/deconv_2d_opencl.cc       |   6 +-
 mace/kernels/opencl/depth_to_space.cc         |   9 +-
 mace/kernels/opencl/depthwise_conv.cc         |  50 +-
 mace/kernels/opencl/eltwise.cc                |   6 +-
 mace/kernels/opencl/fully_connected.cc        |  14 +-
 mace/kernels/opencl/helper.cc                 |  65 ++-
 mace/kernels/opencl/helper.h                  |  22 +-
 mace/kernels/opencl/image_to_buffer.cc        |  10 +-
 mace/kernels/opencl/matmul.cc                 |   6 +-
 .../kernels/opencl/out_of_range_check_test.cc |  15 +-
 mace/kernels/opencl/pad.cc                    |   6 +-
 mace/kernels/opencl/pooling.cc                |  35 +-
 mace/kernels/opencl/reduce_mean_opencl.cc     |  12 +-
 mace/kernels/opencl/resize_bilinear.cc        |  47 +-
 mace/kernels/opencl/slice.cc                  |   7 +-
 mace/kernels/opencl/softmax.cc                |  30 +-
 mace/kernels/opencl/space_to_batch.cc         |   9 +-
 mace/kernels/opencl/winograd_transform.cc     |  18 +-
 mace/libmace/mace.cc                          |  44 ++
 mace/proto/mace.proto                         |  11 +-
 mace/public/mace_runtime.h                    | 127 +++--
 mace/python/tools/convert_util.py             |  65 +++
 mace/python/tools/converter.py                |   7 +
 .../tools/converter_tool/base_converter.py    |   1 +
 .../tools/converter_tool/transformer.py       |  32 +-
 mace/python/tools/memory_optimizer.py         | 108 ++--
 mace/python/tools/model.jinja2                |   1 +
 mace/test/mace_api_mt_test.cc                 |   2 +
 mace/test/mace_api_test.cc                    |   2 +
 47 files changed, 1161 insertions(+), 560 deletions(-)

diff --git a/mace/core/file_storage.cc b/mace/core/file_storage.cc
index 99af2c19..99731a81 100644
--- a/mace/core/file_storage.cc
+++ b/mace/core/file_storage.cc
@@ -37,8 +37,8 @@ int FileStorage::Load() {
   struct stat st;
   if (stat(file_path_.c_str(), &st) == -1) {
     if (errno == ENOENT) {
-      LOG(INFO) << "File " << file_path_
-                << " does not exist";
+      VLOG(1) << "File " << file_path_
+              << " does not exist";
       return 0;
     } else {
       LOG(WARNING) << "Stat file " << file_path_
diff --git a/mace/core/runtime/opencl/opencl_allocator.cc b/mace/core/runtime/opencl/opencl_allocator.cc
index ad5c8eac..7dda80e6 100644
--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -123,7 +123,10 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
   void *mapped_ptr =
       queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
                              offset, nbytes, nullptr, nullptr, &error);
-  MACE_CHECK_CL_SUCCESS(error);
+  if (error != CL_SUCCESS) {
+    LOG(ERROR) << "Map buffer failed, error: " << OpenCLErrorToString(error);
+    mapped_ptr = nullptr;
+  }
   return mapped_ptr;
 }
 
@@ -142,8 +145,10 @@ void *OpenCLAllocator::MapImage(void *buffer,
       *cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region,
       mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr,
       nullptr, &error);
-  MACE_CHECK_CL_SUCCESS(error);
-
+  if (error != CL_SUCCESS) {
+    LOG(ERROR) << "Map Image failed, error: " << OpenCLErrorToString(error);
+    mapped_ptr = nullptr;
+  }
   return mapped_ptr;
 }
 
@@ -152,7 +157,9 @@ void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
   auto queue = OpenCLRuntime::Global()->command_queue();
   cl_int error = queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr,
                                              nullptr, nullptr);
-  MACE_CHECK_CL_SUCCESS(error);
+  if (error != CL_SUCCESS) {
+    LOG(ERROR) << "Unmap buffer failed, error: " << OpenCLErrorToString(error);
+  }
 }
 
 bool OpenCLAllocator::OnHost() const { return false; }
diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index c5809ffd..f901973f 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -307,11 +307,15 @@ void OpenCLRuntime::ConfigureOpenCLBinaryPath(
 OpenCLRuntime::OpenCLRuntime():
     precompiled_binary_storage_(nullptr),
     cache_storage_(nullptr),
-    is_profiling_enabled_(false) {
+    is_opencl_avaliable_(false),
+    is_profiling_enabled_(false),
+    opencl_version_(CL_VER_UNKNOWN),
+    gpu_type_(UNKNOWN) {
   std::vector<cl::Platform> all_platforms;
   cl::Platform::get(&all_platforms);
   if (all_platforms.size() == 0) {
-    LOG(FATAL) << "No OpenCL platforms found";
+    LOG(ERROR) << "No OpenCL platforms found";
+    return;
   }
   cl::Platform default_platform = all_platforms[0];
   std::stringstream ss;
@@ -325,7 +329,8 @@ OpenCLRuntime::OpenCLRuntime():
   std::vector<cl::Device> all_devices;
   default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
   if (all_devices.size() == 0) {
-    LOG(FATAL) << "No OpenCL devices found";
+    LOG(ERROR) << "No OpenCL devices found";
+    return;
   }
 
   bool gpu_detected = false;
@@ -340,13 +345,17 @@ OpenCLRuntime::OpenCLRuntime():
 
       const std::string device_version = device.getInfo<CL_DEVICE_VERSION>();
       opencl_version_ = ParseDeviceVersion(device_version);
+      if (opencl_version_ == OpenCLVersion::CL_VER_UNKNOWN) {
+        return;
+      }
 
       VLOG(1) << "Using device: " << device_name;
       break;
     }
   }
   if (!gpu_detected) {
-    LOG(FATAL) << "No GPU device found";
+    LOG(ERROR) << "No GPU device found";
+    return;
   }
 
   cl_command_queue_properties properties = 0;
@@ -384,13 +393,19 @@ OpenCLRuntime::OpenCLRuntime():
           new cl::Context({*device_}, nullptr, nullptr, nullptr, &err));
     }
   }
-  MACE_CHECK_CL_SUCCESS(err);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+    return;
+  }
 
   command_queue_ = std::make_shared<cl::CommandQueue>(*context_,
                                                       *device_,
                                                       properties,
                                                       &err);
-  MACE_CHECK_CL_SUCCESS(err);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+    return;
+  }
 
   extern std::shared_ptr<KVStorageFactory> kStorageFactory;
   std::string cached_binary_platform_info;
@@ -416,10 +431,7 @@ OpenCLRuntime::OpenCLRuntime():
   }
 
   if (cached_binary_platform_info != platform_info_) {
-    if (OpenCLRuntime::kPrecompiledBinaryPath.empty()) {
-      LOG(WARNING) << "There is no precompiled OpenCL binary in"
-          " all OpenCL binary paths";
-    } else {
+    if (!OpenCLRuntime::kPrecompiledBinaryPath.empty()) {
       precompiled_binary_storage_.reset(
           new FileStorage(OpenCLRuntime::kPrecompiledBinaryPath));
       if (precompiled_binary_storage_->Load() != 0) {
@@ -450,6 +462,8 @@ OpenCLRuntime::OpenCLRuntime():
   } else {
     this->out_of_range_check_ = false;
   }
+
+  is_opencl_avaliable_ = true;
 }
 
 OpenCLRuntime::~OpenCLRuntime() {
@@ -460,6 +474,12 @@ OpenCLRuntime::~OpenCLRuntime() {
   device_.reset();
 }
 
+bool OpenCLRuntime::is_opencl_avaliable() {
+  static const uint64_t kMinWorkGroupSize = 64;
+  return is_opencl_avaliable_
+      && GetDeviceMaxWorkGroupSize() >= kMinWorkGroupSize;
+}
+
 cl::Context &OpenCLRuntime::context() { return *context_; }
 
 cl::Device &OpenCLRuntime::device() { return *device_; }
@@ -538,7 +558,7 @@ bool OpenCLRuntime::BuildProgramFromPrecompiledBinary(
   return true;
 }
 
-void OpenCLRuntime::BuildProgramFromSource(
+bool OpenCLRuntime::BuildProgramFromSource(
     const std::string &program_name,
     const std::string &built_program_key,
     const std::string &build_options_str,
@@ -562,7 +582,7 @@ void OpenCLRuntime::BuildProgramFromSource(
       LOG(WARNING) << "Build program "
                    << program_name << " from source failed: "
                    << MakeString(ret);
-      return;
+      return false;
     }
 
     // Keep built program binary
@@ -572,7 +592,10 @@ void OpenCLRuntime::BuildProgramFromSource(
     cl_int err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARY_SIZES,
                                   sizeof(size_t) * device_list_size,
                                   program_binary_sizes.get(), nullptr);
-    MACE_CHECK_CL_SUCCESS(err);
+    if (err != CL_SUCCESS) {
+      LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+      return false;
+    }
     std::unique_ptr<std::unique_ptr<unsigned char[]>[]> program_binaries(
         new std::unique_ptr<unsigned char[]>[device_list_size]);
     for (cl_uint i = 0; i < device_list_size; ++i) {
@@ -583,7 +606,10 @@ void OpenCLRuntime::BuildProgramFromSource(
     err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARIES,
                            sizeof(unsigned char *) * device_list_size,
                            program_binaries.get(), nullptr);
-    MACE_CHECK_CL_SUCCESS(err);
+    if (err != CL_SUCCESS) {
+      LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+      return false;
+    }
     std::vector<unsigned char> content(
         reinterpret_cast<unsigned char const *>(program_binaries[0].get()),
         reinterpret_cast<unsigned char const *>(program_binaries[0].get()) +
@@ -600,9 +626,10 @@ void OpenCLRuntime::BuildProgramFromSource(
 
     VLOG(3) << "Program from source: " << built_program_key;
   }
+  return true;
 }
 
-void OpenCLRuntime::BuildProgram(const std::string &program_name,
+bool OpenCLRuntime::BuildProgram(const std::string &program_name,
                                  const std::string &built_program_key,
                                  const std::string &build_options,
                                  cl::Program *program) {
@@ -617,16 +644,18 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
     ret = BuildProgramFromPrecompiledBinary(built_program_key,
                                             build_options_str, program);
     if (!ret) {
-      BuildProgramFromSource(program_name, built_program_key,
-                             build_options_str, program);
+      ret = BuildProgramFromSource(program_name, built_program_key,
+                                   build_options_str, program);
     }
   }
+  return ret;
 }
 
-cl::Kernel OpenCLRuntime::BuildKernel(
+MaceStatus OpenCLRuntime::BuildKernel(
     const std::string &program_name,
     const std::string &kernel_name,
-    const std::set<std::string> &build_options) {
+    const std::set<std::string> &build_options,
+    cl::Kernel *kernel) {
   std::string build_options_str;
   for (auto &option : build_options) {
     build_options_str += " " + option;
@@ -639,11 +668,17 @@ cl::Kernel OpenCLRuntime::BuildKernel(
   if (built_program_it != built_program_map_.end()) {
     program = built_program_it->second;
   } else {
-    this->BuildProgram(program_name, built_program_key, build_options_str,
-                       &program);
+    bool ret = this->BuildProgram(program_name, built_program_key,
+                                  build_options_str, &program);
+    if (!ret) {
+      return MaceStatus::MACE_OUT_OF_RESOURCES;
+    }
     built_program_map_.emplace(built_program_key, program);
   }
-  return cl::Kernel(program, kernel_name.c_str());
+  cl_int err;
+  *kernel = cl::Kernel(program, kernel_name.c_str(), &err);
+  MACE_CL_RET_STATUS(err);
+  return MaceStatus::MACE_SUCCESS;
 }
 
 void OpenCLRuntime::SaveBuiltCLProgram() {
@@ -667,25 +702,67 @@ void OpenCLRuntime::GetCallStats(const cl::Event &event, CallStats *stats) {
 
 uint64_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() {
   uint64_t size = 0;
-  device_->getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &size);
+  cl_int err = device_->getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &size);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+    size = 0;
+  }
   return size;
 }
 
 uint64_t OpenCLRuntime::GetDeviceMaxMemAllocSize() {
   uint64_t size = 0;
-  device_->getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &size);
+  cl_int err = device_->getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &size);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+    size = 0;
+  }
   return size;
 }
 
+bool OpenCLRuntime::IsImageSupport() {
+  cl_bool res;
+  cl_int err = device_->getInfo(CL_DEVICE_IMAGE_SUPPORT, &res);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+    return false;
+  }
+  return res == CL_TRUE;
+}
+std::vector<uint64_t> OpenCLRuntime::GetMaxImage2DSize() {
+  size_t max_height, max_width;
+  cl_int err = device_->getInfo(CL_DEVICE_IMAGE2D_MAX_HEIGHT, &max_height);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+    return {};
+  }
+  err = device_->getInfo(CL_DEVICE_IMAGE2D_MAX_WIDTH, &max_width);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+    return {};
+  }
+  return {max_height, max_width};
+}
+
 uint64_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) {
   uint64_t size = 0;
-  kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &size);
+  cl_int err = kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE,
+                                       &size);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+    size = 0;
+  }
   return size;
 }
 
 uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
   uint64_t size = 0;
-  kernel.getWorkGroupInfo(*device_, CL_KERNEL_WAVE_SIZE_QCOM, &size);
+  cl_int err = kernel.getWorkGroupInfo(*device_, CL_KERNEL_WAVE_SIZE_QCOM,
+                                       &size);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+    size = 0;
+  }
   return size;
 }
 
@@ -717,8 +794,8 @@ OpenCLVersion OpenCLRuntime::ParseDeviceVersion(
   } else if (words[1] == "1.0") {
     return OpenCLVersion::CL_VER_1_0;
   } else {
-    LOG(FATAL) << "Do not support OpenCL version: " << words[1];
-    return OpenCLVersion::CL_VER_1_0;
+    LOG(ERROR) << "Do not support OpenCL version: " << words[1];
+    return OpenCLVersion::CL_VER_UNKNOWN;
   }
 }
 
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index f87b4580..537707fa 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -42,13 +42,23 @@ enum OpenCLVersion {
   CL_VER_1_1,
   CL_VER_1_2,
   CL_VER_2_0,
+  CL_VER_UNKNOWN,
 };
 
 
 const std::string OpenCLErrorToString(cl_int error);
 
-#define MACE_CHECK_CL_SUCCESS(error) \
-  MACE_CHECK(error == CL_SUCCESS) << "error: " << OpenCLErrorToString(error)
+#define MACE_CL_RET_ERROR(error)                            \
+  if (error != CL_SUCCESS) {                                \
+    LOG(ERROR) << "error: " << OpenCLErrorToString(error);  \
+    return error;                                           \
+  }
+
+#define MACE_CL_RET_STATUS(error)                           \
+  if (error != CL_SUCCESS) {                                \
+    LOG(ERROR) << "error: " << OpenCLErrorToString(error);  \
+    return MaceStatus::MACE_OUT_OF_RESOURCES;               \
+  }
 
 class OpenCLProfilingTimer : public Timer {
  public:
@@ -81,19 +91,23 @@ class OpenCLRuntime {
   const std::string platform_info() const;
   uint64_t device_global_mem_cache_size() const;
   uint32_t device_compute_units() const;
+  bool is_opencl_avaliable();
 
   void GetCallStats(const cl::Event &event, CallStats *stats);
   uint64_t GetDeviceMaxWorkGroupSize();
   uint64_t GetDeviceMaxMemAllocSize();
+  bool IsImageSupport();
+  std::vector<uint64_t> GetMaxImage2DSize();
   uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
   uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
   bool IsNonUniformWorkgroupsSupported() const;
   bool IsOutOfRangeCheckEnabled() const;
   bool is_profiling_enabled() const;
 
-  cl::Kernel BuildKernel(const std::string &program_name,
+  MaceStatus BuildKernel(const std::string &program_name,
                          const std::string &kernel_name,
-                         const std::set<std::string> &build_options);
+                         const std::set<std::string> &build_options,
+                         cl::Kernel *kernel);
 
   void SaveBuiltCLProgram();
 
@@ -103,7 +117,7 @@ class OpenCLRuntime {
   OpenCLRuntime(const OpenCLRuntime &) = delete;
   OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
 
-  void BuildProgram(const std::string &program_file_name,
+  bool BuildProgram(const std::string &program_file_name,
                     const std::string &binary_file_name,
                     const std::string &build_options,
                     cl::Program *program);
@@ -115,7 +129,7 @@ class OpenCLRuntime {
       const std::string &built_program_key,
       const std::string &build_options_str,
       cl::Program *program);
-  void BuildProgramFromSource(
+  bool BuildProgramFromSource(
       const std::string &program_name,
       const std::string &built_program_key,
       const std::string &build_options_str,
@@ -125,6 +139,7 @@ class OpenCLRuntime {
  private:
   std::unique_ptr<KVStorage> precompiled_binary_storage_;
   std::unique_ptr<KVStorage> cache_storage_;
+  bool is_opencl_avaliable_;
   bool is_profiling_enabled_;
   // All OpenCL object must be a pointer and manually deleted before unloading
   // OpenCL library.
diff --git a/mace/core/runtime/opencl/opencl_wrapper.cc b/mace/core/runtime/opencl/opencl_wrapper.cc
index 26a9fb7e..a7fd447e 100644
--- a/mace/core/runtime/opencl/opencl_wrapper.cc
+++ b/mace/core/runtime/opencl/opencl_wrapper.cc
@@ -257,6 +257,10 @@ OpenCLLibrary *OpenCLLibrary::Get() {
 
 OpenCLLibrary::OpenCLLibrary() {
   this->Load();
+  // Do not call dlclose which may unload all OpenCL symbols.
+  // If close the OpenCL library, the static OpenCLRuntime destructor may fail.
+  // If there is no dlclose, the library will be closed when the program exist.
+  // Besides, the library will not be load repeatedly even dlopen many times.
 }
 
 bool OpenCLLibrary::Load() {
@@ -293,13 +297,12 @@ bool OpenCLLibrary::Load() {
   }
 
   if (handle_ == nullptr) {
-    LOG(FATAL) << "Failed to load OpenCL library, "
+    LOG(ERROR) << "Failed to load OpenCL library, "
         "please make sure there exists OpenCL library on your device, "
         "and your APP have right to access the library.";
     return false;
   }
 
-  // Do not dlclose, leave it to system.
   return true;
 }
 
@@ -384,9 +387,12 @@ CL_API_ENTRY cl_int clGetPlatformIDs(cl_uint num_entries,
                                      cl_uint *num_platforms)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clGetPlatformIDs;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clGetPlatformIDs");
-  return func(num_entries, platforms, num_platforms);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clGetPlatformIDs");
+    return func(num_entries, platforms, num_platforms);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY cl_int clGetPlatformInfo(cl_platform_id platform,
@@ -396,10 +402,13 @@ CL_API_ENTRY cl_int clGetPlatformInfo(cl_platform_id platform,
                                       size_t *param_value_size_ret)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clGetPlatformInfo;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clGetPlatformInfo");
-  return func(platform, param_name, param_value_size, param_value,
-              param_value_size_ret);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clGetPlatformInfo");
+    return func(platform, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 // Device APIs
@@ -410,9 +419,12 @@ CL_API_ENTRY cl_int clGetDeviceIDs(cl_platform_id platform,
                                    cl_uint *num_devices)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clGetDeviceIDs;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clGetDeviceIDs");
-  return func(platform, device_type, num_entries, devices, num_devices);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clGetDeviceIDs");
+    return func(platform, device_type, num_entries, devices, num_devices);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY cl_int clGetDeviceInfo(cl_device_id device,
@@ -422,26 +434,35 @@ CL_API_ENTRY cl_int clGetDeviceInfo(cl_device_id device,
                                     size_t *param_value_size_ret)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clGetDeviceInfo;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clGetDeviceInfo");
-  return func(device, param_name, param_value_size, param_value,
-              param_value_size_ret);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clGetDeviceInfo");
+    return func(device, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY cl_int clRetainDevice(cl_device_id device)
     CL_API_SUFFIX__VERSION_1_2 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clRetainDevice;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clRetainDevice");
-  return func(device);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clRetainDevice");
+    return func(device);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY cl_int clReleaseDevice(cl_device_id device)
     CL_API_SUFFIX__VERSION_1_2 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clReleaseDevice;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clReleaseDevice");
-  return func(device);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clReleaseDevice");
+    return func(device);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 // Context APIs
@@ -453,10 +474,14 @@ CL_API_ENTRY cl_context clCreateContext(
     void *user_data,
     cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clCreateContext;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clCreateContext");
-  return func(properties, num_devices, devices, pfn_notify, user_data,
-              errcode_ret);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clCreateContext");
+    return func(properties, num_devices, devices, pfn_notify, user_data,
+                errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
 }
 
 CL_API_ENTRY cl_context clCreateContextFromType(
@@ -466,25 +491,35 @@ CL_API_ENTRY cl_context clCreateContextFromType(
     void *user_data,
     cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clCreateContextFromType;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clCreateContextFromType");
-  return func(properties, device_type, pfn_notify, user_data, errcode_ret);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clCreateContextFromType");
+    return func(properties, device_type, pfn_notify, user_data, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
 }
 
 CL_API_ENTRY cl_int clRetainContext(cl_context context)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clRetainContext;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clRetainContext");
-  return func(context);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clRetainContext");
+    return func(context);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY cl_int clReleaseContext(cl_context context)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clReleaseContext;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clReleaseContext");
-  return func(context);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clReleaseContext");
+    return func(context);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY cl_int clGetContextInfo(cl_context context,
@@ -494,10 +529,13 @@ CL_API_ENTRY cl_int clGetContextInfo(cl_context context,
                                      size_t *param_value_size_ret)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clGetContextInfo;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clGetContextInfo");
-  return func(context, param_name, param_value_size, param_value,
-              param_value_size_ret);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clGetContextInfo");
+    return func(context, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 // Program Object APIs
@@ -508,9 +546,13 @@ CL_API_ENTRY cl_program clCreateProgramWithSource(cl_context context,
                                                   cl_int *errcode_ret)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clCreateProgramWithSource;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clCreateProgramWithSource");
-  return func(context, count, strings, lengths, errcode_ret);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clCreateProgramWithSource");
+    return func(context, count, strings, lengths, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
 }
 
 CL_API_ENTRY cl_program
@@ -522,10 +564,14 @@ clCreateProgramWithBinary(cl_context context,
                           cl_int *binary_status,
                           cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clCreateProgramWithBinary;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clCreateProgramWithBinary");
-  return func(context, num_devices, device_list, lengths, binaries,
-              binary_status, errcode_ret);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clCreateProgramWithBinary");
+    return func(context, num_devices, device_list, lengths, binaries,
+                binary_status, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
 }
 
 CL_API_ENTRY cl_int clGetProgramInfo(cl_program program,
@@ -535,10 +581,13 @@ CL_API_ENTRY cl_int clGetProgramInfo(cl_program program,
                                      size_t *param_value_size_ret)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clGetProgramInfo;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clGetProgramInfo");
-  return func(program, param_name, param_value_size, param_value,
-              param_value_size_ret);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clGetProgramInfo");
+    return func(program, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY cl_int clGetProgramBuildInfo(cl_program program,
@@ -549,26 +598,35 @@ CL_API_ENTRY cl_int clGetProgramBuildInfo(cl_program program,
                                           size_t *param_value_size_ret)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clGetProgramBuildInfo;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clGetProgramBuildInfo");
-  return func(program, device, param_name, param_value_size, param_value,
-              param_value_size_ret);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clGetProgramBuildInfo");
+    return func(program, device, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY cl_int clRetainProgram(cl_program program)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clRetainProgram;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clRetainProgram");
-  return func(program);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clRetainProgram");
+    return func(program);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY cl_int clReleaseProgram(cl_program program)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clReleaseProgram;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clReleaseProgram");
-  return func(program);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clReleaseProgram");
+    return func(program);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY cl_int clBuildProgram(
@@ -579,10 +637,13 @@ CL_API_ENTRY cl_int clBuildProgram(
     void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
     void *user_data) CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clBuildProgram;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clBuildProgram");
-  return func(program, num_devices, device_list, options, pfn_notify,
-              user_data);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clBuildProgram");
+    return func(program, num_devices, device_list, options, pfn_notify,
+                user_data);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 // Kernel Object APIs
@@ -591,25 +652,35 @@ CL_API_ENTRY cl_kernel clCreateKernel(cl_program program,
                                       cl_int *errcode_ret)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clCreateKernel;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clCreateKernel");
-  return func(program, kernel_name, errcode_ret);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clCreateKernel");
+    return func(program, kernel_name, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
 }
 
 CL_API_ENTRY cl_int clRetainKernel(cl_kernel kernel)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clRetainKernel;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clRetainKernel");
-  return func(kernel);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clRetainKernel");
+    return func(kernel);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY cl_int clReleaseKernel(cl_kernel kernel)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clReleaseKernel;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clReleaseKernel");
-  return func(kernel);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clReleaseKernel");
+    return func(kernel);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY cl_int clSetKernelArg(cl_kernel kernel,
@@ -618,9 +689,12 @@ CL_API_ENTRY cl_int clSetKernelArg(cl_kernel kernel,
                                    const void *arg_value)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clSetKernelArg;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clSetKernelArg");
-  return func(kernel, arg_index, arg_size, arg_value);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clSetKernelArg");
+    return func(kernel, arg_index, arg_size, arg_value);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 // Memory Object APIs
@@ -631,9 +705,13 @@ CL_API_ENTRY cl_mem clCreateBuffer(cl_context context,
                                    cl_int *errcode_ret)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clCreateBuffer;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clCreateBuffer");
-  return func(context, flags, size, host_ptr, errcode_ret);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clCreateBuffer");
+    return func(context, flags, size, host_ptr, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
 }
 
 CL_API_ENTRY cl_mem clCreateImage(cl_context context,
@@ -644,25 +722,40 @@ CL_API_ENTRY cl_mem clCreateImage(cl_context context,
                                   cl_int *errcode_ret)
     CL_API_SUFFIX__VERSION_1_2 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clCreateImage;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clCreateImage");
-  return func(context, flags, image_format, image_desc, host_ptr, errcode_ret);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clCreateImage");
+    return func(context,
+                flags,
+                image_format,
+                image_desc,
+                host_ptr,
+                errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
 }
 
 CL_API_ENTRY cl_int clRetainMemObject(cl_mem memobj)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clRetainMemObject;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clRetainMemObject");
-  return func(memobj);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clRetainMemObject");
+    return func(memobj);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY cl_int clReleaseMemObject(cl_mem memobj)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clReleaseMemObject;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clReleaseMemObject");
-  return func(memobj);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clReleaseMemObject");
+    return func(memobj);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY cl_int clGetImageInfo(cl_mem image,
@@ -672,10 +765,13 @@ CL_API_ENTRY cl_int clGetImageInfo(cl_mem image,
                                    size_t *param_value_size_ret)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clGetImageInfo;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clGetImageInfo");
-  return func(image, param_name, param_value_size, param_value,
-              param_value_size_ret);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clGetImageInfo");
+    return func(image, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 // Command Queue APIs
@@ -686,25 +782,35 @@ CL_API_ENTRY cl_command_queue clCreateCommandQueueWithProperties(
     cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_0 {
   auto func =
       mace::runtime::OpenCLLibrary::Get()->clCreateCommandQueueWithProperties;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clCreateCommandQueueWithProperties");
-  return func(context, device, properties, errcode_ret);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clCreateCommandQueueWithProperties");
+    return func(context, device, properties, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
 }
 
 CL_API_ENTRY cl_int clRetainCommandQueue(cl_command_queue command_queue)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clRetainCommandQueue;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clRetainCommandQueue");
-  return func(command_queue);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clRetainCommandQueue");
+    return func(command_queue);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY cl_int clReleaseCommandQueue(cl_command_queue command_queue)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clReleaseCommandQueue;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clReleaseCommandQueue");
-  return func(command_queue);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clReleaseCommandQueue");
+    return func(command_queue);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 // Enqueued Commands APIs
@@ -719,10 +825,13 @@ CL_API_ENTRY cl_int clEnqueueReadBuffer(cl_command_queue command_queue,
                                         cl_event *event)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clEnqueueReadBuffer;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clEnqueueReadBuffer");
-  return func(command_queue, buffer, blocking_read, offset, size, ptr,
-              num_events_in_wait_list, event_wait_list, event);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clEnqueueReadBuffer");
+    return func(command_queue, buffer, blocking_read, offset, size, ptr,
+                num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY cl_int clEnqueueWriteBuffer(cl_command_queue command_queue,
@@ -736,10 +845,13 @@ CL_API_ENTRY cl_int clEnqueueWriteBuffer(cl_command_queue command_queue,
                                          cl_event *event)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clEnqueueWriteBuffer;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clEnqueueWriteBuffer");
-  return func(command_queue, buffer, blocking_write, offset, size, ptr,
-              num_events_in_wait_list, event_wait_list, event);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clEnqueueWriteBuffer");
+    return func(command_queue, buffer, blocking_write, offset, size, ptr,
+                num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY void *clEnqueueMapBuffer(cl_command_queue command_queue,
@@ -754,10 +866,14 @@ CL_API_ENTRY void *clEnqueueMapBuffer(cl_command_queue command_queue,
                                       cl_int *errcode_ret)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clEnqueueMapBuffer;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clEnqueueMapBuffer");
-  return func(command_queue, buffer, blocking_map, map_flags, offset, size,
-              num_events_in_wait_list, event_wait_list, event, errcode_ret);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clEnqueueMapBuffer");
+    return func(command_queue, buffer, blocking_map, map_flags, offset, size,
+                num_events_in_wait_list, event_wait_list, event, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
 }
 
 CL_API_ENTRY void *clEnqueueMapImage(cl_command_queue command_queue,
@@ -774,11 +890,15 @@ CL_API_ENTRY void *clEnqueueMapImage(cl_command_queue command_queue,
                                      cl_int *errcode_ret)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clEnqueueMapImage;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clEnqueueMapImage");
-  return func(command_queue, image, blocking_map, map_flags, origin, region,
-              image_row_pitch, image_slice_pitch, num_events_in_wait_list,
-              event_wait_list, event, errcode_ret);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clEnqueueMapImage");
+    return func(command_queue, image, blocking_map, map_flags, origin, region,
+                image_row_pitch, image_slice_pitch, num_events_in_wait_list,
+                event_wait_list, event, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
 }
 
 CL_API_ENTRY cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue,
@@ -789,10 +909,13 @@ CL_API_ENTRY cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue,
                                             cl_event *event)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clEnqueueUnmapMemObject;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clEnqueueUnmapMemObject");
-  return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list,
-              event_wait_list, event);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clEnqueueUnmapMemObject");
+    return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list,
+                event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY cl_int clGetKernelWorkGroupInfo(
@@ -803,10 +926,13 @@ CL_API_ENTRY cl_int clGetKernelWorkGroupInfo(
     void *param_value,
     size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clGetKernelWorkGroupInfo;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clGetKernelWorkGroupInfo");
-  return func(kernel, device, param_name, param_value_size, param_value,
-              param_value_size_ret);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clGetKernelWorkGroupInfo");
+    return func(kernel, device, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue,
@@ -820,34 +946,46 @@ CL_API_ENTRY cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue,
                                            cl_event *event)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clEnqueueNDRangeKernel;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clEnqueueNDRangeKernel");
-  return func(command_queue, kernel, work_dim, global_work_offset,
-              global_work_size, local_work_size, num_events_in_wait_list,
-              event_wait_list, event);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clEnqueueNDRangeKernel");
+    return func(command_queue, kernel, work_dim, global_work_offset,
+                global_work_size, local_work_size, num_events_in_wait_list,
+                event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 // Event Object APIs
 CL_API_ENTRY cl_int clWaitForEvents(
     cl_uint num_events, const cl_event *event_list) CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clWaitForEvents;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clWaitForEvents");
-  return func(num_events, event_list);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clWaitForEvents");
+    return func(num_events, event_list);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY cl_int clRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clRetainEvent;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clRetainEvent");
-  return func(event);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clRetainEvent");
+    return func(event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY cl_int clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clReleaseEvent;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clReleaseEvent");
-  return func(event);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clReleaseEvent");
+    return func(event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 // Event API
@@ -858,10 +996,13 @@ CL_API_ENTRY cl_int clGetEventInfo(cl_event event,
                                    size_t *param_value_size_ret)
 CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clGetEventInfo;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clGetEventInfo");
-  return func(event, param_name, param_value_size, param_value,
-              param_value_size_ret);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clGetEventInfo");
+    return func(event, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 // Profiling APIs
@@ -872,27 +1013,36 @@ CL_API_ENTRY cl_int clGetEventProfilingInfo(cl_event event,
                                             size_t *param_value_size_ret)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clGetEventProfilingInfo;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clGetEventProfilingInfo");
-  return func(event, param_name, param_value_size, param_value,
-              param_value_size_ret);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clGetEventProfilingInfo");
+    return func(event, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 // Flush and Finish APIs
 CL_API_ENTRY cl_int clFlush(cl_command_queue command_queue)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clFlush;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clFlush");
-  return func(command_queue);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clFlush");
+    return func(command_queue);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 CL_API_ENTRY cl_int clFinish(cl_command_queue command_queue)
     CL_API_SUFFIX__VERSION_1_0 {
   auto func = mace::runtime::OpenCLLibrary::Get()->clFinish;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clFinish");
-  return func(command_queue);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clFinish");
+    return func(command_queue);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
 }
 
 // Deprecated OpenCL 1.1 APIs
@@ -906,10 +1056,14 @@ CL_API_ENTRY /* CL_EXT_PREFIX__VERSION_1_1_DEPRECATED */ cl_mem clCreateImage2D(
     void *host_ptr,
     cl_int *errcode_ret) /* CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED */ {
   auto func = mace::runtime::OpenCLLibrary::Get()->clCreateImage2D;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clCreateImage2D");
-  return func(context, flags, image_format, image_width, image_height,
-              image_row_pitch, host_ptr, errcode_ret);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clCreateImage2D");
+    return func(context, flags, image_format, image_width, image_height,
+                image_row_pitch, host_ptr, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
 }
 
 // Deprecated OpenCL 2.0 APIs
@@ -920,7 +1074,11 @@ clCreateCommandQueue(cl_context context,
                      cl_int *errcode_ret)
 /* CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED */ {  // NOLINT
   auto func = mace::runtime::OpenCLLibrary::Get()->clCreateCommandQueue;
-  MACE_CHECK_NOTNULL(func);
-  MACE_LATENCY_LOGGER(3, "clCreateCommandQueue");
-  return func(context, device, properties, errcode_ret);
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clCreateCommandQueue");
+    return func(context, device, properties, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
 }
diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc
index cb6be4ec..1e8826cd 100644
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -204,26 +204,28 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
   // TODO(liyin): memory block should not have concept of type, but to be
   // consistent with gpu, all memory block use float/half as unit
   for (auto &mem_block : net_def.mem_arena().mem_block()) {
-    if (device_type == DeviceType::GPU) {
-      // TODO(liuqi): refactor based on PB
-      if (mem_block.mem_id() >= 20000) {
-        std::unique_ptr<BufferBase> image_buf(
-            new Image());
-        MACE_RETURN_IF_ERROR(image_buf->Allocate(
-            {mem_block.x(), mem_block.y()}, dtype));
-        preallocated_allocator_.SetBuffer(mem_block.mem_id(),
-                                          std::move(image_buf));
-      }
-    } else {
-      if (mem_block.mem_id() < 20000) {
-        std::unique_ptr<BufferBase> tensor_buf(
-            new Buffer(GetDeviceAllocator(device_type)));
-        MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
-            mem_block.x() * GetEnumTypeSize(dtype)
-            + MACE_EXTRA_BUFFER_PAD_SIZE));
-        preallocated_allocator_.SetBuffer(mem_block.mem_id(),
-                                          std::move(tensor_buf));
-      }
+    if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
+      std::unique_ptr<BufferBase> tensor_buf(
+          new Buffer(GetDeviceAllocator(DeviceType::CPU)));
+      MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
+          mem_block.x() * GetEnumTypeSize(dtype)
+              + MACE_EXTRA_BUFFER_PAD_SIZE));
+      preallocated_allocator_.SetBuffer(mem_block.mem_id(),
+                                        std::move(tensor_buf));
+    } else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
+      std::unique_ptr<BufferBase> image_buf(
+          new Image());
+      MACE_RETURN_IF_ERROR(image_buf->Allocate(
+          {mem_block.x(), mem_block.y()}, dtype));
+      preallocated_allocator_.SetBuffer(mem_block.mem_id(),
+                                        std::move(image_buf));
+    } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
+      std::unique_ptr<BufferBase> tensor_buf(
+          new Buffer(GetDeviceAllocator(DeviceType::GPU)));
+      MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
+          mem_block.x() * GetEnumTypeSize(dtype)));
+      preallocated_allocator_.SetBuffer(mem_block.mem_id(),
+                                        std::move(tensor_buf));
     }
   }
   VLOG(3) << "Preallocate buffer to tensors";
diff --git a/mace/examples/cli/example.cc b/mace/examples/cli/example.cc
index 2270f295..ec93674e 100644
--- a/mace/examples/cli/example.cc
+++ b/mace/examples/cli/example.cc
@@ -219,7 +219,10 @@ bool RunModel(const std::vector<std::string> &input_names,
 #endif
 
   if (create_engine_status != MaceStatus::MACE_SUCCESS) {
-    std::cerr << "Create engine error, please check the arguments" << std::endl;
+    std::cerr << "Create engine error, please check the arguments first, "
+              << "if correct, the device may not run the model, "
+              << "please fall back to other strategy."
+              << std::endl;
     exit(1);
   }
 
diff --git a/mace/kernels/opencl/activation.cc b/mace/kernels/opencl/activation.cc
index edb67873..2e343aa9 100644
--- a/mace/kernels/opencl/activation.cc
+++ b/mace/kernels/opencl/activation.cc
@@ -79,7 +79,8 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
       default:
         LOG(FATAL) << "Unknown activation type: " << activation_;
     }
-    kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
+                                              built_options, &kernel_));
 
     kwg_size_ =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -115,7 +116,8 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
   std::string tuning_key =
       Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
              output->dim(3));
-  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws,
+                                           lws, future));
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index d7270a6e..e47f5103 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -68,7 +68,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
 
-    kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
+                                              built_options, &kernel_));
 
     kwg_size_ =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -111,7 +112,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
   std::string tuning_key =
       Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
              output_tensor->dim(2), output_tensor->dim(3));
-  TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
diff --git a/mace/kernels/opencl/batch_norm.cc b/mace/kernels/opencl/batch_norm.cc
index 2769e08b..d2dce6d3 100644
--- a/mace/kernels/opencl/batch_norm.cc
+++ b/mace/kernels/opencl/batch_norm.cc
@@ -88,7 +88,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
         LOG(FATAL) << "Unknown activation type: " << activation_;
     }
 
-    kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
+                                              built_options, &kernel_));
 
     kwg_size_ =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -122,7 +123,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
   std::string tuning_key =
       Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3), folded_constant_);
-  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
diff --git a/mace/kernels/opencl/bias_add.cc b/mace/kernels/opencl/bias_add.cc
index d1e58bd3..b7023dd5 100644
--- a/mace/kernels/opencl/bias_add.cc
+++ b/mace/kernels/opencl/bias_add.cc
@@ -61,7 +61,8 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
-    kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
+                                              built_options, &kernel_));
 
     kwg_size_ =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -102,7 +103,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
         cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
         cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
   }
-  MACE_CHECK_CL_SUCCESS(error);
+  MACE_CL_RET_STATUS(error);
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
     char *kerror_code = kernel_error_->mutable_data<char>();
diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc
index b29f7e81..df104d66 100644
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -106,8 +106,10 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
     }
   }
 
-  auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
-                                         obfuscated_kernel_name, built_options);
+  cl::Kernel b2f_kernel;
+
+  MACE_RETURN_IF_ERROR(runtime->BuildKernel(
+      "buffer_to_image", obfuscated_kernel_name, built_options, &b2f_kernel));
 
   uint32_t idx = 0;
   if (runtime->IsOutOfRangeCheckEnabled()) {
@@ -164,7 +166,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
         b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
         cl::NDRange(lws[0], lws[1]), nullptr, &event);
   }
-  MACE_CHECK_CL_SUCCESS(error);
+  MACE_CL_RET_STATUS(error);
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
     char *kerror_code = kernel_error_->mutable_data<char>();
diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc
index 63e47267..8babf338 100644
--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -62,8 +62,9 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
-    kernel_ =
-        runtime->BuildKernel("channel_shuffle", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(
+        runtime->BuildKernel("channel_shuffle", kernel_name,
+                             built_options, &kernel_));
 
     kwg_size_ =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -92,7 +93,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
   std::string tuning_key =
       Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index 65540e48..98ac4342 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -24,12 +24,18 @@ namespace kernels {
 namespace {
 std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
-  uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
-  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
-  lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
-  const uint32_t lws_size = lws[0] * lws[1];
-  lws[2] = std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1);
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
+  } else {
+    uint64_t
+        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
+    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+    lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
+    const uint32_t lws_size = lws[0] * lws[1];
+    lws[2] =
+        std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1);
+  }
   return lws;
 }
 
@@ -83,7 +89,8 @@ static MaceStatus Concat2(cl::Kernel *kernel,
     if (input0->dim(3) % 4 == 0) {
       built_options.emplace("-DDIVISIBLE_FOUR");
     }
-    *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
+                                              built_options, kernel));
 
     *kwg_size =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
@@ -114,7 +121,8 @@ static MaceStatus Concat2(cl::Kernel *kernel,
   std::string tuning_key =
       Concat("concat_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+                                           gws, lws, future));
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     (*kernel_error)->Map(nullptr);
@@ -157,7 +165,8 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
-    *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
+                                              built_options, kernel));
     *kwg_size =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
   }
@@ -207,7 +216,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
           cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
           cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
     }
-    MACE_CHECK_CL_SUCCESS(error);
+    MACE_CL_RET_STATUS(error);
     if (runtime->IsOutOfRangeCheckEnabled()) {
       (*kernel_error)->Map(nullptr);
       char *kerror_code = (*kernel_error)->mutable_data<char>();
diff --git a/mace/kernels/opencl/conv_2d_1x1.cc b/mace/kernels/opencl/conv_2d_1x1.cc
index 48ea04d3..e5eb2134 100644
--- a/mace/kernels/opencl/conv_2d_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_1x1.cc
@@ -27,30 +27,36 @@ const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
 const uint32_t lws_limit = 128;
 std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
-  uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
-  const uint32_t base =
-      std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
-  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
-  if (lws[1] >= base) {
-    lws[0] = std::min<uint32_t>(gws[0], base);
-  } else if ((1 < lws[1] && lws[1] < base) && gws[0] >= lws_limit) {
-    lws[0] = std::min<uint32_t>(gws[0], base);
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
   } else {
-    lws[0] = gws[0] / 8;
-    if (lws[0] < base) {
-      lws[0] = std::max<uint32_t>(gws[0] / 4, base);
+    uint64_t
+        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+    uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
+    const uint32_t base =
+        std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
+    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+    if (lws[1] >= base) {
+      lws[0] = std::min<uint32_t>(gws[0], base);
+    } else if ((1 < lws[1] && lws[1] < base) && gws[0] >= lws_limit) {
+      lws[0] = std::min<uint32_t>(gws[0], base);
+    } else {
+      lws[0] = gws[0] / 8;
+      if (lws[0] < base) {
+        lws[0] = std::max<uint32_t>(gws[0] / 4, base);
+      }
     }
+    lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
+    const uint32_t lws_size = lws[0] * lws[1];
+    lws[2] = std::min<uint32_t>(
+        (cache_size / kernel_cache_size / lws_size / compute_units) * 8,
+        gws[2]);
+    if (lws[2] == 0) {
+      lws[2] = std::min<uint32_t>(gws[2], base);
+    }
+    lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
+                                1);
   }
-  lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
-  const uint32_t lws_size = lws[0] * lws[1];
-  lws[2] = std::min<uint32_t>(
-      (cache_size / kernel_cache_size / lws_size / compute_units) * 8, gws[2]);
-  if (lws[2] == 0) {
-    lws[2] = std::min<uint32_t>(gws[2], base);
-  }
-  lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
-                              1);
   return lws;
 }
 
@@ -130,7 +136,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
         LOG(FATAL) << "Unknown activation type: " << activation;
     }
 
-    *kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_1x1", kernel_name,
+                                              built_options, kernel));
 
     *kwg_size =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
@@ -173,7 +180,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
   std::string tuning_key =
       Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+                                           gws, lws, future));
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     (*kernel_error)->Map(nullptr);
diff --git a/mace/kernels/opencl/conv_2d_3x3.cc b/mace/kernels/opencl/conv_2d_3x3.cc
index 0fa295d0..9984fe10 100644
--- a/mace/kernels/opencl/conv_2d_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_3x3.cc
@@ -26,25 +26,30 @@ namespace {
 const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4;
 std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
-  uint32_t compute_units = std::max<uint32_t>(
-      OpenCLRuntime::Global()->device_compute_units() / 2, 1);
-  const uint32_t base =
-      std::max<uint32_t>(
-          std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize, 4), 1);
-  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
-  lws[0] =
-      std::min<uint32_t>(std::min<uint32_t>(gws[0], base), kwg_size / lws[1]);
-  const uint32_t lws_size = lws[0] * lws[1];
-  lws[2] = std::min<uint32_t>(
-      RoundUp<uint32_t>(
-          cache_size / kernel_cache_size / lws_size / compute_units, base),
-      gws[2]);
-  if (lws[2] == 0) {
-    lws[2] = std::min<uint32_t>(gws[2], base);
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
+  } else {
+    uint64_t
+        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+    uint32_t compute_units = std::max<uint32_t>(
+        OpenCLRuntime::Global()->device_compute_units() / 2, 1);
+    const uint32_t base =
+        std::max<uint32_t>(
+            std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize, 4), 1);
+    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+    lws[0] =
+        std::min<uint32_t>(std::min<uint32_t>(gws[0], base), kwg_size / lws[1]);
+    const uint32_t lws_size = lws[0] * lws[1];
+    lws[2] = std::min<uint32_t>(
+        RoundUp<uint32_t>(
+            cache_size / kernel_cache_size / lws_size / compute_units, base),
+        gws[2]);
+    if (lws[2] == 0) {
+      lws[2] = std::min<uint32_t>(gws[2], base);
+    }
+    lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
+                                1);
   }
-  lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
-                              1);
   return lws;
 }
 
@@ -115,7 +120,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
         LOG(FATAL) << "Unknown activation type: " << activation;
     }
 
-    *kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_3x3", kernel_name,
+                                              built_options, kernel));
 
     *kwg_size =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
@@ -161,7 +167,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
   std::string tuning_key =
       Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+                                           gws, lws, future));
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     (*kernel_error)->Map(nullptr);
diff --git a/mace/kernels/opencl/conv_2d_general.cc b/mace/kernels/opencl/conv_2d_general.cc
index c3208eca..a6e29694 100644
--- a/mace/kernels/opencl/conv_2d_general.cc
+++ b/mace/kernels/opencl/conv_2d_general.cc
@@ -30,30 +30,35 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
                               const uint32_t kernel_size,
                               const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
-  uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
-  const uint32_t base =
-      std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
-  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
-  lws[0] = gws[0] / 4;
-  if (lws[0] == 0) {
-    lws[0] = gws[0];
-  }
-  lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
-  const uint32_t lws_size = lws[0] * lws[1];
-  lws[2] = std::min<uint32_t>((cache_size / kernel_cache_size / kernel_size /
-                               lws_size / compute_units) *
-                                  8,
-                              gws[2]);
-  if (lws[2] == 0) {
-    if (gws[2] < lws_limit) {
-      lws[2] = gws[2];
-    } else {
-      lws[2] = base;
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
+  } else {
+    uint64_t
+        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+    uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
+    const uint32_t base =
+        std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
+    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+    lws[0] = gws[0] / 4;
+    if (lws[0] == 0) {
+      lws[0] = gws[0];
+    }
+    lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
+    const uint32_t lws_size = lws[0] * lws[1];
+    lws[2] = std::min<uint32_t>((cache_size / kernel_cache_size / kernel_size /
+                                    lws_size / compute_units) *
+                                    8,
+                                gws[2]);
+    if (lws[2] == 0) {
+      if (gws[2] < lws_limit) {
+        lws[2] = gws[2];
+      } else {
+        lws[2] = base;
+      }
     }
+    lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
+                                1);
   }
-  lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
-                              1);
   return lws;
 }
 
@@ -124,7 +129,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
         LOG(FATAL) << "Unknown activation type: " << activation;
     }
 
-    *kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d", kernel_name,
+                                              built_options, kernel));
 
     *kwg_size =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
@@ -173,7 +179,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
              output->dim(2), output->dim(3), filter->dim(2), filter->dim(3));
   std::vector<uint32_t> lws =
       LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size);
-  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+                                           gws, lws, future));
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     (*kernel_error)->Map(nullptr);
diff --git a/mace/kernels/opencl/crop.cc b/mace/kernels/opencl/crop.cc
index 297b8dd3..ba9248d1 100644
--- a/mace/kernels/opencl/crop.cc
+++ b/mace/kernels/opencl/crop.cc
@@ -24,12 +24,18 @@ namespace kernels {
 namespace {
 std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
-  uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
-  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
-  lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
-  const uint32_t lws_size = lws[0] * lws[1];
-  lws[2] = std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1);
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
+  } else {
+    uint64_t
+        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
+    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+    lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
+    const uint32_t lws_size = lws[0] * lws[1];
+    lws[2] =
+        std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1);
+  }
   return lws;
 }
 
@@ -147,7 +153,8 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
-    kernel_ = runtime->BuildKernel("crop", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name,
+                                              built_options, &kernel_));
 
     kwg_size_ =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -181,7 +188,8 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
   std::string tuning_key =
       Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
diff --git a/mace/kernels/opencl/deconv_2d_opencl.cc b/mace/kernels/opencl/deconv_2d_opencl.cc
index e40ac3b3..ac7af70d 100644
--- a/mace/kernels/opencl/deconv_2d_opencl.cc
+++ b/mace/kernels/opencl/deconv_2d_opencl.cc
@@ -95,7 +95,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
         LOG(FATAL) << "Unknown activation type: " << activation;
     }
 
-    *kernel = runtime->BuildKernel("deconv_2d", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("deconv_2d", kernel_name,
+                                              built_options, kernel));
 
     *kwg_size =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
@@ -148,7 +149,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
   std::string tuning_key =
       Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+                                           gws, lws, future));
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     (*kernel_error)->Map(nullptr);
diff --git a/mace/kernels/opencl/depth_to_space.cc b/mace/kernels/opencl/depth_to_space.cc
index bf339f40..cd379b22 100644
--- a/mace/kernels/opencl/depth_to_space.cc
+++ b/mace/kernels/opencl/depth_to_space.cc
@@ -95,8 +95,10 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
-    kernel_ = runtime->BuildKernel("depth_to_space", obfuscated_kernel_name,
-                                   built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
 
     kwg_size_ =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -135,7 +137,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
   }
 
   const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
-  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
diff --git a/mace/kernels/opencl/depthwise_conv.cc b/mace/kernels/opencl/depthwise_conv.cc
index ca44be2f..517ff16d 100644
--- a/mace/kernels/opencl/depthwise_conv.cc
+++ b/mace/kernels/opencl/depthwise_conv.cc
@@ -26,27 +26,33 @@ namespace {
 const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4;
 std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
-  uint32_t base = cache_size / kBaseGPUMemCacheSize;
-  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
-  if (lws[1] >= base) {
-    lws[0] = std::min<uint32_t>(gws[0], base);
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
   } else {
-    lws[0] = std::min<uint32_t>(gws[0] / 8, kwg_size / lws[1]);
-    if (lws[0] < base) {
-      lws[0] = std::min<uint32_t>(std::max<uint32_t>(gws[0] / 4, base),
-                                  kwg_size / lws[1]);
+    uint64_t
+        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+    uint32_t base = cache_size / kBaseGPUMemCacheSize;
+    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+    if (lws[1] >= base) {
+      lws[0] = std::min<uint32_t>(gws[0], base);
+    } else {
+      lws[0] = std::min<uint32_t>(gws[0] / 8, kwg_size / lws[1]);
+      if (lws[0] < base) {
+        lws[0] = std::min<uint32_t>(std::max<uint32_t>(gws[0] / 4, base),
+                                    kwg_size / lws[1]);
+      }
     }
+    lws[0] =
+        std::max<uint32_t>(std::min<uint32_t>(lws[0], kwg_size / lws[1]), 1);
+    const uint32_t lws_size = lws[0] * lws[1];
+    lws[2] = std::min<uint32_t>((cache_size / kernel_cache_size / lws_size) * 4,
+                                gws[2]);
+    if (lws[2] == 0) {
+      lws[2] = gws[2];
+    }
+    lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
+                                1);
   }
-  lws[0] = std::max<uint32_t>(std::min<uint32_t>(lws[0], kwg_size / lws[1]), 1);
-  const uint32_t lws_size = lws[0] * lws[1];
-  lws[2] = std::min<uint32_t>((cache_size / kernel_cache_size / lws_size) * 4,
-                              gws[2]);
-  if (lws[2] == 0) {
-    lws[2] = gws[2];
-  }
-  lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
-                              1);
   return lws;
 }
 
@@ -129,8 +135,9 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
         LOG(FATAL) << "Unknown activation type: " << activation;
     }
 
-    *kernel =
-        runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(
+        runtime->BuildKernel("depthwise_conv2d", kernel_name,
+                             built_options, kernel));
 
     *kwg_size =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
@@ -183,7 +190,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
   const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
   std::string tuning_key =
       Concat("depthwise_conv2d_ocl_kernel", gws[0], gws[1], gws[2], multiplier);
-  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+                                           gws, lws, future));
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     (*kernel_error)->Map(nullptr);
diff --git a/mace/kernels/opencl/eltwise.cc b/mace/kernels/opencl/eltwise.cc
index c0a74c42..503d5d5d 100644
--- a/mace/kernels/opencl/eltwise.cc
+++ b/mace/kernels/opencl/eltwise.cc
@@ -103,7 +103,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
-    kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name,
+                                              built_options, &kernel_));
 
     kwg_size_ =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -141,7 +142,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
   std::string tuning_key =
       Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
     char *kerror_code = kernel_error_->mutable_data<char>();
diff --git a/mace/kernels/opencl/fully_connected.cc b/mace/kernels/opencl/fully_connected.cc
index 7888287a..cbd046b4 100644
--- a/mace/kernels/opencl/fully_connected.cc
+++ b/mace/kernels/opencl/fully_connected.cc
@@ -84,8 +84,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
 
-    *kernel =
-        runtime->BuildKernel("fully_connected", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name,
+                                              built_options, kernel));
 
     if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
@@ -160,7 +160,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
     MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
     (*kernel_error)->UnMap();
   }
-  MACE_CHECK_CL_SUCCESS(error);
+  MACE_CL_RET_STATUS(error);
 
   if (future != nullptr) {
     future->wait_fn = [runtime, event](CallStats *stats) {
@@ -230,8 +230,9 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
       default:
         LOG(FATAL) << "Unknown activation type: " << activation;
     }
-    *kernel =
-        runtime->BuildKernel("fully_connected", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(
+        runtime->BuildKernel("fully_connected", kernel_name,
+                             built_options, kernel));
 
     uint32_t kwg_size =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
@@ -272,7 +273,8 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
   std::string tuning_key =
       Concat("fc_opencl_kernel", output->dim(0), output->dim(1), output->dim(2),
              output->dim(3));
-  TuningOrRun2DKernel(*kernel, tuning_key, gws->data(), *lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(*kernel, tuning_key,
+                                           gws->data(), *lws, future));
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     (*kernel_error)->Map(nullptr);
diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc
index 009bf276..6d882352 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -245,23 +245,27 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
 std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
                                        const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size =
-      OpenCLRuntime::Global()->device_global_mem_cache_size();
-  uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
-  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
-  lws[2] =
-      std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
-  const uint32_t lws_size = lws[1] * lws[2];
-  lws[0] = std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size),
-                              1);
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
+  } else {
+    uint64_t cache_size =
+        OpenCLRuntime::Global()->device_global_mem_cache_size();
+    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
+    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+    lws[2] =
+        std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
+    const uint32_t lws_size = lws[1] * lws[2];
+    lws[0] = std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size),
+                                1);
+  }
   return lws;
 }
 
-void TuningOrRun3DKernel(const cl::Kernel &kernel,
-                         const std::string tuning_key,
-                         const uint32_t *gws,
-                         const std::vector<uint32_t> &lws,
-                         StatsFuture *future) {
+MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
+                               const std::string tuning_key,
+                               const uint32_t *gws,
+                               const std::vector<uint32_t> &lws,
+                               StatsFuture *future) {
   auto runtime = OpenCLRuntime::Global();
 
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
@@ -318,6 +322,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
     std::vector<uint32_t> internal_gws(gws, gws + 3);
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       for (size_t i = 0; i < 3; ++i) {
+        MACE_CHECK(params[i] != 0);
         internal_gws[i] = RoundUp(gws[i], params[i]);
       }
     }
@@ -336,7 +341,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
             kernel, cl::NDRange(0, 0, i * block_size),
             cl::NDRange(internal_gws[0], internal_gws[1], gws2),
             cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK_CL_SUCCESS(error);
+        MACE_CL_RET_ERROR(error);
       }
     } else {
       timer->ClearTiming();
@@ -344,7 +349,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
           kernel, cl::NullRange,
           cl::NDRange(internal_gws[0], internal_gws[1], internal_gws[2]),
           cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-      MACE_CHECK_CL_SUCCESS(error);
+      MACE_CL_RET_ERROR(error);
       timer->AccumulateTiming();
       tuning_result->assign(params.begin(), params.end());
 
@@ -369,7 +374,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
               kernel, cl::NDRange(0, 0, i * block_size),
               cl::NDRange(internal_gws[0], internal_gws[1], gws2),
               cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-          MACE_CHECK_CL_SUCCESS(error);
+          MACE_CL_RET_ERROR(error);
           timer->AccumulateTiming();
         }
       }
@@ -377,8 +382,9 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
     return error;
   };
   OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
+  cl_int err = Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
       tuning_key, lws, params_generator, func, &timer);
+  MACE_CL_RET_STATUS(err);
 
   if (future != nullptr) {
     future->wait_fn = [event](CallStats *stats) {
@@ -388,13 +394,14 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
       }
     };
   }
+  return MaceStatus::MACE_SUCCESS;
 }
 
-void TuningOrRun2DKernel(const cl::Kernel &kernel,
-                         const std::string tuning_key,
-                         const uint32_t *gws,
-                         const std::vector<uint32_t> &lws,
-                         StatsFuture *future) {
+MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
+                               const std::string tuning_key,
+                               const uint32_t *gws,
+                               const std::vector<uint32_t> &lws,
+                               StatsFuture *future) {
   auto runtime = OpenCLRuntime::Global();
 
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
@@ -424,6 +431,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
     std::vector<uint32_t> internal_gws(gws, gws + 2);
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       for (size_t i = 0; i < 2; ++i) {
+        MACE_CHECK(params[i] != 0);
         internal_gws[i] = RoundUp(gws[i], params[i]);
       }
     }
@@ -442,14 +450,14 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
             kernel, cl::NDRange(0, i * block_size),
             cl::NDRange(internal_gws[0], gws1),
             cl::NDRange(params[0], params[1]), nullptr, &event);
-        MACE_CHECK_CL_SUCCESS(error);
+        MACE_CL_RET_ERROR(error);
       }
     } else {
       timer->ClearTiming();
       error = runtime->command_queue().enqueueNDRangeKernel(
           kernel, cl::NullRange, cl::NDRange(internal_gws[0], internal_gws[1]),
           cl::NDRange(params[0], params[1]), nullptr, &event);
-      MACE_CHECK_CL_SUCCESS(error);
+      MACE_CL_RET_ERROR(error);
       timer->AccumulateTiming();
       tuning_result->assign(params.begin(), params.end());
 
@@ -474,7 +482,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
               kernel, cl::NDRange(0, i * block_size),
               cl::NDRange(internal_gws[0], gws1),
               cl::NDRange(params[0], params[1]), nullptr, &event);
-          MACE_CHECK_CL_SUCCESS(error);
+          MACE_CL_RET_ERROR(error);
           timer->AccumulateTiming();
         }
       }
@@ -482,8 +490,10 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
     return error;
   };
   OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
+  cl_int err = Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
       tuning_key, lws, params_generator, func, &timer);
+  MACE_CL_RET_STATUS(err);
+
   if (future != nullptr) {
     future->wait_fn = [runtime, event](CallStats *stats) {
       event.wait();
@@ -492,6 +502,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
       }
     };
   }
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h
index d1e26002..5db95e3d 100644
--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -65,17 +65,17 @@ std::string DtToCLDt(const DataType dt);
 
 std::string DtToUpstreamCLDt(const DataType dt);
 
-void TuningOrRun3DKernel(const cl::Kernel &kernel,
-                         const std::string tuning_key,
-                         const uint32_t *gws,
-                         const std::vector<uint32_t> &lws,
-                         StatsFuture *future);
-
-void TuningOrRun2DKernel(const cl::Kernel &kernel,
-                         const std::string tuning_key,
-                         const uint32_t *gws,
-                         const std::vector<uint32_t> &lws,
-                         StatsFuture *future);
+MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
+                               const std::string tuning_key,
+                               const uint32_t *gws,
+                               const std::vector<uint32_t> &lws,
+                               StatsFuture *future);
+
+MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
+                               const std::string tuning_key,
+                               const uint32_t *gws,
+                               const std::vector<uint32_t> &lws,
+                               StatsFuture *future);
 
 inline void SetFuture(StatsFuture *future, const cl::Event &event) {
   if (future != nullptr) {
diff --git a/mace/kernels/opencl/image_to_buffer.cc b/mace/kernels/opencl/image_to_buffer.cc
index 18f92b67..c8635e1a 100644
--- a/mace/kernels/opencl/image_to_buffer.cc
+++ b/mace/kernels/opencl/image_to_buffer.cc
@@ -97,9 +97,11 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
       kernel_error_->UnMap();
     }
   }
-
-  auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
-                                         obfuscated_kernel_name, built_options);
+  cl::Kernel b2f_kernel;
+  MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image",
+                                            obfuscated_kernel_name,
+                                            built_options,
+                                            &b2f_kernel));
 
   uint32_t idx = 0;
   if (runtime->IsOutOfRangeCheckEnabled()) {
@@ -151,7 +153,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
         b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
         cl::NDRange(lws[0], lws[1]), nullptr, &event);
   }
-  MACE_CHECK_CL_SUCCESS(error);
+  MACE_CL_RET_STATUS(error);
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
     char *kerror_code = kernel_error_->mutable_data<char>();
diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc
index 444dcd29..f9825eb6 100644
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -74,7 +74,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
-    kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
+                                              built_options, &kernel_));
 
     kwg_size_ =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -99,7 +100,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
 
   const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
   std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
-  TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
diff --git a/mace/kernels/opencl/out_of_range_check_test.cc b/mace/kernels/opencl/out_of_range_check_test.cc
index 71907c24..2497a4df 100644
--- a/mace/kernels/opencl/out_of_range_check_test.cc
+++ b/mace/kernels/opencl/out_of_range_check_test.cc
@@ -64,8 +64,14 @@ bool BufferToImageOpImpl(Tensor *buffer,
     kernel_error->UnMap();
   }
 
-  auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
-                                         obfuscated_kernel_name, built_options);
+  cl::Kernel b2f_kernel;
+
+  cl_int error = runtime->BuildKernel("buffer_to_image",
+                                      obfuscated_kernel_name,
+                                      built_options, &b2f_kernel);
+  if (error != CL_SUCCESS) {
+    return false;
+  }
 
   uint32_t idx = 0;
   if (runtime->IsOutOfRangeCheckEnabled()) {
@@ -92,7 +98,6 @@ bool BufferToImageOpImpl(Tensor *buffer,
   const std::vector<uint32_t> lws = {16, kwg_size / 16};
 
   cl::Event event;
-  cl_int error;
   if (runtime->IsNonUniformWorkgroupsSupported()) {
     error = runtime->command_queue().enqueueNDRangeKernel(
         b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
@@ -107,7 +112,9 @@ bool BufferToImageOpImpl(Tensor *buffer,
         b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
         cl::NDRange(lws[0], lws[1]), nullptr, &event);
   }
-  MACE_CHECK_CL_SUCCESS(error);
+  if (error != CL_SUCCESS) {
+    return false;
+  }
 
   runtime->command_queue().finish();
   bool is_out_of_range = false;
diff --git a/mace/kernels/opencl/pad.cc b/mace/kernels/opencl/pad.cc
index 963f25e7..1a8879e8 100644
--- a/mace/kernels/opencl/pad.cc
+++ b/mace/kernels/opencl/pad.cc
@@ -68,7 +68,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
-    kernel_ = runtime->BuildKernel("pad", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name,
+                                              built_options, &kernel_));
 
     kwg_size_ =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -104,7 +105,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
   const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
   std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
                                   output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
diff --git a/mace/kernels/opencl/pooling.cc b/mace/kernels/opencl/pooling.cc
index 09bc8d90..405c05a1 100644
--- a/mace/kernels/opencl/pooling.cc
+++ b/mace/kernels/opencl/pooling.cc
@@ -25,18 +25,23 @@ namespace {
 
 std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
-  uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
-  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
-  lws[2] =
-      std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
-  const uint32_t lws_size = lws[1] * lws[2];
-  lws[0] = gws[0] / 4;
-  if (lws[0] == 0) {
-    lws[0] = gws[0];
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
+  } else {
+    uint64_t
+        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
+    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+    lws[2] =
+        std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
+    const uint32_t lws_size = lws[1] * lws[2];
+    lws[0] = gws[0] / 4;
+    if (lws[0] == 0) {
+      lws[0] = gws[0];
+    }
+    lws[0] = std::max<uint32_t>(std::min<uint32_t>(lws[0], kwg_size / lws_size),
+                                1);
   }
-  lws[0] = std::max<uint32_t>(std::min<uint32_t>(lws[0], kwg_size / lws_size),
-                              1);
   return lws;
 }
 
@@ -80,7 +85,10 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
-    kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling",
+                                              kernel_name,
+                                              built_options,
+                                              &kernel_));
 
     kwg_size_ =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -160,7 +168,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
   std::string tuning_key =
       Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+                                           gws.data(), lws, future));
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
diff --git a/mace/kernels/opencl/reduce_mean_opencl.cc b/mace/kernels/opencl/reduce_mean_opencl.cc
index cd27ad1c..266a1111 100644
--- a/mace/kernels/opencl/reduce_mean_opencl.cc
+++ b/mace/kernels/opencl/reduce_mean_opencl.cc
@@ -66,13 +66,17 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
       *(kernel_error_->mutable_data<char>()) = 0;
       kernel_error_->UnMap();
     }
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
 
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
-    kernel_ = runtime->BuildKernel("reduce_mean", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce_mean",
+                                              kernel_name,
+                                              built_options,
+                                              &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
   if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
@@ -135,13 +139,13 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
         cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
         cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
   }
+  MACE_CL_RET_STATUS(error);
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
     char *kerror_code = kernel_error_->mutable_data<char>();
     MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
     kernel_error_->UnMap();
   }
-  MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
 
   if (future != nullptr) {
     future->wait_fn = [runtime, event](CallStats *stats) {
diff --git a/mace/kernels/opencl/resize_bilinear.cc b/mace/kernels/opencl/resize_bilinear.cc
index ee823116..c5b8b65b 100644
--- a/mace/kernels/opencl/resize_bilinear.cc
+++ b/mace/kernels/opencl/resize_bilinear.cc
@@ -25,25 +25,30 @@ namespace kernels {
 namespace {
 std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
-  uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
-  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
-  if (lws[1] >= base) {
-    lws[0] = std::min<uint32_t>(gws[0], base);
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
   } else {
-    lws[0] = gws[0] / 8;
-    if (lws[0] == 0) {
-      lws[0] = gws[0];
+    uint64_t
+        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
+    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+    if (lws[1] >= base) {
+      lws[0] = std::min<uint32_t>(gws[0], base);
+    } else {
+      lws[0] = gws[0] / 8;
+      if (lws[0] == 0) {
+        lws[0] = gws[0];
+      }
     }
+    lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
+    const uint32_t lws_size = lws[0] * lws[1];
+    lws[2] = gws[2] / 8;
+    if (lws[2] == 0) {
+      lws[2] = gws[2];
+    }
+    lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
+                                1);
   }
-  lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
-  const uint32_t lws_size = lws[0] * lws[1];
-  lws[2] = gws[2] / 8;
-  if (lws[2] == 0) {
-    lws[2] = gws[2];
-  }
-  lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
-                              1);
   return lws;
 }
 
@@ -86,8 +91,11 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
-    kernel_ =
-        runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(
+        runtime->BuildKernel("resize_bilinear",
+                             kernel_name,
+                             built_options,
+                             &kernel_));
 
     kwg_size_ =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -131,7 +139,8 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
   std::string tuning_key =
       Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc
index 160ad003..726bcae9 100644
--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -61,7 +61,10 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
-    kernel_ = runtime->BuildKernel("slice", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("slice",
+                                              kernel_name,
+                                              built_options,
+                                              &kernel_));
 
     kwg_size_ =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -107,7 +110,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
           cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
           cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
     }
-    MACE_CHECK_CL_SUCCESS(error);
+    MACE_CL_RET_STATUS(error);
     if (runtime->IsOutOfRangeCheckEnabled()) {
       kernel_error_->Map(nullptr);
       char *kerror_code = kernel_error_->mutable_data<char>();
diff --git a/mace/kernels/opencl/softmax.cc b/mace/kernels/opencl/softmax.cc
index 38d0b8bb..07855488 100644
--- a/mace/kernels/opencl/softmax.cc
+++ b/mace/kernels/opencl/softmax.cc
@@ -25,19 +25,23 @@ namespace kernels {
 namespace {
 
 std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
-  uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
   std::vector<uint32_t> lws(4, 0);
-  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
-  if (gws[0] < base) {
-    lws[0] = gws[0];
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
   } else {
-    lws[0] = gws[0] / base;
+    uint64_t
+        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
+    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+    if (gws[0] < base) {
+      lws[0] = gws[0];
+    } else {
+      lws[0] = gws[0] / base;
+    }
+    lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
+    lws[2] = std::max<uint32_t>(std::min<uint32_t>(
+        gws[2], kwg_size / (lws[0] * lws[1])), 1);
   }
-  lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
-  lws[2] = std::max<uint32_t>(std::min<uint32_t>(gws[2],
-                                                 kwg_size / (lws[0] * lws[1])),
-                              1);
   return lws;
 }
 
@@ -95,7 +99,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
-    kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name,
+                                              built_options, &kernel_));
 
     kwg_size_ =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -122,7 +127,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
   std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
   std::string tuning_key =
       Concat("softmax_opencl_kernel", batch, height, width, channels);
-  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
diff --git a/mace/kernels/opencl/space_to_batch.cc b/mace/kernels/opencl/space_to_batch.cc
index b558ba7d..3606d91d 100644
--- a/mace/kernels/opencl/space_to_batch.cc
+++ b/mace/kernels/opencl/space_to_batch.cc
@@ -77,8 +77,10 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
-    kernel_ = runtime->BuildKernel("space_to_batch", obfuscated_kernel_name,
-                                   built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
 
     kwg_size_ =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -118,7 +120,8 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
   std::string tuning_key =
       Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
              batch_tensor->dim(2), batch_tensor->dim(3));
-  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc
index 790ab181..6cc8e08c 100644
--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -59,8 +59,10 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
     if (runtime->IsNonUniformWorkgroupsSupported()) {
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
-    kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
-                                   built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
 
     kwg_size_ =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -134,7 +136,8 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
                                   output_tensor->dim(0),
                                   output_tensor->dim(1),
                                   output_tensor->dim(2));
-  TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
@@ -211,8 +214,10 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
         LOG(FATAL) << "Unknown activation type: " << activation_;
     }
 
-    kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
-                                   built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
 
     kwg_size_ =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -267,7 +272,8 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
       Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
              output_tensor->dim(1), output_tensor->dim(2),
              output_tensor->dim(3), input_tensor->dim(2));
-  TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc
index 03731078..470d082f 100644
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -61,6 +61,44 @@ void UnloadModelData(const unsigned char *model_data,
   MACE_CHECK(ret == 0, "Failed to unmap model data file, error code: ",
              strerror(errno));
 }
+
+#ifdef MACE_ENABLE_OPENCL
+MaceStatus CheckGPUAvalibility(const NetDef *net_def) {
+  // Check OpenCL avaliable
+  auto runtime = OpenCLRuntime::Global();
+  if (!runtime->is_opencl_avaliable()) {
+    return MaceStatus::MACE_OUT_OF_RESOURCES;
+  }
+
+  // Check whether model max OpenCL image sizes exceed OpenCL limitation.
+  if (net_def == nullptr) {
+    return MaceStatus::MACE_INVALID_ARGS;
+  }
+
+  if (!runtime->IsImageSupport()) {
+    return MaceStatus::MACE_OUT_OF_RESOURCES;
+  }
+
+  auto opencl_max_image_size = runtime->GetMaxImage2DSize();
+  if (opencl_max_image_size.empty()) {
+    return MaceStatus::MACE_OUT_OF_RESOURCES;
+  }
+
+  const std::vector<int64_t> net_max_image_size =
+      ProtoArgHelper::GetRepeatedArgs<NetDef, int64_t>(
+          *net_def, "opencl_max_image_size", {0, 0});
+
+  if (static_cast<uint64_t>(net_max_image_size[0]) > opencl_max_image_size[0]
+      || static_cast<uint64_t>(net_max_image_size[1])
+          > opencl_max_image_size[1]) {
+    LOG(INFO) << "opencl max image size " << MakeString(opencl_max_image_size)
+              << " vs " << MakeString(net_max_image_size);
+    return MaceStatus::MACE_OUT_OF_RESOURCES;
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+#endif
+
 }  // namespace
 
 // Mace Tensor
@@ -171,6 +209,12 @@ MaceStatus MaceEngine::Impl::Init(
     const std::vector<std::string> &output_nodes,
     const unsigned char *model_data) {
   LOG(INFO) << "Initializing MaceEngine";
+  // Check avalibility
+#ifdef MACE_ENABLE_OPENCL
+  if (device_type_ == DeviceType::GPU) {
+    MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def));
+  }
+#endif
   // Get input and output information.
   for (auto &input_info : net_def->input_info()) {
     input_info_map_[input_info.name()] = input_info;
diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto
index 08ad648e..08d862ea 100644
--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
@@ -20,6 +20,12 @@ enum DataType {
   DT_INT32 = 4;
 }
 
+enum MemoryType {
+  CPU_BUFFER = 0;
+  GPU_BUFFER = 1;
+  GPU_IMAGE = 2;
+}
+
 message ConstTensor {
   repeated int64 dims = 1;
   optional DataType data_type = 2 [default = DT_FLOAT];
@@ -73,8 +79,9 @@ message OperatorDef {
 // for memory optimization
 message MemoryBlock {
   optional int32 mem_id = 1;
-  optional uint32 x = 2;
-  optional uint32 y = 3;
+  optional MemoryType mem_type = 2;
+  optional uint32 x = 3;
+  optional uint32 y = 4;
 }
 message MemoryArena {
   repeated MemoryBlock mem_block = 1;
diff --git a/mace/public/mace_runtime.h b/mace/public/mace_runtime.h
index 6fac6ec5..13ee9a52 100644
--- a/mace/public/mace_runtime.h
+++ b/mace/public/mace_runtime.h
@@ -79,77 +79,102 @@ class __attribute__((visibility("default"))) FileStorageFactory
   std::unique_ptr<Impl> impl_;
 };
 
-// Set Key-Value store factory. (Call Once)
-// Now KVStorage is used to store the built OpenCL binaries to file,
-// which could speed up the GPU initialization and first run.
-// If do not call this API, the initialization maybe slow for GPU.
+/// \brief Set internal storage factory to store internal data. (Call once)
+///
+/// Now the path is used to store the built OpenCL binaries to file,
+/// which could speed up the GPU initialization and first run.
+/// If do not call this API, the initialization maybe slow for GPU.
+///
+/// \param path  Make sure your program have Read/Write permission of the path
+/// \return
 __attribute__((visibility("default")))
 void SetKVStorageFactory(std::shared_ptr<KVStorageFactory> storage_factory);
 
-// Just call once. (Not thread-safe)
-// Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so)
-// if you use gpu of specific soc.
-// Using OpenCL binary will speed up the initialization.
-// OpenCL binary is corresponding to the OpenCL Driver version,
-// you should update the binary when OpenCL Driver changed.
+/// \brief Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so)  // NOLINT(whitespace/line_length)
+///
+/// Just call once. (Not thread-safe)
+/// if you use gpu of specific soc, Using OpenCL binary will speed up the initialization.  // NOLINT(whitespace/line_length)
+/// OpenCL binary is corresponding to the OpenCL Driver version,
+/// you should update the binary when OpenCL Driver changed.
+///
+/// \param paths MACE will use first file found in all paths
+/// \return
 __attribute__((visibility("default")))
 void SetOpenCLBinaryPaths(const std::vector<std::string> &paths);
 
-// Just call once. (Not thread-safe)
-// Set the path of Generated OpenCL parameter file
-// if you use gpu for specific soc.
-// The parameters is the local work group size tuned for specific SOC, which
-// may be faster than the general parameters.
+/// \brief Set the path of Generated OpenCL parameter file
+///
+/// Just call once. (Not thread-safe)
+/// If you use gpu for specific soc, The parameters is the local work group
+/// size tuned for specific SOC, which may be faster than the
+/// general parameters.
+///
+/// \param path Make sure your program have Read/Write permission of the path
+/// \return
 __attribute__((visibility("default")))
 void SetOpenCLParameterPath(const std::string &path);
 
-// Set GPU hints, currently only supports Adreno GPU.
-//
-// Caution: this function may hurt performance if improper parameters provided.
+/// \brief Set GPU hints, currently only supports Adreno GPU.
+///
+/// Caution: this function may hurt performance
+/// if improper parameters provided.
+///
+/// \param perf_hint  performance hint
+/// \param priority_hint  priority hint
+/// \return
 __attribute__((visibility("default")))
 void SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint);
 
-// Set OpenMP threads number and affinity policy.
-//
-// Caution: this function may hurt performance if improper parameters provided.
-//
-// num_threads_hint is only a hint. When num_threads_hint is zero or negative,
-// the function will set the threads number equaling to the number of
-// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all
-// (AFFINITY_NONE) cores according to the policy. The threads number will
-// also be truncated to the corresponding cores number when num_threads_hint
-// is larger than it.
-//
-// The OpenMP threads will be bind to (via sched_setaffinity) big cores
-// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY).
-//
-// If successful, it returns MACE_SUCCESS and error if it can't reliabley
-// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's
-// suggested to use AFFINITY_NONE to use all cores.
+/// \brief Set OpenMP threads number and affinity policy.
+///
+/// Caution: this function may hurt performance if improper parameters provided.
+/// When num_threads_hint is zero or negative,
+/// the function will set the threads number equaling to the number of
+/// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all
+/// (AFFINITY_NONE) cores according to the policy. The threads number will
+/// also be truncated to the corresponding cores number when num_threads_hint
+/// is larger than it.
+/// The OpenMP threads will be bind to (via sched_setaffinity) big cores
+/// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY).
+///
+/// \param num_threads_hint it is only a hint.
+/// \param policy one of CPUAffinityPolicy
+/// \param status MACE_SUCCESS for successful, or it can't reliabley
+/// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's
+/// suggested to use AFFINITY_NONE to use all cores.
+/// \return
 __attribute__((visibility("default")))
 MaceStatus SetOpenMPThreadPolicy(int num_threads_hint,
                                  CPUAffinityPolicy policy);
 
-// Set OpenMP threads number and processor affinity.
-//
-// Caution: this function may hurt performance if improper parameters provided.
-//
-// This function may not work well on some chips (e.g. MTK). Setting thread
-// affinity to offline cores may run very slow or unexpectedly. In such cases,
-// please use SetOpenMPThreadPolicy with default policy instead.
+/// \brief Set OpenMP threads number and processor affinity.
+///
+/// Caution: this function may hurt performance
+/// if improper parameters provided.
+/// This function may not work well on some chips (e.g. MTK). Setting thread
+/// affinity to offline cores may run very slow or unexpectedly.
+/// In such cases, please use SetOpenMPThreadPolicy with default policy
+/// instead.
+///
+/// \param num_threads
+/// \param cpu_ids
+/// \param status
+/// \return
 __attribute__((visibility("default")))
 MaceStatus SetOpenMPThreadAffinity(int num_threads,
                                    const std::vector<int> &cpu_ids);
 
-// Get ARM big.LITTLE configuration.
-//
-// This function will detect the max frequencies of all CPU cores, and assume
-// the cores with largest max frequencies as big cores, and all the remaining
-// cores as little. If all cpu core's max frequencies equals, big_core_ids and
-// little_core_ids will both be filled with all cpu core ids.
-//
-// If successful, it returns MACE_SUCCESS and error if it can't reliabley
-// detect the frequency of big-LITTLE cores (e.g. MTK).
+/// \brief Get ARM big.LITTLE configuration.
+///
+/// This function will detect the max frequencies of all CPU cores, and assume
+/// the cores with largest max frequencies as big cores, and all the remaining
+/// cores as little. If all cpu core's max frequencies equals, big_core_ids and
+/// little_core_ids will both be filled with all cpu core ids.
+///
+/// \param [out] big_core_ids
+/// \param [out] little_core_ids
+/// \return If successful, it returns MACE_SUCCESS and error if it can't
+///         reliabley detect the frequency of big-LITTLE cores (e.g. MTK).
 __attribute__((visibility("default")))
 MaceStatus GetBigLittleCoreIDs(std::vector<int> *big_core_ids,
                                std::vector<int> *little_core_ids);
diff --git a/mace/python/tools/convert_util.py b/mace/python/tools/convert_util.py
index 18791987..31590913 100644
--- a/mace/python/tools/convert_util.py
+++ b/mace/python/tools/convert_util.py
@@ -12,7 +12,72 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import enum
+
 
 def mace_check(condition, msg):
     if not condition:
         raise Exception(msg)
+
+
+def roundup_div4(value):
+    return int((value + 3) / 4)
+
+
+class OpenCLBufferType(enum.Enum):
+    CONV2D_FILTER = 0
+    IN_OUT_CHANNEL = 1
+    ARGUMENT = 2
+    IN_OUT_HEIGHT = 3
+    IN_OUT_WIDTH = 4
+    WINOGRAD_FILTER = 5
+    DW_CONV2D_FILTER = 6
+    WEIGHT_HEIGHT = 7
+    WEIGHT_WIDTH = 8
+
+
+def calculate_image_shape(buffer_type, shape, winograd_blk_size=0):
+    # keep the same with mace/kernel/opencl/helper.cc
+    image_shape = [0, 0]
+    if buffer_type == OpenCLBufferType.CONV2D_FILTER:
+        mace_check(len(shape) == 4, "Conv2D filter buffer should be 4D")
+        image_shape[0] = shape[1]
+        image_shape[1] = shape[2] * shape[3] * roundup_div4(shape[0])
+    elif buffer_type == OpenCLBufferType.IN_OUT_CHANNEL:
+        mace_check(len(shape) == 4, "Conv2D input/output buffer should be 4D")
+        image_shape[0] = roundup_div4(shape[3]) * shape[2]
+        image_shape[1] = shape[0] * shape[1]
+    elif buffer_type == OpenCLBufferType.ARGUMENT:
+        mace_check(len(shape) == 1,
+                   "Argument buffer should be 1D not " + str(shape))
+        image_shape[0] = roundup_div4(shape[0])
+        image_shape[1] = 1
+    elif buffer_type == OpenCLBufferType.IN_OUT_HEIGHT:
+        mace_check(len(shape) == 4, "Input/output buffer should be 4D")
+        image_shape[0] = shape[2] * shape[3]
+        image_shape[1] = shape[0] * roundup_div4(shape[1])
+    elif buffer_type == OpenCLBufferType.IN_OUT_WIDTH:
+        mace_check(len(shape) == 4, "Input/output buffer should be 4D")
+        image_shape[0] = roundup_div4(shape[2]) * shape[3]
+        image_shape[1] = shape[0] * shape[1]
+    elif buffer_type == OpenCLBufferType.WINOGRAD_FILTER:
+        mace_check(len(shape) == 4, "Winograd filter buffer should be 4D")
+        image_shape[0] = roundup_div4(shape[1])
+        image_shape[1] = (shape[0] * (winograd_blk_size + 2)
+                          * (winograd_blk_size + 2))
+    elif buffer_type == OpenCLBufferType.DW_CONV2D_FILTER:
+        mace_check(len(shape) == 4, "Winograd filter buffer should be 4D")
+        image_shape[0] = shape[0] * shape[2] * shape[3]
+        image_shape[1] = roundup_div4(shape[1])
+    elif buffer_type == OpenCLBufferType.WEIGHT_HEIGHT:
+        mace_check(len(shape) == 4, "Weight buffer should be 4D")
+        image_shape[0] = shape[1] * shape[2] * shape[3]
+        image_shape[1] = roundup_div4(shape[0])
+    elif buffer_type == OpenCLBufferType.WEIGHT_WIDTH:
+        mace_check(len(shape) == 4, "Weight buffer should be 4D")
+        image_shape[0] = roundup_div4(shape[1]) * shape[2] * shape[3]
+        image_shape[1] = shape[0]
+    else:
+        mace_check(False, "OpenCL Image do not support type "
+                   + str(buffer_type))
+    return image_shape
diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py
index 2a069c1e..3fd856c7 100644
--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -171,6 +171,13 @@ def main(unused_args):
             output_graph_def.op.extend(cpu_graph_def.op)
             output_graph_def.mem_arena.mem_block.extend(
                 cpu_graph_def.mem_arena.mem_block)
+            output_graph_arg_names = set()
+            for arg in output_graph_def.arg:
+                output_graph_arg_names.add(arg.name)
+
+            for arg in cpu_graph_def.arg:
+                if arg.name not in output_graph_arg_names:
+                    output_graph_def.arg.extend(arg)
             print "Merge done"
         else:
             option.device = device_type_map[FLAGS.runtime]
diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py
index 46b6247e..267fafeb 100644
--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -163,6 +163,7 @@ class MaceKeyword(object):
     mace_op_data_type_str = 'T'
     mace_offset_str = 'offset'
     mace_from_caffe_str = 'from_caffe'
+    mace_opencl_max_image_size = "opencl_max_image_size"
 
 
 class TransformerRule(Enum):
diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py
index 850519f5..da8ecff1 100644
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -28,21 +28,12 @@ from mace.python.tools.converter_tool.base_converter import MaceKeyword
 from mace.python.tools.converter_tool.base_converter import MaceOp
 from mace.python.tools.converter_tool.base_converter import PaddingMode
 from mace.python.tools.converter_tool.base_converter import TransformerRule
+from mace.python.tools.convert_util import calculate_image_shape
 from mace.python.tools.convert_util import mace_check
-
-OPENCL_IMAGE_MAX_SIZE = 16384
+from mace.python.tools.convert_util import OpenCLBufferType
 
 
-class OpenCLBufferType(enum.Enum):
-    CONV2D_FILTER = 0
-    IN_OUT_CHANNEL = 1
-    ARGUMENT = 2
-    IN_OUT_HEIGHT = 3
-    IN_OUT_WIDTH = 4
-    WINOGRAD_FILTER = 5
-    DW_CONV2D_FILTER = 6
-    WEIGHT_HEIGHT = 7
-    WEIGHT_WIDTH = 8
+OPENCL_IMAGE_MAX_SIZE = 16384
 
 
 class Transformer(base_converter.ConverterInterface):
@@ -101,6 +92,7 @@ class Transformer(base_converter.ConverterInterface):
         self._producer = {}
         self._target_data_format = DataFormat.NHWC
         self._input_output_added = False
+        self._opencl_max_image_size = [0, 0]
 
         if self._option.device == DeviceType.CPU.value:
             self._target_data_format = DataFormat.NCHW
@@ -972,15 +964,26 @@ class Transformer(base_converter.ConverterInterface):
         arg.name = MaceKeyword.mace_mode
         arg.i = 0
 
+        tensor_shape = list(self._consts[input_name].dims)
         if input_type == OpenCLBufferType.WINOGRAD_FILTER:
             blk_sqr = op.output_shape[0].dims[0]
             wino_blk = int(np.sqrt(blk_sqr)) - 2
             wino_arg = op_def.arg.add()
             wino_arg.name = MaceKeyword.mace_wino_block_size
             wino_arg.i = wino_blk
+            img_shape = calculate_image_shape(input_type, tensor_shape,
+                                              wino_blk)
+        else:
+            img_shape = calculate_image_shape(input_type, tensor_shape)
 
         op.input[input_idx] = output_name
 
+        # update OpenCL max image size
+        self._opencl_max_image_size[0] = max(self._opencl_max_image_size[0],
+                                             img_shape[0])
+        self._opencl_max_image_size[1] = max(self._opencl_max_image_size[1],
+                                             img_shape[1])
+
     def transform_buffer_image(self):
         if self._option.device != DeviceType.GPU.value:
             return False
@@ -1030,6 +1033,11 @@ class Transformer(base_converter.ConverterInterface):
                                          MaceKeyword.mace_activation_type_str).s == ActivationType.PRELU.name:  # noqa
                     self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT)
 
+        # Add OpenCL max image size
+        arg = net.arg.add()
+        arg.name = MaceKeyword.mace_opencl_max_image_size
+        arg.ints.extend(self._opencl_max_image_size)
+
         for input_node in self._option.input_nodes.values():
             new_input_name = MaceKeyword.mace_input_node_name \
                              + '_' + input_node.name
diff --git a/mace/python/tools/memory_optimizer.py b/mace/python/tools/memory_optimizer.py
index 5b1d3d34..0c18a66b 100644
--- a/mace/python/tools/memory_optimizer.py
+++ b/mace/python/tools/memory_optimizer.py
@@ -16,6 +16,24 @@ import sys
 import operator
 from mace.proto import mace_pb2
 
+from mace.python.tools.converter_tool import base_converter as cvt
+from mace.python.tools.convert_util import calculate_image_shape
+from mace.python.tools.convert_util import OpenCLBufferType
+
+
+class MemoryBlock(object):
+    def __init__(self, mem_type, block):
+        self._mem_type = mem_type
+        self._block = block
+
+    @property
+    def mem_type(self):
+        return self._mem_type
+
+    @property
+    def block(self):
+        return self._block
+
 
 class MemoryOptimizer(object):
     def __init__(self, net_def):
@@ -24,7 +42,6 @@ class MemoryOptimizer(object):
         self.op_mem = {}  # op_name->mem_id
         self.mem_block = {}  # mem_id->[size] or mem_id->[x, y]
         self.total_mem_count = 0
-        self.total_cpu_mem_count = 0
         self.input_ref_counter = {}
         self.mem_ref_counter = {}
 
@@ -52,23 +69,27 @@ class MemoryOptimizer(object):
         return True
 
     def get_op_mem_block(self, op_type, output_shape):
-        return [reduce(operator.mul, output_shape, 1)]
+        return MemoryBlock(mace_pb2.CPU_BUFFER,
+                           [reduce(operator.mul, output_shape, 1)])
 
     def mem_size(self, memory_block):
-        return memory_block[0]
+        return memory_block.block[0]
 
     def sub_mem_block(self, mem_block1, mem_block2):
         return self.mem_size(mem_block1) - self.mem_size(mem_block2)
 
     def resize_mem_block(self, old_mem_block, op_mem_block):
-        return [max(old_mem_block[0], op_mem_block[0])]
+        return MemoryBlock(
+            old_mem_block.mem_type,
+            [max(old_mem_block.block[0], op_mem_block.block[0])])
 
     def add_net_mem_blocks(self):
         for mem in self.mem_block:
             arena = self.net_def.mem_arena
             block = arena.mem_block.add()
             block.mem_id = mem
-            block.x = self.mem_block[mem][0]
+            block.mem_type = self.mem_block[mem].mem_type
+            block.x = self.mem_block[mem].block[0]
             block.y = 1
 
     def get_total_origin_mem_size(self):
@@ -82,7 +103,7 @@ class MemoryOptimizer(object):
     def get_total_optimized_mem_size(self):
         optimized_mem_size = 0
         for mem in self.mem_block:
-            print mem, self.mem_block[mem]
+            print mem, self.mem_block[mem].mem_type, self.mem_block[mem].block
             optimized_mem_size += self.mem_size(self.mem_block[mem])
         return optimized_mem_size
 
@@ -117,6 +138,8 @@ class MemoryOptimizer(object):
                         best_mem_waste_size = sys.maxint
                         for mid in self.idle_mem:
                             old_mem_block = self.mem_block[mid]
+                            if old_mem_block.mem_type != op_mem_block.mem_type:
+                                continue
                             new_mem_block = self.resize_mem_block(
                                 old_mem_block, op_mem_block)
                             add_mem_size = self.sub_mem_block(new_mem_block,
@@ -185,53 +208,76 @@ class GPUMemoryOptimizer(MemoryOptimizer):
             for arg in op.arg:
                 if arg.name == 'mode' and arg.i == 0:
                     return False
-        elif op.type == 'Shape':
-            for i in range(len(op.output)):
-                mem_id = self.total_cpu_mem_count
-                self.total_cpu_mem_count += 1
-                op_mem_block = self.get_op_mem_block(
-                    op.type,
-                    op.output_shape[i].dims)
-                self.mem_block[mem_id] = op_mem_block
-            return False
         return op.type != 'ImageToBuffer'
 
     def get_op_mem_block(self, op_type, output_shape):
-        mem_block = [0, 0]
         if op_type == 'WinogradTransform' or op_type == 'MatMul':
-            mem_block[0] = output_shape[2]
-            mem_block[1] = output_shape[0] * int((output_shape[1] + 3) / 4)
+            buffer_shape = list(output_shape) + [1]
+            mem_block = MemoryBlock(
+                mace_pb2.GPU_IMAGE,
+                calculate_image_shape(OpenCLBufferType.IN_OUT_HEIGHT,
+                                      buffer_shape))
         elif op_type == 'Shape':
-            mem_block[0] = output_shape[0]
-            mem_block[1] = 1
+            mem_block = MemoryBlock(mace_pb2.CPU_BUFFER,
+                                    [output_shape[0], 1])
         else:
             if len(output_shape) == 2:  # only support fc/softmax
-                mem_block[0] = int((output_shape[1] + 3) / 4)
-                mem_block[1] = output_shape[0]
+                buffer_shape = [output_shape[0], 1, 1, output_shape[1]]
             elif len(output_shape) == 4:
-                mem_block[0] = output_shape[2] * int((output_shape[3] + 3) / 4)
-                mem_block[1] = output_shape[0] * output_shape[1]
+                buffer_shape = output_shape
             else:
                 raise Exception('output shape dim size is not 2 or 4.')
+            mem_block = MemoryBlock(
+                mace_pb2.GPU_IMAGE,
+                calculate_image_shape(OpenCLBufferType.IN_OUT_CHANNEL,
+                                      buffer_shape))
         return mem_block
 
     def mem_size(self, memory_block):
-        return memory_block[0] * memory_block[1] * 4
+        if memory_block.mem_type == mace_pb2.GPU_IMAGE:
+            return memory_block.block[0] * memory_block.block[1] * 4
+        else:
+            return memory_block.block[0]
 
     def resize_mem_block(self, old_mem_block, op_mem_block):
-        resize_mem_block = [
-            max(old_mem_block[0], op_mem_block[0]),
-            max(old_mem_block[1], op_mem_block[1])
-        ]
+        resize_mem_block = MemoryBlock(
+            old_mem_block.mem_type,
+            [
+                max(old_mem_block.block[0], op_mem_block.block[0]),
+                max(old_mem_block.block[1], op_mem_block.block[1])
+            ])
+
         return resize_mem_block
 
     def add_net_mem_blocks(self):
+        max_image_size_x = 0
+        max_image_size_y = 0
         for mem in self.mem_block:
             arena = self.net_def.mem_arena
             block = arena.mem_block.add()
             block.mem_id = mem
-            block.x = self.mem_block[mem][0]
-            block.y = self.mem_block[mem][1]
+            block.mem_type = self.mem_block[mem].mem_type
+            block.x = self.mem_block[mem].block[0]
+            block.y = self.mem_block[mem].block[1]
+            if self.mem_block[mem].mem_type == mace_pb2.GPU_IMAGE:
+                max_image_size_x = max(max_image_size_x, block.x)
+                max_image_size_y = max(max_image_size_y, block.y)
+
+        # Update OpenCL max image size
+        net_ocl_max_img_size_arg = None
+        for arg in self.net_def.arg:
+            if arg.name == cvt.MaceKeyword.mace_opencl_max_image_size:
+                net_ocl_max_img_size_arg = arg
+                max_image_size_x = max(arg.ints[0], max_image_size_x)
+                max_image_size_y = max(arg.ints[1], max_image_size_y)
+                break
+        if net_ocl_max_img_size_arg is None:
+            net_ocl_max_img_size_arg = self.net_def.arg.add()
+            net_ocl_max_img_size_arg.name = \
+                cvt.MaceKeyword.mace_opencl_max_image_size
+
+        net_ocl_max_img_size_arg.ints[:] = [max_image_size_x,
+                                            max_image_size_y]
 
     def mem_id_base(self):
         return 20000
diff --git a/mace/python/tools/model.jinja2 b/mace/python/tools/model.jinja2
index 2d2ad8ec..267911b9 100644
--- a/mace/python/tools/model.jinja2
+++ b/mace/python/tools/model.jinja2
@@ -129,6 +129,7 @@ void CreateMemoryArena(mace::MemoryArena *mem_arena) {
 
   mace::MemoryBlock* mem_block{{i}} = mem_arena->add_mem_block();
   mem_block{{i}}->set_mem_id({{net.mem_arena.mem_block[i].mem_id}});
+  mem_block{{i}}->set_mem_type(static_cast<MemoryType>({{net.mem_arena.mem_block[i].mem_type}}));
   mem_block{{i}}->set_x({{net.mem_arena.mem_block[i].x}});
   mem_block{{i}}->set_y({{net.mem_arena.mem_block[i].y}});
 
diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc
index b3a09a4d..2280ae35 100644
--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
@@ -244,6 +244,7 @@ std::map<std::string, int> AddMemoryOptimization(
   for (size_t i = 0; i < input_size; ++i) {
     MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
     mem_blk_ptr->set_mem_id(mem_id);
+    mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
     mem_blk_ptr->set_x(in_mem_block_x);
     mem_blk_ptr->set_y(in_mem_block_y);
     res[input_names[i]] = mem_id;
@@ -263,6 +264,7 @@ std::map<std::string, int> AddMemoryOptimization(
   for (size_t i = 0; i < output_size; ++i) {
     MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
     mem_blk_ptr->set_mem_id(mem_id);
+    mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
     mem_blk_ptr->set_x(out_mem_block_x);
     mem_blk_ptr->set_y(out_mem_block_y);
     res[output_names[i]] = mem_id;
diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc
index 874e221e..9f929de6 100644
--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
@@ -245,6 +245,7 @@ std::map<std::string, int> AddMemoryOptimization(
   for (size_t i = 0; i < input_size; ++i) {
     MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
     mem_blk_ptr->set_mem_id(mem_id);
+    mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
     mem_blk_ptr->set_x(in_mem_block_x);
     mem_blk_ptr->set_y(in_mem_block_y);
     res[input_names[i]] = mem_id;
@@ -264,6 +265,7 @@ std::map<std::string, int> AddMemoryOptimization(
   for (size_t i = 0; i < output_size; ++i) {
     MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
     mem_blk_ptr->set_mem_id(mem_id);
+    mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
     mem_blk_ptr->set_x(out_mem_block_x);
     mem_blk_ptr->set_y(out_mem_block_y);
     res[output_names[i]] = mem_id;
-- 
GitLab