Add gpu avalibility check and return status to user if gpu call failed.

88120708 · liuqi · d9a58a5e · 88120708 · 88120708 · 88120708
47 changed file
--- a/mace/core/file_storage.cc
+++ b/mace/core/file_storage.cc
@@ -37,7 +37,7 @@ int FileStorage::Load() {
  struct stat st;
  if (stat(file_path_.c_str(), &st) == -1) {
    if (errno == ENOENT) {
-      LOG(INFO) << "File " << file_path_
+      VLOG(1) << "File " << file_path_
              << " does not exist";
      return 0;
    } else {

--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -123,7 +123,10 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
  void *mapped_ptr =
      queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
                             offset, nbytes, nullptr, nullptr, &error);
-  MACE_CHECK_CL_SUCCESS(error);
+  if (error != CL_SUCCESS) {
+    LOG(ERROR) << "Map buffer failed, error: " << OpenCLErrorToString(error);
+    mapped_ptr = nullptr;
+  }
  return mapped_ptr;
 }
@@ -142,8 +145,10 @@ void *OpenCLAllocator::MapImage(void *buffer,
      *cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region,
      mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr,
      nullptr, &error);
-  MACE_CHECK_CL_SUCCESS(error);
+  if (error != CL_SUCCESS) {
+    LOG(ERROR) << "Map Image failed, error: " << OpenCLErrorToString(error);
+    mapped_ptr = nullptr;
+  }
  return mapped_ptr;
 }
@@ -152,7 +157,9 @@ void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
  auto queue = OpenCLRuntime::Global()->command_queue();
  cl_int error = queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr,
                                             nullptr, nullptr);
-  MACE_CHECK_CL_SUCCESS(error);
+  if (error != CL_SUCCESS) {
+    LOG(ERROR) << "Unmap buffer failed, error: " << OpenCLErrorToString(error);
+  }
 }
 bool OpenCLAllocator::OnHost() const { return false; }

--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -307,11 +307,15 @@ void OpenCLRuntime::ConfigureOpenCLBinaryPath(
 OpenCLRuntime::OpenCLRuntime():
    precompiled_binary_storage_(nullptr),
    cache_storage_(nullptr),
-    is_profiling_enabled_(false) {
+    is_opencl_avaliable_(false),
+    is_profiling_enabled_(false),
+    opencl_version_(CL_VER_UNKNOWN),
+    gpu_type_(UNKNOWN) {
  std::vector<cl::Platform> all_platforms;
  cl::Platform::get(&all_platforms);
  if (all_platforms.size() == 0) {
-    LOG(FATAL) << "No OpenCL platforms found";
+    LOG(ERROR) << "No OpenCL platforms found";
+    return;
  }
  cl::Platform default_platform = all_platforms[0];
  std::stringstream ss;
@@ -325,7 +329,8 @@ OpenCLRuntime::OpenCLRuntime():
  std::vector<cl::Device> all_devices;
  default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
  if (all_devices.size() == 0) {
-    LOG(FATAL) << "No OpenCL devices found";
+    LOG(ERROR) << "No OpenCL devices found";
+    return;
  }
  bool gpu_detected = false;
@@ -340,13 +345,17 @@ OpenCLRuntime::OpenCLRuntime():
      const std::string device_version = device.getInfo<CL_DEVICE_VERSION>();
      opencl_version_ = ParseDeviceVersion(device_version);
+      if (opencl_version_ == OpenCLVersion::CL_VER_UNKNOWN) {
+        return;
+      }
      VLOG(1) << "Using device: " << device_name;
      break;
    }
  }
  if (!gpu_detected) {
-    LOG(FATAL) << "No GPU device found";
+    LOG(ERROR) << "No GPU device found";
+    return;
  }
  cl_command_queue_properties properties = 0;
@@ -384,13 +393,19 @@ OpenCLRuntime::OpenCLRuntime():
          new cl::Context({*device_}, nullptr, nullptr, nullptr, &err));
    }
  }
-  MACE_CHECK_CL_SUCCESS(err);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+    return;
+  }
  command_queue_ = std::make_shared<cl::CommandQueue>(*context_,
                                                      *device_,
                                                      properties,
                                                      &err);
-  MACE_CHECK_CL_SUCCESS(err);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+    return;
+  }
  extern std::shared_ptr<KVStorageFactory> kStorageFactory;
  std::string cached_binary_platform_info;
@@ -416,10 +431,7 @@ OpenCLRuntime::OpenCLRuntime():
  }
  if (cached_binary_platform_info != platform_info_) {
-    if (OpenCLRuntime::kPrecompiledBinaryPath.empty()) {
+    if (!OpenCLRuntime::kPrecompiledBinaryPath.empty()) {
-      LOG(WARNING) << "There is no precompiled OpenCL binary in"
-          " all OpenCL binary paths";
-    } else {
      precompiled_binary_storage_.reset(
          new FileStorage(OpenCLRuntime::kPrecompiledBinaryPath));
      if (precompiled_binary_storage_->Load() != 0) {
@@ -450,6 +462,8 @@ OpenCLRuntime::OpenCLRuntime():
  } else {
    this->out_of_range_check_ = false;
  }
+  is_opencl_avaliable_ = true;
 }
 OpenCLRuntime::~OpenCLRuntime() {
@@ -460,6 +474,12 @@ OpenCLRuntime::~OpenCLRuntime() {
  device_.reset();
 }
+bool OpenCLRuntime::is_opencl_avaliable() {
+  static const uint64_t kMinWorkGroupSize = 64;
+  return is_opencl_avaliable_
+      && GetDeviceMaxWorkGroupSize() >= kMinWorkGroupSize;
+}
 cl::Context &OpenCLRuntime::context() { return *context_; }
 cl::Device &OpenCLRuntime::device() { return *device_; }
@@ -538,7 +558,7 @@ bool OpenCLRuntime::BuildProgramFromPrecompiledBinary(
  return true;
 }
-void OpenCLRuntime::BuildProgramFromSource(
+bool OpenCLRuntime::BuildProgramFromSource(
    const std::string &program_name,
    const std::string &built_program_key,
    const std::string &build_options_str,
@@ -562,7 +582,7 @@ void OpenCLRuntime::BuildProgramFromSource(
      LOG(WARNING) << "Build program "
                   << program_name << " from source failed: "
                   << MakeString(ret);
-      return;
+      return false;
    }
    // Keep built program binary
@@ -572,7 +592,10 @@ void OpenCLRuntime::BuildProgramFromSource(
    cl_int err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARY_SIZES,
                                  sizeof(size_t) * device_list_size,
                                  program_binary_sizes.get(), nullptr);
-    MACE_CHECK_CL_SUCCESS(err);
+    if (err != CL_SUCCESS) {
+      LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+      return false;
+    }
    std::unique_ptr<std::unique_ptr<unsigned char[]>[]> program_binaries(
        new std::unique_ptr<unsigned char[]>[device_list_size]);
    for (cl_uint i = 0; i < device_list_size; ++i) {
@@ -583,7 +606,10 @@ void OpenCLRuntime::BuildProgramFromSource(
    err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARIES,
                           sizeof(unsigned char *) * device_list_size,
                           program_binaries.get(), nullptr);
-    MACE_CHECK_CL_SUCCESS(err);
+    if (err != CL_SUCCESS) {
+      LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+      return false;
+    }
    std::vector<unsigned char> content(
        reinterpret_cast<unsigned char const *>(program_binaries[0].get()),
        reinterpret_cast<unsigned char const *>(program_binaries[0].get()) +
@@ -600,9 +626,10 @@ void OpenCLRuntime::BuildProgramFromSource(
    VLOG(3) << "Program from source: " << built_program_key;
  }
+  return true;
 }
-void OpenCLRuntime::BuildProgram(const std::string &program_name,
+bool OpenCLRuntime::BuildProgram(const std::string &program_name,
                                 const std::string &built_program_key,
                                 const std::string &build_options,
                                 cl::Program *program) {
@@ -617,16 +644,18 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
    ret = BuildProgramFromPrecompiledBinary(built_program_key,
                                            build_options_str, program);
    if (!ret) {
-      BuildProgramFromSource(program_name, built_program_key,
+      ret = BuildProgramFromSource(program_name, built_program_key,
                                   build_options_str, program);
    }
  }
+  return ret;
 }
-cl::Kernel OpenCLRuntime::BuildKernel(
+MaceStatus OpenCLRuntime::BuildKernel(
    const std::string &program_name,
    const std::string &kernel_name,
-    const std::set<std::string> &build_options) {
+    const std::set<std::string> &build_options,
+    cl::Kernel *kernel) {
  std::string build_options_str;
  for (auto &option : build_options) {
    build_options_str += " " + option;
@@ -639,11 +668,17 @@ cl::Kernel OpenCLRuntime::BuildKernel(
  if (built_program_it != built_program_map_.end()) {
    program = built_program_it->second;
  } else {
-    this->BuildProgram(program_name, built_program_key, build_options_str,
+    bool ret = this->BuildProgram(program_name, built_program_key,
-                       &program);
+                                  build_options_str, &program);
+    if (!ret) {
+      return MaceStatus::MACE_OUT_OF_RESOURCES;
+    }
    built_program_map_.emplace(built_program_key, program);
  }
-  return cl::Kernel(program, kernel_name.c_str());
+  cl_int err;
+  *kernel = cl::Kernel(program, kernel_name.c_str(), &err);
+  MACE_CL_RET_STATUS(err);
+  return MaceStatus::MACE_SUCCESS;
 }
 void OpenCLRuntime::SaveBuiltCLProgram() {
@@ -667,25 +702,67 @@ void OpenCLRuntime::GetCallStats(const cl::Event &event, CallStats *stats) {
 uint64_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() {
  uint64_t size = 0;
-  device_->getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &size);
+  cl_int err = device_->getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &size);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+    size = 0;
+  }
  return size;
 }
 uint64_t OpenCLRuntime::GetDeviceMaxMemAllocSize() {
  uint64_t size = 0;
-  device_->getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &size);
+  cl_int err = device_->getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &size);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+    size = 0;
+  }
  return size;
 }
+bool OpenCLRuntime::IsImageSupport() {
+  cl_bool res;
+  cl_int err = device_->getInfo(CL_DEVICE_IMAGE_SUPPORT, &res);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+    return false;
+  }
+  return res == CL_TRUE;
+}
+std::vector<uint64_t> OpenCLRuntime::GetMaxImage2DSize() {
+  size_t max_height, max_width;
+  cl_int err = device_->getInfo(CL_DEVICE_IMAGE2D_MAX_HEIGHT, &max_height);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+    return {};
+  }
+  err = device_->getInfo(CL_DEVICE_IMAGE2D_MAX_WIDTH, &max_width);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+    return {};
+  }
+  return {max_height, max_width};
+}
 uint64_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) {
  uint64_t size = 0;
-  kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &size);
+  cl_int err = kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE,
+                                       &size);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+    size = 0;
+  }
  return size;
 }
 uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
  uint64_t size = 0;
-  kernel.getWorkGroupInfo(*device_, CL_KERNEL_WAVE_SIZE_QCOM, &size);
+  cl_int err = kernel.getWorkGroupInfo(*device_, CL_KERNEL_WAVE_SIZE_QCOM,
+                                       &size);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "error: " << OpenCLErrorToString(err);
+    size = 0;
+  }
  return size;
 }
@@ -717,8 +794,8 @@ OpenCLVersion OpenCLRuntime::ParseDeviceVersion(
  } else if (words[1] == "1.0") {
    return OpenCLVersion::CL_VER_1_0;
  } else {
-    LOG(FATAL) << "Do not support OpenCL version: " << words[1];
+    LOG(ERROR) << "Do not support OpenCL version: " << words[1];
-    return OpenCLVersion::CL_VER_1_0;
+    return OpenCLVersion::CL_VER_UNKNOWN;
  }
 }

--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -42,13 +42,23 @@ enum OpenCLVersion {
  CL_VER_1_1,
  CL_VER_1_2,
  CL_VER_2_0,
+  CL_VER_UNKNOWN,
 };
 const std::string OpenCLErrorToString(cl_int error);
-#define MACE_CHECK_CL_SUCCESS(error) \
+#define MACE_CL_RET_ERROR(error)                            \
-  MACE_CHECK(error == CL_SUCCESS) << "error: " << OpenCLErrorToString(error)
+  if (error != CL_SUCCESS) {                                \
+    LOG(ERROR) << "error: " << OpenCLErrorToString(error);  \
+    return error;                                           \
+  }
+#define MACE_CL_RET_STATUS(error)                           \
+  if (error != CL_SUCCESS) {                                \
+    LOG(ERROR) << "error: " << OpenCLErrorToString(error);  \
+    return MaceStatus::MACE_OUT_OF_RESOURCES;               \
+  }
 class OpenCLProfilingTimer : public Timer {
 public:
@@ -81,19 +91,23 @@ class OpenCLRuntime {
  const std::string platform_info() const;
  uint64_t device_global_mem_cache_size() const;
  uint32_t device_compute_units() const;
+  bool is_opencl_avaliable();
  void GetCallStats(const cl::Event &event, CallStats *stats);
  uint64_t GetDeviceMaxWorkGroupSize();
  uint64_t GetDeviceMaxMemAllocSize();
+  bool IsImageSupport();
+  std::vector<uint64_t> GetMaxImage2DSize();
  uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
  uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
  bool IsNonUniformWorkgroupsSupported() const;
  bool IsOutOfRangeCheckEnabled() const;
  bool is_profiling_enabled() const;
-  cl::Kernel BuildKernel(const std::string &program_name,
+  MaceStatus BuildKernel(const std::string &program_name,
                         const std::string &kernel_name,
-                         const std::set<std::string> &build_options);
+                         const std::set<std::string> &build_options,
+                         cl::Kernel *kernel);
  void SaveBuiltCLProgram();
@@ -103,7 +117,7 @@ class OpenCLRuntime {
  OpenCLRuntime(const OpenCLRuntime &) = delete;
  OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
-  void BuildProgram(const std::string &program_file_name,
+  bool BuildProgram(const std::string &program_file_name,
                    const std::string &binary_file_name,
                    const std::string &build_options,
                    cl::Program *program);
@@ -115,7 +129,7 @@ class OpenCLRuntime {
      const std::string &built_program_key,
      const std::string &build_options_str,
      cl::Program *program);
-  void BuildProgramFromSource(
+  bool BuildProgramFromSource(
      const std::string &program_name,
      const std::string &built_program_key,
      const std::string &build_options_str,
@@ -125,6 +139,7 @@ class OpenCLRuntime {
 private:
  std::unique_ptr<KVStorage> precompiled_binary_storage_;
  std::unique_ptr<KVStorage> cache_storage_;
+  bool is_opencl_avaliable_;
  bool is_profiling_enabled_;
  // All OpenCL object must be a pointer and manually deleted before unloading
  // OpenCL library.

--- a/mace/core/runtime/opencl/opencl_wrapper.cc
+++ b/mace/core/runtime/opencl/opencl_wrapper.cc
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -204,28 +204,30 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
  // TODO(liyin): memory block should not have concept of type, but to be
  // consistent with gpu, all memory block use float/half as unit
  for (auto &mem_block : net_def.mem_arena().mem_block()) {
-    if (device_type == DeviceType::GPU) {
+    if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
-      // TODO(liuqi): refactor based on PB
+      std::unique_ptr<BufferBase> tensor_buf(
-      if (mem_block.mem_id() >= 20000) {
+          new Buffer(GetDeviceAllocator(DeviceType::CPU)));
+      MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
+          mem_block.x() * GetEnumTypeSize(dtype)
+              + MACE_EXTRA_BUFFER_PAD_SIZE));
+      preallocated_allocator_.SetBuffer(mem_block.mem_id(),
+                                        std::move(tensor_buf));
+    } else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
      std::unique_ptr<BufferBase> image_buf(
          new Image());
      MACE_RETURN_IF_ERROR(image_buf->Allocate(
          {mem_block.x(), mem_block.y()}, dtype));
      preallocated_allocator_.SetBuffer(mem_block.mem_id(),
                                        std::move(image_buf));
-      }
+    } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
-    } else {
-      if (mem_block.mem_id() < 20000) {
      std::unique_ptr<BufferBase> tensor_buf(
-            new Buffer(GetDeviceAllocator(device_type)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU)));
      MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
-            mem_block.x() * GetEnumTypeSize(dtype)
+          mem_block.x() * GetEnumTypeSize(dtype)));
-            + MACE_EXTRA_BUFFER_PAD_SIZE));
      preallocated_allocator_.SetBuffer(mem_block.mem_id(),
                                        std::move(tensor_buf));
    }
  }
-  }
  VLOG(3) << "Preallocate buffer to tensors";
  for (auto &op : net_def.op()) {
    // TODO(liuqi): refactor based on PB

--- a/mace/examples/cli/example.cc
+++ b/mace/examples/cli/example.cc
@@ -219,7 +219,10 @@ bool RunModel(const std::vector<std::string> &input_names,
 #endif
  if (create_engine_status != MaceStatus::MACE_SUCCESS) {
-    std::cerr << "Create engine error, please check the arguments" << std::endl;
+    std::cerr << "Create engine error, please check the arguments first, "
+              << "if correct, the device may not run the model, "
+              << "please fall back to other strategy."
+              << std::endl;
    exit(1);
  }

--- a/mace/kernels/opencl/activation.cc
+++ b/mace/kernels/opencl/activation.cc
@@ -79,7 +79,8 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
      default:
        LOG(FATAL) << "Unknown activation type: " << activation_;
    }
-    kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
+                                              built_options, &kernel_));
    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -115,7 +116,8 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
  std::string tuning_key =
      Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
             output->dim(3));
-  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws,
+                                           lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);

--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -68,7 +68,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
-    kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
+                                              built_options, &kernel_));
    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -111,7 +112,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
  std::string tuning_key =
      Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
             output_tensor->dim(2), output_tensor->dim(3));
-  TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);

--- a/mace/kernels/opencl/batch_norm.cc
+++ b/mace/kernels/opencl/batch_norm.cc
@@ -88,7 +88,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
        LOG(FATAL) << "Unknown activation type: " << activation_;
    }
-    kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
+                                              built_options, &kernel_));
    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -122,7 +123,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
  std::string tuning_key =
      Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
             output->dim(1), output->dim(2), output->dim(3), folded_constant_);
-  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);

--- a/mace/kernels/opencl/bias_add.cc
+++ b/mace/kernels/opencl/bias_add.cc
@@ -61,7 +61,8 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
    if (runtime->IsNonUniformWorkgroupsSupported()) {
      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
-    kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
+                                              built_options, &kernel_));
    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -102,7 +103,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
  }
-  MACE_CHECK_CL_SUCCESS(error);
+  MACE_CL_RET_STATUS(error);
  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);
    char *kerror_code = kernel_error_->mutable_data<char>();

--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -106,8 +106,10 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
    }
  }
-  auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
+  cl::Kernel b2f_kernel;
-                                         obfuscated_kernel_name, built_options);
+  MACE_RETURN_IF_ERROR(runtime->BuildKernel(
+      "buffer_to_image", obfuscated_kernel_name, built_options, &b2f_kernel));
  uint32_t idx = 0;
  if (runtime->IsOutOfRangeCheckEnabled()) {
@@ -164,7 +166,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
        b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
        cl::NDRange(lws[0], lws[1]), nullptr, &event);
  }
-  MACE_CHECK_CL_SUCCESS(error);
+  MACE_CL_RET_STATUS(error);
  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);
    char *kerror_code = kernel_error_->mutable_data<char>();

--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -62,8 +62,9 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
    if (runtime->IsNonUniformWorkgroupsSupported()) {
      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
-    kernel_ =
+    MACE_RETURN_IF_ERROR(
-        runtime->BuildKernel("channel_shuffle", kernel_name, built_options);
+        runtime->BuildKernel("channel_shuffle", kernel_name,
+                             built_options, &kernel_));
    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -92,7 +93,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
  std::string tuning_key =
      Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);

--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -24,12 +24,18 @@ namespace kernels {
 namespace {
 std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
+  } else {
+    uint64_t
+        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
    lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
    const uint32_t lws_size = lws[0] * lws[1];
-  lws[2] = std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1);
+    lws[2] =
+        std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1);
+  }
  return lws;
 }
@@ -83,7 +89,8 @@ static MaceStatus Concat2(cl::Kernel *kernel,
    if (input0->dim(3) % 4 == 0) {
      built_options.emplace("-DDIVISIBLE_FOUR");
    }
-    *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
+                                              built_options, kernel));
    *kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
@@ -114,7 +121,8 @@ static MaceStatus Concat2(cl::Kernel *kernel,
  std::string tuning_key =
      Concat("concat_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+                                           gws, lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    (*kernel_error)->Map(nullptr);
@@ -157,7 +165,8 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
    if (runtime->IsNonUniformWorkgroupsSupported()) {
      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
-    *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
+                                              built_options, kernel));
    *kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
  }
@@ -207,7 +216,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
          cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
    }
-    MACE_CHECK_CL_SUCCESS(error);
+    MACE_CL_RET_STATUS(error);
    if (runtime->IsOutOfRangeCheckEnabled()) {
      (*kernel_error)->Map(nullptr);
      char *kerror_code = (*kernel_error)->mutable_data<char>();

--- a/mace/kernels/opencl/conv_2d_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_1x1.cc
@@ -27,7 +27,11 @@ const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
 const uint32_t lws_limit = 128;
 std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
+  } else {
+    uint64_t
+        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
    uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
    const uint32_t base =
        std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
@@ -45,12 +49,14 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
    lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
    const uint32_t lws_size = lws[0] * lws[1];
    lws[2] = std::min<uint32_t>(
-      (cache_size / kernel_cache_size / lws_size / compute_units) * 8, gws[2]);
+        (cache_size / kernel_cache_size / lws_size / compute_units) * 8,
+        gws[2]);
    if (lws[2] == 0) {
      lws[2] = std::min<uint32_t>(gws[2], base);
    }
    lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
                                1);
+  }
  return lws;
 }
@@ -130,7 +136,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
        LOG(FATAL) << "Unknown activation type: " << activation;
    }
-    *kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_1x1", kernel_name,
+                                              built_options, kernel));
    *kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
@@ -173,7 +180,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
  std::string tuning_key =
      Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+                                           gws, lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    (*kernel_error)->Map(nullptr);

--- a/mace/kernels/opencl/conv_2d_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_3x3.cc
@@ -26,7 +26,11 @@ namespace {
 const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4;
 std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
+  } else {
+    uint64_t
+        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
    uint32_t compute_units = std::max<uint32_t>(
        OpenCLRuntime::Global()->device_compute_units() / 2, 1);
    const uint32_t base =
@@ -45,6 +49,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
    }
    lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
                                1);
+  }
  return lws;
 }
@@ -115,7 +120,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
        LOG(FATAL) << "Unknown activation type: " << activation;
    }
-    *kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_3x3", kernel_name,
+                                              built_options, kernel));
    *kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
@@ -161,7 +167,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
  std::string tuning_key =
      Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+                                           gws, lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    (*kernel_error)->Map(nullptr);

--- a/mace/kernels/opencl/conv_2d_general.cc
+++ b/mace/kernels/opencl/conv_2d_general.cc
@@ -30,7 +30,11 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
                              const uint32_t kernel_size,
                              const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
+  } else {
+    uint64_t
+        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
    uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
    const uint32_t base =
        std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
@@ -54,6 +58,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
    }
    lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
                                1);
+  }
  return lws;
 }
@@ -124,7 +129,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
        LOG(FATAL) << "Unknown activation type: " << activation;
    }
-    *kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d", kernel_name,
+                                              built_options, kernel));
    *kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
@@ -173,7 +179,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
             output->dim(2), output->dim(3), filter->dim(2), filter->dim(3));
  std::vector<uint32_t> lws =
      LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size);
-  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+                                           gws, lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    (*kernel_error)->Map(nullptr);

--- a/mace/kernels/opencl/crop.cc
+++ b/mace/kernels/opencl/crop.cc
@@ -24,12 +24,18 @@ namespace kernels {
 namespace {
 std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
+  } else {
+    uint64_t
+        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
    lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
    const uint32_t lws_size = lws[0] * lws[1];
-  lws[2] = std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1);
+    lws[2] =
+        std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1);
+  }
  return lws;
 }
@@ -147,7 +153,8 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
    if (runtime->IsNonUniformWorkgroupsSupported()) {
      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
-    kernel_ = runtime->BuildKernel("crop", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name,
+                                              built_options, &kernel_));
    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -181,7 +188,8 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
  std::string tuning_key =
      Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);

--- a/mace/kernels/opencl/deconv_2d_opencl.cc
+++ b/mace/kernels/opencl/deconv_2d_opencl.cc
@@ -95,7 +95,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
        LOG(FATAL) << "Unknown activation type: " << activation;
    }
-    *kernel = runtime->BuildKernel("deconv_2d", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("deconv_2d", kernel_name,
+                                              built_options, kernel));
    *kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
@@ -148,7 +149,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
  std::string tuning_key =
      Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
             output->dim(1), output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+                                           gws, lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    (*kernel_error)->Map(nullptr);

--- a/mace/kernels/opencl/depth_to_space.cc
+++ b/mace/kernels/opencl/depth_to_space.cc
@@ -95,8 +95,10 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
    if (runtime->IsNonUniformWorkgroupsSupported()) {
      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
-    kernel_ = runtime->BuildKernel("depth_to_space", obfuscated_kernel_name,
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space",
-                                   built_options);
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -135,7 +137,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
  }
  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
-  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);

--- a/mace/kernels/opencl/depthwise_conv.cc
+++ b/mace/kernels/opencl/depthwise_conv.cc
@@ -26,7 +26,11 @@ namespace {
 const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4;
 std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
+  } else {
+    uint64_t
+        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
    uint32_t base = cache_size / kBaseGPUMemCacheSize;
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
    if (lws[1] >= base) {
@@ -38,7 +42,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
                                    kwg_size / lws[1]);
      }
    }
-  lws[0] = std::max<uint32_t>(std::min<uint32_t>(lws[0], kwg_size / lws[1]), 1);
+    lws[0] =
+        std::max<uint32_t>(std::min<uint32_t>(lws[0], kwg_size / lws[1]), 1);
    const uint32_t lws_size = lws[0] * lws[1];
    lws[2] = std::min<uint32_t>((cache_size / kernel_cache_size / lws_size) * 4,
                                gws[2]);
@@ -47,6 +52,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
    }
    lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
                                1);
+  }
  return lws;
 }
@@ -129,8 +135,9 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
        LOG(FATAL) << "Unknown activation type: " << activation;
    }
-    *kernel =
+    MACE_RETURN_IF_ERROR(
-        runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
+        runtime->BuildKernel("depthwise_conv2d", kernel_name,
+                             built_options, kernel));
    *kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
@@ -183,7 +190,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
  const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
  std::string tuning_key =
      Concat("depthwise_conv2d_ocl_kernel", gws[0], gws[1], gws[2], multiplier);
-  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
+                                           gws, lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    (*kernel_error)->Map(nullptr);

--- a/mace/kernels/opencl/eltwise.cc
+++ b/mace/kernels/opencl/eltwise.cc
@@ -103,7 +103,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
    if (runtime->IsNonUniformWorkgroupsSupported()) {
      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
-    kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name,
+                                              built_options, &kernel_));
    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -141,7 +142,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
  std::string tuning_key =
      Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);
    char *kerror_code = kernel_error_->mutable_data<char>();

--- a/mace/kernels/opencl/fully_connected.cc
+++ b/mace/kernels/opencl/fully_connected.cc
@@ -84,8 +84,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
-    *kernel =
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name,
-        runtime->BuildKernel("fully_connected", kernel_name, built_options);
+                                              built_options, kernel));
    if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
@@ -160,7 +160,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
    (*kernel_error)->UnMap();
  }
-  MACE_CHECK_CL_SUCCESS(error);
+  MACE_CL_RET_STATUS(error);
  if (future != nullptr) {
    future->wait_fn = [runtime, event](CallStats *stats) {
@@ -230,8 +230,9 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
      default:
        LOG(FATAL) << "Unknown activation type: " << activation;
    }
-    *kernel =
+    MACE_RETURN_IF_ERROR(
-        runtime->BuildKernel("fully_connected", kernel_name, built_options);
+        runtime->BuildKernel("fully_connected", kernel_name,
+                             built_options, kernel));
    uint32_t kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
@@ -272,7 +273,8 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
  std::string tuning_key =
      Concat("fc_opencl_kernel", output->dim(0), output->dim(1), output->dim(2),
             output->dim(3));
-  TuningOrRun2DKernel(*kernel, tuning_key, gws->data(), *lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(*kernel, tuning_key,
+                                           gws->data(), *lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    (*kernel_error)->Map(nullptr);

--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -245,6 +245,9 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
 std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
                                       const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
+  } else {
    uint64_t cache_size =
        OpenCLRuntime::Global()->device_global_mem_cache_size();
    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
@@ -254,10 +257,11 @@ std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
    const uint32_t lws_size = lws[1] * lws[2];
    lws[0] = std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size),
                                1);
+  }
  return lws;
 }
-void TuningOrRun3DKernel(const cl::Kernel &kernel,
+MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
                               const std::string tuning_key,
                               const uint32_t *gws,
                               const std::vector<uint32_t> &lws,
@@ -318,6 +322,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
    std::vector<uint32_t> internal_gws(gws, gws + 3);
    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      for (size_t i = 0; i < 3; ++i) {
+        MACE_CHECK(params[i] != 0);
        internal_gws[i] = RoundUp(gws[i], params[i]);
      }
    }
@@ -336,7 +341,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
            kernel, cl::NDRange(0, 0, i * block_size),
            cl::NDRange(internal_gws[0], internal_gws[1], gws2),
            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK_CL_SUCCESS(error);
+        MACE_CL_RET_ERROR(error);
      }
    } else {
      timer->ClearTiming();
@@ -344,7 +349,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
          kernel, cl::NullRange,
          cl::NDRange(internal_gws[0], internal_gws[1], internal_gws[2]),
          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-      MACE_CHECK_CL_SUCCESS(error);
+      MACE_CL_RET_ERROR(error);
      timer->AccumulateTiming();
      tuning_result->assign(params.begin(), params.end());
@@ -369,7 +374,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
              kernel, cl::NDRange(0, 0, i * block_size),
              cl::NDRange(internal_gws[0], internal_gws[1], gws2),
              cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-          MACE_CHECK_CL_SUCCESS(error);
+          MACE_CL_RET_ERROR(error);
          timer->AccumulateTiming();
        }
      }
@@ -377,8 +382,9 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
    return error;
  };
  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
+  cl_int err = Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
      tuning_key, lws, params_generator, func, &timer);
+  MACE_CL_RET_STATUS(err);
  if (future != nullptr) {
    future->wait_fn = [event](CallStats *stats) {
@@ -388,9 +394,10 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
      }
    };
  }
+  return MaceStatus::MACE_SUCCESS;
 }
-void TuningOrRun2DKernel(const cl::Kernel &kernel,
+MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
                               const std::string tuning_key,
                               const uint32_t *gws,
                               const std::vector<uint32_t> &lws,
@@ -424,6 +431,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
    std::vector<uint32_t> internal_gws(gws, gws + 2);
    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      for (size_t i = 0; i < 2; ++i) {
+        MACE_CHECK(params[i] != 0);
        internal_gws[i] = RoundUp(gws[i], params[i]);
      }
    }
@@ -442,14 +450,14 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
            kernel, cl::NDRange(0, i * block_size),
            cl::NDRange(internal_gws[0], gws1),
            cl::NDRange(params[0], params[1]), nullptr, &event);
-        MACE_CHECK_CL_SUCCESS(error);
+        MACE_CL_RET_ERROR(error);
      }
    } else {
      timer->ClearTiming();
      error = runtime->command_queue().enqueueNDRangeKernel(
          kernel, cl::NullRange, cl::NDRange(internal_gws[0], internal_gws[1]),
          cl::NDRange(params[0], params[1]), nullptr, &event);
-      MACE_CHECK_CL_SUCCESS(error);
+      MACE_CL_RET_ERROR(error);
      timer->AccumulateTiming();
      tuning_result->assign(params.begin(), params.end());
@@ -474,7 +482,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
              kernel, cl::NDRange(0, i * block_size),
              cl::NDRange(internal_gws[0], gws1),
              cl::NDRange(params[0], params[1]), nullptr, &event);
-          MACE_CHECK_CL_SUCCESS(error);
+          MACE_CL_RET_ERROR(error);
          timer->AccumulateTiming();
        }
      }
@@ -482,8 +490,10 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
    return error;
  };
  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
+  cl_int err = Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
      tuning_key, lws, params_generator, func, &timer);
+  MACE_CL_RET_STATUS(err);
  if (future != nullptr) {
    future->wait_fn = [runtime, event](CallStats *stats) {
      event.wait();
@@ -492,6 +502,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
      }
    };
  }
+  return MaceStatus::MACE_SUCCESS;
 }
 }  // namespace kernels

--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -65,13 +65,13 @@ std::string DtToCLDt(const DataType dt);
 std::string DtToUpstreamCLDt(const DataType dt);
-void TuningOrRun3DKernel(const cl::Kernel &kernel,
+MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
                               const std::string tuning_key,
                               const uint32_t *gws,
                               const std::vector<uint32_t> &lws,
                               StatsFuture *future);
-void TuningOrRun2DKernel(const cl::Kernel &kernel,
+MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
                               const std::string tuning_key,
                               const uint32_t *gws,
                               const std::vector<uint32_t> &lws,

--- a/mace/kernels/opencl/image_to_buffer.cc
+++ b/mace/kernels/opencl/image_to_buffer.cc
@@ -97,9 +97,11 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
      kernel_error_->UnMap();
    }
  }
+  cl::Kernel b2f_kernel;
-  auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
+  MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image",
-                                         obfuscated_kernel_name, built_options);
+                                            obfuscated_kernel_name,
+                                            built_options,
+                                            &b2f_kernel));
  uint32_t idx = 0;
  if (runtime->IsOutOfRangeCheckEnabled()) {
@@ -151,7 +153,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
        b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
        cl::NDRange(lws[0], lws[1]), nullptr, &event);
  }
-  MACE_CHECK_CL_SUCCESS(error);
+  MACE_CL_RET_STATUS(error);
  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);
    char *kerror_code = kernel_error_->mutable_data<char>();

--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -74,7 +74,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
    if (runtime->IsNonUniformWorkgroupsSupported()) {
      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
-    kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
+                                              built_options, &kernel_));
    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -99,7 +100,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
  const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
  std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
-  TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);

--- a/mace/kernels/opencl/out_of_range_check_test.cc
+++ b/mace/kernels/opencl/out_of_range_check_test.cc
@@ -64,8 +64,14 @@ bool BufferToImageOpImpl(Tensor *buffer,
    kernel_error->UnMap();
  }
-  auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
+  cl::Kernel b2f_kernel;
-                                         obfuscated_kernel_name, built_options);
+  cl_int error = runtime->BuildKernel("buffer_to_image",
+                                      obfuscated_kernel_name,
+                                      built_options, &b2f_kernel);
+  if (error != CL_SUCCESS) {
+    return false;
+  }
  uint32_t idx = 0;
  if (runtime->IsOutOfRangeCheckEnabled()) {
@@ -92,7 +98,6 @@ bool BufferToImageOpImpl(Tensor *buffer,
  const std::vector<uint32_t> lws = {16, kwg_size / 16};
  cl::Event event;
-  cl_int error;
  if (runtime->IsNonUniformWorkgroupsSupported()) {
    error = runtime->command_queue().enqueueNDRangeKernel(
        b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
@@ -107,7 +112,9 @@ bool BufferToImageOpImpl(Tensor *buffer,
        b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
        cl::NDRange(lws[0], lws[1]), nullptr, &event);
  }
-  MACE_CHECK_CL_SUCCESS(error);
+  if (error != CL_SUCCESS) {
+    return false;
+  }
  runtime->command_queue().finish();
  bool is_out_of_range = false;

--- a/mace/kernels/opencl/pad.cc
+++ b/mace/kernels/opencl/pad.cc
@@ -68,7 +68,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
    if (runtime->IsNonUniformWorkgroupsSupported()) {
      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
-    kernel_ = runtime->BuildKernel("pad", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name,
+                                              built_options, &kernel_));
    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -104,7 +105,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
  std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
                                  output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);

--- a/mace/kernels/opencl/pooling.cc
+++ b/mace/kernels/opencl/pooling.cc
@@ -25,7 +25,11 @@ namespace {
 std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
+  } else {
+    uint64_t
+        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
    lws[2] =
@@ -37,6 +41,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
    }
    lws[0] = std::max<uint32_t>(std::min<uint32_t>(lws[0], kwg_size / lws_size),
                                1);
+  }
  return lws;
 }
@@ -80,7 +85,10 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
    if (runtime->IsNonUniformWorkgroupsSupported()) {
      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
-    kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling",
+                                              kernel_name,
+                                              built_options,
+                                              &kernel_));
    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -160,7 +168,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
  std::string tuning_key =
      Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+                                           gws.data(), lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);

--- a/mace/kernels/opencl/reduce_mean_opencl.cc
+++ b/mace/kernels/opencl/reduce_mean_opencl.cc
@@ -66,13 +66,17 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
    }
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
    if (runtime->IsNonUniformWorkgroupsSupported()) {
      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
-    kernel_ = runtime->BuildKernel("reduce_mean", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce_mean",
+                                              kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
@@ -135,13 +139,13 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
  }
+  MACE_CL_RET_STATUS(error);
  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);
    char *kerror_code = kernel_error_->mutable_data<char>();
    MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
    kernel_error_->UnMap();
  }
-  MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
  if (future != nullptr) {
    future->wait_fn = [runtime, event](CallStats *stats) {

--- a/mace/kernels/opencl/resize_bilinear.cc
+++ b/mace/kernels/opencl/resize_bilinear.cc
@@ -25,7 +25,11 @@ namespace kernels {
 namespace {
 std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
+  } else {
+    uint64_t
+        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
    if (lws[1] >= base) {
@@ -44,6 +48,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
    }
    lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
                                1);
+  }
  return lws;
 }
@@ -86,8 +91,11 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
    if (runtime->IsNonUniformWorkgroupsSupported()) {
      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
-    kernel_ =
+    MACE_RETURN_IF_ERROR(
-        runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
+        runtime->BuildKernel("resize_bilinear",
+                             kernel_name,
+                             built_options,
+                             &kernel_));
    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -131,7 +139,8 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
  std::string tuning_key =
      Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);

--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -61,7 +61,10 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
    if (runtime->IsNonUniformWorkgroupsSupported()) {
      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
-    kernel_ = runtime->BuildKernel("slice", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("slice",
+                                              kernel_name,
+                                              built_options,
+                                              &kernel_));
    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -107,7 +110,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
          cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
    }
-    MACE_CHECK_CL_SUCCESS(error);
+    MACE_CL_RET_STATUS(error);
    if (runtime->IsOutOfRangeCheckEnabled()) {
      kernel_error_->Map(nullptr);
      char *kerror_code = kernel_error_->mutable_data<char>();

--- a/mace/kernels/opencl/softmax.cc
+++ b/mace/kernels/opencl/softmax.cc
@@ -25,9 +25,13 @@ namespace kernels {
 namespace {
 std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
-  uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
  std::vector<uint32_t> lws(4, 0);
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
+  } else {
+    uint64_t
+        cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
    if (gws[0] < base) {
      lws[0] = gws[0];
@@ -35,9 +39,9 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
      lws[0] = gws[0] / base;
    }
    lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
-  lws[2] = std::max<uint32_t>(std::min<uint32_t>(gws[2],
+    lws[2] = std::max<uint32_t>(std::min<uint32_t>(
-                                                 kwg_size / (lws[0] * lws[1])),
+        gws[2], kwg_size / (lws[0] * lws[1])), 1);
-                              1);
+  }
  return lws;
 }
@@ -95,7 +99,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
    if (runtime->IsNonUniformWorkgroupsSupported()) {
      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
-    kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name,
+                                              built_options, &kernel_));
    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -122,7 +127,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
  std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
  std::string tuning_key =
      Concat("softmax_opencl_kernel", batch, height, width, channels);
-  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);

--- a/mace/kernels/opencl/space_to_batch.cc
+++ b/mace/kernels/opencl/space_to_batch.cc
@@ -77,8 +77,10 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
    if (runtime->IsNonUniformWorkgroupsSupported()) {
      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
-    kernel_ = runtime->BuildKernel("space_to_batch", obfuscated_kernel_name,
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch",
-                                   built_options);
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -118,7 +120,8 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
  std::string tuning_key =
      Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
             batch_tensor->dim(2), batch_tensor->dim(3));
-  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);

--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -59,8 +59,10 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
    if (runtime->IsNonUniformWorkgroupsSupported()) {
      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
-    kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
-                                   built_options);
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -134,7 +136,8 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
                                  output_tensor->dim(0),
                                  output_tensor->dim(1),
                                  output_tensor->dim(2));
-  TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);
@@ -211,8 +214,10 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
        LOG(FATAL) << "Unknown activation type: " << activation_;
    }
-    kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
-                                   built_options);
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
@@ -267,7 +272,8 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
      Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
             output_tensor->dim(1), output_tensor->dim(2),
             output_tensor->dim(3), input_tensor->dim(2));
-  TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
+                                           gws, lws, future));
  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -61,6 +61,44 @@ void UnloadModelData(const unsigned char *model_data,
  MACE_CHECK(ret == 0, "Failed to unmap model data file, error code: ",
             strerror(errno));
 }
+#ifdef MACE_ENABLE_OPENCL
+MaceStatus CheckGPUAvalibility(const NetDef *net_def) {
+  // Check OpenCL avaliable
+  auto runtime = OpenCLRuntime::Global();
+  if (!runtime->is_opencl_avaliable()) {
+    return MaceStatus::MACE_OUT_OF_RESOURCES;
+  }
+  // Check whether model max OpenCL image sizes exceed OpenCL limitation.
+  if (net_def == nullptr) {
+    return MaceStatus::MACE_INVALID_ARGS;
+  }
+  if (!runtime->IsImageSupport()) {
+    return MaceStatus::MACE_OUT_OF_RESOURCES;
+  }
+  auto opencl_max_image_size = runtime->GetMaxImage2DSize();
+  if (opencl_max_image_size.empty()) {
+    return MaceStatus::MACE_OUT_OF_RESOURCES;
+  }
+  const std::vector<int64_t> net_max_image_size =
+      ProtoArgHelper::GetRepeatedArgs<NetDef, int64_t>(
+          *net_def, "opencl_max_image_size", {0, 0});
+  if (static_cast<uint64_t>(net_max_image_size[0]) > opencl_max_image_size[0]
+      || static_cast<uint64_t>(net_max_image_size[1])
+          > opencl_max_image_size[1]) {
+    LOG(INFO) << "opencl max image size " << MakeString(opencl_max_image_size)
+              << " vs " << MakeString(net_max_image_size);
+    return MaceStatus::MACE_OUT_OF_RESOURCES;
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+#endif
 }  // namespace
 // Mace Tensor
@@ -171,6 +209,12 @@ MaceStatus MaceEngine::Impl::Init(
    const std::vector<std::string> &output_nodes,
    const unsigned char *model_data) {
  LOG(INFO) << "Initializing MaceEngine";
+  // Check avalibility
+#ifdef MACE_ENABLE_OPENCL
+  if (device_type_ == DeviceType::GPU) {
+    MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def));
+  }
+#endif
  // Get input and output information.
  for (auto &input_info : net_def->input_info()) {
    input_info_map_[input_info.name()] = input_info;

--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
@@ -20,6 +20,12 @@ enum DataType {
  DT_INT32 = 4;
 }
+enum MemoryType {
+  CPU_BUFFER = 0;
+  GPU_BUFFER = 1;
+  GPU_IMAGE = 2;
+}
 message ConstTensor {
  repeated int64 dims = 1;
  optional DataType data_type = 2 [default = DT_FLOAT];
@@ -73,8 +79,9 @@ message OperatorDef {
 // for memory optimization
 message MemoryBlock {
  optional int32 mem_id = 1;
-  optional uint32 x = 2;
+  optional MemoryType mem_type = 2;
-  optional uint32 y = 3;
+  optional uint32 x = 3;
+  optional uint32 y = 4;
 }
 message MemoryArena {
  repeated MemoryBlock mem_block = 1;

--- a/mace/public/mace_runtime.h
+++ b/mace/public/mace_runtime.h
@@ -79,77 +79,102 @@ class __attribute__((visibility("default"))) FileStorageFactory
  std::unique_ptr<Impl> impl_;
 };
-// Set Key-Value store factory. (Call Once)
+/// \brief Set internal storage factory to store internal data. (Call once)
-// Now KVStorage is used to store the built OpenCL binaries to file,
+///
-// which could speed up the GPU initialization and first run.
+/// Now the path is used to store the built OpenCL binaries to file,
-// If do not call this API, the initialization maybe slow for GPU.
+/// which could speed up the GPU initialization and first run.
+/// If do not call this API, the initialization maybe slow for GPU.
+///
+/// \param path  Make sure your program have Read/Write permission of the path
+/// \return
 __attribute__((visibility("default")))
 void SetKVStorageFactory(std::shared_ptr<KVStorageFactory> storage_factory);
-// Just call once. (Not thread-safe)
+/// \brief Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so)  // NOLINT(whitespace/line_length)
-// Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so)
+///
-// if you use gpu of specific soc.
+/// Just call once. (Not thread-safe)
-// Using OpenCL binary will speed up the initialization.
+/// if you use gpu of specific soc, Using OpenCL binary will speed up the initialization.  // NOLINT(whitespace/line_length)
-// OpenCL binary is corresponding to the OpenCL Driver version,
+/// OpenCL binary is corresponding to the OpenCL Driver version,
-// you should update the binary when OpenCL Driver changed.
+/// you should update the binary when OpenCL Driver changed.
+///
+/// \param paths MACE will use first file found in all paths
+/// \return
 __attribute__((visibility("default")))
 void SetOpenCLBinaryPaths(const std::vector<std::string> &paths);
-// Just call once. (Not thread-safe)
+/// \brief Set the path of Generated OpenCL parameter file
-// Set the path of Generated OpenCL parameter file
+///
-// if you use gpu for specific soc.
+/// Just call once. (Not thread-safe)
-// The parameters is the local work group size tuned for specific SOC, which
+/// If you use gpu for specific soc, The parameters is the local work group
-// may be faster than the general parameters.
+/// size tuned for specific SOC, which may be faster than the
+/// general parameters.
+///
+/// \param path Make sure your program have Read/Write permission of the path
+/// \return
 __attribute__((visibility("default")))
 void SetOpenCLParameterPath(const std::string &path);
-// Set GPU hints, currently only supports Adreno GPU.
+/// \brief Set GPU hints, currently only supports Adreno GPU.
-//
+///
-// Caution: this function may hurt performance if improper parameters provided.
+/// Caution: this function may hurt performance
+/// if improper parameters provided.
+///
+/// \param perf_hint  performance hint
+/// \param priority_hint  priority hint
+/// \return
 __attribute__((visibility("default")))
 void SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint);
-// Set OpenMP threads number and affinity policy.
+/// \brief Set OpenMP threads number and affinity policy.
-//
+///
-// Caution: this function may hurt performance if improper parameters provided.
+/// Caution: this function may hurt performance if improper parameters provided.
-//
+/// When num_threads_hint is zero or negative,
-// num_threads_hint is only a hint. When num_threads_hint is zero or negative,
+/// the function will set the threads number equaling to the number of
-// the function will set the threads number equaling to the number of
+/// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all
-// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all
+/// (AFFINITY_NONE) cores according to the policy. The threads number will
-// (AFFINITY_NONE) cores according to the policy. The threads number will
+/// also be truncated to the corresponding cores number when num_threads_hint
-// also be truncated to the corresponding cores number when num_threads_hint
+/// is larger than it.
-// is larger than it.
+/// The OpenMP threads will be bind to (via sched_setaffinity) big cores
-//
+/// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY).
-// The OpenMP threads will be bind to (via sched_setaffinity) big cores
+///
-// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY).
+/// \param num_threads_hint it is only a hint.
-//
+/// \param policy one of CPUAffinityPolicy
-// If successful, it returns MACE_SUCCESS and error if it can't reliabley
+/// \param status MACE_SUCCESS for successful, or it can't reliabley
-// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's
+/// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's
-// suggested to use AFFINITY_NONE to use all cores.
+/// suggested to use AFFINITY_NONE to use all cores.
+/// \return
 __attribute__((visibility("default")))
 MaceStatus SetOpenMPThreadPolicy(int num_threads_hint,
                                 CPUAffinityPolicy policy);
-// Set OpenMP threads number and processor affinity.
+/// \brief Set OpenMP threads number and processor affinity.
-//
+///
-// Caution: this function may hurt performance if improper parameters provided.
+/// Caution: this function may hurt performance
-//
+/// if improper parameters provided.
-// This function may not work well on some chips (e.g. MTK). Setting thread
+/// This function may not work well on some chips (e.g. MTK). Setting thread
-// affinity to offline cores may run very slow or unexpectedly. In such cases,
+/// affinity to offline cores may run very slow or unexpectedly.
-// please use SetOpenMPThreadPolicy with default policy instead.
+/// In such cases, please use SetOpenMPThreadPolicy with default policy
+/// instead.
+///
+/// \param num_threads
+/// \param cpu_ids
+/// \param status
+/// \return
 __attribute__((visibility("default")))
 MaceStatus SetOpenMPThreadAffinity(int num_threads,
                                   const std::vector<int> &cpu_ids);
-// Get ARM big.LITTLE configuration.
+/// \brief Get ARM big.LITTLE configuration.
-//
+///
-// This function will detect the max frequencies of all CPU cores, and assume
+/// This function will detect the max frequencies of all CPU cores, and assume
-// the cores with largest max frequencies as big cores, and all the remaining
+/// the cores with largest max frequencies as big cores, and all the remaining
-// cores as little. If all cpu core's max frequencies equals, big_core_ids and
+/// cores as little. If all cpu core's max frequencies equals, big_core_ids and
-// little_core_ids will both be filled with all cpu core ids.
+/// little_core_ids will both be filled with all cpu core ids.
-//
+///
-// If successful, it returns MACE_SUCCESS and error if it can't reliabley
+/// \param [out] big_core_ids
-// detect the frequency of big-LITTLE cores (e.g. MTK).
+/// \param [out] little_core_ids
+/// \return If successful, it returns MACE_SUCCESS and error if it can't
+///         reliabley detect the frequency of big-LITTLE cores (e.g. MTK).
 __attribute__((visibility("default")))
 MaceStatus GetBigLittleCoreIDs(std::vector<int> *big_core_ids,
                               std::vector<int> *little_core_ids);

--- a/mace/python/tools/convert_util.py
+++ b/mace/python/tools/convert_util.py
@@ -12,7 +12,72 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import enum
 def mace_check(condition, msg):
    if not condition:
        raise Exception(msg)
+def roundup_div4(value):
+    return int((value + 3) / 4)
+class OpenCLBufferType(enum.Enum):
+    CONV2D_FILTER = 0
+    IN_OUT_CHANNEL = 1
+    ARGUMENT = 2
+    IN_OUT_HEIGHT = 3
+    IN_OUT_WIDTH = 4
+    WINOGRAD_FILTER = 5
+    DW_CONV2D_FILTER = 6
+    WEIGHT_HEIGHT = 7
+    WEIGHT_WIDTH = 8
+def calculate_image_shape(buffer_type, shape, winograd_blk_size=0):
+    # keep the same with mace/kernel/opencl/helper.cc
+    image_shape = [0, 0]
+    if buffer_type == OpenCLBufferType.CONV2D_FILTER:
+        mace_check(len(shape) == 4, "Conv2D filter buffer should be 4D")
+        image_shape[0] = shape[1]
+        image_shape[1] = shape[2] * shape[3] * roundup_div4(shape[0])
+    elif buffer_type == OpenCLBufferType.IN_OUT_CHANNEL:
+        mace_check(len(shape) == 4, "Conv2D input/output buffer should be 4D")
+        image_shape[0] = roundup_div4(shape[3]) * shape[2]
+        image_shape[1] = shape[0] * shape[1]
+    elif buffer_type == OpenCLBufferType.ARGUMENT:
+        mace_check(len(shape) == 1,
+                   "Argument buffer should be 1D not " + str(shape))
+        image_shape[0] = roundup_div4(shape[0])
+        image_shape[1] = 1
+    elif buffer_type == OpenCLBufferType.IN_OUT_HEIGHT:
+        mace_check(len(shape) == 4, "Input/output buffer should be 4D")
+        image_shape[0] = shape[2] * shape[3]
+        image_shape[1] = shape[0] * roundup_div4(shape[1])
+    elif buffer_type == OpenCLBufferType.IN_OUT_WIDTH:
+        mace_check(len(shape) == 4, "Input/output buffer should be 4D")
+        image_shape[0] = roundup_div4(shape[2]) * shape[3]
+        image_shape[1] = shape[0] * shape[1]
+    elif buffer_type == OpenCLBufferType.WINOGRAD_FILTER:
+        mace_check(len(shape) == 4, "Winograd filter buffer should be 4D")
+        image_shape[0] = roundup_div4(shape[1])
+        image_shape[1] = (shape[0] * (winograd_blk_size + 2)
+                          * (winograd_blk_size + 2))
+    elif buffer_type == OpenCLBufferType.DW_CONV2D_FILTER:
+        mace_check(len(shape) == 4, "Winograd filter buffer should be 4D")
+        image_shape[0] = shape[0] * shape[2] * shape[3]
+        image_shape[1] = roundup_div4(shape[1])
+    elif buffer_type == OpenCLBufferType.WEIGHT_HEIGHT:
+        mace_check(len(shape) == 4, "Weight buffer should be 4D")
+        image_shape[0] = shape[1] * shape[2] * shape[3]
+        image_shape[1] = roundup_div4(shape[0])
+    elif buffer_type == OpenCLBufferType.WEIGHT_WIDTH:
+        mace_check(len(shape) == 4, "Weight buffer should be 4D")
+        image_shape[0] = roundup_div4(shape[1]) * shape[2] * shape[3]
+        image_shape[1] = shape[0]
+    else:
+        mace_check(False, "OpenCL Image do not support type "
+                   + str(buffer_type))
+    return image_shape
--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -171,6 +171,13 @@ def main(unused_args):
            output_graph_def.op.extend(cpu_graph_def.op)
            output_graph_def.mem_arena.mem_block.extend(
                cpu_graph_def.mem_arena.mem_block)
+            output_graph_arg_names = set()
+            for arg in output_graph_def.arg:
+                output_graph_arg_names.add(arg.name)
+            for arg in cpu_graph_def.arg:
+                if arg.name not in output_graph_arg_names:
+                    output_graph_def.arg.extend(arg)
            print "Merge done"
        else:
            option.device = device_type_map[FLAGS.runtime]

--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -163,6 +163,7 @@ class MaceKeyword(object):
    mace_op_data_type_str = 'T'
    mace_offset_str = 'offset'
    mace_from_caffe_str = 'from_caffe'
+    mace_opencl_max_image_size = "opencl_max_image_size"
 class TransformerRule(Enum):

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -28,21 +28,12 @@ from mace.python.tools.converter_tool.base_converter import MaceKeyword
 from mace.python.tools.converter_tool.base_converter import MaceOp
 from mace.python.tools.converter_tool.base_converter import PaddingMode
 from mace.python.tools.converter_tool.base_converter import TransformerRule
+from mace.python.tools.convert_util import calculate_image_shape
 from mace.python.tools.convert_util import mace_check
+from mace.python.tools.convert_util import OpenCLBufferType
-OPENCL_IMAGE_MAX_SIZE = 16384
-class OpenCLBufferType(enum.Enum):
+OPENCL_IMAGE_MAX_SIZE = 16384
-    CONV2D_FILTER = 0
-    IN_OUT_CHANNEL = 1
-    ARGUMENT = 2
-    IN_OUT_HEIGHT = 3
-    IN_OUT_WIDTH = 4
-    WINOGRAD_FILTER = 5
-    DW_CONV2D_FILTER = 6
-    WEIGHT_HEIGHT = 7
-    WEIGHT_WIDTH = 8
 class Transformer(base_converter.ConverterInterface):
@@ -101,6 +92,7 @@ class Transformer(base_converter.ConverterInterface):
        self._producer = {}
        self._target_data_format = DataFormat.NHWC
        self._input_output_added = False
+        self._opencl_max_image_size = [0, 0]
        if self._option.device == DeviceType.CPU.value:
            self._target_data_format = DataFormat.NCHW
@@ -972,15 +964,26 @@ class Transformer(base_converter.ConverterInterface):
        arg.name = MaceKeyword.mace_mode
        arg.i = 0
+        tensor_shape = list(self._consts[input_name].dims)
        if input_type == OpenCLBufferType.WINOGRAD_FILTER:
            blk_sqr = op.output_shape[0].dims[0]
            wino_blk = int(np.sqrt(blk_sqr)) - 2
            wino_arg = op_def.arg.add()
            wino_arg.name = MaceKeyword.mace_wino_block_size
            wino_arg.i = wino_blk
+            img_shape = calculate_image_shape(input_type, tensor_shape,
+                                              wino_blk)
+        else:
+            img_shape = calculate_image_shape(input_type, tensor_shape)
        op.input[input_idx] = output_name
+        # update OpenCL max image size
+        self._opencl_max_image_size[0] = max(self._opencl_max_image_size[0],
+                                             img_shape[0])
+        self._opencl_max_image_size[1] = max(self._opencl_max_image_size[1],
+                                             img_shape[1])
    def transform_buffer_image(self):
        if self._option.device != DeviceType.GPU.value:
            return False
@@ -1030,6 +1033,11 @@ class Transformer(base_converter.ConverterInterface):
                                         MaceKeyword.mace_activation_type_str).s == ActivationType.PRELU.name:  # noqa
                    self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT)
+        # Add OpenCL max image size
+        arg = net.arg.add()
+        arg.name = MaceKeyword.mace_opencl_max_image_size
+        arg.ints.extend(self._opencl_max_image_size)
        for input_node in self._option.input_nodes.values():
            new_input_name = MaceKeyword.mace_input_node_name \
                             + '_' + input_node.name

--- a/mace/python/tools/memory_optimizer.py
+++ b/mace/python/tools/memory_optimizer.py
@@ -16,6 +16,24 @@ import sys
 import operator
 from mace.proto import mace_pb2
+from mace.python.tools.converter_tool import base_converter as cvt
+from mace.python.tools.convert_util import calculate_image_shape
+from mace.python.tools.convert_util import OpenCLBufferType
+class MemoryBlock(object):
+    def __init__(self, mem_type, block):
+        self._mem_type = mem_type
+        self._block = block
+    @property
+    def mem_type(self):
+        return self._mem_type
+    @property
+    def block(self):
+        return self._block
 class MemoryOptimizer(object):
    def __init__(self, net_def):
@@ -24,7 +42,6 @@ class MemoryOptimizer(object):
        self.op_mem = {}  # op_name->mem_id
        self.mem_block = {}  # mem_id->[size] or mem_id->[x, y]
        self.total_mem_count = 0
-        self.total_cpu_mem_count = 0
        self.input_ref_counter = {}
        self.mem_ref_counter = {}
@@ -52,23 +69,27 @@ class MemoryOptimizer(object):
        return True
    def get_op_mem_block(self, op_type, output_shape):
-        return [reduce(operator.mul, output_shape, 1)]
+        return MemoryBlock(mace_pb2.CPU_BUFFER,
+                           [reduce(operator.mul, output_shape, 1)])
    def mem_size(self, memory_block):
-        return memory_block[0]
+        return memory_block.block[0]
    def sub_mem_block(self, mem_block1, mem_block2):
        return self.mem_size(mem_block1) - self.mem_size(mem_block2)
    def resize_mem_block(self, old_mem_block, op_mem_block):
-        return [max(old_mem_block[0], op_mem_block[0])]
+        return MemoryBlock(
+            old_mem_block.mem_type,
+            [max(old_mem_block.block[0], op_mem_block.block[0])])
    def add_net_mem_blocks(self):
        for mem in self.mem_block:
            arena = self.net_def.mem_arena
            block = arena.mem_block.add()
            block.mem_id = mem
-            block.x = self.mem_block[mem][0]
+            block.mem_type = self.mem_block[mem].mem_type
+            block.x = self.mem_block[mem].block[0]
            block.y = 1
    def get_total_origin_mem_size(self):
@@ -82,7 +103,7 @@ class MemoryOptimizer(object):
    def get_total_optimized_mem_size(self):
        optimized_mem_size = 0
        for mem in self.mem_block:
-            print mem, self.mem_block[mem]
+            print mem, self.mem_block[mem].mem_type, self.mem_block[mem].block
            optimized_mem_size += self.mem_size(self.mem_block[mem])
        return optimized_mem_size
@@ -117,6 +138,8 @@ class MemoryOptimizer(object):
                        best_mem_waste_size = sys.maxint
                        for mid in self.idle_mem:
                            old_mem_block = self.mem_block[mid]
+                            if old_mem_block.mem_type != op_mem_block.mem_type:
+                                continue
                            new_mem_block = self.resize_mem_block(
                                old_mem_block, op_mem_block)
                            add_mem_size = self.sub_mem_block(new_mem_block,
@@ -185,53 +208,76 @@ class GPUMemoryOptimizer(MemoryOptimizer):
            for arg in op.arg:
                if arg.name == 'mode' and arg.i == 0:
                    return False
-        elif op.type == 'Shape':
-            for i in range(len(op.output)):
-                mem_id = self.total_cpu_mem_count
-                self.total_cpu_mem_count += 1
-                op_mem_block = self.get_op_mem_block(
-                    op.type,
-                    op.output_shape[i].dims)
-                self.mem_block[mem_id] = op_mem_block
-            return False
        return op.type != 'ImageToBuffer'
    def get_op_mem_block(self, op_type, output_shape):
-        mem_block = [0, 0]
        if op_type == 'WinogradTransform' or op_type == 'MatMul':
-            mem_block[0] = output_shape[2]
+            buffer_shape = list(output_shape) + [1]
-            mem_block[1] = output_shape[0] * int((output_shape[1] + 3) / 4)
+            mem_block = MemoryBlock(
+                mace_pb2.GPU_IMAGE,
+                calculate_image_shape(OpenCLBufferType.IN_OUT_HEIGHT,
+                                      buffer_shape))
        elif op_type == 'Shape':
-            mem_block[0] = output_shape[0]
+            mem_block = MemoryBlock(mace_pb2.CPU_BUFFER,
-            mem_block[1] = 1
+                                    [output_shape[0], 1])
        else:
            if len(output_shape) == 2:  # only support fc/softmax
-                mem_block[0] = int((output_shape[1] + 3) / 4)
+                buffer_shape = [output_shape[0], 1, 1, output_shape[1]]
-                mem_block[1] = output_shape[0]
            elif len(output_shape) == 4:
-                mem_block[0] = output_shape[2] * int((output_shape[3] + 3) / 4)
+                buffer_shape = output_shape
-                mem_block[1] = output_shape[0] * output_shape[1]
            else:
                raise Exception('output shape dim size is not 2 or 4.')
+            mem_block = MemoryBlock(
+                mace_pb2.GPU_IMAGE,
+                calculate_image_shape(OpenCLBufferType.IN_OUT_CHANNEL,
+                                      buffer_shape))
        return mem_block
    def mem_size(self, memory_block):
-        return memory_block[0] * memory_block[1] * 4
+        if memory_block.mem_type == mace_pb2.GPU_IMAGE:
+            return memory_block.block[0] * memory_block.block[1] * 4
+        else:
+            return memory_block.block[0]
    def resize_mem_block(self, old_mem_block, op_mem_block):
-        resize_mem_block = [
+        resize_mem_block = MemoryBlock(
-            max(old_mem_block[0], op_mem_block[0]),
+            old_mem_block.mem_type,
-            max(old_mem_block[1], op_mem_block[1])
+            [
-        ]
+                max(old_mem_block.block[0], op_mem_block.block[0]),
+                max(old_mem_block.block[1], op_mem_block.block[1])
+            ])
        return resize_mem_block
    def add_net_mem_blocks(self):
+        max_image_size_x = 0
+        max_image_size_y = 0
        for mem in self.mem_block:
            arena = self.net_def.mem_arena
            block = arena.mem_block.add()
            block.mem_id = mem
-            block.x = self.mem_block[mem][0]
+            block.mem_type = self.mem_block[mem].mem_type
-            block.y = self.mem_block[mem][1]
+            block.x = self.mem_block[mem].block[0]
+            block.y = self.mem_block[mem].block[1]
+            if self.mem_block[mem].mem_type == mace_pb2.GPU_IMAGE:
+                max_image_size_x = max(max_image_size_x, block.x)
+                max_image_size_y = max(max_image_size_y, block.y)
+        # Update OpenCL max image size
+        net_ocl_max_img_size_arg = None
+        for arg in self.net_def.arg:
+            if arg.name == cvt.MaceKeyword.mace_opencl_max_image_size:
+                net_ocl_max_img_size_arg = arg
+                max_image_size_x = max(arg.ints[0], max_image_size_x)
+                max_image_size_y = max(arg.ints[1], max_image_size_y)
+                break
+        if net_ocl_max_img_size_arg is None:
+            net_ocl_max_img_size_arg = self.net_def.arg.add()
+            net_ocl_max_img_size_arg.name = \
+                cvt.MaceKeyword.mace_opencl_max_image_size
+        net_ocl_max_img_size_arg.ints[:] = [max_image_size_x,
+                                            max_image_size_y]
    def mem_id_base(self):
        return 20000

--- a/mace/python/tools/model.jinja2
+++ b/mace/python/tools/model.jinja2
@@ -129,6 +129,7 @@ void CreateMemoryArena(mace::MemoryArena *mem_arena) {
  mace::MemoryBlock* mem_block{{i}} = mem_arena->add_mem_block();
  mem_block{{i}}->set_mem_id({{net.mem_arena.mem_block[i].mem_id}});
+  mem_block{{i}}->set_mem_type(static_cast<MemoryType>({{net.mem_arena.mem_block[i].mem_type}}));
  mem_block{{i}}->set_x({{net.mem_arena.mem_block[i].x}});
  mem_block{{i}}->set_y({{net.mem_arena.mem_block[i].y}});

--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
@@ -244,6 +244,7 @@ std::map<std::string, int> AddMemoryOptimization(
  for (size_t i = 0; i < input_size; ++i) {
    MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
    mem_blk_ptr->set_mem_id(mem_id);
+    mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
    mem_blk_ptr->set_x(in_mem_block_x);
    mem_blk_ptr->set_y(in_mem_block_y);
    res[input_names[i]] = mem_id;
@@ -263,6 +264,7 @@ std::map<std::string, int> AddMemoryOptimization(
  for (size_t i = 0; i < output_size; ++i) {
    MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
    mem_blk_ptr->set_mem_id(mem_id);
+    mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
    mem_blk_ptr->set_x(out_mem_block_x);
    mem_blk_ptr->set_y(out_mem_block_y);
    res[output_names[i]] = mem_id;

--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
@@ -245,6 +245,7 @@ std::map<std::string, int> AddMemoryOptimization(
  for (size_t i = 0; i < input_size; ++i) {
    MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
    mem_blk_ptr->set_mem_id(mem_id);
+    mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
    mem_blk_ptr->set_x(in_mem_block_x);
    mem_blk_ptr->set_y(in_mem_block_y);
    res[input_names[i]] = mem_id;
@@ -264,6 +265,7 @@ std::map<std::string, int> AddMemoryOptimization(
  for (size_t i = 0; i < output_size; ++i) {
    MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
    mem_blk_ptr->set_mem_id(mem_id);
+    mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
    mem_blk_ptr->set_x(out_mem_block_x);
    mem_blk_ptr->set_y(out_mem_block_y);
    res[output_names[i]] = mem_id;