From 355d1a9d5aca34480ef73d1a989cd217f090584b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E5=AF=85?= <liyin@xiaomi.com>
Date: Fri, 25 May 2018 14:06:16 +0800
Subject: [PATCH] Add mace status to op run

---
 mace/core/mace.cc                         | 15 ++---
 mace/core/net.cc                          | 16 ++----
 mace/core/net.h                           |  4 +-
 mace/core/operator.h                      |  4 +-
 mace/kernels/activation.h                 | 25 ++++-----
 mace/kernels/addn.h                       |  7 ++-
 mace/kernels/arm/conv_2d_neon.h           | 22 ++++----
 mace/kernels/batch_norm.h                 |  6 +-
 mace/kernels/bias_add.h                   |  6 +-
 mace/kernels/buffer_to_image.h            |  5 +-
 mace/kernels/channel_shuffle.h            | 16 ++++--
 mace/kernels/concat.h                     |  8 ++-
 mace/kernels/conv_2d.h                    | 15 +++--
 mace/kernels/conv_pool_2d_util.cc         | 18 ++++--
 mace/kernels/conv_pool_2d_util.h          |  6 +-
 mace/kernels/deconv_2d.h                  | 10 ++--
 mace/kernels/depth_to_space.h             | 35 +++++++-----
 mace/kernels/depthwise_conv2d.h           |  8 ++-
 mace/kernels/eltwise.h                    |  8 ++-
 mace/kernels/fully_connected.h            |  8 ++-
 mace/kernels/image_to_buffer.h            |  5 +-
 mace/kernels/local_response_norm.h        |  4 +-
 mace/kernels/matmul.h                     |  8 ++-
 mace/kernels/opencl/activation.cc         | 41 +++++++-------
 mace/kernels/opencl/addn.cc               |  7 ++-
 mace/kernels/opencl/batch_norm.cc         |  4 +-
 mace/kernels/opencl/bias_add.cc           |  4 +-
 mace/kernels/opencl/buffer_to_image.cc    |  8 ++-
 mace/kernels/opencl/channel_shuffle.cc    |  6 +-
 mace/kernels/opencl/concat.cc             |  6 +-
 mace/kernels/opencl/conv_2d.cc            |  6 +-
 mace/kernels/opencl/deconv_2d_opencl.cc   |  6 +-
 mace/kernels/opencl/depth_to_space.cc     |  6 +-
 mace/kernels/opencl/depthwise_conv.cc     |  9 +--
 mace/kernels/opencl/eltwise.cc            |  6 +-
 mace/kernels/opencl/fully_connected.cc    |  6 +-
 mace/kernels/opencl/image_to_buffer.cc    |  6 +-
 mace/kernels/opencl/matmul.cc             |  6 +-
 mace/kernels/opencl/pad.cc                |  6 +-
 mace/kernels/opencl/pooling.cc            |  6 +-
 mace/kernels/opencl/resize_bilinear.cc    |  6 +-
 mace/kernels/opencl/slice.cc              |  6 +-
 mace/kernels/opencl/softmax.cc            |  4 +-
 mace/kernels/opencl/space_to_batch.cc     | 10 +++-
 mace/kernels/opencl/winograd_transform.cc | 12 ++--
 mace/kernels/pad.h                        | 24 +++++---
 mace/kernels/pooling.h                    |  8 ++-
 mace/kernels/proposal.h                   | 10 ++--
 mace/kernels/psroi_align.h                | 11 ++--
 mace/kernels/quantize.h                   | 12 +++-
 mace/kernels/reshape.h                    |  4 +-
 mace/kernels/resize_bilinear.h            | 67 +++++++++++++----------
 mace/kernels/slice.h                      |  8 ++-
 mace/kernels/softmax.h                    | 10 +++-
 mace/kernels/space_to_batch.h             |  9 +--
 mace/kernels/transpose.h                  | 22 +++++---
 mace/kernels/winograd_transform.h         | 28 ++++++----
 mace/ops/activation.h                     |  7 +--
 mace/ops/activation_benchmark.cc          | 50 ++++++++---------
 mace/ops/addn.h                           |  6 +-
 mace/ops/batch_norm.h                     |  8 +--
 mace/ops/batch_to_space.h                 |  5 +-
 mace/ops/bias_add.h                       |  7 +--
 mace/ops/buffer_to_image.h                |  5 +-
 mace/ops/channel_shuffle.h                |  6 +-
 mace/ops/concat.h                         |  5 +-
 mace/ops/conv_2d.h                        |  6 +-
 mace/ops/deconv_2d.h                      |  6 +-
 mace/ops/depth_to_space.h                 |  5 +-
 mace/ops/depthwise_conv2d.h               |  5 +-
 mace/ops/eltwise.h                        |  5 +-
 mace/ops/folded_batch_norm.h              |  7 +--
 mace/ops/fully_connected.h                |  5 +-
 mace/ops/image_to_buffer.h                |  5 +-
 mace/ops/local_response_norm.h            |  7 +--
 mace/ops/matmul.h                         |  5 +-
 mace/ops/ops_test_util.h                  | 12 ++--
 mace/ops/pad.h                            |  5 +-
 mace/ops/pooling.h                        |  5 +-
 mace/ops/proposal.h                       |  5 +-
 mace/ops/psroi_align.h                    |  5 +-
 mace/ops/quantize.h                       | 29 +++++-----
 mace/ops/reshape.h                        |  5 +-
 mace/ops/resize_bilinear.h                |  5 +-
 mace/ops/slice.h                          |  5 +-
 mace/ops/softmax.h                        |  5 +-
 mace/ops/space_to_batch.h                 |  5 +-
 mace/ops/space_to_depth.h                 |  5 +-
 mace/ops/transpose.h                      |  7 +--
 mace/ops/winograd_inverse_transform.h     |  5 +-
 mace/ops/winograd_transform.h             |  5 +-
 mace/public/mace.h                        |  9 +++
 92 files changed, 501 insertions(+), 420 deletions(-)

diff --git a/mace/core/mace.cc b/mace/core/mace.cc
index b2a08454..f16f9f4f 100644
--- a/mace/core/mace.cc
+++ b/mace/core/mace.cc
@@ -155,18 +155,13 @@ MaceStatus MaceEngine::Impl::Init(
     }
   } else {
 #endif
-    MaceStatus status =
-      ws_->LoadModelTensor(*net_def, device_type_, model_data);
-    if (status != MaceStatus::MACE_SUCCESS) {
-      return status;
-    }
+    MACE_FAILURE_RETURN(ws_->LoadModelTensor(
+        *net_def, device_type_, model_data));
 
   // Init model
     auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type_,
                          NetMode::INIT);
-    if (!net->Run()) {
-      LOG(FATAL) << "Net init run failed";
-    }
+    MACE_FAILURE_RETURN(net->Run());
     net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_type_);
 #ifdef MACE_ENABLE_HEXAGON
   }
@@ -226,9 +221,7 @@ MaceStatus MaceEngine::Impl::Run(
     hexagon_controller_->ExecuteGraph(*input_tensors[0], output_tensors[0]);
   } else {
 #endif
-    if (!net_->Run(run_metadata)) {
-      LOG(FATAL) << "Net run failed";
-    }
+    MACE_FAILURE_RETURN(net_->Run(run_metadata));
 #ifdef MACE_ENABLE_HEXAGON
   }
 #endif
diff --git a/mace/core/net.cc b/mace/core/net.cc
index 87d9db7a..ccfc4a81 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -57,7 +57,7 @@ SerialNet::SerialNet(const std::shared_ptr<const OperatorRegistry> op_registry,
   }
 }
 
-bool SerialNet::Run(RunMetadata *run_metadata) {
+MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
   MACE_MEMORY_LOGGING_GUARD();
   MACE_LATENCY_LOGGER(1, "Running net");
   for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
@@ -68,11 +68,10 @@ bool SerialNet::Run(RunMetadata *run_metadata) {
                         (run_metadata != nullptr ||
                          std::distance(iter, operators_.end()) == 1));
 
-    bool ret;
     CallStats call_stats;
     if (future_wait) {
       StatsFuture future;
-      ret = op->Run(&future);
+      MACE_FAILURE_RETURN(op->Run(&future));
       if (run_metadata != nullptr) {
         future.wait_fn(&call_stats);
       } else {
@@ -80,10 +79,10 @@ bool SerialNet::Run(RunMetadata *run_metadata) {
       }
     } else if (run_metadata != nullptr) {
       call_stats.start_micros = NowMicros();
-      ret = op->Run(nullptr);
+      MACE_FAILURE_RETURN(op->Run(nullptr));
       call_stats.end_micros = NowMicros();
     } else {
-      ret = op->Run(nullptr);
+      MACE_FAILURE_RETURN(op->Run(nullptr));
     }
 
     if (run_metadata != nullptr) {
@@ -117,16 +116,11 @@ bool SerialNet::Run(RunMetadata *run_metadata) {
       run_metadata->op_stats.emplace_back(op_stats);
     }
 
-    if (!ret) {
-      LOG(ERROR) << "Operator failed: " << op->debug_def().name();
-      return false;
-    }
-
     VLOG(3) << "Operator " << op->debug_def().name()
             << " has shape: " << MakeString(op->Output(0)->shape());
   }
 
-  return true;
+  return MACE_SUCCESS;
 }
 
 std::unique_ptr<NetBase> CreateNet(
diff --git a/mace/core/net.h b/mace/core/net.h
index 4697c450..efc04d5e 100644
--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -36,7 +36,7 @@ class NetBase {
           DeviceType type);
   virtual ~NetBase() noexcept {}
 
-  virtual bool Run(RunMetadata *run_metadata = nullptr) = 0;
+  virtual MaceStatus Run(RunMetadata *run_metadata = nullptr) = 0;
 
   const std::string &Name() const { return name_; }
 
@@ -55,7 +55,7 @@ class SerialNet : public NetBase {
             DeviceType type,
             const NetMode mode = NetMode::NORMAL);
 
-  bool Run(RunMetadata *run_metadata = nullptr) override;
+  MaceStatus Run(RunMetadata *run_metadata = nullptr) override;
 
  protected:
   std::vector<std::unique_ptr<OperatorBase> > operators_;
diff --git a/mace/core/operator.h b/mace/core/operator.h
index 11b1f889..118279a3 100644
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -73,7 +73,7 @@ class OperatorBase {
   inline const std::vector<Tensor *> &Outputs() { return outputs_; }
 
   // Run Op asynchronously (depends on device), return a future if not nullptr.
-  virtual bool Run(StatsFuture *future) = 0;
+  virtual MaceStatus Run(StatsFuture *future) = 0;
 
   inline const OperatorDef &debug_def() const {
     MACE_CHECK(has_debug_def(), "operator_def was null!");
@@ -130,7 +130,7 @@ class Operator : public OperatorBase {
       }
     }
   }
-  bool Run(StatsFuture *future) override = 0;
+  MaceStatus Run(StatsFuture *future) override = 0;
   ~Operator() noexcept override {}
 };
 
diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h
index 961d4884..9979b542 100644
--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -132,10 +132,10 @@ class ActivationFunctor<DeviceType::CPU, float> {
   ActivationFunctor(ActivationType type, float relux_max_limit)
       : activation_(type), relux_max_limit_(relux_max_limit) {}
 
-  void operator()(const Tensor *input,
-                  const Tensor *alpha,
-                  Tensor *output,
-                  StatsFuture *future) {
+  MaceStatus operator()(const Tensor *input,
+                        const Tensor *alpha,
+                        Tensor *output,
+                        StatsFuture *future) {
     MACE_UNUSED(future);
     const float *input_ptr = input->data<float>();
     float *output_ptr = output->mutable_data<float>();
@@ -144,16 +144,13 @@ class ActivationFunctor<DeviceType::CPU, float> {
       const float *alpha_ptr = alpha->data<float>();
       const index_t outer_size = output->dim(0);
       const index_t inner_size = output->dim(2) * output->dim(3);
-      PReLUActivation(input_ptr,
-                      outer_size,
-                      input->dim(1),
-                      inner_size,
-                      alpha_ptr,
-                      output_ptr);
+      PReLUActivation(input_ptr, outer_size, input->dim(1), inner_size,
+                      alpha_ptr, output_ptr);
     } else {
       DoActivation(input_ptr, output_ptr, output->size(), activation_,
                    relux_max_limit_);
     }
+    return MACE_SUCCESS;
   }
 
  private:
@@ -168,10 +165,10 @@ class ActivationFunctor<DeviceType::GPU, T> {
   ActivationFunctor(ActivationType type, T relux_max_limit)
       : activation_(type), relux_max_limit_(static_cast<T>(relux_max_limit)) {}
 
-  void operator()(const Tensor *input,
-                  const Tensor *alpha,
-                  Tensor *output,
-                  StatsFuture *future);
+  MaceStatus operator()(const Tensor *input,
+                        const Tensor *alpha,
+                        Tensor *output,
+                        StatsFuture *future);
 
  private:
   ActivationType activation_;
diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h
index c61be7d2..dd98ee09 100644
--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -36,11 +36,11 @@ constexpr int kCostPerGroup = 1024;
 
 template <DeviceType D, typename T>
 struct AddNFunctor {
-  void operator()(const std::vector<const Tensor *> &input_tensors,
+  MaceStatus operator()(const std::vector<const Tensor *> &input_tensors,
                   Tensor *output_tensor,
                   StatsFuture *future) {
     MACE_UNUSED(future);
-    output_tensor->ResizeLike(input_tensors[0]);
+    MACE_FAILURE_RETURN(output_tensor->ResizeLike(input_tensors[0]));
     index_t size = output_tensor->size();
     Tensor::MappingGuard output_map(output_tensor);
     float *output_data = output_tensor->mutable_data<float>();
@@ -89,13 +89,14 @@ struct AddNFunctor {
         }
       }
     }
+    return MACE_SUCCESS;
   }
 };
 
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct AddNFunctor<DeviceType::GPU, T> {
-  void operator()(const std::vector<const Tensor *> &input_tensors,
+  MaceStatus operator()(const std::vector<const Tensor *> &input_tensors,
                   Tensor *output_tensor,
                   StatsFuture *future);
 
diff --git a/mace/kernels/arm/conv_2d_neon.h b/mace/kernels/arm/conv_2d_neon.h
index 59a24dc6..dd0ecde0 100644
--- a/mace/kernels/arm/conv_2d_neon.h
+++ b/mace/kernels/arm/conv_2d_neon.h
@@ -20,7 +20,7 @@
 namespace mace {
 namespace kernels {
 
-extern void Conv2dNeonK1x1S1(const float *input,
+void Conv2dNeonK1x1S1(const float *input,
                              const float *filter,
                              const index_t batch,
                              const index_t height,
@@ -29,61 +29,61 @@ extern void Conv2dNeonK1x1S1(const float *input,
                              const index_t out_channels,
                              float *output);
 
-extern void Conv2dNeonK3x3S1(const float *input,
+void Conv2dNeonK3x3S1(const float *input,
                              const float *filter,
                              const index_t *in_shape,
                              const index_t *out_shape,
                              float *output);
 
-extern void Conv2dNeonK3x3S2(const float *input,
+void Conv2dNeonK3x3S2(const float *input,
                              const float *filter,
                              const index_t *in_shape,
                              const index_t *out_shape,
                              float *output);
 
-extern void Conv2dNeonK5x5S1(const float *input,
+void Conv2dNeonK5x5S1(const float *input,
                              const float *filter,
                              const index_t *in_shape,
                              const index_t *out_shape,
                              float *output);
 
-extern void Conv2dNeonK1x7S1(const float *input,
+void Conv2dNeonK1x7S1(const float *input,
                              const float *filter,
                              const index_t *in_shape,
                              const index_t *out_shape,
                              float *output);
 
-extern void Conv2dNeonK7x1S1(const float *input,
+void Conv2dNeonK7x1S1(const float *input,
                              const float *filter,
                              const index_t *in_shape,
                              const index_t *out_shape,
                              float *output);
 
-extern void Conv2dNeonK7x7S1(const float *input,
+void Conv2dNeonK7x7S1(const float *input,
                              const float *filter,
                              const index_t *in_shape,
                              const index_t *out_shape,
                              float *output);
 
-extern void Conv2dNeonK7x7S2(const float *input,
+void Conv2dNeonK7x7S2(const float *input,
                              const float *filter,
                              const index_t *in_shape,
                              const index_t *out_shape,
                              float *output);
 
-extern void Conv2dNeonK7x7S3(const float *input,
+void Conv2dNeonK7x7S3(const float *input,
                              const float *filter,
                              const index_t *in_shape,
                              const index_t *out_shape,
                              float *output);
 
-extern void Conv2dNeonK1x15S1(const float *input,
+void Conv2dNeonK1x15S1(const float *input,
                               const float *filter,
                               const index_t *in_shape,
                               const index_t *out_shape,
                               float *output);
 
-extern void Conv2dNeonK15x1S1(const float *input,
+void Conv2dNeonK15x1S1(const float *input,
                               const float *filter,
                               const index_t *in_shape,
                               const index_t *out_shape,
diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h
index 0b6fddb5..6f934e6b 100644
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -56,7 +56,7 @@ struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
                    const float relux_max_limit)
     : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
 
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const Tensor *scale,
                   const Tensor *offset,
                   const Tensor *mean,
@@ -124,6 +124,8 @@ struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
     }
     DoActivation(output_ptr, output_ptr, output->size(), activation_,
                  relux_max_limit_);
+
+    return MACE_SUCCESS;
   }
 };
 
@@ -134,7 +136,7 @@ struct BatchNormFunctor<DeviceType::GPU, T> : BatchNormFunctorBase {
                    const ActivationType activation,
                    const float relux_max_limit)
     : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const Tensor *scale,
                   const Tensor *offset,
                   const Tensor *mean,
diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.h
index 2ce7904d..cf09c8a5 100644
--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -34,7 +34,7 @@ struct BiasAddFunctor;
 
 template<>
 struct BiasAddFunctor<DeviceType::CPU, float> {
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const Tensor *bias,
                   Tensor *output,
                   StatsFuture *future) {
@@ -61,13 +61,15 @@ struct BiasAddFunctor<DeviceType::CPU, float> {
         }
       }
     }
+
+    return MACE_SUCCESS;
   }
 };
 
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
 struct BiasAddFunctor<DeviceType::GPU, T> {
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const Tensor *bias,
                   Tensor *output,
                   StatsFuture *future);
diff --git a/mace/kernels/buffer_to_image.h b/mace/kernels/buffer_to_image.h
index 8ce3f994..a93af90b 100644
--- a/mace/kernels/buffer_to_image.h
+++ b/mace/kernels/buffer_to_image.h
@@ -33,7 +33,7 @@ struct BufferToImageFunctorBase {
 template <DeviceType D, typename T>
 struct BufferToImageFunctor : BufferToImageFunctorBase {
   BufferToImageFunctor() {}
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const BufferType type,
                   Tensor *output,
                   StatsFuture *future) {
@@ -42,13 +42,14 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
     MACE_UNUSED(output);
     MACE_UNUSED(future);
     MACE_NOT_IMPLEMENTED;
+    return MACE_SUCCESS;
   }
 };
 
 template <typename T>
 struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase {
   BufferToImageFunctor() {}
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const BufferType type,
                   Tensor *output,
                   StatsFuture *future);
diff --git a/mace/kernels/channel_shuffle.h b/mace/kernels/channel_shuffle.h
index 4b89764e..5310377b 100644
--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -28,11 +28,11 @@ template<DeviceType D, typename T>
 struct ChannelShuffleFunctor {
   explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}
 
-  void operator()(const Tensor *input,
-                  Tensor *output,
-                  StatsFuture *future) {
+  MaceStatus operator()(const Tensor *input,
+                        Tensor *output,
+                        StatsFuture *future) {
     MACE_UNUSED(future);
-    output->ResizeLike(input);
+    MACE_FAILURE_RETURN(output->ResizeLike(input));
 
     Tensor::MappingGuard logits_guard(input);
     Tensor::MappingGuard output_guard(output);
@@ -57,10 +57,12 @@ struct ChannelShuffleFunctor {
         index_t idx = c / groups_;
         for (index_t hw = 0; hw < height * width; ++hw) {
           output_base[c * image_size + hw] = input_base[
-            (g * channels_per_group + idx) * image_size + hw];
+              (g * channels_per_group + idx) * image_size + hw];
         }
       }
     }
+
+    return MACE_SUCCESS;
   }
 
   const int groups_;
@@ -71,7 +73,9 @@ template<typename T>
 struct ChannelShuffleFunctor<DeviceType::GPU, T> {
   explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}
 
-  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
+  MaceStatus operator()(const Tensor *input,
+                        Tensor *output,
+                        StatsFuture *future);
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h
index f3139b58..6425f9f7 100644
--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -40,7 +40,7 @@ template <DeviceType D, typename T>
 struct ConcatFunctor : ConcatFunctorBase {
   explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
 
-  void operator()(const std::vector<const Tensor *> &input_list,
+  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
                   Tensor *output,
                   StatsFuture *future) {
     MACE_UNUSED(future);
@@ -68,7 +68,7 @@ struct ConcatFunctor : ConcatFunctorBase {
       outer_sizes[i] = input->size() / inner_size;
       output_shape[axis_] += input->dim(axis_);
     }
-    output->Resize(output_shape);
+    MACE_FAILURE_RETURN(output->Resize(output_shape));
 
     T *output_ptr = output->mutable_data<T>();
 
@@ -89,6 +89,8 @@ struct ConcatFunctor : ConcatFunctorBase {
         }
       }
     }
+
+    return MACE_SUCCESS;
   }
 };
 
@@ -97,7 +99,7 @@ template <typename T>
 struct ConcatFunctor<DeviceType::GPU, T> : ConcatFunctorBase {
   explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
 
-  void operator()(const std::vector<const Tensor *> &input_list,
+  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
                   Tensor *output,
                   StatsFuture *future);
   cl::Kernel kernel_;
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index 53531324..85db043c 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -256,7 +256,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
     }  // b
   }
 
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const Tensor *filter,
                   const Tensor *bias,
                   Tensor *output,
@@ -296,7 +296,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                          RoundType::FLOOR,
                          output_shape.data());
     }
-    output->Resize(output_shape);
+    MACE_FAILURE_RETURN(output->Resize(output_shape));
 
     index_t batch = output->dim(0);
     index_t channels = output->dim(1);
@@ -497,7 +497,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
         if (is_filter_transformed_) {
           transformed_filter_ptr = filter_data;
         } else {
-          transformed_filter_.Resize(transformed_filter_shape);
+          MACE_FAILURE_RETURN(transformed_filter_.Resize(
+              transformed_filter_shape));
           switch (winograd_out_tile_size) {
             case 2:
               TransformFilter4x4(filter_data,
@@ -643,12 +644,12 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
     const Tensor *pad_input_ptr = input;
     if (extra_input_height != input_height
       || extra_input_width != input_width) {
-      ConstructNCHWInputWithSpecificPadding(input,
+      MACE_FAILURE_RETURN(ConstructNCHWInputWithSpecificPadding(input,
                                             pad_top,
                                             pad_bottom,
                                             pad_left,
                                             pad_right,
-                                            &padded_input);
+                                            &padded_input));
       pad_input_ptr = &padded_input;
     }
 
@@ -701,6 +702,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
 
     DoActivation(output_data, output_data, output->size(), activation_,
                  relux_max_limit_);
+
+    return MACE_SUCCESS;
   }
 
   Tensor transformed_filter_;
@@ -729,7 +732,7 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
     MACE_UNUSED(scratch);
   }
 
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const Tensor *filter,
                   const Tensor *bias,
                   Tensor *output,
diff --git a/mace/kernels/conv_pool_2d_util.cc b/mace/kernels/conv_pool_2d_util.cc
index 07c72cb3..b7a63a91 100644
--- a/mace/kernels/conv_pool_2d_util.cc
+++ b/mace/kernels/conv_pool_2d_util.cc
@@ -286,7 +286,7 @@ void CalPaddingSize(const index_t *input_shape,   // NCHW
 }
 
 
-void ConstructNCHWInputWithPadding(const Tensor *input_tensor,
+MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor,
                                    const int *paddings,
                                    Tensor *output_tensor,
                                    bool padding_same_value) {
@@ -306,7 +306,7 @@ void ConstructNCHWInputWithPadding(const Tensor *input_tensor,
   const int padded_top = paddings[0] / 2;
   const int padded_left = paddings[1] / 2;
 
-  output_tensor->Resize(output_shape);
+  MACE_FAILURE_RETURN(output_tensor->Resize(output_shape));
 
   Tensor::MappingGuard padded_output_mapper(output_tensor);
   float *output_data = output_tensor->mutable_data<float>();
@@ -356,9 +356,11 @@ void ConstructNCHWInputWithPadding(const Tensor *input_tensor,
       }
     }
   }
+
+  return MACE_SUCCESS;
 }
 
-void ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
+MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
                                            const int pad_top,
                                            const int pad_bottom,
                                            const int pad_left,
@@ -376,7 +378,7 @@ void ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
   const int pad_width = pad_left + pad_right;
   std::vector<index_t> output_shape(
     {batch, channels, height + pad_height, width + pad_width});
-  output_tensor->Resize(output_shape);
+  MACE_FAILURE_RETURN(output_tensor->Resize(output_shape));
   output_tensor->Clear();
   Tensor::MappingGuard padded_output_mapper(output_tensor);
   float *output_data = output_tensor->mutable_data<float>();
@@ -400,10 +402,12 @@ void ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
       // Skip the padded bottom in this channel and top in the next channel
     }
   }
+
+  return MACE_SUCCESS;
 }
 
 
-void ConstructNHWCInputWithPadding(const Tensor *input_tensor,
+MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor,
                                    const int *paddings,
                                    Tensor *output_tensor,
                                    bool padding_same_value) {
@@ -424,7 +428,7 @@ void ConstructNHWCInputWithPadding(const Tensor *input_tensor,
   const int padded_top = paddings[0] / 2;
   const int padded_left = paddings[1] / 2;
 
-  output_tensor->Resize(output_shape);
+  MACE_FAILURE_RETURN(output_tensor->Resize(output_shape));
 
   Tensor::MappingGuard padded_output_mapper(output_tensor);
   float *output_data = output_tensor->mutable_data<float>();
@@ -450,6 +454,8 @@ void ConstructNHWCInputWithPadding(const Tensor *input_tensor,
       }
     }
   }
+
+  return MACE_SUCCESS;
 }
 
 }  // namespace kernels
diff --git a/mace/kernels/conv_pool_2d_util.h b/mace/kernels/conv_pool_2d_util.h
index 8c7420a1..0f0909a3 100644
--- a/mace/kernels/conv_pool_2d_util.h
+++ b/mace/kernels/conv_pool_2d_util.h
@@ -71,17 +71,17 @@ void CalPaddingSize(const index_t *input_shape,   // NCHW
                     Padding padding,
                     int *padding_size);
 
-void ConstructNCHWInputWithSpecificPadding(const Tensor *input,
+MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input,
                                const int pad_top, const int pad_bottom,
                                const int pad_left, const int pad_right,
                                Tensor *output_tensor);
 
-void ConstructNCHWInputWithPadding(const Tensor *input,
+MaceStatus ConstructNCHWInputWithPadding(const Tensor *input,
                                    const int *paddings,
                                    Tensor *output_tensor,
                                    bool padding_same_value = false);
 
-void ConstructNHWCInputWithPadding(const Tensor *input,
+MaceStatus ConstructNHWCInputWithPadding(const Tensor *input,
                                    const int *paddings,
                                    Tensor *output_tensor,
                                    bool padding_same_value = false);
diff --git a/mace/kernels/deconv_2d.h b/mace/kernels/deconv_2d.h
index 7c20addd..7ccef5e9 100644
--- a/mace/kernels/deconv_2d.h
+++ b/mace/kernels/deconv_2d.h
@@ -226,7 +226,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
                             activation,
                             relux_max_limit) {}
 
-  void operator()(const Tensor *input,   // NCHW
+  MaceStatus operator()(const Tensor *input,   // NCHW
                   const Tensor *filter,  // OIHW
                   const Tensor *bias,
                   Tensor *output,
@@ -250,7 +250,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
           strides_, padding_type_,
           output_shape.data(),
           paddings_.data(), true);
-      output->Resize(output_shape);
+      MACE_FAILURE_RETURN(output->Resize(output_shape));
     } else {
       output_shape_.clear();
       output_shape_ = std::vector<index_t>(4, 0);
@@ -259,7 +259,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
                            strides_,
                            output_shape_.data(),
                            paddings_.data(), true);
-      output->Resize(output_shape_);
+      MACE_FAILURE_RETURN(output->Resize(output_shape_));
     }
     index_t kernel_h = filter->dim(2);
     index_t kernel_w = filter->dim(3);
@@ -298,6 +298,8 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
                  output->size(),
                  activation_,
                  relux_max_limit_);
+
+    return MACE_SUCCESS;
   }
 };
 
@@ -317,7 +319,7 @@ struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase {
                             activation,
                             relux_max_limit) {}
 
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const Tensor *filter,
                   const Tensor *bias,
                   Tensor *output,
diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h
index 2afd905b..cdd4a91f 100644
--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -31,8 +31,10 @@ namespace kernels {
 template<DeviceType D, typename T>
 struct DepthToSpaceOpFunctor {
   explicit DepthToSpaceOpFunctor(const int block_size, bool d2s)
-    : block_size_(block_size), d2s_(d2s) {}
-  void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
+      : block_size_(block_size), d2s_(d2s) {}
+  MaceStatus operator()(const Tensor *input,
+                        Tensor *output,
+                        StatsFuture *future) {
     MACE_UNUSED(future);
     const index_t batch_size = input->dim(0);
     const index_t input_depth = input->dim(1);
@@ -53,7 +55,7 @@ struct DepthToSpaceOpFunctor {
     std::vector<index_t> output_shape = {batch_size, output_depth,
                                          output_height, output_width};
 
-    output->Resize(output_shape);
+    MACE_FAILURE_RETURN(output->Resize(output_shape));
 
     Tensor::MappingGuard logits_guard(input);
     Tensor::MappingGuard output_guard(output);
@@ -71,14 +73,15 @@ struct DepthToSpaceOpFunctor {
               const index_t in_w = w / block_size_;
               const index_t offset_w = w % block_size_;
               const index_t offset_d =
-                (offset_h * block_size_ + offset_w) * output_depth;
+                  (offset_h * block_size_ + offset_w) * output_depth;
 
               const index_t in_d = d + offset_d;
               const index_t o_index =
-                ((b * output_depth + d) * output_height + h) * output_width + w;
+                  ((b * output_depth + d) * output_height + h) * output_width
+                      + w;
               const index_t i_index =
-                ((b * input_depth + in_d) * input_height + in_h) * input_width
-                  + in_w;
+                  ((b * input_depth + in_d) * input_height + in_h) * input_width
+                      + in_w;
               output_ptr[o_index] = input_ptr[i_index];
             }
           }
@@ -95,21 +98,23 @@ struct DepthToSpaceOpFunctor {
               const index_t out_w = w / block_size_;
               const index_t offset_w = (w % block_size_);
               const index_t offset_d =
-                (offset_h * block_size_ + offset_w) * input_depth;
+                  (offset_h * block_size_ + offset_w) * input_depth;
 
               const index_t out_d = d + offset_d;
               const index_t o_index =
-                ((b * output_depth + out_d) * output_height + out_h)
-                  * output_width + out_w;
+                  ((b * output_depth + out_d) * output_height + out_h)
+                      * output_width + out_w;
               const index_t i_index =
-                ((b * input_depth + d) * input_height + h) * input_width
-                  + w;
+                  ((b * input_depth + d) * input_height + h) * input_width
+                      + w;
               output_ptr[o_index] = input_ptr[i_index];
             }
           }
         }
       }
     }
+
+    return MACE_SUCCESS;
   }
 
   const int block_size_;
@@ -120,8 +125,10 @@ struct DepthToSpaceOpFunctor {
 template<typename T>
 struct DepthToSpaceOpFunctor<DeviceType::GPU, T> {
   DepthToSpaceOpFunctor(const int block_size, bool d2s)
-    : block_size_(block_size), d2s_(d2s) {}
-  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
+      : block_size_(block_size), d2s_(d2s) {}
+  MaceStatus operator()(const Tensor *input,
+                        Tensor *output,
+                        StatsFuture *future);
 
   const int block_size_;
   bool d2s_;
diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h
index a276b504..c864ceca 100644
--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -127,7 +127,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
     }
   }
 
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const Tensor *filter,
                   const Tensor *bias,
                   Tensor *output,
@@ -161,7 +161,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
                          RoundType::FLOOR,
                          output_shape.data());
     }
-    output->Resize(output_shape);
+    MACE_FAILURE_RETURN(output->Resize(output_shape));
     output->Clear();
 
     index_t batch = output->dim(0);
@@ -275,6 +275,8 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
 
     DoActivation(output_data, output_data, output->size(), activation_,
                  relux_max_limit_);
+
+    return MACE_SUCCESS;
   }
 };
 
@@ -295,7 +297,7 @@ struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
                                  activation,
                                  relux_max_limit) {}
 
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const Tensor *filter,
                   const Tensor *bias,
                   Tensor *output,
diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h
index 2e7bb769..d5d0e77e 100644
--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -466,7 +466,7 @@ struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase {
                  const float value)
       : EltwiseFunctorBase(type, coeff, value) {}
 
-  void operator()(const Tensor *input0,
+  MaceStatus operator()(const Tensor *input0,
                   const Tensor *input1,
                   Tensor *output,
                   StatsFuture *future) {
@@ -494,7 +494,7 @@ struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase {
         }
       }
     }
-    output->ResizeLike(input0);
+    MACE_FAILURE_RETURN(output->ResizeLike(input0));
 
     Tensor::MappingGuard input0_guard(input0);
     Tensor::MappingGuard output_guard(output);
@@ -530,6 +530,8 @@ struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase {
         }
       }
     }
+
+    return MACE_SUCCESS;
   }
 };
 
@@ -541,7 +543,7 @@ struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase {
                  const float value)
       : EltwiseFunctorBase(type, coeff, value) {}
 
-  void operator()(const Tensor *input0,
+  MaceStatus operator()(const Tensor *input0,
                   const Tensor *input1,
                   Tensor *output,
                   StatsFuture *future);
diff --git a/mace/kernels/fully_connected.h b/mace/kernels/fully_connected.h
index e67603a5..005dd150 100644
--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -50,14 +50,14 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
                         const float relux_max_limit)
       : FullyConnectedBase(activation, relux_max_limit) {}
 
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const Tensor *weight,
                   const Tensor *bias,
                   Tensor *output,
                   StatsFuture *future) {
     MACE_UNUSED(future);
     std::vector<index_t> output_shape = {input->dim(0), weight->dim(0), 1, 1};
-    output->Resize(output_shape);
+    MACE_FAILURE_RETURN(output->Resize(output_shape));
     const index_t N = output->dim(0);
     const index_t input_size = weight->dim(1) * weight->dim(2) * weight->dim(3);
     const index_t output_size = weight->dim(0);
@@ -80,6 +80,8 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
 
     DoActivation(output_ptr, output_ptr, output->size(), activation_,
                  relux_max_limit_);
+
+    return MACE_SUCCESS;
   }
 };
 
@@ -90,7 +92,7 @@ struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase {
                         const float relux_max_limit)
       : FullyConnectedBase(activation, relux_max_limit) {}
 
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const Tensor *weight,
                   const Tensor *bias,
                   Tensor *output,
diff --git a/mace/kernels/image_to_buffer.h b/mace/kernels/image_to_buffer.h
index b6a7370d..22ce6af5 100644
--- a/mace/kernels/image_to_buffer.h
+++ b/mace/kernels/image_to_buffer.h
@@ -33,7 +33,7 @@ struct ImageToBufferFunctorBase {
 template <DeviceType D, typename T>
 struct ImageToBufferFunctor : ImageToBufferFunctorBase {
   ImageToBufferFunctor() {}
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const BufferType type,
                   Tensor *output,
                   StatsFuture *future) {
@@ -42,13 +42,14 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {
     MACE_UNUSED(output);
     MACE_UNUSED(future);
     MACE_NOT_IMPLEMENTED;
+    return MACE_SUCCESS;
   }
 };
 
 template <typename T>
 struct ImageToBufferFunctor<DeviceType::GPU, T> : ImageToBufferFunctorBase {
   ImageToBufferFunctor() {}
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const BufferType type,
                   Tensor *output,
                   StatsFuture *future);
diff --git a/mace/kernels/local_response_norm.h b/mace/kernels/local_response_norm.h
index df156042..0af86327 100644
--- a/mace/kernels/local_response_norm.h
+++ b/mace/kernels/local_response_norm.h
@@ -35,7 +35,7 @@ struct LocalResponseNormFunctor;
 
 template<>
 struct LocalResponseNormFunctor<DeviceType::CPU, float> {
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   int depth_radius,
                   float bias,
                   float alpha,
@@ -74,6 +74,8 @@ struct LocalResponseNormFunctor<DeviceType::CPU, float> {
         }
       }
     }
+
+    return MACE_SUCCESS;
   }
 };
 
diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.h
index 3b189261..303c34c9 100644
--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -38,13 +38,13 @@ namespace kernels {
 
 template<DeviceType D, typename T>
 struct MatMulFunctor {
-  void operator()(const Tensor *A,
+  MaceStatus operator()(const Tensor *A,
                   const Tensor *B,
                   Tensor *C,
                   StatsFuture *future) {
     MACE_UNUSED(future);
     std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
-    C->Resize(c_shape);
+    MACE_FAILURE_RETURN(C->Resize(c_shape));
 
     Tensor::MappingGuard guarda(A);
     Tensor::MappingGuard guardb(B);
@@ -63,13 +63,15 @@ struct MatMulFunctor {
     memset(c_ptr_base, 0, batch * height * width * sizeof(T));
 
     Gemm(a_ptr_base, b_ptr_base, batch, height, K, width, c_ptr_base);
+
+    return MACE_SUCCESS;
   }
 };
 
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
 struct MatMulFunctor<DeviceType::GPU, T> {
-  void operator()(const Tensor *A,
+  MaceStatus operator()(const Tensor *A,
                   const Tensor *B,
                   Tensor *C,
                   StatsFuture *future);
diff --git a/mace/kernels/opencl/activation.cc b/mace/kernels/opencl/activation.cc
index 6b556966..3288127a 100644
--- a/mace/kernels/opencl/activation.cc
+++ b/mace/kernels/opencl/activation.cc
@@ -21,11 +21,12 @@
 
 namespace mace {
 namespace kernels {
-template <typename T>
-void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
-                                                          const Tensor *alpha,
-                                                          Tensor *output,
-                                                          StatsFuture *future) {
+template<typename T>
+MaceStatus ActivationFunctor<DeviceType::GPU,
+                             T>::operator()(const Tensor *input,
+                                            const Tensor *alpha,
+                                            Tensor *output,
+                                            StatsFuture *future) {
   const index_t batch = input->dim(0);
   const index_t height = input->dim(1);
   const index_t width = input->dim(2);
@@ -45,7 +46,7 @@ void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     if (runtime->IsOutOfRangeCheckEnabled()) {
       built_options.emplace("-DOUT_OF_RANGE_CHECK");
       kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::GPU))));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU))));
       kernel_error_->Allocate(1);
       kernel_error_->Map(nullptr);
       *(kernel_error_->mutable_data<char>()) = 0;
@@ -55,28 +56,22 @@ void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
       built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
     switch (activation_) {
-      case RELU:
-        tuning_key_prefix_ = "relu_opencl_kernel";
+      case RELU:tuning_key_prefix_ = "relu_opencl_kernel";
         built_options.emplace("-DUSE_RELU");
         break;
-      case RELUX:
-        tuning_key_prefix_ = "relux_opencl_kernel";
+      case RELUX:tuning_key_prefix_ = "relux_opencl_kernel";
         built_options.emplace("-DUSE_RELUX");
         break;
-      case PRELU:
-        tuning_key_prefix_ = "prelu_opencl_kernel";
+      case PRELU:tuning_key_prefix_ = "prelu_opencl_kernel";
         built_options.emplace("-DUSE_PRELU");
         break;
-      case TANH:
-        tuning_key_prefix_ = "tanh_opencl_kernel";
+      case TANH:tuning_key_prefix_ = "tanh_opencl_kernel";
         built_options.emplace("-DUSE_TANH");
         break;
-      case SIGMOID:
-        tuning_key_prefix_ = "sigmoid_opencl_kernel";
+      case SIGMOID:tuning_key_prefix_ = "sigmoid_opencl_kernel";
         built_options.emplace("-DUSE_SIGMOID");
         break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation_;
+      default:LOG(FATAL) << "Unknown activation type: " << activation_;
     }
     kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
 
@@ -92,7 +87,7 @@ void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     int idx = 0;
     if (runtime->IsOutOfRangeCheckEnabled()) {
       kernel_.setArg(idx++,
-          *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
+                     *(static_cast<cl::Buffer *>(kernel_error_->buffer())));
     }
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
@@ -122,9 +117,13 @@ void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
     kernel_error_->UnMap();
   }
+
+  return MACE_SUCCESS;
 }
 
-template struct ActivationFunctor<DeviceType::GPU, float>;
-template struct ActivationFunctor<DeviceType::GPU, half>;
+template
+struct ActivationFunctor<DeviceType::GPU, float>;
+template
+struct ActivationFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index c47213f5..5325f9cd 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -22,7 +22,7 @@ namespace mace {
 namespace kernels {
 
 template <typename T>
-void AddNFunctor<DeviceType::GPU, T>::operator()(
+MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
     const std::vector<const Tensor *> &input_tensors,
     Tensor *output_tensor,
     StatsFuture *future) {
@@ -87,7 +87,8 @@ void AddNFunctor<DeviceType::GPU, T>::operator()(
     std::vector<size_t> output_image_shape;
     CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
                     &output_image_shape);
-    output_tensor->ResizeImage(output_shape, output_image_shape);
+    MACE_FAILURE_RETURN(output_tensor->ResizeImage(output_shape,
+                                                   output_image_shape));
 
     uint32_t idx = 0;
     if (runtime->IsOutOfRangeCheckEnabled()) {
@@ -118,6 +119,8 @@ void AddNFunctor<DeviceType::GPU, T>::operator()(
     MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
     kernel_error_->UnMap();
   }
+
+  return MACE_SUCCESS;
 }
 
 template struct AddNFunctor<DeviceType::GPU, float>;
diff --git a/mace/kernels/opencl/batch_norm.cc b/mace/kernels/opencl/batch_norm.cc
index 80fafdbc..01bb3399 100644
--- a/mace/kernels/opencl/batch_norm.cc
+++ b/mace/kernels/opencl/batch_norm.cc
@@ -23,7 +23,7 @@ namespace mace {
 namespace kernels {
 
 template <typename T>
-void BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
+MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                          const Tensor *scale,
                                                          const Tensor *offset,
                                                          const Tensor *mean,
@@ -132,6 +132,8 @@ void BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
     kernel_error_->UnMap();
   }
+
+  return MACE_SUCCESS;
 }
 
 template struct BatchNormFunctor<DeviceType::GPU, float>;
diff --git a/mace/kernels/opencl/bias_add.cc b/mace/kernels/opencl/bias_add.cc
index e50dcf58..136cd114 100644
--- a/mace/kernels/opencl/bias_add.cc
+++ b/mace/kernels/opencl/bias_add.cc
@@ -22,7 +22,7 @@ namespace mace {
 namespace kernels {
 
 template <typename T>
-void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
+MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                        const Tensor *bias,
                                                        Tensor *output,
                                                        StatsFuture *future) {
@@ -115,6 +115,8 @@ void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
       }
     };
   }
+
+  return MACE_SUCCESS;
 }
 
 template struct BiasAddFunctor<DeviceType::GPU, float>;
diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc
index bf629e37..7a23ad89 100644
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -20,7 +20,7 @@ namespace mace {
 namespace kernels {
 
 template <typename T>
-void BufferToImageFunctor<DeviceType::GPU, T>::operator()(
+MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
     const Tensor *buffer,
     const BufferType type,
     Tensor *image,
@@ -30,9 +30,9 @@ void BufferToImageFunctor<DeviceType::GPU, T>::operator()(
   CalImage2DShape(buffer->shape(), type, &image_shape);
   if (type == WINOGRAD_FILTER) {
     std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type);
-    image->ResizeImage(new_shape, image_shape);
+    MACE_FAILURE_RETURN(image->ResizeImage(new_shape, image_shape));
   } else {
-    image->ResizeImage(buffer->shape(), image_shape);
+    MACE_FAILURE_RETURN(image->ResizeImage(buffer->shape(), image_shape));
   }
 
   uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
@@ -175,6 +175,8 @@ void BufferToImageFunctor<DeviceType::GPU, T>::operator()(
       }
     };
   }
+
+  return MACE_SUCCESS;
 }
 
 template struct BufferToImageFunctor<DeviceType::GPU, float>;
diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc
index d16a3d8a..d6715e1f 100644
--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -23,11 +23,11 @@ namespace mace {
 namespace kernels {
 
 template <typename T>
-void ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
+MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
     const Tensor *input,
     Tensor *output,
     StatsFuture *future) {
-  output->ResizeLike(input);
+  MACE_FAILURE_RETURN(output->ResizeLike(input));
 
   const index_t batch = input->dim(0);
   const index_t height = input->dim(1);
@@ -103,6 +103,8 @@ void ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
     MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
     kernel_error_->UnMap();
   }
+
+  return MACE_SUCCESS;
 }
 
 template
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index 23904100..32343c11 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -235,7 +235,7 @@ static void ConcatN(cl::Kernel *kernel,
 }
 
 template <typename T>
-void ConcatFunctor<DeviceType::GPU, T>::operator()(
+MaceStatus ConcatFunctor<DeviceType::GPU, T>::operator()(
     const std::vector<const Tensor *> &input_list,
     Tensor *output,
     StatsFuture *future) {
@@ -266,7 +266,7 @@ void ConcatFunctor<DeviceType::GPU, T>::operator()(
       "Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
   std::vector<size_t> image_shape;
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
-  output->ResizeImage(output_shape, image_shape);
+  MACE_FAILURE_RETURN(output->ResizeImage(output_shape, image_shape));
 
   switch (inputs_count) {
     case 2:
@@ -281,6 +281,8 @@ void ConcatFunctor<DeviceType::GPU, T>::operator()(
         MACE_NOT_IMPLEMENTED;
       }
   }
+
+  return MACE_SUCCESS;
 }
 
 template struct ConcatFunctor<DeviceType::GPU, float>;
diff --git a/mace/kernels/opencl/conv_2d.cc b/mace/kernels/opencl/conv_2d.cc
index 9a66d4b9..ce15dad0 100644
--- a/mace/kernels/opencl/conv_2d.cc
+++ b/mace/kernels/opencl/conv_2d.cc
@@ -67,7 +67,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
                          std::unique_ptr<BufferBase> *kernel_error);
 
 template <typename T>
-void Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
+MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                       const Tensor *filter,
                                                       const Tensor *bias,
                                                       Tensor *output,
@@ -111,7 +111,7 @@ void Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
   std::vector<size_t> output_image_shape;
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
                   &output_image_shape);
-  output->ResizeImage(output_shape, output_image_shape);
+  MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape));
 
   if (kernel_h == kernel_w && kernel_h <= 5 &&
       selector[kernel_h - 1] != nullptr) {
@@ -126,6 +126,8 @@ void Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                  DataTypeToEnum<T>::value, &input_shape_, output, future,
                  &kwg_size_, &kernel_error_);
   }
+
+  return MACE_SUCCESS;
 }
 
 template struct Conv2dFunctor<DeviceType::GPU, float>;
diff --git a/mace/kernels/opencl/deconv_2d_opencl.cc b/mace/kernels/opencl/deconv_2d_opencl.cc
index abb4b43e..946b77af 100644
--- a/mace/kernels/opencl/deconv_2d_opencl.cc
+++ b/mace/kernels/opencl/deconv_2d_opencl.cc
@@ -154,7 +154,7 @@ void Deconv2dOpencl(cl::Kernel *kernel,
 }  // namespace
 
 template <typename T>
-void Deconv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
+MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                       const Tensor *filter,
                                                       const Tensor *bias,
                                                       Tensor *output,
@@ -185,13 +185,15 @@ void Deconv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
   std::vector<size_t> output_image_shape;
   CalImage2DShape(output_shape_, BufferType::IN_OUT_CHANNEL,
                   &output_image_shape);
-  output->ResizeImage(output_shape_, output_image_shape);
+  MACE_FAILURE_RETURN(output->ResizeImage(output_shape_, output_image_shape));
 
   Deconv2dOpencl(&kernel_, input, filter, bias,
                  strides_[0], paddings_.data(),
                  activation_, relux_max_limit_,
                  DataTypeToEnum<T>::value, &input_shape_,
                  output, future, &kwg_size_, &kernel_error_);
+
+  return MACE_SUCCESS;
 }
 
 template struct Deconv2dFunctor<DeviceType::GPU, float>;
diff --git a/mace/kernels/opencl/depth_to_space.cc b/mace/kernels/opencl/depth_to_space.cc
index 609ad205..ab713161 100644
--- a/mace/kernels/opencl/depth_to_space.cc
+++ b/mace/kernels/opencl/depth_to_space.cc
@@ -23,7 +23,7 @@ namespace mace {
 namespace kernels {
 
 template <typename T>
-void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
+MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
     const Tensor *input, Tensor *output, StatsFuture *future) {
   const index_t batch = input->dim(0);
   const index_t input_height = input->dim(1);
@@ -70,7 +70,7 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
 
   std::vector<size_t> image_shape;
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
-  output->ResizeImage(output_shape, image_shape);
+  MACE_FAILURE_RETURN(output->ResizeImage(output_shape, image_shape));
 
   auto runtime = OpenCLRuntime::Global();
 
@@ -144,6 +144,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
     MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
     kernel_error_->UnMap();
   }
+
+  return MACE_SUCCESS;
 }
 
 template struct DepthToSpaceOpFunctor<DeviceType::GPU, float>;
diff --git a/mace/kernels/opencl/depthwise_conv.cc b/mace/kernels/opencl/depthwise_conv.cc
index c7800d0a..2711ee97 100644
--- a/mace/kernels/opencl/depthwise_conv.cc
+++ b/mace/kernels/opencl/depthwise_conv.cc
@@ -194,7 +194,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
 }
 
 template <typename T>
-void DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
+MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
     const Tensor *input,
     const Tensor *filter, /* MIHW */
     const Tensor *bias,
@@ -209,10 +209,9 @@ void DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
                  << " stride " << strides_[0] << "x" << strides_[1]
                  << " is not implemented yet, using slow version";
     // TODO(heliangliang) The CPU/NEON kernel should map the buffer
-    DepthwiseConv2dFunctor<DeviceType::CPU, float>(
+    return DepthwiseConv2dFunctor<DeviceType::CPU, float>(
         strides_, padding_type_, paddings_, dilations_, activation_,
         relux_max_limit_)(input, filter, bias, output, future);
-    return;
   }
 
   // Create a fake conv_2d filter to calculate the paddings and output size
@@ -238,12 +237,14 @@ void DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
   std::vector<size_t> output_image_shape;
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
                   &output_image_shape);
-  output->ResizeImage(output_shape, output_image_shape);
+  MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape));
 
   DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                   dilations_, activation_, relux_max_limit_,
                   DataTypeToEnum<T>::value, &input_shape_, output, future,
                   &kwg_size_, &kernel_error_);
+
+  return MACE_SUCCESS;
 }
 
 template struct DepthwiseConv2dFunctor<DeviceType::GPU, float>;
diff --git a/mace/kernels/opencl/eltwise.cc b/mace/kernels/opencl/eltwise.cc
index 4f059046..4cedb051 100644
--- a/mace/kernels/opencl/eltwise.cc
+++ b/mace/kernels/opencl/eltwise.cc
@@ -21,7 +21,7 @@ namespace mace {
 namespace kernels {
 
 template <typename T>
-void EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
+MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
                                                        const Tensor *input1,
                                                        Tensor *output,
                                                        StatsFuture *future) {
@@ -60,7 +60,7 @@ void EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
   CalImage2DShape(output_shape,
                   BufferType::IN_OUT_CHANNEL,
                   &output_image_shape);
-  output->ResizeImage(output_shape, output_image_shape);
+  MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape));
 
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
@@ -151,6 +151,8 @@ void EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
     MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
     kernel_error_->UnMap();
   }
+
+  return MACE_SUCCESS;
 }
 
 template struct EltwiseFunctor<DeviceType::GPU, float>;
diff --git a/mace/kernels/opencl/fully_connected.cc b/mace/kernels/opencl/fully_connected.cc
index 6e0678da..6ebfdef0 100644
--- a/mace/kernels/opencl/fully_connected.cc
+++ b/mace/kernels/opencl/fully_connected.cc
@@ -282,7 +282,7 @@ void FCWTXKernel(cl::Kernel *kernel,
 }  // namespace
 
 template <typename T>
-void FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
+MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
     const Tensor *input,
     const Tensor *weight,
     const Tensor *bias,
@@ -292,11 +292,13 @@ void FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
   std::vector<size_t> output_image_shape;
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
                   &output_image_shape);
-  output->ResizeImage(output_shape, output_image_shape);
+  MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape));
 
   FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
                 activation_, &gws_, &lws_, relux_max_limit_, future,
                 &kernel_error_);
+
+  return MACE_SUCCESS;
 }
 
 template struct FullyConnectedFunctor<DeviceType::GPU, float>;
diff --git a/mace/kernels/opencl/image_to_buffer.cc b/mace/kernels/opencl/image_to_buffer.cc
index 1cefff9e..8b83b88e 100644
--- a/mace/kernels/opencl/image_to_buffer.cc
+++ b/mace/kernels/opencl/image_to_buffer.cc
@@ -20,7 +20,7 @@ namespace mace {
 namespace kernels {
 
 template <typename T>
-void ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
+MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
     const Tensor *image,
     const BufferType type,
     Tensor *buffer,
@@ -28,7 +28,7 @@ void ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
 
   std::vector<size_t> image_shape;
   CalImage2DShape(image->shape(), type, &image_shape);
-  buffer->Resize(image->shape());
+  MACE_FAILURE_RETURN(buffer->Resize(image->shape()));
 
   uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
                      static_cast<uint32_t>(image_shape[1])};
@@ -163,6 +163,8 @@ void ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
       }
     };
   }
+
+  return MACE_SUCCESS;
 }
 
 template struct ImageToBufferFunctor<DeviceType::GPU, float>;
diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc
index cc63ed04..e47698a8 100644
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -21,7 +21,7 @@ namespace mace {
 namespace kernels {
 
 template <typename T>
-void MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
+MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
                                                       const Tensor *B,
                                                       Tensor *C,
                                                       StatsFuture *future) {
@@ -29,7 +29,7 @@ void MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
   std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
   std::vector<size_t> c_image_shape;
   CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape);
-  C->ResizeImage(c_shape, c_image_shape);
+  MACE_FAILURE_RETURN(C->ResizeImage(c_shape, c_image_shape));
 
   const index_t batch = C->dim(0);
   const index_t height = C->dim(1);
@@ -98,6 +98,8 @@ void MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
     MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
     kernel_error_->UnMap();
   }
+
+  return MACE_SUCCESS;
 }
 
 template struct MatMulFunctor<DeviceType::GPU, float>;
diff --git a/mace/kernels/opencl/pad.cc b/mace/kernels/opencl/pad.cc
index 34fbf659..fe2a51ef 100644
--- a/mace/kernels/opencl/pad.cc
+++ b/mace/kernels/opencl/pad.cc
@@ -21,7 +21,7 @@ namespace mace {
 namespace kernels {
 
 template<typename T>
-void PadFunctor<DeviceType::GPU, T>::operator()(
+MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(
     const Tensor *input,
     Tensor *output,
     StatsFuture *future) {
@@ -39,7 +39,7 @@ void PadFunctor<DeviceType::GPU, T>::operator()(
 
   std::vector<size_t> image_shape;
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
-  output->ResizeImage(output_shape, image_shape);
+  MACE_FAILURE_RETURN(output->ResizeImage(output_shape, image_shape));
 
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
@@ -114,6 +114,8 @@ void PadFunctor<DeviceType::GPU, T>::operator()(
     MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
     kernel_error_->UnMap();
   }
+
+  return MACE_SUCCESS;
 }
 
 template
diff --git a/mace/kernels/opencl/pooling.cc b/mace/kernels/opencl/pooling.cc
index 8a9f91e9..b208c529 100644
--- a/mace/kernels/opencl/pooling.cc
+++ b/mace/kernels/opencl/pooling.cc
@@ -44,7 +44,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
 }  // namespace
 
 template <typename T>
-void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
+MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                        Tensor *output,
                                                        StatsFuture *future) {
   MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
@@ -108,7 +108,7 @@ void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     std::vector<size_t> output_image_shape;
     CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
                     &output_image_shape);
-    output->ResizeImage(output_shape, output_image_shape);
+    MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape));
 
     index_t batch = output->dim(0);
     index_t out_height = output->dim(1);
@@ -169,6 +169,8 @@ void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
     kernel_error_->UnMap();
   }
+
+  return MACE_SUCCESS;
 }
 
 template struct PoolingFunctor<DeviceType::GPU, float>;
diff --git a/mace/kernels/opencl/resize_bilinear.cc b/mace/kernels/opencl/resize_bilinear.cc
index 0c86cae8..5fba4af2 100644
--- a/mace/kernels/opencl/resize_bilinear.cc
+++ b/mace/kernels/opencl/resize_bilinear.cc
@@ -51,7 +51,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
 }  // namespace
 
 template <typename T>
-void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
+MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
     const Tensor *input, Tensor *output, StatsFuture *future) {
   const index_t batch = input->dim(0);
   const index_t in_height = input->dim(1);
@@ -100,7 +100,7 @@ void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
     std::vector<size_t> output_image_shape;
     CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
                     &output_image_shape);
-    output->ResizeImage(output_shape, output_image_shape);
+    MACE_FAILURE_RETURN(output->ResizeImage(output_shape, output_image_shape));
 
     float height_scale =
         CalculateResizeScale(in_height, out_height, align_corners_);
@@ -140,6 +140,8 @@ void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
     MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
     kernel_error_->UnMap();
   }
+
+  return MACE_SUCCESS;
 }
 
 template struct ResizeBilinearFunctor<DeviceType::GPU, float>;
diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc
index 21fdbca1..b77a0bdb 100644
--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -21,7 +21,7 @@ namespace mace {
 namespace kernels {
 
 template<typename T>
-void SliceFunctor<DeviceType::GPU, T>::operator()(
+MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
     const Tensor *input,
     const std::vector<Tensor *> &output_list,
     StatsFuture *future) {
@@ -36,7 +36,7 @@ void SliceFunctor<DeviceType::GPU, T>::operator()(
   std::vector<size_t> image_shape;
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
   for (size_t i= 0; i < outputs_count; ++i) {
-    output_list[i]->ResizeImage(output_shape, image_shape);
+    MACE_FAILURE_RETURN(output_list[i]->ResizeImage(output_shape, image_shape));
   }
 
   auto runtime = OpenCLRuntime::Global();
@@ -131,6 +131,8 @@ void SliceFunctor<DeviceType::GPU, T>::operator()(
       }
     };
   }
+
+  return MACE_SUCCESS;
 }
 
 template
diff --git a/mace/kernels/opencl/softmax.cc b/mace/kernels/opencl/softmax.cc
index 8e5be845..b1748ee3 100644
--- a/mace/kernels/opencl/softmax.cc
+++ b/mace/kernels/opencl/softmax.cc
@@ -44,7 +44,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
 }  // namespace
 
 template <typename T>
-void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
+MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
                                                        Tensor *output,
                                                        StatsFuture *future) {
   const index_t batch = logits->dim(0);
@@ -115,6 +115,8 @@ void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
     MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
     kernel_error_->UnMap();
   }
+
+  return MACE_SUCCESS;
 }
 
 template struct SoftmaxFunctor<DeviceType::GPU, float>;
diff --git a/mace/kernels/opencl/space_to_batch.cc b/mace/kernels/opencl/space_to_batch.cc
index c3c45f0b..456434b7 100644
--- a/mace/kernels/opencl/space_to_batch.cc
+++ b/mace/kernels/opencl/space_to_batch.cc
@@ -25,7 +25,7 @@ namespace mace {
 namespace kernels {
 
 template <typename T>
-void SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
+MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
     Tensor *space_tensor,
     Tensor *batch_tensor,
     StatsFuture *future) {
@@ -45,10 +45,12 @@ void SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
                   &output_image_shape);
   if (b2s_) {
-    space_tensor->ResizeImage(output_shape, output_image_shape);
+    MACE_FAILURE_RETURN(space_tensor->ResizeImage(output_shape,
+                                                  output_image_shape));
     kernel_name = "batch_to_space";
   } else {
-    batch_tensor->ResizeImage(output_shape, output_image_shape);
+    MACE_FAILURE_RETURN(batch_tensor->ResizeImage(output_shape,
+                                                  output_image_shape));
     kernel_name = "space_to_batch";
   }
   const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
@@ -129,6 +131,8 @@ void SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
     MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
     kernel_error_->UnMap();
   }
+
+  return MACE_SUCCESS;
 }
 
 template struct SpaceToBatchFunctor<DeviceType::GPU, float>;
diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc
index da7dea0b..70e4dcc5 100644
--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -22,7 +22,7 @@ namespace mace {
 namespace kernels {
 
 template <typename T>
-void WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
+MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
     const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
 
   auto runtime = OpenCLRuntime::Global();
@@ -78,7 +78,7 @@ void WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
     output_shape = {16, input_tensor->dim(3), out_width, 1};
     std::vector<size_t> image_shape;
     CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape);
-    output_tensor->ResizeImage(output_shape, image_shape);
+    MACE_FAILURE_RETURN(output_tensor->ResizeImage(output_shape, image_shape));
 
     uint32_t idx = 0;
     if (runtime->IsOutOfRangeCheckEnabled()) {
@@ -115,10 +115,12 @@ void WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
     MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
     kernel_error_->UnMap();
   }
+
+  return MACE_SUCCESS;
 }
 
 template <typename T>
-void WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
+MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
     const Tensor *input_tensor,
     const Tensor *bias,
     Tensor *output_tensor,
@@ -186,7 +188,7 @@ void WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
                                          input_tensor->dim(1)};
     std::vector<size_t> image_shape;
     CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
-    output_tensor->ResizeImage(output_shape, image_shape);
+    MACE_FAILURE_RETURN(output_tensor->ResizeImage(output_shape, image_shape));
 
     const uint32_t round_h = (height_ + 1) / 2;
     const uint32_t round_w = (width_ + 1) / 2;
@@ -230,6 +232,8 @@ void WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
     MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
     kernel_error_->UnMap();
   }
+
+  return MACE_SUCCESS;
 }
 
 template struct WinogradTransformFunctor<DeviceType::GPU, float>;
diff --git a/mace/kernels/pad.h b/mace/kernels/pad.h
index 1e0e5ba4..105cf242 100644
--- a/mace/kernels/pad.h
+++ b/mace/kernels/pad.h
@@ -38,23 +38,27 @@ struct PadFunctorBase {
   float constant_value_;
 };
 
-template <DeviceType D, typename T>
+template<DeviceType D, typename T>
 struct PadFunctor : public PadFunctorBase {
   PadFunctor(const std::vector<int> &paddings,
              const float constant_value)
       : PadFunctorBase(paddings, constant_value) {}
 
-  void operator()(const Tensor *input,
-                  Tensor *output,
-                  StatsFuture *future) {
+  MaceStatus operator()(const Tensor *input,
+                        Tensor *output,
+                        StatsFuture *future) {
     MACE_UNUSED(future);
     MACE_CHECK(
         this->paddings_.size() == static_cast<size_t>(input->dim_size()) * 2);
     auto input_shape = input->shape();
-    output->Resize({input_shape[0] + this->paddings_[0] + this->paddings_[1],
-                    input_shape[1] + this->paddings_[2] + this->paddings_[3],
-                    input_shape[2] + this->paddings_[4] + this->paddings_[5],
-                    input_shape[3] + this->paddings_[6] + this->paddings_[7]});
+    MACE_FAILURE_RETURN(output->Resize({input_shape[0] + this->paddings_[0]
+                                            + this->paddings_[1],
+                                        input_shape[1] + this->paddings_[2]
+                                            + this->paddings_[3],
+                                        input_shape[2] + this->paddings_[4]
+                                            + this->paddings_[5],
+                                        input_shape[3] + this->paddings_[6]
+                                            + this->paddings_[7]}));
 
     Tensor::MappingGuard input_guard(input);
     Tensor::MappingGuard output_guard(output);
@@ -81,6 +85,8 @@ struct PadFunctor : public PadFunctorBase {
         }
       }
     }
+
+    return MACE_SUCCESS;
   }
 };
 
@@ -91,7 +97,7 @@ struct PadFunctor<DeviceType::GPU, T> : PadFunctorBase {
              const float constant_value)
       : PadFunctorBase(paddings, constant_value) {}
 
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   Tensor *output,
                   StatsFuture *future);
 
diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h
index 9c510b34..2632966f 100644
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -167,7 +167,7 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
     }
   }
 
-  void operator()(const Tensor *input_tensor,
+  MaceStatus operator()(const Tensor *input_tensor,
                   Tensor *output_tensor,
                   StatsFuture *future) {
     MACE_UNUSED(future);
@@ -190,7 +190,7 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
                          RoundType::CEIL,
                          output_shape.data());
     }
-    output_tensor->Resize(output_shape);
+    MACE_FAILURE_RETURN(output_tensor->Resize(output_shape));
 
     Tensor::MappingGuard input_guard(input_tensor);
     Tensor::MappingGuard output_guard(output_tensor);
@@ -220,6 +220,8 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
     } else {
       MACE_NOT_IMPLEMENTED;
     }
+
+    return MACE_SUCCESS;
   }
 };
 
@@ -235,7 +237,7 @@ struct PoolingFunctor<DeviceType::GPU, T> : PoolingFunctorBase {
       : PoolingFunctorBase(
             pooling_type, kernels, strides, padding_type, paddings, dilations) {
   }
-  void operator()(const Tensor *input_tensor,
+  MaceStatus operator()(const Tensor *input_tensor,
                   Tensor *output_tensor,
                   StatsFuture *future);
 
diff --git a/mace/kernels/proposal.h b/mace/kernels/proposal.h
index 273d17b8..c61a031b 100644
--- a/mace/kernels/proposal.h
+++ b/mace/kernels/proposal.h
@@ -136,7 +136,7 @@ struct ProposalFunctor {
       feat_stride_(feat_stride),
       anchors_(GenerateAnchors(scales, ratios, base_size)) {}
 
-  void operator()(const Tensor *rpn_cls_prob,
+  MaceStatus operator()(const Tensor *rpn_cls_prob,
                   const Tensor *rpn_bbox_pred,
                   const Tensor *img_info_tensor,
                   Tensor *output,
@@ -180,7 +180,7 @@ struct ProposalFunctor {
     for (int h_idx = 0; h_idx < feat_height; ++h_idx) {
       for (int w_idx = 0; w_idx < feat_width; ++w_idx) {
         for (int a_idx = 0; a_idx < anchors_size; ++a_idx) {
-          const int sanc_idx = (h_idx * feat_width + w_idx) * anchors_size
+          const index_t sanc_idx = (h_idx * feat_width + w_idx) * anchors_size
               + a_idx;
           const float width = proposals[sanc_idx][2] -
               proposals[sanc_idx][0] + 1;
@@ -216,7 +216,7 @@ struct ProposalFunctor {
     for (int h_idx = 0; h_idx < feat_height; ++h_idx) {
       for (int w_idx = 0; w_idx < feat_width; ++w_idx) {
         for (int a_idx = 0; a_idx < anchors_size; ++a_idx) {
-          const int sanc_idx = (h_idx * feat_width + w_idx) * anchors_size
+          const index_t sanc_idx = (h_idx * feat_width + w_idx) * anchors_size
               + a_idx;
           const float width = proposals[sanc_idx][2]
               - proposals[sanc_idx][0] + 1;
@@ -267,7 +267,7 @@ struct ProposalFunctor {
     // Our RPN implementation only supports a single input image, so all
     // batch inds are 0
     size = static_cast<int>(nms_result.size());
-    output->Resize({size, 1, 1, 5});
+    MACE_FAILURE_RETURN(output->Resize({size, 1, 1, 5}));
     auto output_ptr = output->mutable_data<float>();
 #pragma omp parallel for
     for (int i = 0; i < size; ++i) {
@@ -279,6 +279,8 @@ struct ProposalFunctor {
       output_ptr[out_idx + 3] = nms_proposals[nms_idx + 2];
       output_ptr[out_idx + 4] = nms_proposals[nms_idx + 3];
     }
+
+    return MACE_SUCCESS;
   }
 
   const int min_size_;
diff --git a/mace/kernels/psroi_align.h b/mace/kernels/psroi_align.h
index 4417fb1a..1830ff5d 100644
--- a/mace/kernels/psroi_align.h
+++ b/mace/kernels/psroi_align.h
@@ -34,7 +34,7 @@ struct PSROIAlignFunctor {
       output_dim_(output_dim),
       group_size_(group_size) {}
 
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const Tensor *rois,
                   Tensor *output,
                   StatsFuture *future) {
@@ -47,10 +47,11 @@ struct PSROIAlignFunctor {
     const T *input_ptr = input->data<T>();
     const T *rois_ptr = rois->data<T>();
     // Number of ROIs
-    const int num_rois = rois->dim(0);
-    const int batch_size = input->dim(0);
+    const index_t num_rois = rois->dim(0);
+    const index_t batch_size = input->dim(0);
 
-    output->Resize({num_rois, pooled_height, pooled_width, output_dim_});
+    MACE_FAILURE_RETURN(output->Resize({num_rois, pooled_height, pooled_width,
+                                        output_dim_}));
     T *output_ptr = output->mutable_data<T>();
 
     for (int n = 0; n < num_rois; ++n) {
@@ -176,6 +177,8 @@ struct PSROIAlignFunctor {
       rois_ptr += 5;
       output_ptr += pooled_height * pooled_width * output_dim_;
     }
+
+    return MACE_SUCCESS;
   }
 
   const T spatial_scale_;
diff --git a/mace/kernels/quantize.h b/mace/kernels/quantize.h
index 5483d067..1369f3cb 100644
--- a/mace/kernels/quantize.h
+++ b/mace/kernels/quantize.h
@@ -74,7 +74,7 @@ template<>
 struct QuantizeFunctor<CPU, uint8_t> {
   QuantizeFunctor() {}
 
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const Tensor *in_min,
                   const Tensor *in_max,
                   Tensor *output,
@@ -95,6 +95,8 @@ struct QuantizeFunctor<CPU, uint8_t> {
       output_data[i] = Saturate<uint8_t>(roundf(
         (input_data[i] - in_min_data) * recip_stepsize));
     }
+
+    return MACE_SUCCESS;
   }
 };
 
@@ -105,7 +107,7 @@ template<>
 struct DequantizeFunctor<CPU, uint8_t> {
   DequantizeFunctor() {}
 
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const Tensor *in_min,
                   const Tensor *in_max,
                   Tensor *output,
@@ -120,6 +122,8 @@ struct DequantizeFunctor<CPU, uint8_t> {
     for (int i = 0; i < input->size(); ++i) {
       output_data[i] = in_min_data + stepsize * input_data[i];
     }
+
+    return MACE_SUCCESS;
   }
 };
 
@@ -130,7 +134,7 @@ template<>
 struct RequantizeFunctor<CPU, uint8_t> {
   RequantizeFunctor() {}
 
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const Tensor *in_min,
                   const Tensor *in_max,
                   const Tensor *rerange_min,
@@ -189,6 +193,8 @@ struct RequantizeFunctor<CPU, uint8_t> {
         Saturate<uint8_t>(roundf(
           quantized_out_zero + input_data[i] * step_ratio));
     }
+
+    return MACE_SUCCESS;
   }
 };
 
diff --git a/mace/kernels/reshape.h b/mace/kernels/reshape.h
index 221064cc..87519bc9 100644
--- a/mace/kernels/reshape.h
+++ b/mace/kernels/reshape.h
@@ -31,12 +31,14 @@ template <DeviceType D, typename T>
 struct ReshapeFunctor {
   ReshapeFunctor() {}
 
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const std::vector<index_t> &out_shape,
                   Tensor *output,
                   StatsFuture *future) {
     MACE_UNUSED(future);
     output->ResizeWithBuffer(out_shape, input->UnderlyingBuffer());
+
+    return MACE_SUCCESS;
   }
 };
 
diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h
index c312fbd2..2c7ff3ef 100644
--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -46,10 +46,10 @@ inline float CalculateResizeScale(index_t in_size,
 }
 
 inline void ComputeInterpolationWeights(
-  const index_t out_size,
-  const index_t in_size,
-  const float scale,
-  CachedInterpolation *interpolation) {
+    const index_t out_size,
+    const index_t in_size,
+    const float scale,
+    CachedInterpolation *interpolation) {
   interpolation[out_size].lower = 0;
   interpolation[out_size].upper = 0;
   for (index_t i = out_size - 1; i >= 0; --i) {
@@ -72,29 +72,30 @@ inline float ComputeLerp(const float top_left,
 }
 
 inline void ResizeImage(const float *images,
-                 const index_t batch_size,
-                 const index_t in_height,
-                 const index_t in_width,
-                 const index_t out_height,
-                 const index_t out_width,
-                 const index_t channels,
-                 const std::vector<CachedInterpolation> &xs_vec,
-                 const std::vector<CachedInterpolation> &ys,
-                 float *output) {
+                        const index_t batch_size,
+                        const index_t in_height,
+                        const index_t in_width,
+                        const index_t out_height,
+                        const index_t out_width,
+                        const index_t channels,
+                        const std::vector<CachedInterpolation> &xs_vec,
+                        const std::vector<CachedInterpolation> &ys,
+                        float *output) {
   const CachedInterpolation *xs = xs_vec.data();
 
 #pragma omp parallel for collapse(2)
   for (index_t b = 0; b < batch_size; ++b) {
     for (index_t c = 0; c < channels; ++c) {
       const float
-        *channel_input_ptr = images + (b * channels + c) * in_height * in_width;
+          *channel_input_ptr =
+          images + (b * channels + c) * in_height * in_width;
       float *channel_output_ptr =
-        output + (b * channels + c) * out_height * out_width;
+          output + (b * channels + c) * out_height * out_width;
       for (index_t y = 0; y < out_height; ++y) {
         const float *y_lower_input_ptr =
-          channel_input_ptr + ys[y].lower * in_width;
+            channel_input_ptr + ys[y].lower * in_width;
         const float *y_upper_input_ptr =
-          channel_input_ptr + ys[y].upper * in_width;
+            channel_input_ptr + ys[y].upper * in_width;
         const float ys_lerp = ys[y].lerp;
 
         for (index_t x = 0; x < out_width; ++x) {
@@ -104,8 +105,8 @@ inline void ResizeImage(const float *images,
           const float bottom_left = y_upper_input_ptr[xs[x].lower];
           const float bottom_right = y_upper_input_ptr[xs[x].upper];
           channel_output_ptr[y * out_width + x] =
-            ComputeLerp(top_left, top_right, bottom_left,
-                        bottom_right, xs_lerp, ys_lerp);
+              ComputeLerp(top_left, top_right, bottom_left,
+                          bottom_right, xs_lerp, ys_lerp);
         }
       }
     }
@@ -115,7 +116,7 @@ inline void ResizeImage(const float *images,
 struct ResizeBilinearFunctorBase {
   ResizeBilinearFunctorBase(const std::vector<index_t> &size,
                             bool align_corners)
-    : align_corners_(align_corners) {
+      : align_corners_(align_corners) {
     MACE_CHECK(size.size() == 2);
     out_height_ = size[0];
     out_width_ = size[1];
@@ -132,11 +133,13 @@ struct ResizeBilinearFunctor;
 
 template<>
 struct ResizeBilinearFunctor<DeviceType::CPU, float>
-  : ResizeBilinearFunctorBase {
+    : ResizeBilinearFunctorBase {
   ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
-    : ResizeBilinearFunctorBase(size, align_corners) {}
+      : ResizeBilinearFunctorBase(size, align_corners) {}
 
-  void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
+  MaceStatus operator()(const Tensor *input,
+                        Tensor *output,
+                        StatsFuture *future) {
     MACE_UNUSED(future);
     const index_t batch = input->dim(0);
     const index_t channels = input->dim(1);
@@ -147,7 +150,7 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
     index_t out_width = out_width_;
     MACE_CHECK(out_height > 0 && out_width > 0);
     std::vector<index_t> out_shape{batch, channels, out_height, out_width};
-    output->Resize(out_shape);
+    MACE_FAILURE_RETURN(output->Resize(out_shape));
 
     Tensor::MappingGuard input_mapper(input);
     Tensor::MappingGuard output_mapper(output);
@@ -158,13 +161,13 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
       std::copy(input_data,
                 input_data + batch * channels * in_height * in_width,
                 output_data);
-      return;
+      return MACE_SUCCESS;
     }
 
     float height_scale =
-      CalculateResizeScale(in_height, out_height, align_corners_);
+        CalculateResizeScale(in_height, out_height, align_corners_);
     float width_scale =
-      CalculateResizeScale(in_width, out_width, align_corners_);
+        CalculateResizeScale(in_width, out_width, align_corners_);
 
     std::vector<CachedInterpolation> ys(out_height + 1);
     std::vector<CachedInterpolation> xs(out_width + 1);
@@ -175,17 +178,21 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
 
     ResizeImage(input_data, batch, in_height, in_width, out_height, out_width,
                 channels, xs, ys, output_data);
+
+    return MACE_SUCCESS;
   }
 };
 
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
 struct ResizeBilinearFunctor<DeviceType::GPU, T>
-  : ResizeBilinearFunctorBase {
+    : ResizeBilinearFunctorBase {
   ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
-    : ResizeBilinearFunctorBase(size, align_corners) {}
+      : ResizeBilinearFunctorBase(size, align_corners) {}
 
-  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
+  MaceStatus operator()(const Tensor *input,
+                        Tensor *output,
+                        StatsFuture *future);
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
diff --git a/mace/kernels/slice.h b/mace/kernels/slice.h
index 16248fde..02396ce3 100644
--- a/mace/kernels/slice.h
+++ b/mace/kernels/slice.h
@@ -41,7 +41,7 @@ template<DeviceType D, typename T>
 struct SliceFunctor : SliceFunctorBase {
   explicit SliceFunctor(const int32_t axis) : SliceFunctorBase(axis) {}
 
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const std::vector<Tensor *> &output_list,
                   StatsFuture *future) {
     MACE_UNUSED(future);
@@ -61,7 +61,7 @@ struct SliceFunctor : SliceFunctorBase {
                                                1,
                                                std::multiplies<index_t>());
     for (size_t i= 0; i < outputs_count; ++i) {
-      output_list[i]->Resize(output_shape);
+      MACE_FAILURE_RETURN(output_list[i]->Resize(output_shape));
       output_ptrs[i] = output_list[i]->mutable_data<T>();
     }
     const T *input_ptr = input->data<T>();
@@ -82,6 +82,8 @@ struct SliceFunctor : SliceFunctorBase {
         input_idx += output_channels * inner_size;
       }
     }
+
+    return MACE_SUCCESS;
   }
 };
 
@@ -90,7 +92,7 @@ template<typename T>
 struct SliceFunctor<DeviceType::GPU, T> : SliceFunctorBase {
   explicit SliceFunctor(const int32_t axis) : SliceFunctorBase(axis) {}
 
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const std::vector<Tensor *> &output_list,
                   StatsFuture *future);
   cl::Kernel kernel_;
diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h
index ac8c9913..ebcb7b40 100644
--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -38,7 +38,9 @@ struct SoftmaxFunctor;
 
 template<>
 struct SoftmaxFunctor<DeviceType::CPU, float> {
-  void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
+  MaceStatus operator()(const Tensor *input,
+                        Tensor *output,
+                        StatsFuture *future) {
     MACE_UNUSED(future);
     const index_t batch = input->dim(0);
     const index_t class_count = input->dim(1);
@@ -82,13 +84,17 @@ struct SoftmaxFunctor<DeviceType::CPU, float> {
         }
       }  // k
     }  // b
+
+    return MACE_SUCCESS;
   }
 };
 
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
 struct SoftmaxFunctor<DeviceType::GPU, T> {
-  void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);
+  MaceStatus operator()(const Tensor *logits,
+                        Tensor *output,
+                        StatsFuture *future);
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h
index 46a0e031..204fe44b 100644
--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -140,7 +140,7 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
                       bool b2s)
     : SpaceToBatchFunctorBase(paddings, block_shape, b2s) {}
 
-  void operator()(Tensor *space_tensor,
+  MaceStatus operator()(Tensor *space_tensor,
                   Tensor *batch_tensor,
                   StatsFuture *future) {
     MACE_UNUSED(future);
@@ -150,12 +150,12 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
       CalculateBatchToSpaceOutputShape(batch_tensor,
                                        DataFormat::NCHW,
                                        output_shape.data());
-      space_tensor->Resize(output_shape);
+      MACE_FAILURE_RETURN(space_tensor->Resize(output_shape));
     } else {
       CalculateSpaceToBatchOutputShape(space_tensor,
                                        DataFormat::NCHW,
                                        output_shape.data());
-      batch_tensor->Resize(output_shape);
+      MACE_FAILURE_RETURN(batch_tensor->Resize(output_shape));
     }
 
     Tensor::MappingGuard input_guard(space_tensor);
@@ -312,6 +312,7 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
         }  // block_h
       }  // c
     }
+    return MACE_SUCCESS;
   }
 };
 
@@ -323,7 +324,7 @@ struct SpaceToBatchFunctor<DeviceType::GPU, T> : SpaceToBatchFunctorBase {
                       bool b2s)
       : SpaceToBatchFunctorBase(paddings, block_shape, b2s) {}
 
-  void operator()(Tensor *space_tensor,
+  MaceStatus operator()(Tensor *space_tensor,
                   Tensor *batch_tensor,
                   StatsFuture *future);
 
diff --git a/mace/kernels/transpose.h b/mace/kernels/transpose.h
index d57e0228..5faa67c1 100644
--- a/mace/kernels/transpose.h
+++ b/mace/kernels/transpose.h
@@ -107,7 +107,9 @@ template<DeviceType D, typename T>
 struct TransposeFunctor {
   explicit TransposeFunctor(const std::vector<int> &dims) : dims_(dims) {}
 
-  void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
+  MaceStatus operator()(const Tensor *input,
+                        Tensor *output,
+                        StatsFuture *future) {
     MACE_UNUSED(future);
     Tensor::MappingGuard input_guard(input);
     Tensor::MappingGuard output_guard(output);
@@ -137,7 +139,7 @@ struct TransposeFunctor {
                                 input->dim(2));
         }
       } else if (dims_ == transpose_order_from_NCHW_to_NHWC
-        && input->dim(1) == 2) {
+          && input->dim(1) == 2) {
         for (index_t b = 0; b < input->dim(0); ++b) {
           TransposeNCHWToNHWCC2(input_data + b * batch_size,
                                 output_data + b * batch_size,
@@ -146,11 +148,11 @@ struct TransposeFunctor {
         }
       } else {
         std::vector<index_t>
-          in_stride{input_shape[1] * input_shape[2] * input_shape[3],
-                    input_shape[2] * input_shape[3], input_shape[3], 1};
+            in_stride{input_shape[1] * input_shape[2] * input_shape[3],
+                      input_shape[2] * input_shape[3], input_shape[3], 1};
         std::vector<index_t>
-          out_stride{output_shape[1] * output_shape[2] * output_shape[3],
-                     output_shape[2] * output_shape[3], output_shape[3], 1};
+            out_stride{output_shape[1] * output_shape[2] * output_shape[3],
+                       output_shape[2] * output_shape[3], output_shape[3], 1};
 
         std::vector<index_t> idim(4, 0);
         std::vector<index_t> odim(4, 0);
@@ -164,9 +166,9 @@ struct TransposeFunctor {
                 idim[dims_[3]] = odim[3];
 
                 output_data[odim[0] * out_stride[0] + odim[1] * out_stride[1]
-                  + odim[2] * out_stride[2] + odim[3]] =
-                  input_data[idim[0] * in_stride[0] + idim[1] * in_stride[1]
-                    + idim[2] * in_stride[2] + idim[3]];
+                    + odim[2] * out_stride[2] + odim[3]] =
+                    input_data[idim[0] * in_stride[0] + idim[1] * in_stride[1]
+                        + idim[2] * in_stride[2] + idim[3]];
               }
             }
           }
@@ -175,6 +177,8 @@ struct TransposeFunctor {
     } else {
       MACE_NOT_IMPLEMENTED;
     }
+
+    return MACE_SUCCESS;
   }
 
   std::vector<int> dims_;
diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h
index 4e53ee7a..0cdde365 100644
--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -44,29 +44,34 @@ struct WinogradTransformFunctorBase {
   std::vector<int> paddings_;
 };
 
-template <DeviceType D, typename T>
+template<DeviceType D, typename T>
 struct WinogradTransformFunctor : WinogradTransformFunctorBase {
   WinogradTransformFunctor(const Padding &padding_type,
                            const std::vector<int> &paddings)
       : WinogradTransformFunctorBase(padding_type, paddings) {}
 
-  void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
+  MaceStatus operator()(const Tensor *input,
+                        Tensor *output,
+                        StatsFuture *future) {
     MACE_UNUSED(input);
     MACE_UNUSED(output);
     MACE_UNUSED(future);
     MACE_NOT_IMPLEMENTED;
+    return MACE_SUCCESS;
   }
 };
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<typename T>
 struct WinogradTransformFunctor<DeviceType::GPU, T>
     : WinogradTransformFunctorBase {
   WinogradTransformFunctor(const Padding &padding_type,
                            const std::vector<int> &paddings)
       : WinogradTransformFunctorBase(padding_type, paddings) {}
 
-  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
+  MaceStatus operator()(const Tensor *input,
+                        Tensor *output,
+                        StatsFuture *future);
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
@@ -94,7 +99,7 @@ struct WinogradInverseTransformFunctorBase {
   const float relux_max_limit_;
 };
 
-template <DeviceType D, typename T>
+template<DeviceType D, typename T>
 struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
   WinogradInverseTransformFunctor(const int batch,
                                   const int height,
@@ -102,17 +107,18 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
                                   const ActivationType activation,
                                   const float relux_max_limit)
       : WinogradInverseTransformFunctorBase(
-            batch, height, width, activation, relux_max_limit) {}
+      batch, height, width, activation, relux_max_limit) {}
 
-  void operator()(const Tensor *input,
-                  const Tensor *bias,
-                  Tensor *output,
-                  StatsFuture *future) {
+  MaceStatus operator()(const Tensor *input,
+                        const Tensor *bias,
+                        Tensor *output,
+                        StatsFuture *future) {
     MACE_UNUSED(input);
     MACE_UNUSED(bias);
     MACE_UNUSED(output);
     MACE_UNUSED(future);
     MACE_NOT_IMPLEMENTED;
+    return MACE_SUCCESS;
   }
 };
 
@@ -128,7 +134,7 @@ struct WinogradInverseTransformFunctor<DeviceType::GPU, T>
       : WinogradInverseTransformFunctorBase(
             batch, height, width, activation, relux_max_limit) {}
 
-  void operator()(const Tensor *input,
+  MaceStatus operator()(const Tensor *input,
                   const Tensor *bias,
                   Tensor *output,
                   StatsFuture *future);
diff --git a/mace/ops/activation.h b/mace/ops/activation.h
index 7c6d3b56..ce148054 100644
--- a/mace/ops/activation.h
+++ b/mace/ops/activation.h
@@ -34,15 +34,14 @@ class ActivationOp : public Operator<D, T> {
                  static_cast<T>(OperatorBase::GetSingleArgument<float>(
                      "max_limit", 0.0f))) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input_tensor = this->Input(0);
     const Tensor *alpha_tensor =
         this->InputSize() >= 2 ? this->Input(1) : nullptr;
     Tensor *output_tensor = this->Output(0);
-    output_tensor->ResizeLike(input_tensor);
+    MACE_FAILURE_RETURN(output_tensor->ResizeLike(input_tensor));
 
-    functor_(input_tensor, alpha_tensor, output_tensor, future);
-    return true;
+    return functor_(input_tensor, alpha_tensor, output_tensor, future);
   }
 
  private:
diff --git a/mace/ops/activation_benchmark.cc b/mace/ops/activation_benchmark.cc
index 9c95b9ec..e4ff005a 100644
--- a/mace/ops/activation_benchmark.cc
+++ b/mace/ops/activation_benchmark.cc
@@ -24,8 +24,7 @@ namespace test {
 
 namespace {
 template <DeviceType D, typename T>
-void ReluBenchmark(
-    int iters, int batch, int channels, int height, int width) {
+void ReluBenchmark(int iters, int batch, int channels, int height, int width) {
   mace::testing::StopTiming();
 
   OpsTestNet net;
@@ -41,10 +40,10 @@ void ReluBenchmark(
 
   if (D == DeviceType::CPU) {
     OpDefBuilder("Activation", "ReluBM")
-      .Input("Input")
-      .Output("Output")
-      .AddStringArg("activation", "RELU")
-      .Finalize(net.NewOperatorDef());
+        .Input("Input")
+        .Output("Output")
+        .AddStringArg("activation", "RELU")
+        .Finalize(net.NewOperatorDef());
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
                             kernels::BufferType::IN_OUT_CHANNEL);
@@ -81,8 +80,8 @@ void ReluBenchmark(
   }                                                                          \
   BENCHMARK(BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
 
-#define BM_RELU(N, C, H, W)                 \
-  BM_RELU_MACRO(N, C, H, W, float, CPU);    \
+#define BM_RELU(N, C, H, W)              \
+  BM_RELU_MACRO(N, C, H, W, float, CPU); \
   BM_RELU_MACRO(N, C, H, W, float, GPU); \
   BM_RELU_MACRO(N, C, H, W, half, GPU);
 
@@ -94,8 +93,7 @@ BM_RELU(1, 64, 256, 256);
 
 namespace {
 template <DeviceType D, typename T>
-void ReluxBenchmark(
-    int iters, int batch, int channels, int height, int width) {
+void ReluxBenchmark(int iters, int batch, int channels, int height, int width) {
   mace::testing::StopTiming();
 
   OpsTestNet net;
@@ -149,8 +147,8 @@ void ReluxBenchmark(
   }                                                                           \
   BENCHMARK(BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
 
-#define BM_RELUX(N, C, H, W)                 \
-  BM_RELUX_MACRO(N, C, H, W, float, CPU);    \
+#define BM_RELUX(N, C, H, W)              \
+  BM_RELUX_MACRO(N, C, H, W, float, CPU); \
   BM_RELUX_MACRO(N, C, H, W, float, GPU); \
   BM_RELUX_MACRO(N, C, H, W, half, GPU);
 
@@ -162,8 +160,7 @@ BM_RELUX(1, 64, 256, 256);
 
 namespace {
 template <DeviceType D, typename T>
-void PreluBenchmark(
-    int iters, int batch, int channels, int height, int width) {
+void PreluBenchmark(int iters, int batch, int channels, int height, int width) {
   mace::testing::StopTiming();
 
   OpsTestNet net;
@@ -180,11 +177,11 @@ void PreluBenchmark(
 
   if (D == DeviceType::CPU) {
     OpDefBuilder("Activation", "PreluBM")
-      .Input("Input")
-      .Input("Alpha")
-      .Output("Output")
-      .AddStringArg("activation", "PRELU")
-      .Finalize(net.NewOperatorDef());
+        .Input("Input")
+        .Input("Alpha")
+        .Output("Output")
+        .AddStringArg("activation", "PRELU")
+        .Finalize(net.NewOperatorDef());
   } else if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
                             kernels::BufferType::IN_OUT_CHANNEL);
@@ -224,8 +221,8 @@ void PreluBenchmark(
   }                                                                           \
   BENCHMARK(BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
 
-#define BM_PRELU(N, C, H, W)                 \
-  BM_PRELU_MACRO(N, C, H, W, float, CPU);    \
+#define BM_PRELU(N, C, H, W)              \
+  BM_PRELU_MACRO(N, C, H, W, float, CPU); \
   BM_PRELU_MACRO(N, C, H, W, float, GPU); \
   BM_PRELU_MACRO(N, C, H, W, half, GPU);
 
@@ -237,8 +234,7 @@ BM_PRELU(1, 64, 256, 256);
 
 namespace {
 template <DeviceType D, typename T>
-void TanhBenchmark(
-    int iters, int batch, int channels, int height, int width) {
+void TanhBenchmark(int iters, int batch, int channels, int height, int width) {
   mace::testing::StopTiming();
 
   OpsTestNet net;
@@ -290,8 +286,8 @@ void TanhBenchmark(
   }                                                                          \
   BENCHMARK(BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
 
-#define BM_TANH(N, C, H, W)                 \
-  BM_TANH_MACRO(N, C, H, W, float, CPU);    \
+#define BM_TANH(N, C, H, W)              \
+  BM_TANH_MACRO(N, C, H, W, float, CPU); \
   BM_TANH_MACRO(N, C, H, W, float, GPU); \
   BM_TANH_MACRO(N, C, H, W, half, GPU);
 
@@ -357,8 +353,8 @@ void SigmoidBenchmark(
   }                                                                  \
   BENCHMARK(BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
 
-#define BM_SIGMOID(N, C, H, W)                 \
-  BM_SIGMOID_MACRO(N, C, H, W, float, CPU);    \
+#define BM_SIGMOID(N, C, H, W)              \
+  BM_SIGMOID_MACRO(N, C, H, W, float, CPU); \
   BM_SIGMOID_MACRO(N, C, H, W, float, GPU); \
   BM_SIGMOID_MACRO(N, C, H, W, half, GPU);
 
diff --git a/mace/ops/addn.h b/mace/ops/addn.h
index 55d1c0c7..64373343 100644
--- a/mace/ops/addn.h
+++ b/mace/ops/addn.h
@@ -29,7 +29,7 @@ class AddNOp : public Operator<D, T> {
   AddNOp(const OperatorDef &operator_def, Workspace *ws)
       : Operator<D, T>(operator_def, ws) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     Tensor *output_tensor = this->Output(0);
     int n = this->inputs_.size();
     std::vector<const Tensor *> inputs(n, nullptr);
@@ -42,9 +42,7 @@ class AddNOp : public Operator<D, T> {
           << ", size: " << inputs[0]->size() << ". Input " << i << ": "
           << MakeString(inputs[i]->shape()) << ", size: " << inputs[i]->size();
     }
-
-    functor_(inputs, output_tensor, future);
-    return true;
+    return functor_(inputs, output_tensor, future);
   }
 
  private:
diff --git a/mace/ops/batch_norm.h b/mace/ops/batch_norm.h
index 11a89fb4..4712353e 100644
--- a/mace/ops/batch_norm.h
+++ b/mace/ops/batch_norm.h
@@ -32,7 +32,7 @@ class BatchNormOp : public Operator<D, T> {
                                                       static_cast<float>(1e-4));
   }
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
     const Tensor *scale = this->Input(SCALE);
     const Tensor *offset = this->Input(OFFSET);
@@ -51,10 +51,8 @@ class BatchNormOp : public Operator<D, T> {
                var->dim_size());
 
     Tensor *output = this->Output(OUTPUT);
-    output->ResizeLike(input);
-
-    functor_(input, scale, offset, mean, var, epsilon_, output, future);
-    return true;
+    MACE_FAILURE_RETURN(output->ResizeLike(input));
+    return functor_(input, scale, offset, mean, var, epsilon_, output, future);
   }
 
  private:
diff --git a/mace/ops/batch_to_space.h b/mace/ops/batch_to_space.h
index eacce531..05fc676e 100644
--- a/mace/ops/batch_to_space.h
+++ b/mace/ops/batch_to_space.h
@@ -33,12 +33,11 @@ class BatchToSpaceNDOp : public Operator<D, T> {
                  OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1}),
                  true) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *batch_tensor = this->Input(INPUT);
     Tensor *space_tensor = this->Output(OUTPUT);
-    functor_(space_tensor, const_cast<Tensor *>(batch_tensor),
+    return functor_(space_tensor, const_cast<Tensor *>(batch_tensor),
              future);
-    return true;
   }
 
  private:
diff --git a/mace/ops/bias_add.h b/mace/ops/bias_add.h
index 54dfee0d..f69f18e6 100644
--- a/mace/ops/bias_add.h
+++ b/mace/ops/bias_add.h
@@ -27,7 +27,7 @@ class BiasAddOp : public Operator<D, T> {
   BiasAddOp(const OperatorDef &operator_def, Workspace *ws)
       : Operator<D, T>(operator_def, ws), functor_() {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
     const Tensor *bias = this->Input(BIAS);
 
@@ -37,10 +37,9 @@ class BiasAddOp : public Operator<D, T> {
                bias->dim_size());
 
     Tensor *output = this->Output(OUTPUT);
-    output->ResizeLike(input);
+    MACE_FAILURE_RETURN(output->ResizeLike(input));
 
-    functor_(input, bias, output, future);
-    return true;
+    return functor_(input, bias, output, future);
   }
 
  private:
diff --git a/mace/ops/buffer_to_image.h b/mace/ops/buffer_to_image.h
index 84763b19..1c32fd14 100644
--- a/mace/ops/buffer_to_image.h
+++ b/mace/ops/buffer_to_image.h
@@ -27,7 +27,7 @@ class BufferToImageOp : public Operator<D, T> {
   BufferToImageOp(const OperatorDef &op_def, Workspace *ws)
       : Operator<D, T>(op_def, ws) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input_tensor = this->Input(INPUT);
 
     kernels::BufferType type =
@@ -35,8 +35,7 @@ class BufferToImageOp : public Operator<D, T> {
             "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
     Tensor *output = this->Output(OUTPUT);
 
-    functor_(input_tensor, type, output, future);
-    return true;
+    return functor_(input_tensor, type, output, future);
   }
 
  private:
diff --git a/mace/ops/channel_shuffle.h b/mace/ops/channel_shuffle.h
index 562d5ac2..22e1e211 100644
--- a/mace/ops/channel_shuffle.h
+++ b/mace/ops/channel_shuffle.h
@@ -31,7 +31,7 @@ class ChannelShuffleOp : public Operator<D, T> {
         group_(OperatorBase::GetSingleArgument<int>("group", 1)),
         functor_(this->group_) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
     Tensor *output = this->Output(OUTPUT);
     int channels;
@@ -45,9 +45,7 @@ class ChannelShuffleOp : public Operator<D, T> {
     MACE_CHECK(channels % group_ == 0,
                "input channels must be an integral multiple of group. ",
                input->dim(3));
-    functor_(input, output, future);
-
-    return true;
+    return functor_(input, output, future);
   }
 
  protected:
diff --git a/mace/ops/concat.h b/mace/ops/concat.h
index 8902548d..b2fcc37b 100644
--- a/mace/ops/concat.h
+++ b/mace/ops/concat.h
@@ -30,7 +30,7 @@ class ConcatOp : public Operator<D, T> {
       : Operator<D, T>(op_def, ws),
         functor_(OperatorBase::GetSingleArgument<int>("axis", 3)) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     MACE_CHECK(this->InputSize() >= 2)
         << "There must be at least two inputs to concat";
     const std::vector<const Tensor *> input_list = this->Inputs();
@@ -44,8 +44,7 @@ class ConcatOp : public Operator<D, T> {
 
     Tensor *output = this->Output(OUTPUT);
 
-    functor_(input_list, output, future);
-    return true;
+    return functor_(input_list, output, future);
   }
 
  private:
diff --git a/mace/ops/conv_2d.h b/mace/ops/conv_2d.h
index f7fc157d..9c353ca0 100644
--- a/mace/ops/conv_2d.h
+++ b/mace/ops/conv_2d.h
@@ -42,14 +42,12 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
                      "is_filter_transformed", false)),
                  ws->GetScratchBuffer(D)) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
     const Tensor *filter = this->Input(FILTER);
     const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
     Tensor *output = this->Output(OUTPUT);
-    functor_(input, filter, bias, output, future);
-
-    return true;
+    return functor_(input, filter, bias, output, future);
   }
 
  private:
diff --git a/mace/ops/deconv_2d.h b/mace/ops/deconv_2d.h
index 1796655b..33d934e3 100644
--- a/mace/ops/deconv_2d.h
+++ b/mace/ops/deconv_2d.h
@@ -36,15 +36,13 @@ class Deconv2dOp : public ConvPool2dOpBase<D, T> {
                  kernels::ActivationType::NOOP,
                  0.0f) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
     const Tensor *filter = this->Input(FILTER);
     const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
     Tensor *output = this->Output(OUTPUT);
 
-    functor_(input, filter, bias, output, future);
-
-    return true;
+    return functor_(input, filter, bias, output, future);
   }
 
  private:
diff --git a/mace/ops/depth_to_space.h b/mace/ops/depth_to_space.h
index 1e923edc..e96ac897 100644
--- a/mace/ops/depth_to_space.h
+++ b/mace/ops/depth_to_space.h
@@ -32,7 +32,7 @@ class DepthToSpaceOp : public Operator<D, T> {
         block_size_(OperatorBase::GetSingleArgument<int>("block_size", 1)),
         functor_(this->block_size_, true) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
     Tensor *output = this->Output(OUTPUT);
     MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
@@ -50,8 +50,7 @@ class DepthToSpaceOp : public Operator<D, T> {
                input_depth);
     MACE_CHECK((input_depth % 4) == 0,
                "input channel should be dividable by 4");
-    functor_(input, output, future);
-    return true;
+    return functor_(input, output, future);
   }
 
  protected:
diff --git a/mace/ops/depthwise_conv2d.h b/mace/ops/depthwise_conv2d.h
index 7e0deebd..37b82720 100644
--- a/mace/ops/depthwise_conv2d.h
+++ b/mace/ops/depthwise_conv2d.h
@@ -40,7 +40,7 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
                                                                   "NOOP")),
                  OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
     const Tensor *filter = this->Input(FILTER);
     const Tensor *bias = nullptr;
@@ -48,8 +48,7 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
       bias = this->Input(BIAS);
     }
     Tensor *output = this->Output(OUTPUT);
-    functor_(input, filter, bias, output, future);
-    return true;
+    return functor_(input, filter, bias, output, future);
   }
 
  private:
diff --git a/mace/ops/eltwise.h b/mace/ops/eltwise.h
index 57f73e26..3c63c080 100644
--- a/mace/ops/eltwise.h
+++ b/mace/ops/eltwise.h
@@ -32,12 +32,11 @@ class EltwiseOp : public Operator<D, T> {
                  OperatorBase::GetRepeatedArgument<float>("coeff"),
                  OperatorBase::GetSingleArgument<float>("x", 1.0)) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor* input0 = this->Input(0);
     const Tensor* input1 = this->InputSize() == 2 ? this->Input(1) : nullptr;
     Tensor *output = this->Output(OUTPUT);
-    functor_(input0, input1, output, future);
-    return true;
+    return functor_(input0, input1, output, future);
   }
 
  private:
diff --git a/mace/ops/folded_batch_norm.h b/mace/ops/folded_batch_norm.h
index 75f61c3a..40e3e113 100644
--- a/mace/ops/folded_batch_norm.h
+++ b/mace/ops/folded_batch_norm.h
@@ -34,7 +34,7 @@ class FoldedBatchNormOp : public Operator<D, T> {
                                                                   "NOOP")),
                  OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
     const Tensor *scale = this->Input(SCALE);
     const Tensor *offset = this->Input(OFFSET);
@@ -47,10 +47,9 @@ class FoldedBatchNormOp : public Operator<D, T> {
                offset->dim_size());
 
     Tensor *output = this->Output(OUTPUT);
-    output->ResizeLike(input);
+    MACE_FAILURE_RETURN(output->ResizeLike(input));
 
-    functor_(input, scale, offset, nullptr, nullptr, 0, output, future);
-    return true;
+    return functor_(input, scale, offset, nullptr, nullptr, 0, output, future);
   }
 
  private:
diff --git a/mace/ops/fully_connected.h b/mace/ops/fully_connected.h
index 2d54a70e..0c21efc3 100644
--- a/mace/ops/fully_connected.h
+++ b/mace/ops/fully_connected.h
@@ -33,7 +33,7 @@ class FullyConnectedOp : public Operator<D, T> {
                                                               "NOOP")),
                OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
     const Tensor *weight = this->Input(WEIGHT);  // OIHW
     const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
@@ -65,8 +65,7 @@ class FullyConnectedOp : public Operator<D, T> {
                  " don't match.");
     }
 
-    functor_(input, weight, bias, output, future);
-    return true;
+    return functor_(input, weight, bias, output, future);
   }
 
  private:
diff --git a/mace/ops/image_to_buffer.h b/mace/ops/image_to_buffer.h
index 1af0b15f..88265948 100644
--- a/mace/ops/image_to_buffer.h
+++ b/mace/ops/image_to_buffer.h
@@ -27,15 +27,14 @@ class ImageToBufferOp : public Operator<D, T> {
   ImageToBufferOp(const OperatorDef &op_def, Workspace *ws)
       : Operator<D, T>(op_def, ws) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
     Tensor *output = this->Output(OUTPUT);
 
     kernels::BufferType type =
         static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
             "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
-    functor_(input, type, output, future);
-    return true;
+    return functor_(input, type, output, future);
   }
 
  private:
diff --git a/mace/ops/local_response_norm.h b/mace/ops/local_response_norm.h
index 980a59e6..6938de65 100644
--- a/mace/ops/local_response_norm.h
+++ b/mace/ops/local_response_norm.h
@@ -33,17 +33,16 @@ class LocalResponseNormOp : public Operator<D, T> {
     beta_ = OperatorBase::GetSingleArgument<float>("beta", 0.5f);
   }
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
 
     MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ",
                input->dim_size());
 
     Tensor *output = this->Output(OUTPUT);
-    output->ResizeLike(input);
+    MACE_FAILURE_RETURN(output->ResizeLike(input));
 
-    functor_(input, depth_radius_, bias_, alpha_, beta_, output, future);
-    return true;
+    return functor_(input, depth_radius_, bias_, alpha_, beta_, output, future);
   }
 
  private:
diff --git a/mace/ops/matmul.h b/mace/ops/matmul.h
index 0a5cd27d..10a43578 100644
--- a/mace/ops/matmul.h
+++ b/mace/ops/matmul.h
@@ -27,7 +27,7 @@ class MatMulOp : public Operator<D, T> {
   MatMulOp(const OperatorDef &operator_def, Workspace *ws)
       : Operator<D, T>(operator_def, ws) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *A = this->Input(0);
     const Tensor *B = this->Input(1);
     Tensor *C = this->Output(0);
@@ -38,8 +38,7 @@ class MatMulOp : public Operator<D, T> {
         << "the number of A's column " << A->dim(2)
         << " must be equal to B's row " << B->dim(1);
 
-    functor_(A, B, C, future);
-    return true;
+    return functor_(A, B, C, future);
   }
 
  private:
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
index 8900b3c7..ad7d2ee6 100644
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -354,7 +354,7 @@ class OpsTestNet {
     return net_ != nullptr;
   }
 
-  bool Run() {
+  MaceStatus Run() {
     MACE_CHECK_NOTNULL(net_);
     return net_->Run();
   }
@@ -362,7 +362,7 @@ class OpsTestNet {
   // DEPRECATED(liyin):
   // Test and benchmark should setup model once and run multiple times.
   // Setup time should not be counted during benchmark.
-  bool RunOp(DeviceType device) {
+  MaceStatus RunOp(DeviceType device) {
     Setup(device);
     return Run();
   }
@@ -370,16 +370,14 @@ class OpsTestNet {
   // DEPRECATED(liyin):
   // Test and benchmark should setup model once and run multiple times.
   // Setup time should not be counted during benchmark.
-  bool RunOp() {
+  MaceStatus RunOp() {
     return RunOp(DeviceType::CPU);
   }
 
-  bool RunNet(const NetDef &net_def, const DeviceType device) {
+  MaceStatus RunNet(const NetDef &net_def, const DeviceType device) {
     device_ = device;
     net_ = CreateNet(op_registry_, net_def, &ws_, device, NetMode::INIT);
-    if (!net_->Run()) {
-      return false;
-    }
+    MACE_FAILURE_RETURN(net_->Run());
     net_ = CreateNet(op_registry_, net_def, &ws_, device);
     return net_->Run();
   }
diff --git a/mace/ops/pad.h b/mace/ops/pad.h
index 35e68482..843cf6fe 100644
--- a/mace/ops/pad.h
+++ b/mace/ops/pad.h
@@ -32,11 +32,10 @@ class PadOp : public Operator<D, T> {
                  OperatorBase::GetSingleArgument<float>("constant_value", 0.0))
   {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input_tensor = this->Input(0);
     Tensor *output_tensor = this->Output(0);
-    functor_(input_tensor, output_tensor, future);
-    return true;
+    return functor_(input_tensor, output_tensor, future);
   }
 
  private:
diff --git a/mace/ops/pooling.h b/mace/ops/pooling.h
index 6e2c5056..a0f95d08 100644
--- a/mace/ops/pooling.h
+++ b/mace/ops/pooling.h
@@ -40,12 +40,11 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
                  this->paddings_,
                  this->dilations_.data()) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
     Tensor *output = this->Output(OUTPUT);
 
-    functor_(input, output, future);
-    return true;
+    return functor_(input, output, future);
   };
 
  protected:
diff --git a/mace/ops/proposal.h b/mace/ops/proposal.h
index 07b60162..1c1b280f 100644
--- a/mace/ops/proposal.h
+++ b/mace/ops/proposal.h
@@ -35,15 +35,14 @@ class ProposalOp : public Operator<D, T> {
                  OperatorBase::GetRepeatedArgument<int>("scales"),
                  OperatorBase::GetRepeatedArgument<float>("ratios")) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *rpn_cls_prob = this->Input(RPN_CLS_PROB);
     const Tensor *rpn_bbox_pred = this->Input(RPN_BBOX_PRED);
     const Tensor *img_info = this->Input(IMG_INFO);
 
     Tensor *output = this->Output(ROIS);
 
-    functor_(rpn_cls_prob, rpn_bbox_pred, img_info, output, future);
-    return true;
+    return functor_(rpn_cls_prob, rpn_bbox_pred, img_info, output, future);
   }
 
  private:
diff --git a/mace/ops/psroi_align.h b/mace/ops/psroi_align.h
index a8fa024b..1f60bc30 100644
--- a/mace/ops/psroi_align.h
+++ b/mace/ops/psroi_align.h
@@ -30,14 +30,13 @@ class PSROIAlignOp : public Operator<D, T> {
                  OperatorBase::GetSingleArgument<int>("output_dim", 0),
                  OperatorBase::GetSingleArgument<int>("group_size", 0)) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
     const Tensor *rois = this->Input(ROIS);
 
     Tensor *output = this->Output(OUTPUT);
 
-    functor_(input, rois, output, future);
-    return true;
+    return functor_(input, rois, output, future);
   }
 
  private:
diff --git a/mace/ops/quantize.h b/mace/ops/quantize.h
index cee215f1..212d3593 100644
--- a/mace/ops/quantize.h
+++ b/mace/ops/quantize.h
@@ -28,7 +28,7 @@ class QuantizeOp : public Operator<D, T> {
     : Operator<D, T>(operator_def, ws) {
   }
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
     const Tensor *in_min = this->Input(IN_MIN);
     const Tensor *in_max = this->Input(IN_MAX);
@@ -39,12 +39,11 @@ class QuantizeOp : public Operator<D, T> {
     Tensor *output = this->Output(OUTPUT);
     Tensor *out_min = this->Output(OUT_MIN);
     Tensor *out_max = this->Output(OUT_MAX);
-    output->ResizeLike(input);
-    out_min->ResizeLike(in_min);
-    out_max->ResizeLike(in_max);
+    MACE_FAILURE_RETURN(output->ResizeLike(input));
+    MACE_FAILURE_RETURN(out_min->ResizeLike(in_min));
+    MACE_FAILURE_RETURN(out_max->ResizeLike(in_max));
 
-    functor_(input, in_min, in_max, output, out_min, out_max, future);
-    return true;
+    return functor_(input, in_min, in_max, output, out_min, out_max, future);
   }
 
  private:
@@ -62,7 +61,7 @@ class DequantizeOp : public Operator<D, T> {
     : Operator<D, T>(operator_def, ws) {
   }
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
     const Tensor *in_min = this->Input(IN_MIN);
     const Tensor *in_max = this->Input(IN_MAX);
@@ -71,10 +70,9 @@ class DequantizeOp : public Operator<D, T> {
     MACE_CHECK(in_max->size() == 1, "max val tensor has more than 1 value");
 
     Tensor *output = this->Output(OUTPUT);
-    output->ResizeLike(input);
+    MACE_FAILURE_RETURN(output->ResizeLike(input));
 
-    functor_(input, in_min, in_max, output, future);
-    return true;
+    return functor_(input, in_min, in_max, output, future);
   }
 
  private:
@@ -92,7 +90,7 @@ class RequantizeOp : public Operator<D, T> {
     : Operator<D, T>(operator_def, ws) {
   }
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
     const Tensor *in_min = this->Input(IN_MIN);
     const Tensor *in_max = this->Input(IN_MAX);
@@ -114,11 +112,11 @@ class RequantizeOp : public Operator<D, T> {
     Tensor *output = this->Output(OUTPUT);
     Tensor *out_min = this->Output(OUT_MIN);
     Tensor *out_max = this->Output(OUT_MAX);
-    output->ResizeLike(input);
-    out_min->ResizeLike(in_min);
-    out_max->ResizeLike(out_max);
+    MACE_FAILURE_RETURN(output->ResizeLike(input));
+    MACE_FAILURE_RETURN(out_min->ResizeLike(in_min));
+    MACE_FAILURE_RETURN(out_max->ResizeLike(out_max));
 
-    functor_(input,
+    return functor_(input,
              in_min,
              in_max,
              rerange_min,
@@ -127,7 +125,6 @@ class RequantizeOp : public Operator<D, T> {
              out_min,
              out_max,
              future);
-    return true;
   }
 
  private:
diff --git a/mace/ops/reshape.h b/mace/ops/reshape.h
index e8aded23..2d145d09 100644
--- a/mace/ops/reshape.h
+++ b/mace/ops/reshape.h
@@ -30,7 +30,7 @@ class ReshapeOp : public Operator<D, T> {
       : Operator<D, T>(op_def, ws),
         shape_(OperatorBase::GetRepeatedArgument<int64_t>("shape")) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
     const index_t num_dims = shape_.size();
     int unknown_idx = -1;
@@ -61,8 +61,7 @@ class ReshapeOp : public Operator<D, T> {
 
     Tensor *output = this->Output(OUTPUT);
 
-    functor_(input, out_shape, output, future);
-    return true;
+    return functor_(input, out_shape, output, future);
   }
 
  private:
diff --git a/mace/ops/resize_bilinear.h b/mace/ops/resize_bilinear.h
index c2e1f0ae..6e186f7d 100644
--- a/mace/ops/resize_bilinear.h
+++ b/mace/ops/resize_bilinear.h
@@ -30,15 +30,14 @@ class ResizeBilinearOp : public Operator<D, T> {
             OperatorBase::GetRepeatedArgument<index_t>("size", {-1, -1}),
             OperatorBase::GetSingleArgument<bool>("align_corners", false)) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(0);
     Tensor *output = this->Output(0);
 
     MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional.",
                input->dim_size());
 
-    functor_(input, output, future);
-    return true;
+    return functor_(input, output, future);
   }
 
  private:
diff --git a/mace/ops/slice.h b/mace/ops/slice.h
index a0640621..0dd36b1d 100644
--- a/mace/ops/slice.h
+++ b/mace/ops/slice.h
@@ -30,7 +30,7 @@ class SliceOp : public Operator<D, T> {
       : Operator<D, T>(op_def, ws),
         functor_(OperatorBase::GetSingleArgument<int>("axis", 3)) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     MACE_CHECK(this->OutputSize() >= 2)
       << "There must be at least two outputs for slicing";
     const Tensor *input = this->Input(INPUT);
@@ -39,8 +39,7 @@ class SliceOp : public Operator<D, T> {
     MACE_CHECK((input->dim(slice_axis) % this->OutputSize()) == 0)
       << "Outputs do not split input equally.";
 
-    functor_(input, output_list, future);
-    return true;
+    return functor_(input, output_list, future);
   }
 
  private:
diff --git a/mace/ops/softmax.h b/mace/ops/softmax.h
index 260e10e5..a4459e52 100644
--- a/mace/ops/softmax.h
+++ b/mace/ops/softmax.h
@@ -27,14 +27,13 @@ class SoftmaxOp : public Operator<D, T> {
   SoftmaxOp(const OperatorDef &operator_def, Workspace *ws)
       : Operator<D, T>(operator_def, ws) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *logits = this->Input(LOGITS);
 
     Tensor *output = this->Output(OUTPUT);
     output->ResizeLike(logits);
 
-    functor_(logits, output, future);
-    return true;
+    return functor_(logits, output, future);
   }
 
  private:
diff --git a/mace/ops/space_to_batch.h b/mace/ops/space_to_batch.h
index fd7f2c05..a67e868c 100644
--- a/mace/ops/space_to_batch.h
+++ b/mace/ops/space_to_batch.h
@@ -34,12 +34,11 @@ class SpaceToBatchNDOp : public Operator<D, T> {
             OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1}),
             false) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *space_tensor = this->Input(INPUT);
     Tensor *batch_tensor = this->Output(OUTPUT);
-    functor_(const_cast<Tensor *>(space_tensor), batch_tensor,
+    return functor_(const_cast<Tensor *>(space_tensor), batch_tensor,
              future);
-    return true;
   }
 
  private:
diff --git a/mace/ops/space_to_depth.h b/mace/ops/space_to_depth.h
index 1b593faf..bccf8b07 100644
--- a/mace/ops/space_to_depth.h
+++ b/mace/ops/space_to_depth.h
@@ -32,7 +32,7 @@ class SpaceToDepthOp : public Operator<D, T> {
       functor_(OperatorBase::GetSingleArgument<int>("block_size", 1), false) {
   }
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
     Tensor *output = this->Output(OUTPUT);
     MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
@@ -58,8 +58,7 @@ class SpaceToDepthOp : public Operator<D, T> {
       (input_width % block_size == 0) && (input_height % block_size == 0),
       "input width and height should be dividable by block_size",
       input->dim(3));
-    functor_(input, output, future);
-    return true;
+    return functor_(input, output, future);
   }
 
  protected:
diff --git a/mace/ops/transpose.h b/mace/ops/transpose.h
index ff7315bb..626adbe5 100644
--- a/mace/ops/transpose.h
+++ b/mace/ops/transpose.h
@@ -31,7 +31,7 @@ class TransposeOp : public Operator<D, T> {
       dims_(OperatorBase::GetRepeatedArgument<int>("dims")),
       functor_(dims_) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
     Tensor *output = this->Output(OUTPUT);
     const std::vector<index_t> &input_shape = input->shape();
@@ -42,9 +42,8 @@ class TransposeOp : public Operator<D, T> {
     for (size_t i = 0; i < dims_.size(); ++i) {
       output_shape.push_back(input_shape[dims_[i]]);
     }
-    output->Resize(output_shape);
-    functor_(input, output, future);
-    return true;
+    MACE_FAILURE_RETURN(output->Resize(output_shape));
+    return functor_(input, output, future);
   }
 
  protected:
diff --git a/mace/ops/winograd_inverse_transform.h b/mace/ops/winograd_inverse_transform.h
index cb9fcf59..dfcc0fd1 100644
--- a/mace/ops/winograd_inverse_transform.h
+++ b/mace/ops/winograd_inverse_transform.h
@@ -38,12 +38,11 @@ class WinogradInverseTransformOp : public Operator<D, T> {
                                                                   "NOOP")),
                  OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input_tensor = this->Input(INPUT);
     const Tensor *bias = this->InputSize() == 2 ? this->Input(BIAS) : nullptr;
     Tensor *output_tensor = this->Output(OUTPUT);
-    functor_(input_tensor, bias, output_tensor, future);
-    return true;
+    return functor_(input_tensor, bias, output_tensor, future);
   }
 
  private:
diff --git a/mace/ops/winograd_transform.h b/mace/ops/winograd_transform.h
index b32398ec..a5795b12 100644
--- a/mace/ops/winograd_transform.h
+++ b/mace/ops/winograd_transform.h
@@ -32,12 +32,11 @@ class WinogradTransformOp : public Operator<D, T> {
                      "padding", static_cast<int>(VALID))),
                  OperatorBase::GetRepeatedArgument<int>("padding_values")) {}
 
-  bool Run(StatsFuture *future) override {
+  MaceStatus Run(StatsFuture *future) override {
     const Tensor *input_tensor = this->Input(INPUT);
     Tensor *output_tensor = this->Output(OUTPUT);
 
-    functor_(input_tensor, output_tensor, future);
-    return true;
+    return functor_(input_tensor, output_tensor, future);
   }
 
  private:
diff --git a/mace/public/mace.h b/mace/public/mace.h
index 2ebf72a0..b4511481 100644
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -65,6 +65,15 @@ enum MaceStatus {
   MACE_OUT_OF_RESOURCES = 2
 };
 
+#define MACE_FAILURE_RETURN(stmt)                                          \
+  {                                                                        \
+    MaceStatus status = (stmt);                                            \
+    if (status != MACE_SUCCESS) {                                          \
+      VLOG(0) << "Mace runtime failure: " << __FILE__ << ":" << __LINE__;  \
+      return status;                                                       \
+    }                                                                      \
+  }
+
 // MACE input/output tensor
 class MaceTensor {
  public:
-- 
GitLab