From 1fcd812b1a0c86b96fa3b86538248c9a3a5dca0c Mon Sep 17 00:00:00 2001
From: liutuo <liutuo@xiaomi.com>
Date: Wed, 16 May 2018 19:39:41 +0800
Subject: [PATCH] fix ndk-r15c openmp bugs

---
 mace/core/runtime/cpu/cpu_runtime.cc          |   1 +
 mace/kernels/arm/conv_2d_neon.h               |  54 ++----
 mace/kernels/arm/conv_2d_neon_3x3.cc          |  53 +++---
 mace/kernels/arm/conv_2d_neon_5x5.cc          |  26 +--
 mace/kernels/arm/conv_2d_neon_7x7.cc          |  78 ++++-----
 mace/kernels/arm/depthwise_conv2d_neon.h      |  24 +--
 mace/kernels/arm/depthwise_conv2d_neon_3x3.cc |  79 +++++----
 mace/kernels/conv_2d.h                        | 158 +++++++-----------
 mace/kernels/deconv_2d.h                      | 109 ++++++------
 mace/kernels/depth_to_space.h                 |  38 ++---
 mace/kernels/depthwise_conv2d.h               |  92 ++++------
 mace/kernels/image_to_buffer.h                |   4 +
 mace/kernels/opencl/bias_add.cc               |   3 +-
 mace/kernels/pooling.h                        | 153 +++++++----------
 mace/kernels/winograd_transform.h             |   1 +
 mace/ops/activation.h                         |   2 +-
 mace/ops/ops_test_util.h                      |   2 +
 mace/utils/logging.cc                         |   1 +
 18 files changed, 371 insertions(+), 507 deletions(-)

diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc
index 09891e8c..8fb3ee5b 100644
--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -18,6 +18,7 @@
 #include <omp.h>
 #endif
 
+#include <errno.h>
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
diff --git a/mace/kernels/arm/conv_2d_neon.h b/mace/kernels/arm/conv_2d_neon.h
index 3d3c907e..5d2d5f9a 100644
--- a/mace/kernels/arm/conv_2d_neon.h
+++ b/mace/kernels/arm/conv_2d_neon.h
@@ -31,68 +31,38 @@ extern void Conv2dNeonK1x1S1(const float *input,
 
 extern void Conv2dNeonK3x3S1(const float *input,
                              const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                              float *output);
 
 extern void Conv2dNeonK3x3S2(const float *input,
                              const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                              float *output);
 
 extern void Conv2dNeonK5x5S1(const float *input,
                              const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                              float *output);
 
 extern void Conv2dNeonK7x7S1(const float *input,
                              const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                              float *output);
 
 extern void Conv2dNeonK7x7S2(const float *input,
                              const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                              float *output);
 
 extern void Conv2dNeonK7x7S3(const float *input,
                              const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                              float *output);
 
 }  // namespace kernels
diff --git a/mace/kernels/arm/conv_2d_neon_3x3.cc b/mace/kernels/arm/conv_2d_neon_3x3.cc
index 0e4ac0eb..58b28ddc 100644
--- a/mace/kernels/arm/conv_2d_neon_3x3.cc
+++ b/mace/kernels/arm/conv_2d_neon_3x3.cc
@@ -24,22 +24,22 @@ namespace kernels {
 // Ho = 2, Wo = 4, Co = 2
 void Conv2dNeonK3x3S1(const float *input,
                       const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                       float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
 
 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; m += 2) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 2) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
       if (m + 1 < out_channels) {
         float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
 #if defined(MACE_ENABLE_NEON)
@@ -522,23 +522,22 @@ void Conv2dNeonK3x3S1(const float *input,
 
 void Conv2dNeonK3x3S2(const float *input,
                       const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                       float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
 
 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; ++m) {
-      for (index_t c = 0; c < in_channels; ++c) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; ++m) {
+      for (index_t c = 0; c < in_shape[1]; ++c) {
+        const index_t in_channels = in_shape[1];
+        const index_t in_width = in_shape[3];
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
         const float *in_base = input + b * in_batch_size + c * in_image_size;
         const float
           *filter_ptr = filter + m * in_channels * 9 + c * 9;
diff --git a/mace/kernels/arm/conv_2d_neon_5x5.cc b/mace/kernels/arm/conv_2d_neon_5x5.cc
index f4fe7ce7..3d77d8f6 100644
--- a/mace/kernels/arm/conv_2d_neon_5x5.cc
+++ b/mace/kernels/arm/conv_2d_neon_5x5.cc
@@ -103,22 +103,22 @@ inline void Conv2dCPUK5x5Calc(const float *in_ptr_base,
 // Ho = 1, Wo = 4, Co = 4
 void Conv2dNeonK5x5S1(const float *input,
                       const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                       float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
 
 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; m += 4) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 4) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
       if (m + 3 < out_channels) {
         float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
 #if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
diff --git a/mace/kernels/arm/conv_2d_neon_7x7.cc b/mace/kernels/arm/conv_2d_neon_7x7.cc
index 057b9313..4432f2a0 100644
--- a/mace/kernels/arm/conv_2d_neon_7x7.cc
+++ b/mace/kernels/arm/conv_2d_neon_7x7.cc
@@ -180,22 +180,22 @@ inline void Conv2dCPUK7x7Calc(const float *in_ptr_base,
 // Ho = 1, Wo = 4, Co = 4
 void Conv2dNeonK7x7S1(const float *input,
                       const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                       float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
 
 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; m += 4) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 4) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
       if (m + 3 < out_channels) {
         float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
 #if defined(MACE_ENABLE_NEON)
@@ -336,22 +336,22 @@ void Conv2dNeonK7x7S1(const float *input,
 // Ho = 1, Wo = 4, Co = 4
 void Conv2dNeonK7x7S2(const float *input,
                       const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                       float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
 
 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; m += 4) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 4) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
       if (m + 3 < out_channels) {
         float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
 #if defined(MACE_ENABLE_NEON)
@@ -502,22 +502,22 @@ void Conv2dNeonK7x7S2(const float *input,
 // Ho = 1, Wo = 4, Co = 4
 void Conv2dNeonK7x7S3(const float *input,
                       const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                       float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
 
 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; m += 4) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 4) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
       if (m + 3 < out_channels) {
         float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
 #if defined(MACE_ENABLE_NEON)
diff --git a/mace/kernels/arm/depthwise_conv2d_neon.h b/mace/kernels/arm/depthwise_conv2d_neon.h
index 130cd360..119867bf 100644
--- a/mace/kernels/arm/depthwise_conv2d_neon.h
+++ b/mace/kernels/arm/depthwise_conv2d_neon.h
@@ -22,15 +22,9 @@ namespace kernels {
 
 void DepthwiseConv2dNeonK3x3S1(const float *input,
                                const float *filter,
-                               const index_t batch,
-                               const index_t in_height,
-                               const index_t in_width,
-                               const index_t in_channels,
-                               const index_t out_height,
-                               const index_t out_width,
-                               const index_t out_channels,
-                               const int pad_top,
-                               const int pad_left,
+                               const index_t *in_shape,
+                               const index_t *out_shape,
+                               const int *pad_hw,
                                const index_t valid_h_start,
                                const index_t valid_h_stop,
                                const index_t valid_w_start,
@@ -39,15 +33,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
 
 void DepthwiseConv2dNeonK3x3S2(const float *input,
                           const float *filter,
-                          const index_t batch,
-                          const index_t in_height,
-                          const index_t in_width,
-                          const index_t in_channels,
-                          const index_t out_height,
-                          const index_t out_width,
-                          const index_t out_channels,
-                          const int pad_top,
-                          const int pad_left,
+                          const index_t *in_shape,
+                          const index_t *out_shape,
+                          const int *pad_hw,
                           const index_t valid_h_start,
                           const index_t valid_h_stop,
                           const index_t valid_w_start,
diff --git a/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc b/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
index fb36bdad..443e57b7 100644
--- a/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
+++ b/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
@@ -52,15 +52,9 @@ void DepthwiseConv2dPixel(const float *in_base,
 // Ho = 2, Wo = 4, Co = 1
 void DepthwiseConv2dNeonK3x3S1(const float *input,
                                const float *filter,
-                               const index_t batch,
-                               const index_t in_height,
-                               const index_t in_width,
-                               const index_t in_channels,
-                               const index_t out_height,
-                               const index_t out_width,
-                               const index_t out_channels,
-                               const int pad_top,
-                               const int pad_left,
+                               const index_t* in_shape,
+                               const index_t* out_shape,
+                               const int* pad_hw,
                                const index_t valid_h_start,
                                const index_t valid_h_stop,
                                const index_t valid_w_start,
@@ -70,25 +64,30 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
   MACE_UNUSED(valid_w_start);
   MACE_UNUSED(valid_w_stop);
 #endif
-  const index_t multiplier = out_channels / in_channels;
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t multiplier = out_shape[1] / in_shape[1];
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
 
 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; ++m) {
+  for (index_t b = 0; b < in_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; ++m) {
       index_t c = m / multiplier;
       index_t multi_index = m % multiplier;
       const float *in_base = input + b * in_batch_size + c * in_image_size;
-      const float *filter_ptr = filter + multi_index * in_channels * 9 + c * 9;
+      const float *filter_ptr = filter + multi_index * in_shape[1] * 9 + c * 9;
       float *out_base = output + b * out_batch_size + m * out_image_size;
       index_t h, w;
+      const index_t pad_top = pad_hw[0];
+      const index_t pad_left = pad_hw[1];
+      const index_t out_width = out_shape[3];
+      const index_t in_height = in_shape[2];
+      const index_t in_width = in_shape[3];
 
       // top
       for (h = 0; h < valid_h_start; ++h) {
-        for (w = 0; w < out_width; ++w) {
+        for (w = 0; w < out_shape[3]; ++w) {
           DepthwiseConv2dPixel(in_base,
                                filter_ptr,
                                h,
@@ -256,7 +255,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
       }  // h
 #else
       for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) {
-        for (index_t iw = 0; iw < out_width; ++iw) {
+        for (index_t iw = 0; iw < out_shape[3]; ++iw) {
           DepthwiseConv2dPixel(in_base,
                                filter_ptr,
                                ih,
@@ -274,8 +273,8 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
 #endif
 
       // bottom
-      for (; h < out_height; ++h) {
-        for (w = 0; w < out_width; ++w) {
+      for (; h < out_shape[2]; ++h) {
+        for (w = 0; w < out_shape[3]; ++w) {
           DepthwiseConv2dPixel(in_base,
                                filter_ptr,
                                h,
@@ -296,15 +295,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
 
 void DepthwiseConv2dNeonK3x3S2(const float *input,
                                const float *filter,
-                               const index_t batch,
-                               const index_t in_height,
-                               const index_t in_width,
-                               const index_t in_channels,
-                               const index_t out_height,
-                               const index_t out_width,
-                               const index_t out_channels,
-                               const int pad_top,
-                               const int pad_left,
+                               const index_t* in_shape,
+                               const index_t* out_shape,
+                               const int* pad_hw,
                                const index_t valid_h_start,
                                const index_t valid_h_stop,
                                const index_t valid_w_start,
@@ -314,22 +307,26 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
   MACE_UNUSED(valid_w_start);
   MACE_UNUSED(valid_w_stop);
 #endif
-  const index_t multiplier = out_channels / in_channels;
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t multiplier = out_shape[1] / in_shape[1];
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
 
 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; ++m) {
+  for (index_t b = 0; b < in_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; ++m) {
       index_t c = m / multiplier;
       index_t multi_index = m % multiplier;
       const float *in_base = input + b * in_batch_size + c * in_image_size;
-      const float *filter_ptr = filter + multi_index * in_channels * 9 + c * 9;
+      const float *filter_ptr = filter + multi_index * in_shape[1] * 9 + c * 9;
       float *out_base = output + b * out_batch_size + m * out_image_size;
       index_t h, w;
-
+      const index_t pad_top = pad_hw[0];
+      const index_t pad_left = pad_hw[1];
+      const index_t out_width = out_shape[3];
+      const index_t in_height = in_shape[2];
+      const index_t in_width = in_shape[3];
       // top
       for (h = 0; h < valid_h_start; ++h) {
         for (w = 0; w < out_width; ++w) {
@@ -472,8 +469,8 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
 #endif
 
       // bottom
-      for (; h < out_height; ++h) {
-        for (w = 0; w < out_width; ++w) {
+      for (; h < out_shape[2]; ++h) {
+        for (w = 0; w < out_shape[3]; ++w) {
           DepthwiseConv2dPixel(in_base,
                                filter_ptr,
                                h,
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index dfe5540d..7f825828 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -84,49 +84,46 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
 
   void Conv2dGeneral(const float *input,
                      const float *filter,
-                     const index_t batch,
-                     const index_t in_height,
-                     const index_t in_width,
-                     const index_t in_channels,
-                     const index_t out_height,
-                     const index_t out_width,
-                     const index_t out_channels,
-                     const int filter_height,
-                     const int filter_width,
-                     const int stride_h,
-                     const int stride_w,
-                     const int dilation_h,
-                     const int dilation_w,
+                     const index_t *in_shape,
+                     const index_t *out_shape,
+                     const index_t *filter_shape,
+                     const int *stride_hw,
+                     const int *dilation_hw,
                      float *output) {
-    const index_t in_image_size = in_height * in_width;
-    const index_t out_image_size = out_height * out_width;
-    const index_t in_batch_size = in_channels * in_image_size;
-    const index_t out_batch_size = out_channels * out_image_size;
-    const index_t filter_size = filter_height * filter_width;
+    const index_t in_image_size = in_shape[2] * in_shape[3];
+    const index_t out_image_size = out_shape[2] * out_shape[3];
+    const index_t in_batch_size = filter_shape[1] * in_image_size;
+    const index_t out_batch_size = filter_shape[0] * out_image_size;
+    const index_t filter_size = filter_shape[2] * filter_shape[3];
 
 #pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t m = 0; m < out_channels; m += 4) {
+    for (index_t b = 0; b < in_shape[0]; b++) {
+      for (index_t m = 0; m < filter_shape[0]; m += 4) {
+        const index_t in_height = in_shape[2];
+        const index_t in_width = in_shape[3];
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
+        const index_t out_channels = filter_shape[0];
+        const index_t in_channels = filter_shape[1];
+
+        const int stride_h = stride_hw[0];
+        const int stride_w = stride_hw[1];
+        const int dilation_h = dilation_hw[0];
+        const int dilation_w = dilation_hw[1];
         if (m + 3 < out_channels) {
           float *out_ptr0_base =
               output + b * out_batch_size + m * out_image_size;
-          float *out_ptr1_base =
-              output + b * out_batch_size + (m + 1) * out_image_size;
-          float *out_ptr2_base =
-              output + b * out_batch_size + (m + 2) * out_image_size;
-          float *out_ptr3_base =
-              output + b * out_batch_size + (m + 3) * out_image_size;
+          float *out_ptr1_base = out_ptr0_base + out_image_size;
+          float *out_ptr2_base = out_ptr1_base + out_image_size;
+          float *out_ptr3_base = out_ptr2_base + out_image_size;
           for (index_t c = 0; c < in_channels; ++c) {
             const float *in_ptr_base =
                 input + b * in_batch_size + c * in_image_size;
             const float *filter_ptr0 =
                 filter + m * in_channels * filter_size + c * filter_size;
-            const float *filter_ptr1 =
-                filter + (m + 1) * in_channels * filter_size + c * filter_size;
-            const float *filter_ptr2 =
-                filter + (m + 2) * in_channels * filter_size + c * filter_size;
-            const float *filter_ptr3 =
-                filter + (m + 3) * in_channels * filter_size + c * filter_size;
+            const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size;
+            const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size;
+            const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size;
             for (index_t h = 0; h < out_height; ++h) {
               for (index_t w = 0; w + 3 < out_width; w += 4) {
                  // input offset
@@ -144,8 +141,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                    vo3[ow] = out_ptr3_base[out_offset + ow];
                 }
                 // calc by row
-                for (index_t kh = 0; kh < filter_height; ++kh) {
-                  for (index_t kw = 0; kw < filter_width; ++kw) {
+                for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
+                  for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
                     // outch 0
                     vo0[0] += in_ptr_base[in_offset
                         + kw * dilation_w] * filter_ptr0[kw];
@@ -185,10 +182,10 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                   }  // kw
 
                   in_offset += dilation_h * in_width;
-                  filter_ptr0 += filter_width;
-                  filter_ptr1 += filter_width;
-                  filter_ptr2 += filter_width;
-                  filter_ptr3 += filter_width;
+                  filter_ptr0 += filter_shape[3];
+                  filter_ptr1 += filter_shape[3];
+                  filter_ptr2 += filter_shape[3];
+                  filter_ptr3 += filter_shape[3];
                 }  // kh
 
                 for (index_t ow = 0; ow < 4; ++ow) {
@@ -230,8 +227,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                   }
 
                   // calc by row
-                  for (index_t kh = 0; kh < filter_height; ++kh) {
-                    for (index_t kw = 0; kw < filter_width; ++kw) {
+                  for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
+                    for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
                       // outch 0
                       vo0[0] += in_ptr_base[in_offset
                           + kw * dilation_w] * filter_ptr0[kw];
@@ -244,7 +241,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                     }  // kw
 
                     in_offset += dilation_h * in_width;
-                    filter_ptr0 += filter_width;
+                    filter_ptr0 += filter_shape[3];
                   }  // kh
 
                   for (index_t ow = 0; ow < 4; ++ow) {
@@ -325,6 +322,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
     index_t dilation_h = dilations_[0];
     index_t dilation_w = dilations_[1];
 
+    const index_t filter_hw[2] = {filter_h, filter_w};
+
     MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
 
     index_t padded_input_height = input_height + paddings[0];
@@ -478,6 +477,10 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
       transformed_output(scratch_->Scratch(transformed_output_size), DT_FLOAT);
     Tensor padded_input(scratch_->Scratch(padded_input_size), DT_FLOAT);
     Tensor padded_output(scratch_->Scratch(padded_output_size), DT_FLOAT);
+    const index_t extra_input_shape[4] =
+        {batch, input_channels, extra_input_height, extra_input_width};
+    const index_t extra_output_shape[4] =
+        {batch, channels, extra_output_height, extra_output_width};
 
     // decide which convolution function to call
     if (use_winograd) {
@@ -512,6 +515,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
 
       float *transformed_input_data = transformed_input.mutable_data<float>();
       float *transformed_output_data = transformed_output.mutable_data<float>();
+
       conv_func = [=](const float *pad_input, float *pad_output) {
         WinoGradConv3x3s1(pad_input,
                           transformed_filter_ptr,
@@ -529,26 +533,16 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
       conv_func = [=](const float *pad_input, float *pad_output) {
         Conv2dNeonK3x3S1(pad_input,
                          filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                          pad_output);
       };
     } else if (use_neon_3x3_s2) {
       conv_func = [=](const float *pad_input, float *pad_output) {
         Conv2dNeonK3x3S2(pad_input,
                          filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                          pad_output);
       };
     } else if (use_neon_1x1_s1) {
@@ -566,71 +560,43 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
       conv_func = [=](const float *pad_input, float *pad_output) {
         Conv2dNeonK5x5S1(pad_input,
                          filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                          pad_output);
       };
     } else if (use_neon_7x7_s1) {
       conv_func = [=](const float *pad_input, float *pad_output) {
         Conv2dNeonK7x7S1(pad_input,
                          filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                          pad_output);
       };
     } else if (use_neon_7x7_s2) {
       conv_func = [=](const float *pad_input, float *pad_output) {
         Conv2dNeonK7x7S2(pad_input,
                          filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                          pad_output);
       };
     } else if (use_neon_7x7_s3) {
       conv_func = [=](const float *pad_input, float *pad_output) {
         Conv2dNeonK7x7S3(pad_input,
                          filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                          pad_output);
       };
     } else {
       conv_func = [=](const float *pad_input, float *pad_output) {
         Conv2dGeneral(pad_input,
                       filter_data,
-                      batch,
-                      extra_input_height,
-                      extra_input_width,
-                      input_channels,
-                      extra_output_height,
-                      extra_output_width,
-                      channels,
-                      filter_h,
-                      filter_w,
-                      stride_h,
-                      stride_w,
-                      dilation_h,
-                      dilation_w,
+                      extra_input_shape,
+                      extra_output_shape,
+                      filter_shape.data(),
+                      strides_,
+                      dilations_,
                       pad_output);
       };
     }
diff --git a/mace/kernels/deconv_2d.h b/mace/kernels/deconv_2d.h
index 14d78a45..7c20addd 100644
--- a/mace/kernels/deconv_2d.h
+++ b/mace/kernels/deconv_2d.h
@@ -41,48 +41,40 @@ template<typename T>
 void Deconv2dNCHW(const T *input,
                   const T *filter,
                   const T *bias,
-                  const index_t batch,
-                  const index_t in_height,
-                  const index_t in_width,
-                  const index_t in_channels,
-                  const index_t out_height,
-                  const index_t out_width,
-                  const index_t out_channels,
-                  const index_t filter_height,
-                  const index_t filter_width,
-                  const index_t stride_h,
-                  const index_t stride_w,
-                  const int padding_top,
-                  const int padding_left,
+                  const index_t *in_shape,
+                  const index_t *out_shape,
+                  const index_t *kernel_hw,
+                  const int *strides,
+                  const int *padding,
                   float *output) {
 #pragma omp parallel for collapse(4)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t oc = 0; oc < out_channels; ++oc) {
-      for (index_t oh = 0; oh < out_height; ++oh) {
-        for (index_t ow = 0; ow < out_width; ++ow) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t oc = 0; oc < out_shape[1]; ++oc) {
+      for (index_t oh = 0; oh < out_shape[2]; ++oh) {
+        for (index_t ow = 0; ow < out_shape[3]; ++ow) {
           index_t filter_start_y, filter_start_x;
-          index_t start_x = std::max<int>(0, ow + stride_w -1 - padding_left);
-          index_t start_y = std::max<int>(0, oh + stride_h -1 - padding_top);
-          start_x /= stride_w;
-          start_y /= stride_h;
-          filter_start_x = padding_left + stride_w * start_x - ow;
-          filter_start_y = padding_top + stride_h * start_y - oh;
-          filter_start_x = filter_width - 1 - filter_start_x;
-          filter_start_y = filter_height - 1 - filter_start_y;
+          index_t start_x = std::max<int>(0, ow + strides[1] -1 - padding[1]);
+          index_t start_y = std::max<int>(0, oh + strides[0] -1 - padding[0]);
+          start_x /= strides[1];
+          start_y /= strides[0];
+          filter_start_x = padding[1] + strides[1] * start_x - ow;
+          filter_start_y = padding[0] + strides[0] * start_y - oh;
+          filter_start_x = kernel_hw[1] - 1 - filter_start_x;
+          filter_start_y = kernel_hw[0] - 1 - filter_start_y;
           T out_value = 0;
           index_t out_pos =
-              ((b * out_channels + oc) * out_height + oh) * out_width + ow;
-          for (index_t ic = 0; ic < in_channels; ++ic) {
+              ((b * out_shape[1] + oc) * out_shape[2] + oh) * out_shape[3] + ow;
+          for (index_t ic = 0; ic < in_shape[1]; ++ic) {
             for (index_t f_y = filter_start_y, ih = start_y;
-                 f_y >= 0 && ih < in_height; f_y -= stride_h, ++ih) {
+                 f_y >= 0 && ih < in_shape[2]; f_y -= strides[0], ++ih) {
               for (index_t f_x = filter_start_x, iw = start_x;
-                  f_x >= 0 && iw < in_width; f_x -= stride_w, ++iw) {
+                  f_x >= 0 && iw < in_shape[3]; f_x -= strides[1], ++iw) {
                   index_t weight_pos =
-                      ((oc * in_channels + ic) * filter_height + f_y)
-                          * filter_width + f_x;
+                      ((oc * in_shape[1] + ic) * kernel_hw[0] + f_y)
+                          * kernel_hw[1] + f_x;
                   index_t in_pos =
-                      ((b * in_channels + ic) * in_height + ih)
-                          * in_width + iw;
+                      ((b * in_shape[1] + ic) * in_shape[2] + ih)
+                          * in_shape[3] + iw;
                   out_value += input[in_pos] * filter[weight_pos];
               }
             }
@@ -269,26 +261,17 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
                            paddings_.data(), true);
       output->Resize(output_shape_);
     }
-    index_t batch = output->dim(0);
-    index_t channels = output->dim(1);
-    index_t height = output->dim(2);
-    index_t width = output->dim(3);
-
-    index_t input_batch = input->dim(0);
-    index_t input_channels = input->dim(1);
-    index_t input_height = input->dim(2);
-    index_t input_width = input->dim(3);
-
     index_t kernel_h = filter->dim(2);
     index_t kernel_w = filter->dim(3);
-    MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
-    MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
-               input_channels);
-
-    index_t stride_h = strides_[0];
-    index_t stride_w = strides_[1];
-
-    MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
+    const index_t *in_shape = input->shape().data();
+    const index_t *out_shape = output->shape().data();
+    const index_t kernel_hw[2] = {kernel_h, kernel_w};
+
+    MACE_CHECK(filter->dim(0) == out_shape[1], filter->dim(0), " != ",
+               output_shape[1]);
+    MACE_CHECK(filter->dim(1) == in_shape[1], filter->dim(1), " != ",
+               in_shape[1]);
+    MACE_CHECK(in_shape[0] == out_shape[0], "Input/Output batch size mismatch");
     Tensor::MappingGuard input_mapper(input);
     Tensor::MappingGuard filter_mapper(filter);
     Tensor::MappingGuard bias_mapper(bias);
@@ -297,17 +280,23 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
     auto filter_data = filter->data<T>();
     auto bias_data = bias == nullptr ? nullptr : bias->data<T>();
     auto output_data = output->mutable_data<T>();
-    int padding_top = (paddings_[0] + 1) >> 1;
-    int padding_left = (paddings_[1] + 1) >> 1;
-
-    deconv::Deconv2dNCHW(input_data, filter_data, bias_data,
-                         batch, input_height, input_width, input_channels,
-                         height, width, channels,
-                         kernel_h, kernel_w,
-                         stride_h, stride_w, padding_top, padding_left,
+    int padding[2];
+    padding[0] = (paddings_[0] + 1) >> 1;
+    padding[1] = (paddings_[1] + 1) >> 1;
+    deconv::Deconv2dNCHW(input_data,
+                         filter_data,
+                         bias_data,
+                         in_shape,
+                         out_shape,
+                         kernel_hw,
+                         strides_,
+                         padding,
                          output_data);
 
-    DoActivation(output_data, output_data, output->size(), activation_,
+    DoActivation(output_data,
+                 output_data,
+                 output->size(),
+                 activation_,
                  relux_max_limit_);
   }
 };
diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h
index 733591a5..2afd905b 100644
--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -34,10 +34,10 @@ struct DepthToSpaceOpFunctor {
     : block_size_(block_size), d2s_(d2s) {}
   void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
     MACE_UNUSED(future);
-    const int batch_size = input->dim(0);
-    const int input_depth = input->dim(1);
-    const int input_height = input->dim(2);
-    const int input_width = input->dim(3);
+    const index_t batch_size = input->dim(0);
+    const index_t input_depth = input->dim(1);
+    const index_t input_height = input->dim(2);
+    const index_t input_width = input->dim(3);
 
     index_t output_depth, output_width, output_height;
 
@@ -62,11 +62,11 @@ struct DepthToSpaceOpFunctor {
 
     if (d2s_) {
 #pragma omp parallel for
-      for (int b = 0; b < batch_size; ++b) {
-        for (int d = 0; d < output_depth; ++d) {
-          for (int h = 0; h < output_height; ++h) {
-            const int in_h = h / block_size_;
-            const int offset_h = (h % block_size_);
+      for (index_t b = 0; b < batch_size; ++b) {
+        for (index_t d = 0; d < output_depth; ++d) {
+          for (index_t h = 0; h < output_height; ++h) {
+            const index_t in_h = h / block_size_;
+            const index_t offset_h = (h % block_size_);
             for (int w = 0; w < output_width; ++w) {
               const index_t in_w = w / block_size_;
               const index_t offset_w = w % block_size_;
@@ -86,18 +86,18 @@ struct DepthToSpaceOpFunctor {
       }
     } else {
 #pragma omp parallel for
-      for (int b = 0; b < batch_size; ++b) {
-        for (int d = 0; d < input_depth; ++d) {
-          for (int h = 0; h < input_height; ++h) {
-            const int out_h = h / block_size_;
-            const int offset_h = (h % block_size_);
-            for (int w = 0; w < input_width; ++w) {
-              const int out_w = w / block_size_;
-              const int offset_w = (w % block_size_);
-              const int offset_d =
+      for (index_t b = 0; b < batch_size; ++b) {
+        for (index_t d = 0; d < input_depth; ++d) {
+          for (index_t h = 0; h < input_height; ++h) {
+            const index_t out_h = h / block_size_;
+            const index_t offset_h = (h % block_size_);
+            for (index_t w = 0; w < input_width; ++w) {
+              const index_t out_w = w / block_size_;
+              const index_t offset_w = (w % block_size_);
+              const index_t offset_d =
                 (offset_h * block_size_ + offset_w) * input_depth;
 
-              const int out_d = d + offset_d;
+              const index_t out_d = d + offset_d;
               const index_t o_index =
                 ((b * output_depth + out_d) * output_height + out_h)
                   * output_width + out_w;
diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h
index ce3c1e48..a276b504 100644
--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -78,28 +78,27 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
 
   void DepthwiseConv2dGeneral(const float *input,
                               const float *filter,
-                              const index_t batch,
-                              const index_t in_height,
-                              const index_t in_width,
-                              const index_t in_channels,
-                              const index_t out_height,
-                              const index_t out_width,
-                              const index_t out_channels,
-                              const int filter_height,
-                              const int filter_width,
-                              const int stride_h,
-                              const int stride_w,
-                              const int dilation_h,
-                              const int dilation_w,
-                              const int pad_top,
-                              const int pad_left,
+                              const index_t *in_shape,
+                              const index_t *out_shape,
+                              const index_t *filter_shape,
+                              const int *stride_hw,
+                              const int *dilation_hw,
+                              const int *pad_hw,
                               float *output) {
-    const index_t multiplier = out_channels / in_channels;
+    const index_t multiplier = filter_shape[0] / filter_shape[1];
 #pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t m = 0; m < out_channels; ++m) {
-        for (index_t h = 0; h < out_height; ++h) {
-          for (index_t w = 0; w < out_width; ++w) {
+    for (index_t b = 0; b < in_shape[0]; ++b) {
+      for (index_t m = 0; m < filter_shape[0]; ++m) {
+        for (index_t h = 0; h < out_shape[2]; ++h) {
+          for (index_t w = 0; w < out_shape[3]; ++w) {
+            const index_t out_channels = filter_shape[0];
+            const index_t in_channels = filter_shape[1];
+            const index_t filter_height = filter_shape[2];
+            const index_t filter_width = filter_shape[3];
+            const index_t in_height = in_shape[2];
+            const index_t in_width = in_shape[3];
+            const index_t out_height = out_shape[2];
+            const index_t out_width = out_shape[3];
             index_t out_offset =
               ((b * out_channels + m) * out_height + h) * out_width + w;
             index_t c = m / multiplier;
@@ -107,8 +106,8 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
             float sum = 0;
             for (index_t kh = 0; kh < filter_height; ++kh) {
               for (index_t kw = 0; kw < filter_width; ++kw) {
-                index_t ih = h * stride_h + kh * dilation_h - pad_top;
-                index_t iw = w * stride_w + kw * dilation_w - pad_left;
+                index_t ih = h * stride_hw[0] + kh * dilation_hw[0] - pad_hw[0];
+                index_t iw = w * stride_hw[1] + kw * dilation_hw[1] - pad_hw[1];
                 if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
                   index_t in_offset =
                     ((b * in_channels + c) * in_height + ih) * in_width + iw;
@@ -214,20 +213,18 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
     auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
     auto output_data = output->mutable_data<float>();
 
+    const int pad_hw[2] = {pad_top, pad_left};
+    const index_t input_shape[4] =
+        {batch, input_channels, input_height, input_width};
+
     if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1
       && dilation_h == 1 && dilation_w == 1) {
       conv_func = [=](const float *input, float *output) {
         DepthwiseConv2dNeonK3x3S1(input,
                                   filter_data,
-                                  batch,
-                                  input_height,
-                                  input_width,
-                                  input_channels,
-                                  height,
-                                  width,
-                                  channels,
-                                  pad_top,
-                                  pad_left,
+                                  input_shape,
+                                  output_shape.data(),
+                                  pad_hw,
                                   valid_h_start,
                                   valid_h_stop,
                                   valid_w_start,
@@ -239,15 +236,9 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
       conv_func = [=](const float *input, float *output) {
         DepthwiseConv2dNeonK3x3S2(input,
                                   filter_data,
-                                  batch,
-                                  input_height,
-                                  input_width,
-                                  input_channels,
-                                  height,
-                                  width,
-                                  channels,
-                                  pad_top,
-                                  pad_left,
+                                  input_shape,
+                                  output_shape.data(),
+                                  pad_hw,
                                   valid_h_start,
                                   valid_h_stop,
                                   valid_w_start,
@@ -258,21 +249,12 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
       conv_func = [=](const float *input, float *output) {
         DepthwiseConv2dGeneral(input,
                                filter_data,
-                               batch,
-                               input_height,
-                               input_width,
-                               input_channels,
-                               height,
-                               width,
-                               channels,
-                               filter_h,
-                               filter_w,
-                               stride_h,
-                               stride_w,
-                               dilation_h,
-                               dilation_w,
-                               pad_top,
-                               pad_left,
+                               input_shape,
+                               output_shape.data(),
+                               filter_shape.data(),
+                               strides_,
+                               dilations_,
+                               pad_hw,
                                output);
       };
     }
diff --git a/mace/kernels/image_to_buffer.h b/mace/kernels/image_to_buffer.h
index ce08e51f..b6a7370d 100644
--- a/mace/kernels/image_to_buffer.h
+++ b/mace/kernels/image_to_buffer.h
@@ -37,6 +37,10 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {
                   const BufferType type,
                   Tensor *output,
                   StatsFuture *future) {
+    MACE_UNUSED(input);
+    MACE_UNUSED(type);
+    MACE_UNUSED(output);
+    MACE_UNUSED(future);
     MACE_NOT_IMPLEMENTED;
   }
 };
diff --git a/mace/kernels/opencl/bias_add.cc b/mace/kernels/opencl/bias_add.cc
index b6d2b4b1..05b9d169 100644
--- a/mace/kernels/opencl/bias_add.cc
+++ b/mace/kernels/opencl/bias_add.cc
@@ -90,7 +90,8 @@ void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
   } else {
     std::vector<uint32_t> roundup_gws(lws.size());
     for (size_t i = 0; i < lws.size(); ++i) {
-      roundup_gws[i] = RoundUp(gws[i], lws[i]);
+      if (lws[i] != 0)
+        roundup_gws[i] = RoundUp(gws[i], lws[i]);
     }
 
     error = runtime->command_queue().enqueueNDRangeKernel(
diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h
index 97a65f1e..9c510b34 100644
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -75,39 +75,38 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
   }
 
   void MaxPooling(const float *input,
-                  const index_t batch,
-                  const index_t in_height,
-                  const index_t in_width,
-                  const index_t channels,
-                  const index_t out_height,
-                  const index_t out_width,
-                  const int filter_height,
-                  const int filter_width,
-                  const int stride_h,
-                  const int stride_w,
-                  const int dilation_h,
-                  const int dilation_w,
-                  const int pad_top,
-                  const int pad_left,
+                  const index_t *in_shape,
+                  const index_t *out_shape,
+                  const int *filter_hw,
+                  const int *stride_hw,
+                  const int *dilation_hw,
+                  const int *pad_hw,
                   float *output) {
-    const index_t in_image_size = in_height * in_width;
-    const index_t out_image_size = out_height * out_width;
-    const index_t in_batch_size = channels * in_image_size;
-    const index_t out_batch_size = channels * out_image_size;
+    const index_t in_image_size = in_shape[2] * in_shape[3];
+    const index_t out_image_size = out_shape[2] * out_shape[3];
+    const index_t in_batch_size = in_shape[1] * in_image_size;
+    const index_t out_batch_size = out_shape[1] * out_image_size;
 
 #pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t c = 0; c < channels; ++c) {
+    for (index_t b = 0; b < out_shape[0]; ++b) {
+      for (index_t c = 0; c < out_shape[1]; ++c) {
         const index_t out_base = b * out_batch_size + c * out_image_size;
         const index_t in_base = b * in_batch_size + c * in_image_size;
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
+        const index_t in_height = in_shape[2];
+        const index_t in_width = in_shape[3];
+
         for (index_t h = 0; h < out_height; ++h) {
           for (index_t w = 0; w < out_width; ++w) {
             const index_t out_offset = out_base + h * out_width + w;
             float res = std::numeric_limits<float>::lowest();
-            for (int fh = 0; fh < filter_height; ++fh) {
-              for (int fw = 0; fw < filter_width; ++fw) {
-                int inh = h * stride_h + dilation_h * fh - pad_top;
-                int inw = w * stride_w + dilation_w * fw - pad_left;
+            for (int fh = 0; fh < filter_hw[0]; ++fh) {
+              for (int fw = 0; fw < filter_hw[1]; ++fw) {
+                index_t inh =
+                    h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0];
+                index_t inw =
+                    w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1];
                 if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
                   index_t input_offset = in_base + inh * in_width + inw;
                   res = std::max(res, input[input_offset]);
@@ -122,40 +121,38 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
   }
 
   void AvgPooling(const float *input,
-                  const index_t batch,
-                  const index_t in_height,
-                  const index_t in_width,
-                  const index_t channels,
-                  const index_t out_height,
-                  const index_t out_width,
-                  const int filter_height,
-                  const int filter_width,
-                  const int stride_h,
-                  const int stride_w,
-                  const int dilation_h,
-                  const int dilation_w,
-                  const int pad_top,
-                  const int pad_left,
+                  const index_t *in_shape,
+                  const index_t *out_shape,
+                  const int *filter_hw,
+                  const int *stride_hw,
+                  const int *dilation_hw,
+                  const int *pad_hw,
                   float *output) {
-    const index_t in_image_size = in_height * in_width;
-    const index_t out_image_size = out_height * out_width;
-    const index_t in_batch_size = channels * in_image_size;
-    const index_t out_batch_size = channels * out_image_size;
+    const index_t in_image_size = in_shape[2] * in_shape[3];
+    const index_t out_image_size = out_shape[2] * out_shape[3];
+    const index_t in_batch_size = in_shape[1] * in_image_size;
+    const index_t out_batch_size = out_shape[1] * out_image_size;
 
 #pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t c = 0; c < channels; ++c) {
+    for (index_t b = 0; b < out_shape[0]; ++b) {
+      for (index_t c = 0; c < out_shape[1]; ++c) {
         const index_t out_base = b * out_batch_size + c * out_image_size;
         const index_t in_base = b * in_batch_size + c * in_image_size;
+        const index_t in_height = in_shape[2];
+        const index_t in_width = in_shape[3];
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
         for (index_t h = 0; h < out_height; ++h) {
           for (index_t w = 0; w < out_width; ++w) {
             const index_t out_offset = out_base + h * out_width + w;
             float res = 0;
             int block_size = 0;
-            for (int fh = 0; fh < filter_height; ++fh) {
-              for (int fw = 0; fw < filter_width; ++fw) {
-                int inh = h * stride_h + dilation_h * fh - pad_top;
-                int inw = w * stride_w + dilation_w * fw - pad_left;
+            for (int fh = 0; fh < filter_hw[0]; ++fh) {
+              for (int fw = 0; fw < filter_hw[1]; ++fw) {
+                index_t inh =
+                    h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0];
+                index_t inw =
+                    w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1];
                 if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
                   index_t input_offset = in_base + inh * in_width + inw;
                   res += input[input_offset];
@@ -200,59 +197,25 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
     const float *input = input_tensor->data<float>();
     float *output = output_tensor->mutable_data<float>();
     const index_t *input_shape = input_tensor->shape().data();
-    index_t batch = output_shape[0];
-    index_t channels = output_shape[1];
-    index_t height = output_shape[2];
-    index_t width = output_shape[3];
-
-    index_t input_height = input_shape[2];
-    index_t input_width = input_shape[3];
-
-    int filter_h = kernels_[0];
-    int filter_w = kernels_[1];
-
-    int stride_h = strides_[0];
-    int stride_w = strides_[1];
-
-    int dilation_h = dilations_[0];
-    int dilation_w = dilations_[1];
-
-    int pad_top = paddings[0] / 2;
-    int pad_left = paddings[1] / 2;
+    int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2};
 
     if (pooling_type_ == PoolingType::MAX) {
       MaxPooling(input,
-                 batch,
-                 input_height,
-                 input_width,
-                 channels,
-                 height,
-                 width,
-                 filter_h,
-                 filter_w,
-                 stride_h,
-                 stride_w,
-                 dilation_h,
-                 dilation_w,
-                 pad_top,
-                 pad_left,
+                 input_shape,
+                 output_shape.data(),
+                 kernels_,
+                 strides_,
+                 dilations_,
+                 pad_hw,
                  output);
     } else if (pooling_type_ == PoolingType::AVG) {
       AvgPooling(input,
-                 batch,
-                 input_height,
-                 input_width,
-                 channels,
-                 height,
-                 width,
-                 filter_h,
-                 filter_w,
-                 stride_h,
-                 stride_w,
-                 dilation_h,
-                 dilation_w,
-                 pad_top,
-                 pad_left,
+                 input_shape,
+                 output_shape.data(),
+                 kernels_,
+                 strides_,
+                 dilations_,
+                 pad_hw,
                  output);
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h
index 06b6182e..4e53ee7a 100644
--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -111,6 +111,7 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
     MACE_UNUSED(input);
     MACE_UNUSED(bias);
     MACE_UNUSED(output);
+    MACE_UNUSED(future);
     MACE_NOT_IMPLEMENTED;
   }
 };
diff --git a/mace/ops/activation.h b/mace/ops/activation.h
index 5f07425a..7c6d3b56 100644
--- a/mace/ops/activation.h
+++ b/mace/ops/activation.h
@@ -38,7 +38,7 @@ class ActivationOp : public Operator<D, T> {
     const Tensor *input_tensor = this->Input(0);
     const Tensor *alpha_tensor =
         this->InputSize() >= 2 ? this->Input(1) : nullptr;
-    Tensor *output_tensor = this->outputs_[0];
+    Tensor *output_tensor = this->Output(0);
     output_tensor->ResizeLike(input_tensor);
 
     functor_(input_tensor, alpha_tensor, output_tensor, future);
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
index bf9006fb..0f46e7a0 100644
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -618,6 +618,8 @@ struct Expector<EXP_TYPE, RES_TYPE, false> {
   static void Near(const Tensor &x, const Tensor &y,
                    const double rel_err,
                    const double abs_err) {
+    MACE_UNUSED(rel_err);
+    MACE_UNUSED(abs_err);
     Equal(x, y);
   }
 };
diff --git a/mace/utils/logging.cc b/mace/utils/logging.cc
index 52ddca5f..a8b06e69 100644
--- a/mace/utils/logging.cc
+++ b/mace/utils/logging.cc
@@ -15,6 +15,7 @@
 #include "mace/utils/logging.h"
 
 #include <stdlib.h>
+#include <string.h>
 #if defined(ANDROID) || defined(__ANDROID__)
 #include <android/log.h>
 #include <iostream>
-- 
GitLab