fix ndk-r15c openmp bugs

1fcd812b · liutuo · 3b11a1b2 · 1fcd812b · 1fcd812b · 1fcd812b
18 changed file
--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -18,6 +18,7 @@
 #include <omp.h>
 #endif

+#include <errno.h>
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <sys/types.h>

--- a/mace/kernels/arm/conv_2d_neon.h
+++ b/mace/kernels/arm/conv_2d_neon.h
@@ -31,68 +31,38 @@ extern void Conv2dNeonK1x1S1(const float *input,

 extern void Conv2dNeonK3x3S1(const float *input,
                             const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                             float *output);

 extern void Conv2dNeonK3x3S2(const float *input,
                             const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                             float *output);

 extern void Conv2dNeonK5x5S1(const float *input,
                             const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                             float *output);

 extern void Conv2dNeonK7x7S1(const float *input,
                             const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                             float *output);

 extern void Conv2dNeonK7x7S2(const float *input,
                             const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                             float *output);

 extern void Conv2dNeonK7x7S3(const float *input,
                             const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
+                             const index_t *in_shape,
+                             const index_t *out_shape,
                             float *output);

 }  // namespace kernels

--- a/mace/kernels/arm/conv_2d_neon_3x3.cc
+++ b/mace/kernels/arm/conv_2d_neon_3x3.cc
@@ -24,22 +24,22 @@ namespace kernels {
 // Ho = 2, Wo = 4, Co = 2
 void Conv2dNeonK3x3S1(const float *input,
                      const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                      float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;

 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; m += 2) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 2) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
      if (m + 1 < out_channels) {
        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
 #if defined(MACE_ENABLE_NEON)
@@ -522,23 +522,22 @@ void Conv2dNeonK3x3S1(const float *input,

 void Conv2dNeonK3x3S2(const float *input,
                      const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                      float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;

 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; ++m) {
-      for (index_t c = 0; c < in_channels; ++c) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; ++m) {
+      for (index_t c = 0; c < in_shape[1]; ++c) {
+        const index_t in_channels = in_shape[1];
+        const index_t in_width = in_shape[3];
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
        const float *in_base = input + b * in_batch_size + c * in_image_size;
        const float
          *filter_ptr = filter + m * in_channels * 9 + c * 9;

--- a/mace/kernels/arm/conv_2d_neon_5x5.cc
+++ b/mace/kernels/arm/conv_2d_neon_5x5.cc
@@ -103,22 +103,22 @@ inline void Conv2dCPUK5x5Calc(const float *in_ptr_base,
 // Ho = 1, Wo = 4, Co = 4
 void Conv2dNeonK5x5S1(const float *input,
                      const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                      float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;

 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; m += 4) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 4) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
      if (m + 3 < out_channels) {
        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
 #if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)

--- a/mace/kernels/arm/conv_2d_neon_7x7.cc
+++ b/mace/kernels/arm/conv_2d_neon_7x7.cc
@@ -180,22 +180,22 @@ inline void Conv2dCPUK7x7Calc(const float *in_ptr_base,
 // Ho = 1, Wo = 4, Co = 4
 void Conv2dNeonK7x7S1(const float *input,
                      const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                      float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;

 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; m += 4) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 4) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
      if (m + 3 < out_channels) {
        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
 #if defined(MACE_ENABLE_NEON)
@@ -336,22 +336,22 @@ void Conv2dNeonK7x7S1(const float *input,
 // Ho = 1, Wo = 4, Co = 4
 void Conv2dNeonK7x7S2(const float *input,
                      const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                      float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;

 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; m += 4) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 4) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
      if (m + 3 < out_channels) {
        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
 #if defined(MACE_ENABLE_NEON)
@@ -502,22 +502,22 @@ void Conv2dNeonK7x7S2(const float *input,
 // Ho = 1, Wo = 4, Co = 4
 void Conv2dNeonK7x7S3(const float *input,
                      const float *filter,
-                      const index_t batch,
-                      const index_t in_height,
-                      const index_t in_width,
-                      const index_t in_channels,
-                      const index_t out_height,
-                      const index_t out_width,
-                      const index_t out_channels,
+                      const index_t *in_shape,
+                      const index_t *out_shape,
                      float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;

 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; m += 4) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; m += 4) {
+      const index_t out_channels = out_shape[1];
+      const index_t out_height = out_shape[2];
+      const index_t out_width = out_shape[3];
+      const index_t in_channels = in_shape[1];
+      const index_t in_width = in_shape[3];
      if (m + 3 < out_channels) {
        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
 #if defined(MACE_ENABLE_NEON)

--- a/mace/kernels/arm/depthwise_conv2d_neon.h
+++ b/mace/kernels/arm/depthwise_conv2d_neon.h
@@ -22,15 +22,9 @@ namespace kernels {

 void DepthwiseConv2dNeonK3x3S1(const float *input,
                               const float *filter,
-                               const index_t batch,
-                               const index_t in_height,
-                               const index_t in_width,
-                               const index_t in_channels,
-                               const index_t out_height,
-                               const index_t out_width,
-                               const index_t out_channels,
-                               const int pad_top,
-                               const int pad_left,
+                               const index_t *in_shape,
+                               const index_t *out_shape,
+                               const int *pad_hw,
                               const index_t valid_h_start,
                               const index_t valid_h_stop,
                               const index_t valid_w_start,
@@ -39,15 +33,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,

 void DepthwiseConv2dNeonK3x3S2(const float *input,
                          const float *filter,
-                          const index_t batch,
-                          const index_t in_height,
-                          const index_t in_width,
-                          const index_t in_channels,
-                          const index_t out_height,
-                          const index_t out_width,
-                          const index_t out_channels,
-                          const int pad_top,
-                          const int pad_left,
+                          const index_t *in_shape,
+                          const index_t *out_shape,
+                          const int *pad_hw,
                          const index_t valid_h_start,
                          const index_t valid_h_stop,
                          const index_t valid_w_start,

--- a/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
+++ b/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
@@ -52,15 +52,9 @@ void DepthwiseConv2dPixel(const float *in_base,
 // Ho = 2, Wo = 4, Co = 1
 void DepthwiseConv2dNeonK3x3S1(const float *input,
                               const float *filter,
-                               const index_t batch,
-                               const index_t in_height,
-                               const index_t in_width,
-                               const index_t in_channels,
-                               const index_t out_height,
-                               const index_t out_width,
-                               const index_t out_channels,
-                               const int pad_top,
-                               const int pad_left,
+                               const index_t* in_shape,
+                               const index_t* out_shape,
+                               const int* pad_hw,
                               const index_t valid_h_start,
                               const index_t valid_h_stop,
                               const index_t valid_w_start,
@@ -70,25 +64,30 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
  MACE_UNUSED(valid_w_start);
  MACE_UNUSED(valid_w_stop);
 #endif
-  const index_t multiplier = out_channels / in_channels;
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t multiplier = out_shape[1] / in_shape[1];
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;

 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; ++m) {
+  for (index_t b = 0; b < in_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; ++m) {
      index_t c = m / multiplier;
      index_t multi_index = m % multiplier;
      const float *in_base = input + b * in_batch_size + c * in_image_size;
-      const float *filter_ptr = filter + multi_index * in_channels * 9 + c * 9;
+      const float *filter_ptr = filter + multi_index * in_shape[1] * 9 + c * 9;
      float *out_base = output + b * out_batch_size + m * out_image_size;
      index_t h, w;
+      const index_t pad_top = pad_hw[0];
+      const index_t pad_left = pad_hw[1];
+      const index_t out_width = out_shape[3];
+      const index_t in_height = in_shape[2];
+      const index_t in_width = in_shape[3];

      // top
      for (h = 0; h < valid_h_start; ++h) {
-        for (w = 0; w < out_width; ++w) {
+        for (w = 0; w < out_shape[3]; ++w) {
          DepthwiseConv2dPixel(in_base,
                               filter_ptr,
                               h,
@@ -256,7 +255,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
      }  // h
 #else
      for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) {
-        for (index_t iw = 0; iw < out_width; ++iw) {
+        for (index_t iw = 0; iw < out_shape[3]; ++iw) {
          DepthwiseConv2dPixel(in_base,
                               filter_ptr,
                               ih,
@@ -274,8 +273,8 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
 #endif

      // bottom
-      for (; h < out_height; ++h) {
-        for (w = 0; w < out_width; ++w) {
+      for (; h < out_shape[2]; ++h) {
+        for (w = 0; w < out_shape[3]; ++w) {
          DepthwiseConv2dPixel(in_base,
                               filter_ptr,
                               h,
@@ -296,15 +295,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,

 void DepthwiseConv2dNeonK3x3S2(const float *input,
                               const float *filter,
-                               const index_t batch,
-                               const index_t in_height,
-                               const index_t in_width,
-                               const index_t in_channels,
-                               const index_t out_height,
-                               const index_t out_width,
-                               const index_t out_channels,
-                               const int pad_top,
-                               const int pad_left,
+                               const index_t* in_shape,
+                               const index_t* out_shape,
+                               const int* pad_hw,
                               const index_t valid_h_start,
                               const index_t valid_h_stop,
                               const index_t valid_w_start,
@@ -314,22 +307,26 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
  MACE_UNUSED(valid_w_start);
  MACE_UNUSED(valid_w_stop);
 #endif
-  const index_t multiplier = out_channels / in_channels;
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t multiplier = out_shape[1] / in_shape[1];
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;

 #pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; ++m) {
+  for (index_t b = 0; b < in_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; ++m) {
      index_t c = m / multiplier;
      index_t multi_index = m % multiplier;
      const float *in_base = input + b * in_batch_size + c * in_image_size;
-      const float *filter_ptr = filter + multi_index * in_channels * 9 + c * 9;
+      const float *filter_ptr = filter + multi_index * in_shape[1] * 9 + c * 9;
      float *out_base = output + b * out_batch_size + m * out_image_size;
      index_t h, w;
-
+      const index_t pad_top = pad_hw[0];
+      const index_t pad_left = pad_hw[1];
+      const index_t out_width = out_shape[3];
+      const index_t in_height = in_shape[2];
+      const index_t in_width = in_shape[3];
      // top
      for (h = 0; h < valid_h_start; ++h) {
        for (w = 0; w < out_width; ++w) {
@@ -472,8 +469,8 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
 #endif

      // bottom
-      for (; h < out_height; ++h) {
-        for (w = 0; w < out_width; ++w) {
+      for (; h < out_shape[2]; ++h) {
+        for (w = 0; w < out_shape[3]; ++w) {
          DepthwiseConv2dPixel(in_base,
                               filter_ptr,
                               h,

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -84,49 +84,46 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {

  void Conv2dGeneral(const float *input,
                     const float *filter,
-                     const index_t batch,
-                     const index_t in_height,
-                     const index_t in_width,
-                     const index_t in_channels,
-                     const index_t out_height,
-                     const index_t out_width,
-                     const index_t out_channels,
-                     const int filter_height,
-                     const int filter_width,
-                     const int stride_h,
-                     const int stride_w,
-                     const int dilation_h,
-                     const int dilation_w,
+                     const index_t *in_shape,
+                     const index_t *out_shape,
+                     const index_t *filter_shape,
+                     const int *stride_hw,
+                     const int *dilation_hw,
                     float *output) {
-    const index_t in_image_size = in_height * in_width;
-    const index_t out_image_size = out_height * out_width;
-    const index_t in_batch_size = in_channels * in_image_size;
-    const index_t out_batch_size = out_channels * out_image_size;
-    const index_t filter_size = filter_height * filter_width;
+    const index_t in_image_size = in_shape[2] * in_shape[3];
+    const index_t out_image_size = out_shape[2] * out_shape[3];
+    const index_t in_batch_size = filter_shape[1] * in_image_size;
+    const index_t out_batch_size = filter_shape[0] * out_image_size;
+    const index_t filter_size = filter_shape[2] * filter_shape[3];

 #pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t m = 0; m < out_channels; m += 4) {
+    for (index_t b = 0; b < in_shape[0]; b++) {
+      for (index_t m = 0; m < filter_shape[0]; m += 4) {
+        const index_t in_height = in_shape[2];
+        const index_t in_width = in_shape[3];
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
+        const index_t out_channels = filter_shape[0];
+        const index_t in_channels = filter_shape[1];
+
+        const int stride_h = stride_hw[0];
+        const int stride_w = stride_hw[1];
+        const int dilation_h = dilation_hw[0];
+        const int dilation_w = dilation_hw[1];
        if (m + 3 < out_channels) {
          float *out_ptr0_base =
              output + b * out_batch_size + m * out_image_size;
-          float *out_ptr1_base =
-              output + b * out_batch_size + (m + 1) * out_image_size;
-          float *out_ptr2_base =
-              output + b * out_batch_size + (m + 2) * out_image_size;
-          float *out_ptr3_base =
-              output + b * out_batch_size + (m + 3) * out_image_size;
+          float *out_ptr1_base = out_ptr0_base + out_image_size;
+          float *out_ptr2_base = out_ptr1_base + out_image_size;
+          float *out_ptr3_base = out_ptr2_base + out_image_size;
          for (index_t c = 0; c < in_channels; ++c) {
            const float *in_ptr_base =
                input + b * in_batch_size + c * in_image_size;
            const float *filter_ptr0 =
                filter + m * in_channels * filter_size + c * filter_size;
-            const float *filter_ptr1 =
-                filter + (m + 1) * in_channels * filter_size + c * filter_size;
-            const float *filter_ptr2 =
-                filter + (m + 2) * in_channels * filter_size + c * filter_size;
-            const float *filter_ptr3 =
-                filter + (m + 3) * in_channels * filter_size + c * filter_size;
+            const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size;
+            const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size;
+            const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size;
            for (index_t h = 0; h < out_height; ++h) {
              for (index_t w = 0; w + 3 < out_width; w += 4) {
                 // input offset
@@ -144,8 +141,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                   vo3[ow] = out_ptr3_base[out_offset + ow];
                }
                // calc by row
-                for (index_t kh = 0; kh < filter_height; ++kh) {
-                  for (index_t kw = 0; kw < filter_width; ++kw) {
+                for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
+                  for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
                    // outch 0
                    vo0[0] += in_ptr_base[in_offset
                        + kw * dilation_w] * filter_ptr0[kw];
@@ -185,10 +182,10 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                  }  // kw

                  in_offset += dilation_h * in_width;
-                  filter_ptr0 += filter_width;
-                  filter_ptr1 += filter_width;
-                  filter_ptr2 += filter_width;
-                  filter_ptr3 += filter_width;
+                  filter_ptr0 += filter_shape[3];
+                  filter_ptr1 += filter_shape[3];
+                  filter_ptr2 += filter_shape[3];
+                  filter_ptr3 += filter_shape[3];
                }  // kh

                for (index_t ow = 0; ow < 4; ++ow) {
@@ -230,8 +227,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                  }

                  // calc by row
-                  for (index_t kh = 0; kh < filter_height; ++kh) {
-                    for (index_t kw = 0; kw < filter_width; ++kw) {
+                  for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
+                    for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
                      // outch 0
                      vo0[0] += in_ptr_base[in_offset
                          + kw * dilation_w] * filter_ptr0[kw];
@@ -244,7 +241,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                    }  // kw

                    in_offset += dilation_h * in_width;
-                    filter_ptr0 += filter_width;
+                    filter_ptr0 += filter_shape[3];
                  }  // kh

                  for (index_t ow = 0; ow < 4; ++ow) {
@@ -325,6 +322,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
    index_t dilation_h = dilations_[0];
    index_t dilation_w = dilations_[1];

+    const index_t filter_hw[2] = {filter_h, filter_w};
+
    MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");

    index_t padded_input_height = input_height + paddings[0];
@@ -478,6 +477,10 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
      transformed_output(scratch_->Scratch(transformed_output_size), DT_FLOAT);
    Tensor padded_input(scratch_->Scratch(padded_input_size), DT_FLOAT);
    Tensor padded_output(scratch_->Scratch(padded_output_size), DT_FLOAT);
+    const index_t extra_input_shape[4] =
+        {batch, input_channels, extra_input_height, extra_input_width};
+    const index_t extra_output_shape[4] =
+        {batch, channels, extra_output_height, extra_output_width};

    // decide which convolution function to call
    if (use_winograd) {
@@ -512,6 +515,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {

      float *transformed_input_data = transformed_input.mutable_data<float>();
      float *transformed_output_data = transformed_output.mutable_data<float>();
+
      conv_func = [=](const float *pad_input, float *pad_output) {
        WinoGradConv3x3s1(pad_input,
                          transformed_filter_ptr,
@@ -529,26 +533,16 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
      conv_func = [=](const float *pad_input, float *pad_output) {
        Conv2dNeonK3x3S1(pad_input,
                         filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                         pad_output);
      };
    } else if (use_neon_3x3_s2) {
      conv_func = [=](const float *pad_input, float *pad_output) {
        Conv2dNeonK3x3S2(pad_input,
                         filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                         pad_output);
      };
    } else if (use_neon_1x1_s1) {
@@ -566,71 +560,43 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
      conv_func = [=](const float *pad_input, float *pad_output) {
        Conv2dNeonK5x5S1(pad_input,
                         filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                         pad_output);
      };
    } else if (use_neon_7x7_s1) {
      conv_func = [=](const float *pad_input, float *pad_output) {
        Conv2dNeonK7x7S1(pad_input,
                         filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                         pad_output);
      };
    } else if (use_neon_7x7_s2) {
      conv_func = [=](const float *pad_input, float *pad_output) {
        Conv2dNeonK7x7S2(pad_input,
                         filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                         pad_output);
      };
    } else if (use_neon_7x7_s3) {
      conv_func = [=](const float *pad_input, float *pad_output) {
        Conv2dNeonK7x7S3(pad_input,
                         filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         extra_output_height,
-                         extra_output_width,
-                         channels,
+                         extra_input_shape,
+                         extra_output_shape,
                         pad_output);
      };
    } else {
      conv_func = [=](const float *pad_input, float *pad_output) {
        Conv2dGeneral(pad_input,
                      filter_data,
-                      batch,
-                      extra_input_height,
-                      extra_input_width,
-                      input_channels,
-                      extra_output_height,
-                      extra_output_width,
-                      channels,
-                      filter_h,
-                      filter_w,
-                      stride_h,
-                      stride_w,
-                      dilation_h,
-                      dilation_w,
+                      extra_input_shape,
+                      extra_output_shape,
+                      filter_shape.data(),
+                      strides_,
+                      dilations_,
                      pad_output);
      };
    }

--- a/mace/kernels/deconv_2d.h
+++ b/mace/kernels/deconv_2d.h
@@ -41,48 +41,40 @@ template<typename T>
 void Deconv2dNCHW(const T *input,
                  const T *filter,
                  const T *bias,
-                  const index_t batch,
-                  const index_t in_height,
-                  const index_t in_width,
-                  const index_t in_channels,
-                  const index_t out_height,
-                  const index_t out_width,
-                  const index_t out_channels,
-                  const index_t filter_height,
-                  const index_t filter_width,
-                  const index_t stride_h,
-                  const index_t stride_w,
-                  const int padding_top,
-                  const int padding_left,
+                  const index_t *in_shape,
+                  const index_t *out_shape,
+                  const index_t *kernel_hw,
+                  const int *strides,
+                  const int *padding,
                  float *output) {
 #pragma omp parallel for collapse(4)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t oc = 0; oc < out_channels; ++oc) {
-      for (index_t oh = 0; oh < out_height; ++oh) {
-        for (index_t ow = 0; ow < out_width; ++ow) {
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t oc = 0; oc < out_shape[1]; ++oc) {
+      for (index_t oh = 0; oh < out_shape[2]; ++oh) {
+        for (index_t ow = 0; ow < out_shape[3]; ++ow) {
          index_t filter_start_y, filter_start_x;
-          index_t start_x = std::max<int>(0, ow + stride_w -1 - padding_left);
-          index_t start_y = std::max<int>(0, oh + stride_h -1 - padding_top);
-          start_x /= stride_w;
-          start_y /= stride_h;
-          filter_start_x = padding_left + stride_w * start_x - ow;
-          filter_start_y = padding_top + stride_h * start_y - oh;
-          filter_start_x = filter_width - 1 - filter_start_x;
-          filter_start_y = filter_height - 1 - filter_start_y;
+          index_t start_x = std::max<int>(0, ow + strides[1] -1 - padding[1]);
+          index_t start_y = std::max<int>(0, oh + strides[0] -1 - padding[0]);
+          start_x /= strides[1];
+          start_y /= strides[0];
+          filter_start_x = padding[1] + strides[1] * start_x - ow;
+          filter_start_y = padding[0] + strides[0] * start_y - oh;
+          filter_start_x = kernel_hw[1] - 1 - filter_start_x;
+          filter_start_y = kernel_hw[0] - 1 - filter_start_y;
          T out_value = 0;
          index_t out_pos =
-              ((b * out_channels + oc) * out_height + oh) * out_width + ow;
-          for (index_t ic = 0; ic < in_channels; ++ic) {
+              ((b * out_shape[1] + oc) * out_shape[2] + oh) * out_shape[3] + ow;
+          for (index_t ic = 0; ic < in_shape[1]; ++ic) {
            for (index_t f_y = filter_start_y, ih = start_y;
-                 f_y >= 0 && ih < in_height; f_y -= stride_h, ++ih) {
+                 f_y >= 0 && ih < in_shape[2]; f_y -= strides[0], ++ih) {
              for (index_t f_x = filter_start_x, iw = start_x;
-                  f_x >= 0 && iw < in_width; f_x -= stride_w, ++iw) {
+                  f_x >= 0 && iw < in_shape[3]; f_x -= strides[1], ++iw) {
                  index_t weight_pos =
-                      ((oc * in_channels + ic) * filter_height + f_y)
-                          * filter_width + f_x;
+                      ((oc * in_shape[1] + ic) * kernel_hw[0] + f_y)
+                          * kernel_hw[1] + f_x;
                  index_t in_pos =
-                      ((b * in_channels + ic) * in_height + ih)
-                          * in_width + iw;
+                      ((b * in_shape[1] + ic) * in_shape[2] + ih)
+                          * in_shape[3] + iw;
                  out_value += input[in_pos] * filter[weight_pos];
              }
            }
@@ -269,26 +261,17 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
                           paddings_.data(), true);
      output->Resize(output_shape_);
    }
-    index_t batch = output->dim(0);
-    index_t channels = output->dim(1);
-    index_t height = output->dim(2);
-    index_t width = output->dim(3);
-
-    index_t input_batch = input->dim(0);
-    index_t input_channels = input->dim(1);
-    index_t input_height = input->dim(2);
-    index_t input_width = input->dim(3);
-
    index_t kernel_h = filter->dim(2);
    index_t kernel_w = filter->dim(3);
-    MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
-    MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
-               input_channels);
-
-    index_t stride_h = strides_[0];
-    index_t stride_w = strides_[1];
-
-    MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
+    const index_t *in_shape = input->shape().data();
+    const index_t *out_shape = output->shape().data();
+    const index_t kernel_hw[2] = {kernel_h, kernel_w};
+
+    MACE_CHECK(filter->dim(0) == out_shape[1], filter->dim(0), " != ",
+               output_shape[1]);
+    MACE_CHECK(filter->dim(1) == in_shape[1], filter->dim(1), " != ",
+               in_shape[1]);
+    MACE_CHECK(in_shape[0] == out_shape[0], "Input/Output batch size mismatch");
    Tensor::MappingGuard input_mapper(input);
    Tensor::MappingGuard filter_mapper(filter);
    Tensor::MappingGuard bias_mapper(bias);
@@ -297,17 +280,23 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
    auto filter_data = filter->data<T>();
    auto bias_data = bias == nullptr ? nullptr : bias->data<T>();
    auto output_data = output->mutable_data<T>();
-    int padding_top = (paddings_[0] + 1) >> 1;
-    int padding_left = (paddings_[1] + 1) >> 1;
-
-    deconv::Deconv2dNCHW(input_data, filter_data, bias_data,
-                         batch, input_height, input_width, input_channels,
-                         height, width, channels,
-                         kernel_h, kernel_w,
-                         stride_h, stride_w, padding_top, padding_left,
+    int padding[2];
+    padding[0] = (paddings_[0] + 1) >> 1;
+    padding[1] = (paddings_[1] + 1) >> 1;
+    deconv::Deconv2dNCHW(input_data,
+                         filter_data,
+                         bias_data,
+                         in_shape,
+                         out_shape,
+                         kernel_hw,
+                         strides_,
+                         padding,
                         output_data);

-    DoActivation(output_data, output_data, output->size(), activation_,
+    DoActivation(output_data,
+                 output_data,
+                 output->size(),
+                 activation_,
                 relux_max_limit_);
  }
 };

--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -34,10 +34,10 @@ struct DepthToSpaceOpFunctor {
    : block_size_(block_size), d2s_(d2s) {}
  void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
    MACE_UNUSED(future);
-    const int batch_size = input->dim(0);
-    const int input_depth = input->dim(1);
-    const int input_height = input->dim(2);
-    const int input_width = input->dim(3);
+    const index_t batch_size = input->dim(0);
+    const index_t input_depth = input->dim(1);
+    const index_t input_height = input->dim(2);
+    const index_t input_width = input->dim(3);

    index_t output_depth, output_width, output_height;

@@ -62,11 +62,11 @@ struct DepthToSpaceOpFunctor {

    if (d2s_) {
 #pragma omp parallel for
-      for (int b = 0; b < batch_size; ++b) {
-        for (int d = 0; d < output_depth; ++d) {
-          for (int h = 0; h < output_height; ++h) {
-            const int in_h = h / block_size_;
-            const int offset_h = (h % block_size_);
+      for (index_t b = 0; b < batch_size; ++b) {
+        for (index_t d = 0; d < output_depth; ++d) {
+          for (index_t h = 0; h < output_height; ++h) {
+            const index_t in_h = h / block_size_;
+            const index_t offset_h = (h % block_size_);
            for (int w = 0; w < output_width; ++w) {
              const index_t in_w = w / block_size_;
              const index_t offset_w = w % block_size_;
@@ -86,18 +86,18 @@ struct DepthToSpaceOpFunctor {
      }
    } else {
 #pragma omp parallel for
-      for (int b = 0; b < batch_size; ++b) {
-        for (int d = 0; d < input_depth; ++d) {
-          for (int h = 0; h < input_height; ++h) {
-            const int out_h = h / block_size_;
-            const int offset_h = (h % block_size_);
-            for (int w = 0; w < input_width; ++w) {
-              const int out_w = w / block_size_;
-              const int offset_w = (w % block_size_);
-              const int offset_d =
+      for (index_t b = 0; b < batch_size; ++b) {
+        for (index_t d = 0; d < input_depth; ++d) {
+          for (index_t h = 0; h < input_height; ++h) {
+            const index_t out_h = h / block_size_;
+            const index_t offset_h = (h % block_size_);
+            for (index_t w = 0; w < input_width; ++w) {
+              const index_t out_w = w / block_size_;
+              const index_t offset_w = (w % block_size_);
+              const index_t offset_d =
                (offset_h * block_size_ + offset_w) * input_depth;

-              const int out_d = d + offset_d;
+              const index_t out_d = d + offset_d;
              const index_t o_index =
                ((b * output_depth + out_d) * output_height + out_h)
                  * output_width + out_w;

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -78,28 +78,27 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>

  void DepthwiseConv2dGeneral(const float *input,
                              const float *filter,
-                              const index_t batch,
-                              const index_t in_height,
-                              const index_t in_width,
-                              const index_t in_channels,
-                              const index_t out_height,
-                              const index_t out_width,
-                              const index_t out_channels,
-                              const int filter_height,
-                              const int filter_width,
-                              const int stride_h,
-                              const int stride_w,
-                              const int dilation_h,
-                              const int dilation_w,
-                              const int pad_top,
-                              const int pad_left,
+                              const index_t *in_shape,
+                              const index_t *out_shape,
+                              const index_t *filter_shape,
+                              const int *stride_hw,
+                              const int *dilation_hw,
+                              const int *pad_hw,
                              float *output) {
-    const index_t multiplier = out_channels / in_channels;
+    const index_t multiplier = filter_shape[0] / filter_shape[1];
 #pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t m = 0; m < out_channels; ++m) {
-        for (index_t h = 0; h < out_height; ++h) {
-          for (index_t w = 0; w < out_width; ++w) {
+    for (index_t b = 0; b < in_shape[0]; ++b) {
+      for (index_t m = 0; m < filter_shape[0]; ++m) {
+        for (index_t h = 0; h < out_shape[2]; ++h) {
+          for (index_t w = 0; w < out_shape[3]; ++w) {
+            const index_t out_channels = filter_shape[0];
+            const index_t in_channels = filter_shape[1];
+            const index_t filter_height = filter_shape[2];
+            const index_t filter_width = filter_shape[3];
+            const index_t in_height = in_shape[2];
+            const index_t in_width = in_shape[3];
+            const index_t out_height = out_shape[2];
+            const index_t out_width = out_shape[3];
            index_t out_offset =
              ((b * out_channels + m) * out_height + h) * out_width + w;
            index_t c = m / multiplier;
@@ -107,8 +106,8 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
            float sum = 0;
            for (index_t kh = 0; kh < filter_height; ++kh) {
              for (index_t kw = 0; kw < filter_width; ++kw) {
-                index_t ih = h * stride_h + kh * dilation_h - pad_top;
-                index_t iw = w * stride_w + kw * dilation_w - pad_left;
+                index_t ih = h * stride_hw[0] + kh * dilation_hw[0] - pad_hw[0];
+                index_t iw = w * stride_hw[1] + kw * dilation_hw[1] - pad_hw[1];
                if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
                  index_t in_offset =
                    ((b * in_channels + c) * in_height + ih) * in_width + iw;
@@ -214,20 +213,18 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
    auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
    auto output_data = output->mutable_data<float>();

+    const int pad_hw[2] = {pad_top, pad_left};
+    const index_t input_shape[4] =
+        {batch, input_channels, input_height, input_width};
+
    if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1
      && dilation_h == 1 && dilation_w == 1) {
      conv_func = [=](const float *input, float *output) {
        DepthwiseConv2dNeonK3x3S1(input,
                                  filter_data,
-                                  batch,
-                                  input_height,
-                                  input_width,
-                                  input_channels,
-                                  height,
-                                  width,
-                                  channels,
-                                  pad_top,
-                                  pad_left,
+                                  input_shape,
+                                  output_shape.data(),
+                                  pad_hw,
                                  valid_h_start,
                                  valid_h_stop,
                                  valid_w_start,
@@ -239,15 +236,9 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
      conv_func = [=](const float *input, float *output) {
        DepthwiseConv2dNeonK3x3S2(input,
                                  filter_data,
-                                  batch,
-                                  input_height,
-                                  input_width,
-                                  input_channels,
-                                  height,
-                                  width,
-                                  channels,
-                                  pad_top,
-                                  pad_left,
+                                  input_shape,
+                                  output_shape.data(),
+                                  pad_hw,
                                  valid_h_start,
                                  valid_h_stop,
                                  valid_w_start,
@@ -258,21 +249,12 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
      conv_func = [=](const float *input, float *output) {
        DepthwiseConv2dGeneral(input,
                               filter_data,
-                               batch,
-                               input_height,
-                               input_width,
-                               input_channels,
-                               height,
-                               width,
-                               channels,
-                               filter_h,
-                               filter_w,
-                               stride_h,
-                               stride_w,
-                               dilation_h,
-                               dilation_w,
-                               pad_top,
-                               pad_left,
+                               input_shape,
+                               output_shape.data(),
+                               filter_shape.data(),
+                               strides_,
+                               dilations_,
+                               pad_hw,
                               output);
      };
    }

--- a/mace/kernels/image_to_buffer.h
+++ b/mace/kernels/image_to_buffer.h
@@ -37,6 +37,10 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {
                  const BufferType type,
                  Tensor *output,
                  StatsFuture *future) {
+    MACE_UNUSED(input);
+    MACE_UNUSED(type);
+    MACE_UNUSED(output);
+    MACE_UNUSED(future);
    MACE_NOT_IMPLEMENTED;
  }
 };

--- a/mace/kernels/opencl/bias_add.cc
+++ b/mace/kernels/opencl/bias_add.cc
@@ -90,7 +90,8 @@ void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
  } else {
    std::vector<uint32_t> roundup_gws(lws.size());
    for (size_t i = 0; i < lws.size(); ++i) {
-      roundup_gws[i] = RoundUp(gws[i], lws[i]);
+      if (lws[i] != 0)
+        roundup_gws[i] = RoundUp(gws[i], lws[i]);
    }

    error = runtime->command_queue().enqueueNDRangeKernel(

--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -75,39 +75,38 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
  }

  void MaxPooling(const float *input,
-                  const index_t batch,
-                  const index_t in_height,
-                  const index_t in_width,
-                  const index_t channels,
-                  const index_t out_height,
-                  const index_t out_width,
-                  const int filter_height,
-                  const int filter_width,
-                  const int stride_h,
-                  const int stride_w,
-                  const int dilation_h,
-                  const int dilation_w,
-                  const int pad_top,
-                  const int pad_left,
+                  const index_t *in_shape,
+                  const index_t *out_shape,
+                  const int *filter_hw,
+                  const int *stride_hw,
+                  const int *dilation_hw,
+                  const int *pad_hw,
                  float *output) {
-    const index_t in_image_size = in_height * in_width;
-    const index_t out_image_size = out_height * out_width;
-    const index_t in_batch_size = channels * in_image_size;
-    const index_t out_batch_size = channels * out_image_size;
+    const index_t in_image_size = in_shape[2] * in_shape[3];
+    const index_t out_image_size = out_shape[2] * out_shape[3];
+    const index_t in_batch_size = in_shape[1] * in_image_size;
+    const index_t out_batch_size = out_shape[1] * out_image_size;

 #pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t c = 0; c < channels; ++c) {
+    for (index_t b = 0; b < out_shape[0]; ++b) {
+      for (index_t c = 0; c < out_shape[1]; ++c) {
        const index_t out_base = b * out_batch_size + c * out_image_size;
        const index_t in_base = b * in_batch_size + c * in_image_size;
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
+        const index_t in_height = in_shape[2];
+        const index_t in_width = in_shape[3];
+
        for (index_t h = 0; h < out_height; ++h) {
          for (index_t w = 0; w < out_width; ++w) {
            const index_t out_offset = out_base + h * out_width + w;
            float res = std::numeric_limits<float>::lowest();
-            for (int fh = 0; fh < filter_height; ++fh) {
-              for (int fw = 0; fw < filter_width; ++fw) {
-                int inh = h * stride_h + dilation_h * fh - pad_top;
-                int inw = w * stride_w + dilation_w * fw - pad_left;
+            for (int fh = 0; fh < filter_hw[0]; ++fh) {
+              for (int fw = 0; fw < filter_hw[1]; ++fw) {
+                index_t inh =
+                    h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0];
+                index_t inw =
+                    w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1];
                if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
                  index_t input_offset = in_base + inh * in_width + inw;
                  res = std::max(res, input[input_offset]);
@@ -122,40 +121,38 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
  }

  void AvgPooling(const float *input,
-                  const index_t batch,
-                  const index_t in_height,
-                  const index_t in_width,
-                  const index_t channels,
-                  const index_t out_height,
-                  const index_t out_width,
-                  const int filter_height,
-                  const int filter_width,
-                  const int stride_h,
-                  const int stride_w,
-                  const int dilation_h,
-                  const int dilation_w,
-                  const int pad_top,
-                  const int pad_left,
+                  const index_t *in_shape,
+                  const index_t *out_shape,
+                  const int *filter_hw,
+                  const int *stride_hw,
+                  const int *dilation_hw,
+                  const int *pad_hw,
                  float *output) {
-    const index_t in_image_size = in_height * in_width;
-    const index_t out_image_size = out_height * out_width;
-    const index_t in_batch_size = channels * in_image_size;
-    const index_t out_batch_size = channels * out_image_size;
+    const index_t in_image_size = in_shape[2] * in_shape[3];
+    const index_t out_image_size = out_shape[2] * out_shape[3];
+    const index_t in_batch_size = in_shape[1] * in_image_size;
+    const index_t out_batch_size = out_shape[1] * out_image_size;

 #pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t c = 0; c < channels; ++c) {
+    for (index_t b = 0; b < out_shape[0]; ++b) {
+      for (index_t c = 0; c < out_shape[1]; ++c) {
        const index_t out_base = b * out_batch_size + c * out_image_size;
        const index_t in_base = b * in_batch_size + c * in_image_size;
+        const index_t in_height = in_shape[2];
+        const index_t in_width = in_shape[3];
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
        for (index_t h = 0; h < out_height; ++h) {
          for (index_t w = 0; w < out_width; ++w) {
            const index_t out_offset = out_base + h * out_width + w;
            float res = 0;
            int block_size = 0;
-            for (int fh = 0; fh < filter_height; ++fh) {
-              for (int fw = 0; fw < filter_width; ++fw) {
-                int inh = h * stride_h + dilation_h * fh - pad_top;
-                int inw = w * stride_w + dilation_w * fw - pad_left;
+            for (int fh = 0; fh < filter_hw[0]; ++fh) {
+              for (int fw = 0; fw < filter_hw[1]; ++fw) {
+                index_t inh =
+                    h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0];
+                index_t inw =
+                    w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1];
                if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
                  index_t input_offset = in_base + inh * in_width + inw;
                  res += input[input_offset];
@@ -200,59 +197,25 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
    const float *input = input_tensor->data<float>();
    float *output = output_tensor->mutable_data<float>();
    const index_t *input_shape = input_tensor->shape().data();
-    index_t batch = output_shape[0];
-    index_t channels = output_shape[1];
-    index_t height = output_shape[2];
-    index_t width = output_shape[3];
-
-    index_t input_height = input_shape[2];
-    index_t input_width = input_shape[3];
-
-    int filter_h = kernels_[0];
-    int filter_w = kernels_[1];
-
-    int stride_h = strides_[0];
-    int stride_w = strides_[1];
-
-    int dilation_h = dilations_[0];
-    int dilation_w = dilations_[1];
-
-    int pad_top = paddings[0] / 2;
-    int pad_left = paddings[1] / 2;
+    int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2};

    if (pooling_type_ == PoolingType::MAX) {
      MaxPooling(input,
-                 batch,
-                 input_height,
-                 input_width,
-                 channels,
-                 height,
-                 width,
-                 filter_h,
-                 filter_w,
-                 stride_h,
-                 stride_w,
-                 dilation_h,
-                 dilation_w,
-                 pad_top,
-                 pad_left,
+                 input_shape,
+                 output_shape.data(),
+                 kernels_,
+                 strides_,
+                 dilations_,
+                 pad_hw,
                 output);
    } else if (pooling_type_ == PoolingType::AVG) {
      AvgPooling(input,
-                 batch,
-                 input_height,
-                 input_width,
-                 channels,
-                 height,
-                 width,
-                 filter_h,
-                 filter_w,
-                 stride_h,
-                 stride_w,
-                 dilation_h,
-                 dilation_w,
-                 pad_top,
-                 pad_left,
+                 input_shape,
+                 output_shape.data(),
+                 kernels_,
+                 strides_,
+                 dilations_,
+                 pad_hw,
                 output);
    } else {
      MACE_NOT_IMPLEMENTED;

--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -111,6 +111,7 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
    MACE_UNUSED(input);
    MACE_UNUSED(bias);
    MACE_UNUSED(output);
+    MACE_UNUSED(future);
    MACE_NOT_IMPLEMENTED;
  }
 };

--- a/mace/ops/activation.h
+++ b/mace/ops/activation.h
@@ -38,7 +38,7 @@ class ActivationOp : public Operator<D, T> {
    const Tensor *input_tensor = this->Input(0);
    const Tensor *alpha_tensor =
        this->InputSize() >= 2 ? this->Input(1) : nullptr;
-    Tensor *output_tensor = this->outputs_[0];
+    Tensor *output_tensor = this->Output(0);
    output_tensor->ResizeLike(input_tensor);

    functor_(input_tensor, alpha_tensor, output_tensor, future);

--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -618,6 +618,8 @@ struct Expector<EXP_TYPE, RES_TYPE, false> {
  static void Near(const Tensor &x, const Tensor &y,
                   const double rel_err,
                   const double abs_err) {
+    MACE_UNUSED(rel_err);
+    MACE_UNUSED(abs_err);
    Equal(x, y);
  }
 };

--- a/mace/utils/logging.cc
+++ b/mace/utils/logging.cc
@@ -15,6 +15,7 @@
 #include "mace/utils/logging.h"

 #include <stdlib.h>
+#include <string.h>
 #if defined(ANDROID) || defined(__ANDROID__)
 #include <android/log.h>
 #include <iostream>