Finish max/avg pooling opencl kernel.

c899d438 · liuqi · e19514a9 · c899d438 · c899d438 · c899d438
10 changed file
--- a/mace/kernels/neon/pooling_neon.cc
+++ b/mace/kernels/neon/pooling_neon.cc
@@ -58,19 +58,27 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
    const Tensor *input_tensor,
    Tensor *output_tensor) {

+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  std::vector<index_t> filter_shape(4);
+  filter_shape[0] = input_tensor->shape()[1];
+  filter_shape[1] = input_tensor->shape()[1];
+  filter_shape[2] = kernels_[0];
+  filter_shape[3] = kernels_[1];
+
+  kernels::CalcPaddingAndOutputSize(
+      input_tensor->shape().data(), filter_shape.data(), this->dilations_,
+      strides_, this->padding_, output_shape.data(),
+      paddings.data());
+  output_tensor->Resize(output_shape);
+
  const float *input = input_tensor->data<float>();
  float *output = output_tensor->mutable_data<float>();
  const index_t *input_shape = input_tensor->shape().data();
-  const index_t *output_shape = output_tensor->shape().data();

-  int paddings[2];
-  std::vector<index_t> filter_shape = {input_shape[1], input_shape[0],
-                                       kernels_[0], kernels_[1]};
-  kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_,
-                          strides_, this->padding_, paddings);
 #ifdef __COPY_MAKE_PADDING
  Tensor padded_input;
-  ConstructInputWithPadding(input_tensor, paddings, &padded_input);
+  ConstructInputWithPadding(input_tensor, paddings.data(), &padded_input);
  input = padded_input.data<float>();
  input_shape = padded_input.shape().data();
 #endif
@@ -80,17 +88,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
    // kernel_size: 2x2, strides: 2x2
    if (pooling_type_ == MAX) {  // MAX_POOL_2x2s2x2
 #ifdef __COPY_MAKE_PADDING
-      PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape);
+      PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data());
 #else
-      PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape,
-                             paddings);
+      PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape.data(),
+                             paddings.data());
 #endif
    } else {  // AVG_POOL_2x2s2x2
 #ifdef __COPY_MAKE_PADDING
-      PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape);
+      PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data());
 #else
-      PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape,
-                             paddings);
+      PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape.data(),
+                             paddings.data());
 #endif
    }
  } else if (kernels_[0] == 3 && kernels_[1] == 3 && strides_[0] == 2 &&
@@ -98,17 +106,17 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
    // kernel_size: 3x3, strides: 2x2
    if (pooling_type_ == MAX) {  // MAX_POOL_3x3s2x2
 #ifdef __COPY_MAKE_PADDING
-      PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape);
+      PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data());
 #else
-      PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape,
-                             paddings);
+      PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape.data(),
+                             paddings.data());
 #endif
    } else {  // AVG_POOL_3x3s2x2
 #ifdef __COPY_MAKE_PADDING
-      PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape);
+      PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data());
 #else
-      PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape,
-                             paddings);
+      PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape.data(),
+                             paddings.data());
 #endif
    }
  } else {  // not implement yet

--- a/mace/kernels/opencl/cl/common.h
+++ b/mace/kernels/opencl/cl/common.h
@@ -18,4 +18,7 @@
 #define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
 #define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)

+
+__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
 #endif  // MACE_KERNELS_OPENCL_CL_COMMON_H_
--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
@@ -57,15 +57,14 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
  // Unrolling this loop hurt perfmance
  for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
    for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) {
-      for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
-
-        in_idx = in_ch_blk * in_width;

-        int in_hb_value = height_idx + hb_idx;
-        in_hb_value = select(in_hb_value + batch_idx,
-                             -1,
-                             (in_hb_value < 0 || in_hb_value >= in_height));
+      int in_hb_value = height_idx + hb_idx;
+      in_hb_value = select(in_hb_value + batch_idx,
+                           -1,
+                           (in_hb_value < 0 || in_hb_value >= in_height));

+      for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
+        in_idx = in_ch_blk * in_width;
        int in_width_value;
 #define READ_INPUT(i)                                                                \
        in_width_value = in_width##i + width_idx;                                    \

--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -59,15 +59,13 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  // Unrolling this loop hurt perfmance
  for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
    for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
+      int in_hb_value = height_idx + hb_idx;
+      in_hb_value = select(in_hb_value + batch_idx,
+                           -1,
+                           (in_hb_value < 0 || in_hb_value >= in_height));
      for (short width_idx = 0; width_idx < 3; ++width_idx) {

        in_idx = in_ch_blk * in_width;
-
-        int in_hb_value = height_idx + hb_idx;
-        in_hb_value = select(in_hb_value + batch_idx,
-                             -1,
-                             (in_hb_value < 0 || in_hb_value >= in_height));
-
        int in_width_value;
 #define READ_INPUT(i)                                                                \
        in_width_value = in_width##i + width_idx;                                    \

--- a/mace/kernels/opencl/cl/pooling.cl
+++ b/mace/kernels/opencl/cl/pooling.cl
 #include <common.h>

-VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s1(const DATA_TYPE *input_ptr, const int in_width) {
-  VEC_DATA_TYPE(DATA_TYPE,4) row00 = vload4(0, input_ptr);
-  VEC_DATA_TYPE(DATA_TYPE,2) row01 = vload2(0, input_ptr + 4);
-  VEC_DATA_TYPE(DATA_TYPE,4) row10 = vload4(0, input_ptr + in_width);
-  VEC_DATA_TYPE(DATA_TYPE,2) row11 = vload2(0, input_ptr + in_width + 4);
-  VEC_DATA_TYPE(DATA_TYPE,4) row20 = vload4(0, input_ptr + in_width * 2);
-  VEC_DATA_TYPE(DATA_TYPE,2) row21 = vload2(0, input_ptr + in_width * 2 + 4);
-
-  VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01212323);
-  VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row01.s0, row00.s3, row01.s01);
-  VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01212323);
-  VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row11.s0, row10.s3, row11.s01);
-  VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01212323);
-  VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row21.s0, row20.s3, row21.s01);
-
-  VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
-  VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
-
-  VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
-                                        (VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
-  res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
-
-  return res;
-}
-
-VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s2(const DATA_TYPE *input_ptr, const int in_width) {
-  VEC_DATA_TYPE(DATA_TYPE,8) row00 = vload8(0, input_ptr);
-  DATA_TYPE row01 = *(input_ptr + 8);
-  VEC_DATA_TYPE(DATA_TYPE,8) row10 = vload8(0, input_ptr + in_width);
-  DATA_TYPE row11 = *(input_ptr + in_width + 8);
-  VEC_DATA_TYPE(DATA_TYPE,8) row20 = vload8(0, input_ptr + in_width * 2);
-  DATA_TYPE row21 = *(input_ptr + in_width * 2 + 8);
-
-  VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01223445);
-  VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row00.s667, row01);
-  VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01223445);
-  VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row10.s667, row11);
-  VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01223445);
-  VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row20.s667, row21);
-
-  VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
-  VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
-
-  VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
-                                        (VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
-  res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
-
-  return res;
-}
-
-DATA_TYPE inner_pooling_3(const DATA_TYPE *input_ptr, const int in_width) {
-  VEC_DATA_TYPE(DATA_TYPE,3) row0 = vload3(0, input_ptr);
-  VEC_DATA_TYPE(DATA_TYPE,3) row1 = vload3(0, input_ptr + in_width);
-  VEC_DATA_TYPE(DATA_TYPE,3) row2 = vload3(0, input_ptr + in_width * 2);
-
-  VEC_DATA_TYPE(DATA_TYPE,3) data = fmax(fmax(row0, row1), row2);
-
-  DATA_TYPE res = fmax(fmax(data.s0, data.s1), data.s2);
-  return res;
-}
-
-// Supported data type: half/float
-__kernel void pooling3(__global const DATA_TYPE *input, /* n, c, h, w */
-                       __private const int in_height,
-                       __private const int in_width,
-                       __private const int out_chan_num,
-                       __private const int out_height,
-                       __private const int out_width,
-                       __private const int stride,
-                       __global DATA_TYPE *output) {
-  int batch = get_global_id(0);
-  int out_chan_blk = get_global_id(1);
-  int out_pixel_blk = get_global_id(2);
-
-  const int round_out_width = (out_width + 3) / 4;
-  const int out_pixel_height = out_pixel_blk / round_out_width;
-  const int out_pixel_width = out_pixel_blk % round_out_width;
-
-  const int out_chan_begin = out_chan_blk * 4;
-  const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
-  const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
-  const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
-  const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4;
-
-  const int in_pixel = in_height * in_width;
-  const int out_pixel = out_height * out_width;
-
-  const int in_offset = batch * out_chan_num * in_pixel;
-  const int out_offset = batch * out_chan_num * out_pixel;
-  const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
-  DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
-
-  const int pixels = out_pixel_end - out_pixel_begin;
-
-  for (int i = out_chan_begin; i < out_chan_end; ++i) {
-    const DATA_TYPE *input_ptr = input_base + i * in_pixel;
-    DATA_TYPE *output_ptr = output_base + i * out_pixel;
-    if (pixels == 4) {
-      VEC_DATA_TYPE(DATA_TYPE,4) res;
-#ifdef STRIDE_1
-      res = vec_pooling_3_s1(input_ptr, in_width);
+#ifdef FP16
+#define MIN_VALUE -HALF_MAX
 #else
-      res = vec_pooling_3_s2(input_ptr, in_width);
+#define MIN_VALUE -FLT_MAX
 #endif
-      vstore4(res, 0, output_ptr);
-    } else {
-      for (int p = 0; p < pixels; ++p) {
-        output_ptr[p] = inner_pooling_3(input_ptr, in_width);
-        input_ptr += stride;
-      }
-    }
-  }
-}

-int calculate_avg_block_size(const int pos_h,
-                             const int pos_w,
-                             const int pool_size,
-                             const int pad_h,
-                             const int pad_w,
-                             const int h_size,
-                             const int w_size) {
-  const int h_start = max(0, pos_h - pad_h);
-  const int w_start = max(0, pos_w - pad_w);
-  const int h_end = min(pos_h + pool_size - pad_h, h_size);
-  const int w_end = min(pos_w + pool_size - pad_w, w_size);
+inline int calculate_avg_block_size(const int pool_size,
+                                    const int pos_h,
+                                    const int pos_w,
+                                    const int h_size,
+                                    const int w_size) {
+  const int h_start = max(0, pos_h);
+  const int w_start = max(0, pos_w);
+  const int h_end = min(pos_h + pool_size, h_size);
+  const int w_end = min(pos_w + pool_size, w_size);
  return (h_end - h_start) * (w_end - w_start);
 }

 // Supported data type: half/float
-__kernel void poolingn(__global const DATA_TYPE *input, /* n, c, h, w */
-                       __private const int in_height,
-                       __private const int in_width,
-                       __private const int out_chan_num,
-                       __private const int out_height,
-                       __private const int out_width,
-                       __private const int stride,
-                       __private const int pad_h,
-                       __private const int pad_w,
-                       __private const int pooling_size,
-                       __global DATA_TYPE *output) {
-  int batch = get_global_id(0);
-  int out_chan_idx = get_global_id(1);
-  int out_pixel_idx = get_global_id(2);
-
-  const int out_pixel_height = out_pixel_idx / out_width;
-  const int out_pixel_width = out_pixel_idx % out_width;
-
-  const int out_chan_begin = out_chan_idx * 4;
-  const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
-  const int in_pixel_idx = out_pixel_height * stride * in_width
-                             + out_pixel_width * stride;
-
-  const int in_pixel = in_height * in_width;
-  const int out_pixel = out_height * out_width;
-
-  const int in_offset = batch * out_chan_num * in_pixel;
-  const int out_offset = batch * out_chan_num * out_pixel;
-  const DATA_TYPE *input_base = input + in_offset + in_pixel_idx;
-  DATA_TYPE *output_base = output + out_offset + out_pixel_idx;
-
-  const int block_size = calculate_avg_block_size(
-                            out_pixel_height * stride,
-                            out_pixel_width * stride,
-                            pooling_size,
-                            pad_h/2,
-                            pad_w/2,
-                            in_height - pad_h,
-                            in_width - pad_w);
-  for (int i = out_chan_begin; i < out_chan_end; ++i) {
-    VEC_DATA_TYPE(DATA_TYPE,8) sum8 = 0.0f;
-    DATA_TYPE sum1 = 0.0f;
-    DATA_TYPE *output_ptr = output_base + i * out_pixel;
-    for (int y = 0; y < pooling_size; ++y) {
-      const DATA_TYPE *input_ptr = input_base + i * in_pixel + y * in_width;
-      int x = 0;
-      for (; x < (pooling_size-8); x += 8) {
-        VEC_DATA_TYPE(DATA_TYPE,8) data = vload8(0, input_ptr);
-        sum8 += data;
-        input_ptr += 8;
-      }
-      for (; x < pooling_size; ++x) {
-        sum1 += *input_ptr;
-        input_ptr++;
+__kernel void pooling(__read_only image2d_t input,
+                      __private const int in_height,
+                      __private const int in_width,
+                      __private const int out_height,
+                      __private const int pad_top,
+                      __private const int pad_left,
+                      __private const int stride,
+                      __private const int pooling_size,
+                      __write_only image2d_t output) {
+  const int out_chan_idx = get_global_id(0);
+  const int out_width_idx = get_global_id(1);
+  const int out_width = get_global_size(1);
+  const int out_hb_idx = get_global_id(2);
+
+  const int batch_idx = (out_hb_idx / out_height) * in_height;
+  const int in_height_start = (out_hb_idx % out_height) * stride - pad_top;
+  const int in_width_start = out_width_idx * stride - pad_left;
+  const int in_channel_offset = out_chan_idx * in_width;
+
+
+#ifdef POOL_AVG
+  DATA_TYPE4 res = 0;
+  for (int height = 0; height < pooling_size; ++height) {
+    int in_height_idx = in_height_start + height;
+    in_height_idx = select(batch_idx + in_height_idx,
+                       -1,
+                       (in_height_idx < 0 || in_height_idx >= in_height));
+    for (int width = 0; width < pooling_size; ++width) {
+      int in_width_idx = in_width_start + width;
+      in_width_idx = select(in_channel_offset + in_width_idx,
+                            -1,
+                            (in_width_idx < 0 || in_width_idx >= in_width));
+
+      DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx));
+      res = res + in;
+    }
+  }
+  const int block_size = calculate_avg_block_size(pooling_size,
+                                                  in_height_start, in_width_start,
+                                                  in_height, in_width);
+  res /= block_size;
+#else
+  DATA_TYPE4 res = (DATA_TYPE4)(MIN_VALUE);
+  for (int height = 0; height < pooling_size; ++height) {
+    int in_height_idx = in_height_start + height;
+    in_height_idx = select(batch_idx + in_height_idx,
+                           -1,
+                           (in_height_idx < 0 || in_height_idx >= in_height));
+    if (in_height_idx != -1) {
+      for (int width = 0; width < pooling_size; ++width) {
+        int in_width_idx = in_width_start + width;
+        in_width_idx = select(in_channel_offset + in_width_idx,
+                              -1,
+                              (in_width_idx < 0 || in_width_idx >= in_width));
+
+        if (in_width_idx != -1) {
+          DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx));
+          res = fmax(res, in);
+        }
      }
    }
-    VEC_DATA_TYPE(DATA_TYPE,4) sum4 = sum8.s0123 + sum8.s4567;
-    VEC_DATA_TYPE(DATA_TYPE,2) sum2 = sum4.s01 + sum4.s23;
-
-    *output_ptr = (sum2.s0 + sum2.s1 + sum1) / block_size;
  }
+#endif
+
+  WRITE_IMAGET(output, (int2)(out_chan_idx * out_width + out_width_idx, out_hb_idx), res);
 }
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -38,7 +38,6 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
  auto program = runtime->program();

  auto conv_2d_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options);
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);

  uint32_t idx = 0;
  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));

--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -10,131 +10,94 @@
 namespace mace {
 namespace kernels {

-static void Pooling3(const Tensor *input,
-                     const int *stride,
-                     const PoolingType type,
-                     Tensor *output) {
-  if (type != MAX) {
-    MACE_NOT_IMPLEMENTED;
-  }
+static void Pooling(const Tensor *input,
+                    const int *stride,
+                    const int *paddings,
+                    const int pooling_size,
+                    const PoolingType type,
+                    const DataType dt,
+                    Tensor *output) {
  index_t batch = output->dim(0);
-  index_t channels = output->dim(1);
-  index_t out_height = output->dim(2);
-  index_t out_width = output->dim(3);
+  index_t out_height = output->dim(1);
+  index_t out_width = output->dim(2);
+  index_t channels = output->dim(3);

-  index_t channel_blk = (channels + 3) / 4;
-  const index_t pixel_width = (out_width + 3) / 4 ;
+  index_t channel_blocks = (channels + 3) / 4;
  const uint32_t gws[3] = {
-      static_cast<uint32_t>(batch),
-      static_cast<uint32_t>(channel_blk),
-      static_cast<uint32_t>(pixel_width * out_height),
+      static_cast<uint32_t>(channel_blocks),
+      static_cast<uint32_t>(out_width),
+      static_cast<uint32_t>(batch * out_height),
  };

  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
-  built_options.emplace(stride[0] == 1 ? "-DSTRIDE_1" : "");
-  auto pooling_kernel  = runtime->BuildKernel("pooling", "pooling3", built_options);
+  if (type == MAX && input->dtype() == output->dtype()) {
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    built_options.emplace(dt == DT_HALF ? "-DFP16" : "");
+  } else {
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+  }
+  if (type == AVG) {
+    built_options.emplace("-DPOOL_AVG");
+  }
+  auto pooling_kernel = runtime->BuildKernel("pooling", "pooling", built_options);

+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);

-  const uint32_t lws[3] = {1, 8, 128};
+  uint32_t lws[3];
+  lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
+  lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
+  lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));

  uint32_t idx = 0;
-  pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
+  pooling_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
+  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
  pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width));
+  pooling_kernel.setArg(idx++, paddings[0] / 2);
+  pooling_kernel.setArg(idx++, paddings[1] / 2);
  pooling_kernel.setArg(idx++, stride[0]);
-  pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  pooling_kernel.setArg(idx++, pooling_size);
+  pooling_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));

  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      pooling_kernel, cl::NullRange,
      cl::NDRange(gws[0], gws[1], gws[2]),
      cl::NDRange(lws[0], lws[1], lws[2]),
      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS);
+  MACE_CHECK(error == CL_SUCCESS) << error;
 }

-static void PoolingN(const Tensor *input,
-                     const int *stride,
-                     const int *paddings,
-                     const int pooling_size,
-                     const PoolingType type,
-                     Tensor *output) {
-  if (type != AVG) {
-    MACE_NOT_IMPLEMENTED;
-  }
-  index_t batch = output->dim(0);
-  index_t channels = output->dim(1);
-  index_t out_height = output->dim(2);
-  index_t out_width = output->dim(3);
-
-  index_t channel_blk = (channels + 3) / 4;
-  const uint32_t gws[3] = {
-      static_cast<uint32_t>(batch),
-      static_cast<uint32_t>(channel_blk),
-      static_cast<uint32_t>(out_height * out_width),
+template<typename T>
+void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
+                                                       Tensor *output) {
+  MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) << "Pooling opencl kernel not support dilation yet";
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  std::vector<index_t> filter_shape = {
+      kernels_[0], kernels_[1],
+      input->dim(3), input->dim(3)
  };

-  auto runtime = OpenCLRuntime::Get();
-  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
-  auto pooling_kernel  = runtime->BuildKernel("pooling", "poolingn", built_options);
+  kernels::CalcNHWCPaddingAndOutputSize(
+      input->shape().data(), filter_shape.data(),
+      dilations_, strides_, this->padding_,
+      output_shape.data(), paddings.data());

-  const uint32_t lws[3] = {1, 8, 128};
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
+  output->ResizeImage(output_shape, output_image_shape);

-  uint32_t idx = 0;
-  pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width));
-  pooling_kernel.setArg(idx++, stride[0]);
-  pooling_kernel.setArg(idx++, paddings[0]);
-  pooling_kernel.setArg(idx++, paddings[1]);
-  pooling_kernel.setArg(idx++, pooling_size);
-  pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  Pooling(input, strides_, paddings.data(), kernels_[0], pooling_type_,
+          DataTypeToEnum<T>::value, output);

-  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      pooling_kernel, cl::NullRange,
-      cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]),
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS);
-}
-
-template <>
-void PoolingFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
-                                                          Tensor *output) {
-  int paddings[2];
-  std::vector<index_t> filter_shape = {input->dim(1), input->dim(0),
-                                       kernels_[0], kernels_[1]};
-  kernels::CalPaddingSize(input->shape().data(), filter_shape.data(), this->dilations_,
-                          strides_, this->padding_, paddings);
-#define POOLING_HELPER                                               \
-  switch(kernels_[0]) {                                              \
-    case 3:                                                          \
-      Pooling3(input, strides_, pooling_type_, output);              \
-      break;                                                         \
-    default:                                                         \
-      PoolingN(input, strides_, paddings, kernels_[0],               \
-               pooling_type_, output);                               \
-      break;                                                         \
-  }
-
-  if (paddings[0] > 0 || paddings[1] > 0) {
-    Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
-    ConstructInputWithPadding(input, paddings, &padded_input, pooling_type_ == MAX);
-    input = &padded_input;
-    POOLING_HELPER
-  } else {
-    POOLING_HELPER
-  }
-#undef POOLING_HELPER
 }

+template
+struct PoolingFunctor<DeviceType::OPENCL, float>;
+template
+struct PoolingFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -18,36 +18,66 @@ enum PoolingType {

 namespace kernels {

-template <DeviceType D, typename T>
-struct PoolingFunctor {
-  PoolingFunctor(const PoolingType pooling_type,
-                 const int *kernels,
-                 const int *strides,
-                 const Padding padding,
-                 const int *dilations)
+struct PoolingFunctorBase {
+  PoolingFunctorBase(const PoolingType pooling_type,
+                     const int *kernels,
+                     const int *strides,
+                     const Padding padding,
+                     const int *dilations)
      : pooling_type_(pooling_type),
        kernels_(kernels),
        strides_(strides),
        padding_(padding),
        dilations_(dilations) {}

+  const PoolingType pooling_type_;
+  const int *kernels_;
+  const int *strides_;
+  const Padding padding_;
+  const int *dilations_;
+};
+
+template<DeviceType D, typename T>
+struct PoolingFunctor : PoolingFunctorBase {
+  PoolingFunctor(const PoolingType pooling_type,
+                 const int *kernels,
+                 const int *strides,
+                 const Padding padding,
+                 const int *dilations)
+      : PoolingFunctorBase(pooling_type, kernels,
+                           strides, padding,
+                           dilations) {}
+
  void operator()(const Tensor *input_tensor,
                  Tensor *output_tensor) {
+
+    std::vector<index_t> output_shape(4);
+    std::vector<int> paddings(2);
+    std::vector<index_t> filter_shape = {
+        kernels_[0], kernels_[1],
+        input_tensor->dim(3), input_tensor->dim(3)
+    };
+
+    kernels::CalcNHWCPaddingAndOutputSize(
+        input_tensor->shape().data(), filter_shape.data(),
+        dilations_, strides_, this->padding_,
+        output_shape.data(), paddings.data());
+    output_tensor->Resize(output_shape);
+
    Tensor::MappingGuard in_guard(input_tensor);
    Tensor::MappingGuard out_guard(output_tensor);
    const T *input = input_tensor->data<T>();
    T *output = output_tensor->mutable_data<T>();
    const index_t *input_shape = input_tensor->shape().data();
-    const index_t *output_shape = output_tensor->shape().data();
    index_t batch = output_shape[0];
-    index_t channels = output_shape[1];
-    index_t height = output_shape[2];
-    index_t width = output_shape[3];
+    index_t height = output_shape[1];
+    index_t width = output_shape[2];
+    index_t channels = output_shape[3];
    index_t out_image_size = height * width;

-    index_t input_channels = input_shape[1];
-    index_t input_height = input_shape[2];
-    index_t input_width = input_shape[3];
+    index_t input_height = input_shape[1];
+    index_t input_width = input_shape[2];
+    index_t input_channels = input_shape[3];
    index_t in_image_size = input_height * input_width;

    int kernel_h = kernels_[0];
@@ -59,11 +89,6 @@ struct PoolingFunctor {
    int dilation_h = dilations_[0];
    int dilation_w = dilations_[1];

-    int paddings[2];
-    std::vector<index_t> filter_shape = {input_shape[1], input_shape[0],
-                                         kernels_[0], kernels_[1]};
-    kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_,
-                            strides_, this->padding_, paddings);
    // The left-upper most offset of the padded input
    int padded_h_start = 0 - paddings[0] / 2;
    int padded_w_start = 0 - paddings[1] / 2;
@@ -71,25 +96,24 @@ struct PoolingFunctor {
    if (pooling_type_ == MAX) {
 #pragma omp parallel for collapse(2)
      for (int b = 0; b < batch; ++b) {
-        for (int c = 0; c < channels; ++c) {
-          index_t out_offset = (b * channels + c) * out_image_size;
-          index_t in_offset = (b * input_channels + c) * in_image_size;
-          for (int h = 0; h < height; ++h) {
-            for (int w = 0; w < width; ++w) {
-              T max = std::numeric_limits<T>::lowest();
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            for (int c = 0; c < channels; ++c) {
+              index_t in_offset = b * in_image_size * input_channels + c;
+              T res = std::numeric_limits<T>::lowest();
              for (int kh = 0; kh < kernel_h; ++kh) {
                for (int kw = 0; kw < kernel_w; ++kw) {
                  int inh = padded_h_start + h * stride_h + dilation_h * kh;
                  int inw = padded_w_start + w * stride_w + dilation_w * kw;
                  if (inh >= 0 && inh < input_height && inw >= 0 &&
                      inw < input_width) {
-                    index_t input_offset = in_offset + inh * input_width + inw;
-                    max = std::max(max, input[input_offset]);
+                    index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
+                    res = std::max(res, input[input_offset]);
                  }
                }
              }
-              output[out_offset] = max;
-              out_offset += 1;
+              *output = res;
+              output++;
            }
          }
        }
@@ -97,11 +121,10 @@ struct PoolingFunctor {
    } else if (pooling_type_ == AVG) {
 #pragma omp parallel for collapse(2)
      for (int b = 0; b < batch; ++b) {
-        for (int c = 0; c < channels; ++c) {
-          index_t out_offset = (b * channels + c) * out_image_size;
-          index_t in_offset = (b * input_channels + c) * in_image_size;
-          for (int h = 0; h < height; ++h) {
-            for (int w = 0; w < width; ++w) {
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            for (int c = 0; c < channels; ++c) {
+              index_t in_offset = b * in_image_size * input_channels + c;
              T sum = 0;
              int block_size = 0;
              for (int kh = 0; kh < kernel_h; ++kh) {
@@ -110,14 +133,14 @@ struct PoolingFunctor {
                  int inw = padded_w_start + w * stride_w + dilation_w * kw;
                  if (inh >= 0 && inh < input_height && inw >= 0 &&
                      inw < input_width) {
-                    index_t input_offset = in_offset + inh * input_width + inw;
+                    index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
                    sum += input[input_offset];
                    block_size += 1;
                  }
                }
              }
-              output[out_offset] = sum / block_size;
-              out_offset += 1;
+              *output = sum / block_size;
+              output++;
            }
          }
        }
@@ -125,22 +148,26 @@ struct PoolingFunctor {
    }
  }

-  const PoolingType pooling_type_;
-  const int *kernels_;
-  const int *strides_;
-  const Padding padding_;
-  const int *dilations_;
 };

-template <>
+template<>
 void PoolingFunctor<DeviceType::NEON, float>::operator()(
    const Tensor *input_tensor,
    Tensor *output_tensor);

-template <>
-void PoolingFunctor<DeviceType::OPENCL, float>::operator()(
-    const Tensor *input_tensor,
-    Tensor *output_tensor);
+template<typename T>
+struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
+  PoolingFunctor(const PoolingType pooling_type,
+                 const int *kernels,
+                 const int *strides,
+                 const Padding padding,
+                 const int *dilations)
+      : PoolingFunctorBase(pooling_type, kernels,
+                           strides, padding,
+                           dilations) {}
+  void operator()(const Tensor *input_tensor,
+                  Tensor *output_tensor);
+};

 }  //  namespace kernels
 }  //  namespace mace

--- a/mace/ops/pooling.h
+++ b/mace/ops/pooling.h
@@ -27,21 +27,6 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
    const Tensor *input = this->Input(INPUT);
    Tensor *output = this->Output(OUTPUT);

-    std::vector<index_t> output_shape(4);
-    std::vector<int> paddings(2);
-    std::vector<index_t> filter_shape(4);
-    // TODO(chenghui): is it kind of a hack?
-    filter_shape[0] = input->shape()[1];
-    filter_shape[1] = input->shape()[0];
-    filter_shape[2] = kernels_[0];
-    filter_shape[3] = kernels_[1];
-
-    kernels::CalcPaddingAndOutputSize(
-        input->shape().data(), filter_shape.data(), this->dilations_.data(),
-        this->strides_.data(), this->padding_, output_shape.data(),
-        paddings.data());
-    output->Resize(output_shape);
-
    functor_(input, output);
    return true;
  };

--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -28,48 +28,20 @@ TEST_F(PoolingOpTest, MAX_VALID) {

  // Add input data
  net.AddInputFromArray<DeviceType::CPU, float>(
-      "Input", {1, 2, 4, 4},
-      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
+      "Input", {1, 4, 4, 2},
+      {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
+       8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});

  // Run
  net.RunOp();

  // Check
  auto expected =
-      CreateTensor<float>({1, 2, 2, 2}, {5, 7, 13, 15, 21, 23, 29, 31});
+      CreateTensor<float>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }

-TEST_F(PoolingOpTest, AVG_VALID) {
-  // Construct graph
-  auto &net = test_net();
-  OpDefBuilder("Pooling", "PoolingTest")
-      .Input("Input")
-      .Output("Output")
-      .AddIntsArg("kernels", {2, 2})
-      .AddIntsArg("strides", {2, 2})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("pooling_type", PoolingType::AVG)
-      .Finalize(net.NewOperatorDef());
-
-  // Add input data
-  net.AddInputFromArray<DeviceType::CPU, float>(
-      "Input", {1, 2, 4, 4},
-      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
-
-  // Run
-  net.RunOp();
-
-  // Check
-  auto expected = CreateTensor<float>(
-      {1, 2, 2, 2}, {2.5, 4.5, 10.5, 12.5, 18.5, 20.5, 26.5, 28.5});
-
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
-}

 TEST_F(PoolingOpTest, MAX_SAME) {
  // Construct graph
@@ -85,14 +57,14 @@ TEST_F(PoolingOpTest, MAX_SAME) {
      .Finalize(net.NewOperatorDef());

  // Add input data
-  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 1, 3, 3},
-                               {0, 1, 2, 3, 4, 5, 6, 7, 8});
+  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 3, 1},
+                                                {0, 1, 2, 3, 4, 5, 6, 7, 8});

  // Run
  net.RunOp();

  // Check
-  auto expected = CreateTensor<float>({1, 1, 2, 2}, {4, 5, 7, 8});
+  auto expected = CreateTensor<float>({1, 2, 2, 1}, {4, 5, 7, 8});

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -112,14 +84,14 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {

  // Add input data
  net.AddInputFromArray<DeviceType::CPU, float>(
-      "Input", {1, 1, 4, 4},
+      "Input", {1, 4, 4, 1},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});

  // Run
  net.RunOp();

  // Check
-  auto expected = CreateTensor<float>({1, 1, 2, 2}, {10, 11, 14, 15});
+  auto expected = CreateTensor<float>({1, 2, 2, 1}, {10, 11, 14, 15});

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -139,42 +111,57 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {

  // Add input data
  net.AddInputFromArray<DeviceType::CPU, float>(
-      "Input", {1, 1, 2, 9},
+      "Input", {1, 2, 9, 1},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
  // Run
-  net.RunOp(DeviceType::NEON);
+  net.RunOp();

  // Check
-  auto expected = CreateTensor<float>({1, 1, 1, 5}, {10, 12, 14, 16, 17});
+  auto expected = CreateTensor<float>({1, 1, 5, 1}, {10, 12, 14, 16, 17});

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }

-
-template <DeviceType D>
+template<DeviceType D>
 static void SimpleMaxPooling3S2() {
  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("Pooling", "PoolingTest")
-      .Input("Input")
-      .Output("Output")
-      .AddIntArg("pooling_type", PoolingType::MAX)
-      .AddIntsArg("kernels", {3, 3})
-      .AddIntsArg("strides", {2, 2})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());

  // Add input data
  net.AddInputFromArray<D, float>(
-      "Input", {1, 1, 3, 9},
-      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
+      "Input", {1, 3, 9, 1},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
       14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26});
-  // Run
-  net.RunOp(D);
+
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    OpDefBuilder("Pooling", "PoolingTest")
+        .Input("InputImage")
+        .Output("OutputImage")
+        .AddIntArg("pooling_type", PoolingType::MAX)
+        .AddIntsArg("kernels", {3, 3})
+        .AddIntsArg("strides", {2, 2})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    net.RunOp(D);
+    ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    // Run
+    OpDefBuilder("Pooling", "PoolingTest")
+        .Input("Input")
+        .Output("Output")
+        .AddIntArg("pooling_type", PoolingType::MAX)
+        .AddIntsArg("kernels", {3, 3})
+        .AddIntsArg("strides", {2, 2})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    net.RunOp(D);
+  }

  // Check
-  auto expected = CreateTensor<float>({1, 1, 1, 4}, {20, 22, 24, 26});
+  auto expected = CreateTensor<float>({1, 1, 4, 1}, {20, 22, 24, 26});

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
@@ -182,15 +169,15 @@ static void SimpleMaxPooling3S2() {
 TEST_F(PoolingOpTest, CPUSimpleMaxPooling3S2) {
  SimpleMaxPooling3S2<CPU>();
 }
-TEST_F(PoolingOpTest, NEONSimpleMaxPooling3S2) {
-  SimpleMaxPooling3S2<NEON>();
-}
+
 TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) {
  SimpleMaxPooling3S2<OPENCL>();
 }

-template <DeviceType D>
-static void AlignedMaxPooling3S2(Padding padding) {
+template<DeviceType D>
+static void MaxPooling3S2(const std::vector<index_t> &input_shape,
+                          const std::vector<int> strides,
+                          Padding padding) {
  // Construct graph
  OpsTestNet net;
  OpDefBuilder("Pooling", "PoolingTest")
@@ -198,22 +185,33 @@ static void AlignedMaxPooling3S2(Padding padding) {
      .Output("Output")
      .AddIntArg("pooling_type", PoolingType::MAX)
      .AddIntsArg("kernels", {3, 3})
-      .AddIntsArg("strides", {2, 2})
+      .AddIntsArg("strides", strides)
      .AddIntArg("padding", padding)
      .AddIntsArg("dilations", {1, 1})
      .Finalize(net.NewOperatorDef());

  // Add input data
-  net.AddRandomInput<D, float>("Input", {3, 128, 64, 64});
-  // Run
-  net.RunOp(D);
+  net.AddRandomInput<D, float>("Input", input_shape);
+
+  // run on cpu
+  net.RunOp();
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));

-  // Run on cpu
-  net.RunOp();
+  BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+  OpDefBuilder("Pooling", "PoolingTest")
+      .Input("InputImage")
+      .Output("OutputImage")
+      .AddIntArg("pooling_type", PoolingType::MAX)
+      .AddIntsArg("kernels", {3, 3})
+      .AddIntsArg("strides", strides)
+      .AddIntArg("padding", padding)
+      .AddIntsArg("dilations", {1, 1})
+      .Finalize(net.NewOperatorDef());
+  net.RunOp(D);
+  ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);

-  ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 0.001);
+  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
 }

 // TODO(chenghui) : there is a bug.
@@ -223,152 +221,140 @@ static void AlignedMaxPooling3S2(Padding padding) {
 //}

 TEST_F(PoolingOpTest, OPENCLAlignedMaxPooling3S2) {
-  AlignedMaxPooling3S2<OPENCL>(Padding::VALID);
-  AlignedMaxPooling3S2<OPENCL>(Padding::SAME);
+  MaxPooling3S2<OPENCL>({3, 64, 32, 32}, {1, 1}, Padding::VALID);
+  MaxPooling3S2<OPENCL>({3, 64, 32, 32}, {2, 2}, Padding::VALID);
+  MaxPooling3S2<OPENCL>({3, 64, 32, 32}, {1, 1}, Padding::SAME);
+  MaxPooling3S2<OPENCL>({3, 64, 32, 32}, {2, 2}, Padding::SAME);
 }

-template <DeviceType D>
-static void UnalignedMaxPooling3S2(Padding padding) {
+TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) {
+  MaxPooling3S2<OPENCL>({3, 41, 43, 47}, {1, 1}, Padding::VALID);
+  MaxPooling3S2<OPENCL>({3, 41, 43, 47}, {2, 2}, Padding::VALID);
+  MaxPooling3S2<OPENCL>({3, 41, 43, 47}, {1, 1}, Padding::SAME);
+  MaxPooling3S2<OPENCL>({3, 41, 43, 47}, {2, 2}, Padding::SAME);
+}
+
+TEST_F(PoolingOpTest, AVG_VALID) {
  // Construct graph
-  OpsTestNet net;
+  auto &net = test_net();
  OpDefBuilder("Pooling", "PoolingTest")
      .Input("Input")
      .Output("Output")
-      .AddIntArg("pooling_type", PoolingType::MAX)
-      .AddIntsArg("kernels", {3, 3})
+      .AddIntsArg("kernels", {2, 2})
      .AddIntsArg("strides", {2, 2})
-      .AddIntArg("padding", padding)
+      .AddIntArg("padding", Padding::VALID)
      .AddIntsArg("dilations", {1, 1})
+      .AddIntArg("pooling_type", PoolingType::AVG)
      .Finalize(net.NewOperatorDef());

  // Add input data
-  net.AddRandomInput<D, float>("Input", {3, 113, 43, 47});
-  // Run
-  net.RunOp(D);
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  net.AddInputFromArray<DeviceType::CPU, float>(
+      "Input", {1, 4, 4, 2},
+      {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
+       8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});

-  // Run on cpu
+  // Run
  net.RunOp();

-  ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 0.001);
-}
-
-// TODO(chenghui) : there is a bug.
-//TEST_F(PoolingOpTest, NEONUnalignedMaxPooling3S2) {
-//  UnalignedMaxPooling3S2<NEON>();
-//}
+  // Check
+  auto expected = CreateTensor<float>(
+      {1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5});

-TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) {
-  UnalignedMaxPooling3S2<OPENCL>(Padding::VALID);
-  UnalignedMaxPooling3S2<OPENCL>(Padding::SAME);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }

-template <DeviceType D>
+template<DeviceType D>
 static void SimpleAvgPoolingTest() {
  // Construct graph
  OpsTestNet net;
+
+  // Add input data
+  net.AddInputFromArray<D, float>(
+      "Input", {1, 2, 8, 1},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
+
+  BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
  OpDefBuilder("Pooling", "PoolingTest")
-      .Input("Input")
-      .Output("Output")
+      .Input("InputImage")
+      .Output("OutputImage")
      .AddIntArg("pooling_type", PoolingType::AVG)
      .AddIntsArg("kernels", {2, 2})
      .AddIntsArg("strides", {2, 2})
      .AddIntArg("padding", Padding::SAME)
      .AddIntsArg("dilations", {1, 1})
      .Finalize(net.NewOperatorDef());
-
-  // Add input data
-  net.AddInputFromArray<D, float>(
-      "Input", {1, 1, 2, 8},
-      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
  // Run
  net.RunOp(D);
+  ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);

  // Check
-  auto expected = CreateTensor<float>({1, 1, 1, 4}, {4.5, 6.5, 8.5, 10.5});
+  auto expected = CreateTensor<float>({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5});

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }

-TEST_F(PoolingOpTest, NEONSimpleAvgPooling) {
-  SimpleAvgPoolingTest<NEON>();
-}
-
 TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) {
  SimpleAvgPoolingTest<OPENCL>();
 }

-template <DeviceType D>
-static void AlignedAvgPoolingTest(Padding padding) {
+template<DeviceType D>
+static void AvgPoolingTest(const std::vector<index_t> &shape,
+                                  const std::vector<int> &kernels,
+                                  const std::vector<int> &strides,
+                                  Padding padding) {
  // Construct graph
  OpsTestNet net;
  OpDefBuilder("Pooling", "PoolingTest")
      .Input("Input")
      .Output("Output")
      .AddIntArg("pooling_type", PoolingType::AVG)
-      .AddIntsArg("kernels", {4, 4})
-      .AddIntsArg("strides", {4, 4})
+      .AddIntsArg("kernels", kernels)
+      .AddIntsArg("strides", strides)
      .AddIntArg("padding", padding)
      .AddIntsArg("dilations", {1, 1})
      .Finalize(net.NewOperatorDef());

  // Add input data
-  net.AddRandomInput<D, float>("Input", {3, 128, 15, 15});
-  // Run
-  net.RunOp(D);
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  net.AddRandomInput<D, float>("Input", shape);

-  // Run on cpu
+  // run on cpu
  net.RunOp();
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));

-  ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 1e-5);
-}
-
-TEST_F(PoolingOpTest, NEONAlignedAvgPooling) {
-  AlignedAvgPoolingTest<NEON>(Padding::VALID);
-  AlignedAvgPoolingTest<NEON>(Padding::SAME);
-}
-
-TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) {
-  AlignedAvgPoolingTest<OPENCL>(Padding::VALID);
-  AlignedAvgPoolingTest<OPENCL>(Padding::SAME);
-}
-
-template <DeviceType D>
-static void UnAlignedAvgPoolingTest(Padding padding) {
-  // Construct graph
-  OpsTestNet net;
+  BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
  OpDefBuilder("Pooling", "PoolingTest")
-      .Input("Input")
-      .Output("Output")
+      .Input("InputImage")
+      .Output("OutputImage")
      .AddIntArg("pooling_type", PoolingType::AVG)
-      .AddIntsArg("kernels", {7, 7})
-      .AddIntsArg("strides", {7, 7})
+      .AddIntsArg("kernels", kernels)
+      .AddIntsArg("strides", strides)
      .AddIntArg("padding", padding)
      .AddIntsArg("dilations", {1, 1})
      .Finalize(net.NewOperatorDef());
-
-  // Add input data
-  net.AddRandomInput<D, float>("Input", {3, 128, 31, 37});
-  // Run
  net.RunOp(D);
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+  ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT);

-  // Run on cpu
-  net.RunOp();
+  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
+}

-  ExpectTensorNear<float>(*net.GetOutput("Output"), expected, 1e-5);
+TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) {
+  AvgPoolingTest<OPENCL>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::VALID);
+  AvgPoolingTest<OPENCL>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME);
 }

-TEST_F(PoolingOpTest, NEONUnAlignedAvgPooling) {
-  UnAlignedAvgPoolingTest<NEON>(Padding::VALID);
-  UnAlignedAvgPoolingTest<NEON>(Padding::SAME);
+TEST_F(PoolingOpTest, OPENCLAlignedLargeKernelAvgPooling) {
+  AvgPoolingTest<OPENCL>({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::VALID);
+  AvgPoolingTest<OPENCL>({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::SAME);
 }

 TEST_F(PoolingOpTest, OPENCLUnAlignedAvgPooling) {
-  UnAlignedAvgPoolingTest<OPENCL>(Padding::VALID);
-  UnAlignedAvgPoolingTest<OPENCL>(Padding::SAME);
+  AvgPoolingTest<OPENCL>({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::VALID);
+  AvgPoolingTest<OPENCL>({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::SAME);
 }
+
+TEST_F(PoolingOpTest, OPENCLUnAlignedLargeKernelAvgPooling) {
+  AvgPoolingTest<OPENCL>({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::VALID);
+  AvgPoolingTest<OPENCL>({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::SAME);
+}
+