Merge branch 'remove-d2s-b2s-b2i' into 'master'

remove d2s b2s for depth_to_space and batch_to_space See merge request !774

Merge branch 'remove-d2s-b2s-b2i' into 'master'
remove d2s b2s for depth_to_space and batch_to_space See merge request !774
98c8dab1 · 刘琦 · b1764247 · ea9a8243 · 98c8dab1 · 98c8dab1
18 changed file
--- a/mace/kernels/batch_to_space.h
+++ b/mace/kernels/batch_to_space.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_BATCH_TO_SPACE_H_
+#define MACE_KERNELS_BATCH_TO_SPACE_H_
+#include <memory>
+#include <vector>
+#include <algorithm>
+#include "mace/core/future.h"
+#include "mace/core/tensor.h"
+#include "mace/kernels/kernel.h"
+#include "mace/public/mace.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+namespace mace {
+namespace kernels {
+struct BatchToSpaceFunctorBase : OpKernel {
+  BatchToSpaceFunctorBase(OpKernelContext *context,
+                          const std::vector<int> &paddings,
+                          const std::vector<int> &block_shape)
+    : OpKernel(context),
+      paddings_(paddings.begin(), paddings.end()),
+      block_shape_(block_shape.begin(), block_shape.end()) {
+    MACE_CHECK(
+      block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1,
+      "Block's shape should be 1D, and greater than 1");
+    MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D");
+  }
+  std::vector<int> paddings_;
+  std::vector<int> block_shape_;
+ protected:
+  void CalculateBatchToSpaceOutputShape(const Tensor *input_tensor,
+                                        const DataFormat data_format,
+                                        index_t *output_shape) {
+    MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D");
+    index_t batch = input_tensor->dim(0);
+    index_t channels = 0;
+    index_t height = 0;
+    index_t width = 0;
+    if (data_format == DataFormat::NHWC) {
+      height = input_tensor->dim(1);
+      width = input_tensor->dim(2);
+      channels = input_tensor->dim(3);
+    } else if (data_format == DataFormat::NCHW) {
+      height = input_tensor->dim(2);
+      width = input_tensor->dim(3);
+      channels = input_tensor->dim(1);
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+    index_t new_batch = batch / block_shape_[0] / block_shape_[1];
+    index_t new_height = height * block_shape_[0] - paddings_[0] - paddings_[1];
+    index_t new_width = width * block_shape_[1] - paddings_[2] - paddings_[3];
+    if (data_format == DataFormat::NHWC) {
+      output_shape[0] = new_batch;
+      output_shape[1] = new_height;
+      output_shape[2] = new_width;
+      output_shape[3] = channels;
+    } else {
+      output_shape[0] = new_batch;
+      output_shape[1] = channels;
+      output_shape[2] = new_height;
+      output_shape[3] = new_width;
+    }
+  }
+};
+template<DeviceType D, typename T>
+struct BatchToSpaceFunctor;
+template<>
+struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
+  BatchToSpaceFunctor(OpKernelContext *context,
+                      const std::vector<int> &paddings,
+                      const std::vector<int> &block_shape)
+    : BatchToSpaceFunctorBase(context, paddings, block_shape) {}
+  MaceStatus operator()(Tensor *space_tensor,
+                  Tensor *batch_tensor,
+                  StatsFuture *future) {
+    MACE_UNUSED(future);
+    std::vector<index_t> output_shape(4, 0);
+    CalculateBatchToSpaceOutputShape(batch_tensor,
+                                     DataFormat::NCHW,
+                                     output_shape.data());
+    MACE_RETURN_IF_ERROR(space_tensor->Resize(output_shape));
+    Tensor::MappingGuard input_guard(space_tensor);
+    Tensor::MappingGuard output_guard(batch_tensor);
+    int pad_top = paddings_[0];
+    int pad_left = paddings_[2];
+    int block_shape_h = block_shape_[0];
+    int block_shape_w = block_shape_[1];
+    const float *input_data = batch_tensor->data<float>();
+    float *output_data = space_tensor->mutable_data<float>();
+    index_t in_batches = batch_tensor->dim(0);
+    index_t in_height = batch_tensor->dim(2);
+    index_t in_width = batch_tensor->dim(3);
+    index_t out_batches = space_tensor->dim(0);
+    index_t channels = space_tensor->dim(1);
+    index_t out_height = space_tensor->dim(2);
+    index_t out_width = space_tensor->dim(3);
+    // 32k/sizeof(float)/out_width/block_shape
+    index_t
+      block_h_size =
+      std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / out_width);
+    // make channel outter loop so we can make best use of cache
+#pragma omp parallel for collapse(3)
+    for (index_t c = 0; c < channels; ++c) {
+      for (index_t block_h = 0; block_h < in_height;
+           block_h += block_h_size) {
+        for (index_t in_b = 0; in_b < in_batches; ++in_b) {
+          const index_t b = in_b % out_batches;
+          const index_t tile_index = in_b / out_batches;
+          const index_t tile_h = tile_index / block_shape_w;
+          const index_t tile_w = tile_index % block_shape_w;
+          const index_t valid_h_start = std::max(block_h,
+                                                 (pad_top - tile_h
+                                                   + block_shape_h - 1)
+                                                   / block_shape_h);
+          const index_t valid_h_end = std::min(in_height,
+                                               std::min(
+                                                 block_h + block_h_size,
+                                                 (out_height + pad_top
+                                                   - tile_h
+                                                   + block_shape_h - 1)
+                                                   / block_shape_h));
+          const index_t valid_w_start = std::max(static_cast<index_t>(0),
+                                                 (pad_left - tile_w
+                                                   + block_shape_w - 1)
+                                                   / block_shape_w);
+          const index_t valid_w_end = std::min(in_width,
+                                               (out_width + pad_left - tile_w
+                                                 + block_shape_w - 1)
+                                                 / block_shape_w);
+          const float *input_base =
+            input_data + (in_b * channels + c) * in_height * in_width;
+          float *output_base =
+            output_data + (b * channels + c) * out_height * out_width;
+          index_t h = valid_h_start * block_shape_h + tile_h - pad_top;
+          for (index_t in_h = valid_h_start; in_h < valid_h_end; ++in_h) {
+            index_t w = valid_w_start * block_shape_w + tile_w - pad_left;
+            for (index_t in_w = valid_w_start; in_w < valid_w_end; ++in_w) {
+              output_base[h * out_width + w] =
+                input_base[in_h * in_width + in_w];
+              w += block_shape_w;
+            }  // w
+            h += block_shape_h;
+          }  // h
+        }  // b
+      }  // block_h
+    }  // c
+    return MACE_SUCCESS;
+  }
+};
+#ifdef MACE_ENABLE_OPENCL
+template <typename T>
+struct BatchToSpaceFunctor<DeviceType::GPU, T> : BatchToSpaceFunctorBase {
+  BatchToSpaceFunctor(OpKernelContext *context,
+                      const std::vector<int> &paddings,
+                      const std::vector<int> &block_shape)
+      : BatchToSpaceFunctorBase(context, paddings, block_shape) {}
+  MaceStatus operator()(Tensor *space_tensor,
+                  Tensor *batch_tensor,
+                  StatsFuture *future);
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::unique_ptr<BufferBase> kernel_error_;
+  std::vector<index_t> space_shape_;
+};
+#endif  // MACE_ENABLE_OPENCL
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_BATCH_TO_SPACE_H_
--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -32,9 +32,8 @@ namespace kernels {
 template<DeviceType D, typename T>
 struct DepthToSpaceOpFunctor : OpKernel {
  DepthToSpaceOpFunctor(OpKernelContext *context,
-                        const int block_size,
+                        const int block_size)
-                        bool d2s)
+      : OpKernel(context), block_size_(block_size) {}
-      : OpKernel(context), block_size_(block_size), d2s_(d2s) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
                        StatsFuture *future) {
@@ -44,17 +43,13 @@ struct DepthToSpaceOpFunctor : OpKernel {
    const index_t input_height = input->dim(2);
    const index_t input_width = input->dim(3);
-    index_t output_depth, output_width, output_height;
+    MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
+               "input depth should be dividable by block_size * block_size",
+               input_depth);
-    if (d2s_) {
+    const index_t output_depth = input_depth / (block_size_ * block_size_);
-      output_depth = input_depth / (block_size_ * block_size_);
+    const index_t output_width = input_width * block_size_;
-      output_width = input_width * block_size_;
+    const index_t output_height = input_height * block_size_;
-      output_height = input_height * block_size_;
-    } else {
-      output_depth = input_depth * block_size_ * block_size_;
-      output_width = input_width / block_size_;
-      output_height = input_height / block_size_;
-    }
    std::vector<index_t> output_shape = {batch_size, output_depth,
                                         output_height, output_width};
@@ -65,78 +60,49 @@ struct DepthToSpaceOpFunctor : OpKernel {
    const T *input_ptr = input->data<T>();
    T *output_ptr = output->mutable_data<T>();
-    if (d2s_) {
-#pragma omp parallel for
-      for (index_t b = 0; b < batch_size; ++b) {
-        for (index_t d = 0; d < output_depth; ++d) {
-          for (index_t h = 0; h < output_height; ++h) {
-            const index_t in_h = h / block_size_;
-            const index_t offset_h = (h % block_size_);
-            for (int w = 0; w < output_width; ++w) {
-              const index_t in_w = w / block_size_;
-              const index_t offset_w = w % block_size_;
-              const index_t offset_d =
-                  (offset_h * block_size_ + offset_w) * output_depth;
-              const index_t in_d = d + offset_d;
-              const index_t o_index =
-                  ((b * output_depth + d) * output_height + h) * output_width
-                      + w;
-              const index_t i_index =
-                  ((b * input_depth + in_d) * input_height + in_h) * input_width
-                      + in_w;
-              output_ptr[o_index] = input_ptr[i_index];
-            }
-          }
-        }
-      }
-    } else {
 #pragma omp parallel for
-      for (index_t b = 0; b < batch_size; ++b) {
+    for (index_t b = 0; b < batch_size; ++b) {
-        for (index_t d = 0; d < input_depth; ++d) {
+      for (index_t d = 0; d < output_depth; ++d) {
-          for (index_t h = 0; h < input_height; ++h) {
+        for (index_t h = 0; h < output_height; ++h) {
-            const index_t out_h = h / block_size_;
+          const index_t in_h = h / block_size_;
-            const index_t offset_h = (h % block_size_);
+          const index_t offset_h = (h % block_size_);
-            for (index_t w = 0; w < input_width; ++w) {
+          for (int w = 0; w < output_width; ++w) {
-              const index_t out_w = w / block_size_;
+            const index_t in_w = w / block_size_;
-              const index_t offset_w = (w % block_size_);
+            const index_t offset_w = w % block_size_;
-              const index_t offset_d =
+            const index_t offset_d =
-                  (offset_h * block_size_ + offset_w) * input_depth;
+                (offset_h * block_size_ + offset_w) * output_depth;
-              const index_t out_d = d + offset_d;
+            const index_t in_d = d + offset_d;
-              const index_t o_index =
+            const index_t o_index =
-                  ((b * output_depth + out_d) * output_height + out_h)
+                ((b * output_depth + d) * output_height + h) * output_width
-                      * output_width + out_w;
+                    + w;
-              const index_t i_index =
+            const index_t i_index =
-                  ((b * input_depth + d) * input_height + h) * input_width
+                ((b * input_depth + in_d) * input_height + in_h) * input_width
-                      + w;
+                    + in_w;
-              output_ptr[o_index] = input_ptr[i_index];
+            output_ptr[o_index] = input_ptr[i_index];
-            }
          }
        }
      }
    }
    return MACE_SUCCESS;
  }
  const int block_size_;
-  bool d2s_;
 };
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
 struct DepthToSpaceOpFunctor<DeviceType::GPU, T> : OpKernel {
  DepthToSpaceOpFunctor(OpKernelContext *context,
-                        const int block_size,
+                        const int block_size)
-                        bool d2s)
+      : OpKernel(context), block_size_(block_size) {}
-      : OpKernel(context), block_size_(block_size), d2s_(d2s) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
                        StatsFuture *future);
  const int block_size_;
-  bool d2s_;
  cl::Kernel kernel_;
  uint32_t kwg_size_;
  std::unique_ptr<BufferBase> kernel_error_;

--- a/mace/kernels/opencl/batch_to_space.cc
+++ b/mace/kernels/opencl/batch_to_space.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_
+#define MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_
+#include "mace/kernels/batch_to_space.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/helper.h"
+#include "mace/utils/tuner.h"
+#include "mace/utils/utils.h"
+namespace mace {
+namespace kernels {
+template <typename T>
+MaceStatus BatchToSpaceFunctor<DeviceType::GPU, T>::operator()(
+    Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) {
+  std::vector<index_t> output_shape(4, 0);
+  CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC,
+                                   output_shape.data());
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                  &output_image_shape);
+  MACE_RETURN_IF_ERROR(
+      space_tensor->ResizeImage(output_shape, output_image_shape));
+  const uint32_t chan_blk =
+      static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
+  const uint32_t gws[3] = {
+      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
+      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
+  auto runtime = context_->device()->opencl_runtime();
+  if (kernel_.get() == nullptr) {
+    const char *kernel_name = "batch_to_space";
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+    std::set<std::string> built_options;
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
+    NON_UNIFORM_WG_CONFIG;
+    std::stringstream kernel_name_ss;
+    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    built_options.emplace(kernel_name_ss.str());
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" +
+                          DtToCLCMDDt(DataTypeToEnum<T>::value));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  if (!IsVecEqual(space_shape_, space_tensor->shape())) {
+    uint32_t idx = 0;
+    OUT_OF_RANGE_SET_ARG;
+    SET_3D_GWS_ARGS(kernel_);
+    kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
+    kernel_.setArg(idx++, *(space_tensor->opencl_image()));
+    kernel_.setArg(idx++, block_shape_[0]);
+    kernel_.setArg(idx++, block_shape_[1]);
+    kernel_.setArg(idx++, paddings_[0]);
+    kernel_.setArg(idx++, paddings_[2]);
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
+    space_shape_ = space_tensor->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
+             batch_tensor->dim(2), batch_tensor->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  OUT_OF_RANGE_VALIDATION(kernel_error_);
+  return MACE_SUCCESS;
+}
+template struct BatchToSpaceFunctor<DeviceType::GPU, float>;
+template struct BatchToSpaceFunctor<DeviceType::GPU, half>;
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_
--- a/mace/kernels/opencl/cl/batch_to_space.cl
+++ b/mace/kernels/opencl/cl/batch_to_space.cl
+#include <common.h>
+__kernel void batch_to_space(KERNEL_ERROR_PARAMS
+                             GLOBAL_WORK_GROUP_SIZE_DIM3
+                             __read_only image2d_t batch_data,
+                             __write_only image2d_t space_data,
+                             __private const int block_height,
+                             __private const int block_width,
+                             __private const int padding_height,
+                             __private const int padding_width,
+                             __private const int batch_size,
+                             __private const int space_height,
+                             __private const int space_width,
+                             __private const int batch_height,
+                             __private const int batch_width) {
+  const int chan_idx = get_global_id(0);
+  const int batch_w_idx = get_global_id(1);
+  const int batch_hb_idx = get_global_id(2);
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
+      || batch_hb_idx >= global_size_dim2) {
+    return;
+  }
+#endif
+  const int batch_b_idx = batch_hb_idx / batch_height;
+  const int batch_h_idx = batch_hb_idx % batch_height;
+  const int block_size = mul24(block_height, block_width);
+  const int space_b_idx = batch_b_idx % batch_size;
+  const int remaining_batch_idx = batch_b_idx / batch_size;
+  const int space_h_idx = (remaining_batch_idx / block_width) +
+      mul24(batch_h_idx, block_height) - padding_height;
+  const int space_w_idx = (remaining_batch_idx % block_width) +
+      mul24(batch_w_idx, block_width) - padding_width;
+  if (0 <= space_w_idx && space_w_idx < space_width &&
+      0 <= space_h_idx && space_h_idx < space_height) {
+    int2 batch_coord = (int2)(mul24(chan_idx, batch_width) + batch_w_idx, batch_hb_idx);
+    DATA_TYPE4 value = READ_IMAGET(batch_data, SAMPLER, batch_coord);
+    int2 space_coord = (int2)(mul24(chan_idx, space_width) + space_w_idx,
+                              space_b_idx * space_height + space_h_idx);
+    WRITE_IMAGET(space_data, space_coord, value);
+  }
+}
--- a/mace/kernels/opencl/cl/depth_to_space.cl
+++ b/mace/kernels/opencl/cl/depth_to_space.cl
@@ -39,43 +39,3 @@ __kernel void depth_to_space(KERNEL_ERROR_PARAMS
  WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data);
 }
-__kernel void space_to_depth(KERNEL_ERROR_PARAMS
-                             GLOBAL_WORK_GROUP_SIZE_DIM3
-                             __read_only image2d_t input,
-                             __private const int block_size,
-                             __private const int input_width,
-                             __private const int input_depth_blocks,
-                             __private const int output_hb,
-                             __private const int output_width,
-                             __private const int output_depth_blocks,
-                             __write_only image2d_t output) {
-  const int d = get_global_id(0);
-  const int w = get_global_id(1);
-  const int hb = get_global_id(2);
-#ifndef NON_UNIFORM_WORK_GROUP
-  if (d >= global_size_dim0 || w >= global_size_dim1
-      || hb >= global_size_dim2) {
-    return;
-  }
-#endif
-  const int in_pos = mad24(d, input_width, w);
-  const int out_hb = hb / block_size;
-  const int offset_h = hb % block_size;
-  const int out_w = w / block_size;
-  const int offset_w = w % block_size;
-  const int offset_d = (offset_h * block_size + offset_w) * input_depth_blocks;
-  const int out_d = d + offset_d;
-  if (out_d >= output_depth_blocks || out_hb >= output_hb || out_w >= output_width) {
-    return;
-  }
-  const int out_pos = mad24(out_d, output_width, out_w);
-  DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, hb));
-  WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data);
-}
--- a/mace/kernels/opencl/cl/space_to_batch.cl
+++ b/mace/kernels/opencl/cl/space_to_batch.cl
@@ -49,50 +49,3 @@ __kernel void space_to_batch(KERNEL_ERROR_PARAMS
  WRITE_IMAGET(batch_data, batch_coord, value);
 }
-__kernel void batch_to_space(KERNEL_ERROR_PARAMS
-                             GLOBAL_WORK_GROUP_SIZE_DIM3
-                             __read_only image2d_t batch_data,
-                             __write_only image2d_t space_data,
-                             __private const int block_height,
-                             __private const int block_width,
-                             __private const int padding_height,
-                             __private const int padding_width,
-                             __private const int batch_size,
-                             __private const int space_height,
-                             __private const int space_width,
-                             __private const int batch_height,
-                             __private const int batch_width) {
-  const int chan_idx = get_global_id(0);
-  const int batch_w_idx = get_global_id(1);
-  const int batch_hb_idx = get_global_id(2);
-#ifndef NON_UNIFORM_WORK_GROUP
-  if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
-      || batch_hb_idx >= global_size_dim2) {
-    return;
-  }
-#endif
-  const int batch_b_idx = batch_hb_idx / batch_height;
-  const int batch_h_idx = batch_hb_idx % batch_height;
-  const int block_size = mul24(block_height, block_width);
-  const int space_b_idx = batch_b_idx % batch_size;
-  const int remaining_batch_idx = batch_b_idx / batch_size;
-  const int space_h_idx = (remaining_batch_idx / block_width) +
-      mul24(batch_h_idx, block_height) - padding_height;
-  const int space_w_idx = (remaining_batch_idx % block_width) +
-      mul24(batch_w_idx, block_width) - padding_width;
-  if (0 <= space_w_idx && space_w_idx < space_width &&
-      0 <= space_h_idx && space_h_idx < space_height) {
-    int2 batch_coord = (int2)(mul24(chan_idx, batch_width) + batch_w_idx, batch_hb_idx);
-    DATA_TYPE4 value = READ_IMAGET(batch_data, SAMPLER, batch_coord);
-    int2 space_coord = (int2)(mul24(chan_idx, space_width) + space_w_idx,
-                              space_b_idx * space_height + space_h_idx);
-    WRITE_IMAGET(space_data, space_coord, value);
-  }
-}
--- a/mace/kernels/opencl/cl/space_to_depth.cl
+++ b/mace/kernels/opencl/cl/space_to_depth.cl
+#include <common.h>
+__kernel void space_to_depth(KERNEL_ERROR_PARAMS
+                             GLOBAL_WORK_GROUP_SIZE_DIM3
+                             __read_only image2d_t input,
+                             __private const int block_size,
+                             __private const int input_width,
+                             __private const int input_depth_blocks,
+                             __private const int output_hb,
+                             __private const int output_width,
+                             __private const int output_depth_blocks,
+                             __write_only image2d_t output) {
+  const int d = get_global_id(0);
+  const int w = get_global_id(1);
+  const int hb = get_global_id(2);
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (d >= global_size_dim0 || w >= global_size_dim1
+      || hb >= global_size_dim2) {
+    return;
+  }
+#endif
+  const int in_pos = mad24(d, input_width, w);
+  const int out_hb = hb / block_size;
+  const int offset_h = hb % block_size;
+  const int out_w = w / block_size;
+  const int offset_w = w % block_size;
+  const int offset_d = (offset_h * block_size + offset_w) * input_depth_blocks;
+  const int out_d = d + offset_d;
+  if (out_d >= output_depth_blocks || out_hb >= output_hb || out_w >= output_width) {
+    return;
+  }
+  const int out_pos = mad24(out_d, output_width, out_w);
+  DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, hb));
+  WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data);
+}
--- a/mace/kernels/opencl/depth_to_space.cc
+++ b/mace/kernels/opencl/depth_to_space.cc
@@ -30,54 +30,41 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
  const index_t input_width = input->dim(2);
  const index_t input_depth = input->dim(3);
-  const char *kernel_name = nullptr;
+  MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
+             "input depth should be dividable by block_size * block_size",
+             input_depth);
+  MACE_CHECK((input_depth % 4) == 0,
+             "input channel should be dividable by 4");
+  const index_t output_height = input_height * block_size_;
+  const index_t output_width = input_width * block_size_;
+  const index_t output_depth = input_depth / (block_size_ * block_size_);
+  MACE_CHECK(output_depth % 4 == 0, "output channel not support:")
+    << output_depth;
-  uint32_t gws[3];
-  std::string tuning_key;
-  index_t output_height, output_width, output_depth;
-  if (d2s_) {
-    output_height = input_height * block_size_;
-    output_width = input_width * block_size_;
-    output_depth = input_depth / (block_size_ * block_size_);
-    MACE_CHECK(output_depth % 4 == 0, "output channel not support:")
-        << output_depth;
-    kernel_name = "depth_to_space";
-    gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth));
-    gws[1] = static_cast<uint32_t>(output_width);
-    gws[2] = static_cast<uint32_t>(output_height * batch);
-    tuning_key = Concat("depth_to_space_opencl_kernel", batch, output_height,
-                        output_width, output_depth);
-  } else {
-    output_height = input_height / block_size_;
-    output_width = input_width / block_size_;
-    output_depth = input_depth * block_size_ * block_size_;
-    MACE_CHECK(input_depth % 4 == 0, "input channel not support:")
-        << input_depth;
-    kernel_name = "space_to_depth";
-    gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth));
-    gws[1] = static_cast<uint32_t>(input_width);
-    gws[2] = static_cast<uint32_t>(input_height * batch);
-    tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0),
-                        input->dim(1), input->dim(2), input->dim(3));
-  }
  const index_t input_depth_blocks = RoundUpDiv4(input_depth);
  const index_t output_depth_blocks = RoundUpDiv4(output_depth);
-  std::vector<index_t> output_shape = {batch, output_height, output_width,
+  std::vector<index_t> output_shape = {batch,
+                                       output_height,
+                                       output_width,
                                       output_depth};
  std::vector<size_t> image_shape;
  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(RoundUpDiv4(output_depth)),
+      static_cast<uint32_t>(output_width),
+      static_cast<uint32_t>(output_height * batch)
+  };
  auto runtime = context_->device()->opencl_runtime();
  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
    NON_UNIFORM_WG_CONFIG;
+    const char *kernel_name = kernel_name = "depth_to_space";
    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
    std::stringstream kernel_name_ss;
    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
@@ -89,7 +76,6 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
                                              obfuscated_kernel_name,
                                              built_options,
                                              &kernel_));
    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
@@ -99,26 +85,20 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
    OUT_OF_RANGE_SET_ARG;
    SET_3D_GWS_ARGS(kernel_);
    kernel_.setArg(idx++, *(input->opencl_image()));
-    if (d2s_) {
+    kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
-      kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_height * batch));
-      kernel_.setArg(idx++, static_cast<int32_t>(input_height * batch));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_width));
-      kernel_.setArg(idx++, static_cast<int32_t>(input_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
-      kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_width));
-      kernel_.setArg(idx++, static_cast<int32_t>(output_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
-      kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
-    } else {
-      kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
-      kernel_.setArg(idx++, static_cast<int32_t>(input_width));
-      kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
-      kernel_.setArg(idx++, static_cast<int32_t>(output_height * batch));
-      kernel_.setArg(idx++, static_cast<int32_t>(output_width));
-      kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
-    }
    kernel_.setArg(idx++, *(output->opencl_image()));
    input_shape_ = input->shape();
  }
+  std::string  tuning_key = Concat("depth_to_space_opencl_kernel",
+                                   batch, output_height,
+                                   output_width, output_depth);
  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, future));

--- a/mace/kernels/opencl/space_to_batch.cc
+++ b/mace/kernels/opencl/space_to_batch.cc
@@ -28,27 +28,14 @@ template <typename T>
 MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
    Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) {
  std::vector<index_t> output_shape(4, 0);
-  if (b2s_) {
+  CalculateSpaceToBatchOutputShape(space_tensor, DataFormat::NHWC,
-    CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC,
+                                   output_shape.data());
-                                     output_shape.data());
-  } else {
-    CalculateSpaceToBatchOutputShape(space_tensor, DataFormat::NHWC,
-                                     output_shape.data());
-  }
-  const char *kernel_name = nullptr;
  std::vector<size_t> output_image_shape;
  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
                  &output_image_shape);
-  if (b2s_) {
+  MACE_RETURN_IF_ERROR(
-    MACE_RETURN_IF_ERROR(
+      batch_tensor->ResizeImage(output_shape, output_image_shape));
-        space_tensor->ResizeImage(output_shape, output_image_shape));
+  const char *kernel_name = "space_to_batch";
-    kernel_name = "batch_to_space";
-  } else {
-    MACE_RETURN_IF_ERROR(
-        batch_tensor->ResizeImage(output_shape, output_image_shape));
-    kernel_name = "space_to_batch";
-  }
  const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
  const uint32_t gws[3] = {
      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
@@ -79,13 +66,9 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
    uint32_t idx = 0;
    OUT_OF_RANGE_SET_ARG;
    SET_3D_GWS_ARGS(kernel_);
-    if (b2s_) {
-      kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
+    kernel_.setArg(idx++, *(space_tensor->opencl_image()));
-      kernel_.setArg(idx++, *(space_tensor->opencl_image()));
+    kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
-    } else {
-      kernel_.setArg(idx++, *(space_tensor->opencl_image()));
-      kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
-    }
    kernel_.setArg(idx++, block_shape_[0]);
    kernel_.setArg(idx++, block_shape_[1]);
    kernel_.setArg(idx++, paddings_[0]);

--- a/mace/kernels/opencl/space_to_depth.cc
+++ b/mace/kernels/opencl/space_to_depth.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/kernels/space_to_depth.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/helper.h"
+#include "mace/utils/tuner.h"
+#include "mace/utils/utils.h"
+namespace mace {
+namespace kernels {
+template <typename T>
+MaceStatus SpaceToDepthOpFunctor<DeviceType::GPU, T>::operator()(
+    const Tensor *input, Tensor *output, StatsFuture *future) {
+  const index_t batch = input->dim(0);
+  const index_t input_height = input->dim(1);
+  const index_t input_width = input->dim(2);
+  const index_t input_depth = input->dim(3);
+  MACE_CHECK((input_depth % 4) == 0,
+             "input channel should be dividable by 4");
+  MACE_CHECK(
+      (input_width % block_size_ == 0) && (input_height % block_size_ == 0),
+      "input width and height should be dividable by block_size");
+  const index_t output_height = input_height / block_size_;
+  const index_t output_width = input_width / block_size_;
+  const index_t output_depth = input_depth * block_size_ * block_size_;
+  const index_t input_depth_blocks = RoundUpDiv4(input_depth);
+  const index_t output_depth_blocks = RoundUpDiv4(output_depth);
+  std::vector<index_t> output_shape = {batch, output_height, output_width,
+                                       output_depth};
+  std::vector<size_t> image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
+  auto runtime = context_->device()->opencl_runtime();
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
+    NON_UNIFORM_WG_CONFIG;
+    const char *kernel_name = "space_to_depth";
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+    std::stringstream kernel_name_ss;
+    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    built_options.emplace(kernel_name_ss.str());
+    auto dt = DataTypeToEnum<T>::value;
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_depth",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  const uint32_t gws[3] = {static_cast<uint32_t>(input_depth_blocks),
+                           static_cast<uint32_t>(input_width),
+                           static_cast<uint32_t>(input_height * batch)};
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    OUT_OF_RANGE_SET_ARG;
+    SET_3D_GWS_ARGS(kernel_);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_height * batch));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0),
+                                  input->dim(1), input->dim(2), input->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  OUT_OF_RANGE_VALIDATION(kernel_error_);
+  return MACE_SUCCESS;
+}
+template struct SpaceToDepthOpFunctor<DeviceType::GPU, float>;
+template struct SpaceToDepthOpFunctor<DeviceType::GPU, half>;
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -33,12 +33,10 @@ namespace kernels {
 struct SpaceToBatchFunctorBase : OpKernel {
  SpaceToBatchFunctorBase(OpKernelContext *context,
                          const std::vector<int> &paddings,
-                          const std::vector<int> &block_shape,
+                          const std::vector<int> &block_shape)
-                          bool b2s)
    : OpKernel(context),
      paddings_(paddings.begin(), paddings.end()),
-      block_shape_(block_shape.begin(), block_shape.end()),
+      block_shape_(block_shape.begin(), block_shape.end()) {
-      b2s_(b2s) {
    MACE_CHECK(
      block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1,
      "Block's shape should be 1D, and greater than 1");
@@ -47,7 +45,6 @@ struct SpaceToBatchFunctorBase : OpKernel {
  std::vector<int> paddings_;
  std::vector<int> block_shape_;
-  bool b2s_;
 protected:
  void CalculateSpaceToBatchOutputShape(const Tensor *input_tensor,
@@ -93,43 +90,6 @@ struct SpaceToBatchFunctorBase : OpKernel {
      output_shape[3] = new_width;
    }
  }
-  void CalculateBatchToSpaceOutputShape(const Tensor *input_tensor,
-                                        const DataFormat data_format,
-                                        index_t *output_shape) {
-    MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D");
-    index_t batch = input_tensor->dim(0);
-    index_t channels = 0;
-    index_t height = 0;
-    index_t width = 0;
-    if (data_format == DataFormat::NHWC) {
-      height = input_tensor->dim(1);
-      width = input_tensor->dim(2);
-      channels = input_tensor->dim(3);
-    } else if (data_format == DataFormat::NCHW) {
-      height = input_tensor->dim(2);
-      width = input_tensor->dim(3);
-      channels = input_tensor->dim(1);
-    } else {
-      MACE_NOT_IMPLEMENTED;
-    }
-    index_t new_batch = batch / block_shape_[0] / block_shape_[1];
-    index_t new_height = height * block_shape_[0] - paddings_[0] - paddings_[1];
-    index_t new_width = width * block_shape_[1] - paddings_[2] - paddings_[3];
-    if (data_format == DataFormat::NHWC) {
-      output_shape[0] = new_batch;
-      output_shape[1] = new_height;
-      output_shape[2] = new_width;
-      output_shape[3] = channels;
-    } else {
-      output_shape[0] = new_batch;
-      output_shape[1] = channels;
-      output_shape[2] = new_height;
-      output_shape[3] = new_width;
-    }
-  }
 };
 template<DeviceType D, typename T>
@@ -139,9 +99,8 @@ template<>
 struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
  SpaceToBatchFunctor(OpKernelContext *context,
                      const std::vector<int> &paddings,
-                      const std::vector<int> &block_shape,
+                      const std::vector<int> &block_shape)
-                      bool b2s)
+    : SpaceToBatchFunctorBase(context, paddings, block_shape) {}
-    : SpaceToBatchFunctorBase(context, paddings, block_shape, b2s) {}
  MaceStatus operator()(Tensor *space_tensor,
                  Tensor *batch_tensor,
@@ -149,17 +108,11 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
    MACE_UNUSED(future);
    std::vector<index_t> output_shape(4, 0);
-    if (b2s_) {
-      CalculateBatchToSpaceOutputShape(batch_tensor,
+    CalculateSpaceToBatchOutputShape(space_tensor,
-                                       DataFormat::NCHW,
+                                     DataFormat::NCHW,
-                                       output_shape.data());
+                                     output_shape.data());
-      MACE_RETURN_IF_ERROR(space_tensor->Resize(output_shape));
+    MACE_RETURN_IF_ERROR(batch_tensor->Resize(output_shape));
-    } else {
-      CalculateSpaceToBatchOutputShape(space_tensor,
-                                       DataFormat::NCHW,
-                                       output_shape.data());
-      MACE_RETURN_IF_ERROR(batch_tensor->Resize(output_shape));
-    }
    Tensor::MappingGuard input_guard(space_tensor);
    Tensor::MappingGuard output_guard(batch_tensor);
@@ -169,152 +122,85 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
    int block_shape_h = block_shape_[0];
    int block_shape_w = block_shape_[1];
-    if (b2s_) {
+    const float *input_data = space_tensor->data<float>();
-      const float *input_data = batch_tensor->data<float>();
+    float *output_data = batch_tensor->mutable_data<float>();
-      float *output_data = space_tensor->mutable_data<float>();
-      index_t in_batches = batch_tensor->dim(0);
+    index_t in_batches = space_tensor->dim(0);
-      index_t in_height = batch_tensor->dim(2);
+    index_t in_height = space_tensor->dim(2);
-      index_t in_width = batch_tensor->dim(3);
+    index_t in_width = space_tensor->dim(3);
-      index_t out_batches = space_tensor->dim(0);
+    index_t out_batches = batch_tensor->dim(0);
-      index_t channels = space_tensor->dim(1);
+    index_t channels = batch_tensor->dim(1);
-      index_t out_height = space_tensor->dim(2);
+    index_t out_height = batch_tensor->dim(2);
-      index_t out_width = space_tensor->dim(3);
+    index_t out_width = batch_tensor->dim(3);
-      // 32k/sizeof(float)/out_width/block_shape
+    index_t block_h_size =
-      index_t
+      std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / in_width);
-        block_h_size =
-        std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / out_width);
-      // make channel outter loop so we can make best use of cache
+    // make channel outter loop so we can make best use of cache
 #pragma omp parallel for collapse(3)
-      for (index_t c = 0; c < channels; ++c) {
+    for (index_t c = 0; c < channels; ++c) {
-        for (index_t block_h = 0; block_h < in_height;
+      for (index_t block_h = 0; block_h < out_height;
-             block_h += block_h_size) {
+           block_h += block_h_size) {
-          for (index_t in_b = 0; in_b < in_batches; ++in_b) {
+        for (index_t b = 0; b < out_batches; ++b) {
-            const index_t b = in_b % out_batches;
+          const index_t in_b = b % in_batches;
-            const index_t tile_index = in_b / out_batches;
+          const index_t tile_index = b / in_batches;
-            const index_t tile_h = tile_index / block_shape_w;
+          const index_t tile_h = tile_index / block_shape_w;
-            const index_t tile_w = tile_index % block_shape_w;
+          const index_t tile_w = tile_index % block_shape_w;
-            const index_t valid_h_start = std::max(block_h,
+          const index_t valid_h_start = std::max(block_h,
-                                                   (pad_top - tile_h
+                                                 (pad_top - tile_h
-                                                     + block_shape_h - 1)
+                                                   + block_shape_h - 1)
-                                                     / block_shape_h);
+                                                   / block_shape_h);
-            const index_t valid_h_end = std::min(in_height,
+          const index_t valid_h_end = std::min(out_height,
-                                                 std::min(
+                                               std::min(
-                                                   block_h + block_h_size,
+                                                 block_h + block_h_size,
-                                                   (out_height + pad_top
+                                                 (in_height + pad_top
-                                                     - tile_h
+                                                   - tile_h
-                                                     + block_shape_h - 1)
+                                                   + block_shape_h - 1)
-                                                     / block_shape_h));
+                                                   / block_shape_h));
-            const index_t valid_w_start = std::max(static_cast<index_t>(0),
+          const index_t valid_w_start = std::max(static_cast<index_t>(0),
-                                                   (pad_left - tile_w
+                                                 (pad_left - tile_w
-                                                     + block_shape_w - 1)
-                                                     / block_shape_w);
-            const index_t valid_w_end = std::min(in_width,
-                                                 (out_width + pad_left - tile_w
                                                   + block_shape_w - 1)
                                                   / block_shape_w);
-            const float *input_base =
+          const index_t valid_w_end = std::min(out_width,
-              input_data + (in_b * channels + c) * in_height * in_width;
+                                               (in_width + pad_left - tile_w
-            float *output_base =
+                                                 + block_shape_w - 1)
-              output_data + (b * channels + c) * out_height * out_width;
+                                                 / block_shape_w);
+          const float *input_base =
-            index_t h = valid_h_start * block_shape_h + tile_h - pad_top;
+            input_data + (in_b * channels + c) * in_height * in_width;
-            for (index_t in_h = valid_h_start; in_h < valid_h_end; ++in_h) {
+          float *output_base =
-              index_t w = valid_w_start * block_shape_w + tile_w - pad_left;
+            output_data + (b * channels + c) * out_height * out_width;
-              for (index_t in_w = valid_w_start; in_w < valid_w_end; ++in_w) {
-                output_base[h * out_width + w] =
+          memset(output_base + block_h * out_width,
-                  input_base[in_h * in_width + in_w];
+                 0,
-                w += block_shape_w;
+                 (valid_h_start - block_h) * out_width * sizeof(float));
-              }  // w
-              h += block_shape_h;
+          index_t in_h = valid_h_start * block_shape_h + tile_h - pad_top;
-            }  // h
+          for (index_t h = valid_h_start; h < valid_h_end; ++h) {
-          }  // b
+            memset(output_base + h * out_width,
-        }  // block_h
+                   0,
-      }  // c
+                   valid_w_start * sizeof(float));
-    } else {
-      const float *input_data = space_tensor->data<float>();
-      float *output_data = batch_tensor->mutable_data<float>();
-      index_t in_batches = space_tensor->dim(0);
-      index_t in_height = space_tensor->dim(2);
-      index_t in_width = space_tensor->dim(3);
-      index_t out_batches = batch_tensor->dim(0);
-      index_t channels = batch_tensor->dim(1);
-      index_t out_height = batch_tensor->dim(2);
-      index_t out_width = batch_tensor->dim(3);
-      index_t block_h_size =
+            index_t in_w = valid_w_start * block_shape_w + tile_w - pad_left;
-        std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / in_width);
+            for (index_t w = valid_w_start; w < valid_w_end; ++w) {
+              output_base[h * out_width + w] =
+                input_base[in_h * in_width + in_w];
+              in_w += block_shape_w;
+            }  // w
+            in_h += block_shape_h;
-      // make channel outter loop so we can make best use of cache
+            memset(output_base + h * out_width + valid_w_end,
-#pragma omp parallel for collapse(3)
-      for (index_t c = 0; c < channels; ++c) {
-        for (index_t block_h = 0; block_h < out_height;
-             block_h += block_h_size) {
-          for (index_t b = 0; b < out_batches; ++b) {
-            const index_t in_b = b % in_batches;
-            const index_t tile_index = b / in_batches;
-            const index_t tile_h = tile_index / block_shape_w;
-            const index_t tile_w = tile_index % block_shape_w;
-            const index_t valid_h_start = std::max(block_h,
-                                                   (pad_top - tile_h
-                                                     + block_shape_h - 1)
-                                                     / block_shape_h);
-            const index_t valid_h_end = std::min(out_height,
-                                                 std::min(
-                                                   block_h + block_h_size,
-                                                   (in_height + pad_top
-                                                     - tile_h
-                                                     + block_shape_h - 1)
-                                                     / block_shape_h));
-            const index_t valid_w_start = std::max(static_cast<index_t>(0),
-                                                   (pad_left - tile_w
-                                                     + block_shape_w - 1)
-                                                     / block_shape_w);
-            const index_t valid_w_end = std::min(out_width,
-                                                 (in_width + pad_left - tile_w
-                                                   + block_shape_w - 1)
-                                                   / block_shape_w);
-            const float *input_base =
-              input_data + (in_b * channels + c) * in_height * in_width;
-            float *output_base =
-              output_data + (b * channels + c) * out_height * out_width;
-            memset(output_base + block_h * out_width,
                   0,
-                   (valid_h_start - block_h) * out_width * sizeof(float));
+                   (out_width - valid_w_end) * sizeof(float));
+          }  // h
-            index_t in_h = valid_h_start * block_shape_h + tile_h - pad_top;
-            for (index_t h = valid_h_start; h < valid_h_end; ++h) {
+          memset(output_base + valid_h_end * out_width,
-              memset(output_base + h * out_width,
+                 0,
-                     0,
+                 (std::min(out_height, block_h + block_h_size) - valid_h_end)
-                     valid_w_start * sizeof(float));
+                   * out_width * sizeof(float));
+        }  // b
-              index_t in_w = valid_w_start * block_shape_w + tile_w - pad_left;
+      }  // block_h
-              for (index_t w = valid_w_start; w < valid_w_end; ++w) {
+    }  // c
-                output_base[h * out_width + w] =
-                  input_base[in_h * in_width + in_w];
-                in_w += block_shape_w;
-              }  // w
-              in_h += block_shape_h;
-              memset(output_base + h * out_width + valid_w_end,
-                     0,
-                     (out_width - valid_w_end) * sizeof(float));
-            }  // h
-            memset(output_base + valid_h_end * out_width,
-                   0,
-                   (std::min(out_height, block_h + block_h_size) - valid_h_end)
-                     * out_width * sizeof(float));
-          }  // b
-        }  // block_h
-      }  // c
-    }
    return MACE_SUCCESS;
  }
 };
@@ -324,9 +210,8 @@ template <typename T>
 struct SpaceToBatchFunctor<DeviceType::GPU, T> : SpaceToBatchFunctorBase {
  SpaceToBatchFunctor(OpKernelContext *context,
                      const std::vector<int> &paddings,
-                      const std::vector<int> &block_shape,
+                      const std::vector<int> &block_shape)
-                      bool b2s)
+      : SpaceToBatchFunctorBase(context, paddings, block_shape) {}
-      : SpaceToBatchFunctorBase(context, paddings, block_shape, b2s) {}
  MaceStatus operator()(Tensor *space_tensor,
                  Tensor *batch_tensor,

--- a/mace/kernels/space_to_depth.h
+++ b/mace/kernels/space_to_depth.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_SPACE_TO_DEPTH_H_
+#define MACE_KERNELS_SPACE_TO_DEPTH_H_
+#include <memory>
+#include <vector>
+#include "mace/core/future.h"
+#include "mace/core/tensor.h"
+#include "mace/public/mace.h"
+#include "mace/kernels/kernel.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+namespace mace {
+namespace kernels {
+template<DeviceType D, typename T>
+struct SpaceToDepthOpFunctor : OpKernel {
+  SpaceToDepthOpFunctor(OpKernelContext *context,
+                        const int block_size)
+      : OpKernel(context), block_size_(block_size) {}
+  MaceStatus operator()(const Tensor *input,
+                        Tensor *output,
+                        StatsFuture *future) {
+    MACE_UNUSED(future);
+    const index_t batch_size = input->dim(0);
+    const index_t input_depth = input->dim(1);
+    const index_t input_height = input->dim(2);
+    const index_t input_width = input->dim(3);
+    MACE_CHECK(
+        (input_width % block_size_ == 0) && (input_height % block_size_ == 0),
+        "input width and height should be dividable by block_size");
+    const index_t output_depth = input_depth * block_size_ * block_size_;
+    const index_t output_width = input_width / block_size_;
+    const index_t output_height = input_height / block_size_;
+    std::vector<index_t> output_shape = {batch_size, output_depth,
+                                         output_height, output_width};
+    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+    Tensor::MappingGuard logits_guard(input);
+    Tensor::MappingGuard output_guard(output);
+    const T *input_ptr = input->data<T>();
+    T *output_ptr = output->mutable_data<T>();
+#pragma omp parallel for
+    for (index_t b = 0; b < batch_size; ++b) {
+      for (index_t d = 0; d < input_depth; ++d) {
+        for (index_t h = 0; h < input_height; ++h) {
+          const index_t out_h = h / block_size_;
+          const index_t offset_h = (h % block_size_);
+          for (index_t w = 0; w < input_width; ++w) {
+            const index_t out_w = w / block_size_;
+            const index_t offset_w = (w % block_size_);
+            const index_t offset_d =
+                (offset_h * block_size_ + offset_w) * input_depth;
+            const index_t out_d = d + offset_d;
+            const index_t o_index =
+                ((b * output_depth + out_d) * output_height + out_h)
+                    * output_width + out_w;
+            const index_t i_index =
+                ((b * input_depth + d) * input_height + h) * input_width + w;
+            output_ptr[o_index] = input_ptr[i_index];
+          }
+        }
+      }
+    }
+    return MACE_SUCCESS;
+  }
+  const int block_size_;
+};
+#ifdef MACE_ENABLE_OPENCL
+template<typename T>
+struct SpaceToDepthOpFunctor<DeviceType::GPU, T> : OpKernel {
+  explicit SpaceToDepthOpFunctor(OpKernelContext *context,
+                                 const int block_size)
+      : OpKernel(context), block_size_(block_size) {}
+  MaceStatus operator()(const Tensor *input,
+                        Tensor *output,
+                        StatsFuture *future);
+  const int block_size_;
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::unique_ptr<BufferBase> kernel_error_;
+  std::vector<index_t> input_shape_;
+};
+#endif  // MACE_ENABLE_OPENCL
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_SPACE_TO_DEPTH_H_
--- a/mace/ops/batch_to_space.h
+++ b/mace/ops/batch_to_space.h
@@ -19,7 +19,7 @@
 #include <vector>
 #include "mace/core/operator.h"
-#include "mace/kernels/space_to_batch.h"
+#include "mace/kernels/batch_to_space.h"
 namespace mace {
 namespace ops {
@@ -31,8 +31,7 @@ class BatchToSpaceNDOp : public Operator<D, T> {
      : Operator<D, T>(op_def, context),
        functor_(context,
                 OperatorBase::GetRepeatedArgs<int>("crops", {0, 0, 0, 0}),
-                 OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1}),
+                 OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1})) {}
-                 true) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *batch_tensor = this->Input(INPUT);
@@ -41,7 +40,7 @@ class BatchToSpaceNDOp : public Operator<D, T> {
  }
 private:
-  kernels::SpaceToBatchFunctor<D, T> functor_;
+  kernels::BatchToSpaceFunctor<D, T> functor_;
 protected:
  MACE_OP_INPUT_TAGS(INPUT);

--- a/mace/ops/depth_to_space.h
+++ b/mace/ops/depth_to_space.h
@@ -30,26 +30,13 @@ class DepthToSpaceOp : public Operator<D, T> {
  DepthToSpaceOp(const OperatorDef &op_def, OpKernelContext *context)
      : Operator<D, T>(op_def, context),
        block_size_(OperatorBase::GetOptionalArg<int>("block_size", 1)),
-        functor_(context, this->block_size_, true) {}
+        functor_(context, this->block_size_) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);
    Tensor *output = this->Output(OUTPUT);
    MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
-    int input_depth;
-    if (D == CPU) {
-      input_depth = input->dim(1);
-    } else if (D == GPU) {
-      input_depth = input->dim(3);
-    } else {
-      MACE_NOT_IMPLEMENTED;
-    }
-    MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
-               "input depth should be dividable by block_size * block_size",
-               input_depth);
-    MACE_CHECK((input_depth % 4) == 0,
-               "input channel should be dividable by 4");
    return functor_(input, output, future);
  }

--- a/mace/ops/depth_to_space_test.cc
+++ b/mace/ops/depth_to_space_test.cc
@@ -24,21 +24,18 @@ namespace test {
 namespace {
 template <DeviceType D>
-void RunDepthToSpace(const bool d2s,
+void RunDepthToSpace(const std::vector<index_t> &input_shape,
-                     const std::vector<index_t> &input_shape,
                     const std::vector<float> &input_data,
                     const int block_size,
                     const std::vector<index_t> &expected_shape,
                     const std::vector<float> &expected_data) {
  OpsTestNet net;
  net.AddInputFromArray<D, float>("Input", input_shape, input_data);
-  const char *ops_name = (d2s) ? "DepthToSpace" : "SpaceToDepth";
-  const char *ops_test_name = (d2s) ? "DepthToSpaceTest" : "SpaceToDepthTest";
  // Construct graph
  if (D == DeviceType::CPU) {
    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                    NCHW);
-    OpDefBuilder(ops_name, ops_test_name)
+    OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
        .Input("InputNCHW")
        .Output("OutputNCHW")
        .AddIntArg("block_size", block_size)
@@ -51,7 +48,7 @@ void RunDepthToSpace(const bool d2s,
  } else {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
-    OpDefBuilder(ops_name, ops_test_name)
+    OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
        .Input("InputImage")
        .Output("OutputImage")
        .AddIntArg("block_size", block_size)
@@ -69,47 +66,11 @@ void RunDepthToSpace(const bool d2s,
 }
 }  // namespace
-class SpaceToDepthOpTest : public OpsTestBase {};
-TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_CPU) {
-  RunDepthToSpace<DeviceType::CPU>(
-      false, {1, 2, 4, 4},
-      {0, 1, 2,  3,  4,  5,  6,  7,  16, 17, 18, 19, 20, 21, 22, 23,
-       8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
-      2, {1, 1, 2, 16},
-      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
-}
-TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_OPENCL) {
-  RunDepthToSpace<DeviceType::GPU>(
-      false, {1, 2, 4, 4},
-      {0, 1, 2,  3,  4,  5,  6,  7,  16, 17, 18, 19, 20, 21, 22, 23,
-       8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
-      2, {1, 1, 2, 16},
-      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
-}
-TEST_F(SpaceToDepthOpTest, Input2x2x4_B2_CPU) {
-  RunDepthToSpace<DeviceType::CPU>(
-      false, {1, 2, 2, 4},
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16},
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-}
-TEST_F(SpaceToDepthOpTest, Input4x4x1_B2_OPENCL) {
-  RunDepthToSpace<DeviceType::GPU>(
-      false, {1, 2, 2, 4},
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16},
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-}
 class DepthToSpaceOpTest : public OpsTestBase {};
 TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) {
  RunDepthToSpace<DeviceType::CPU>(
-      true, {1, 1, 2, 16},
+      {1, 1, 2, 16},
      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
      2, {1, 2, 4, 4},
@@ -119,7 +80,7 @@ TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) {
 TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) {
  RunDepthToSpace<DeviceType::GPU>(
-      true, {1, 1, 2, 16},
+      {1, 1, 2, 16},
      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
      2, {1, 2, 4, 4},
@@ -129,14 +90,14 @@ TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) {
 TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_CPU) {
  RunDepthToSpace<DeviceType::CPU>(
-      true, {1, 1, 1, 16},
+      {1, 1, 1, 16},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
 }
 TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) {
  RunDepthToSpace<DeviceType::GPU>(
-      true, {1, 1, 1, 16},
+      {1, 1, 1, 16},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
 }
@@ -144,14 +105,13 @@ TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) {
 TEST_F(DepthToSpaceOpTest, InputLarger_B2_OPENCL) {
  const std::vector<float> in = std::vector<float>(192 * 192 * 128, 1.0);
-  RunDepthToSpace<DeviceType::GPU>(true, {1, 192, 192, 128}, in, 2,
+  RunDepthToSpace<DeviceType::GPU>({1, 192, 192, 128}, in, 2,
                                   {1, 384, 384, 32}, in);
 }
 namespace {
 template <DeviceType D, typename T>
-void RandomTest(const bool d2s,
+void RandomTest(const int block_size,
-                const int block_size,
                const std::vector<index_t> &shape) {
  testing::internal::LogToStderr();
  srand(time(NULL));
@@ -159,14 +119,11 @@ void RandomTest(const bool d2s,
  // Construct graph
  OpsTestNet net;
-  const char *ops_name = (d2s) ? "DepthToSpace" : "SpaceToDepth";
-  const char *ops_test_name = (d2s) ? "DepthToSpaceTest" : "SpaceToDepthTest";
  // Add input data
  net.AddRandomInput<D, float>("Input", shape);
  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                  NCHW);
-  OpDefBuilder(ops_name, ops_test_name)
+  OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
      .Input("InputNCHW")
      .AddIntArg("block_size", block_size)
      .Output("OutputNCHW")
@@ -181,7 +138,7 @@ void RandomTest(const bool d2s,
  BufferToImage<D, T>(&net, "Input", "InputImg",
                      kernels::BufferType::IN_OUT_CHANNEL);
-  OpDefBuilder(ops_name, ops_test_name)
+  OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
      .Input("InputImg")
      .AddIntArg("block_size", block_size)
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
@@ -205,19 +162,11 @@ void RandomTest(const bool d2s,
 }  // namespace
 TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) {
-  RandomTest<DeviceType::GPU, float>(true, 2, {1, 192, 192, 128});
+  RandomTest<DeviceType::GPU, float>(2, {1, 192, 192, 128});
 }
 TEST_F(DepthToSpaceOpTest, OPENCLRandomHalf) {
-  RandomTest<DeviceType::GPU, half>(true, 2, {1, 192, 192, 128});
+  RandomTest<DeviceType::GPU, half>(2, {1, 192, 192, 128});
-}
-TEST_F(SpaceToDepthOpTest, OPENCLRandomFloat) {
-  RandomTest<DeviceType::GPU, float>(false, 2, {1, 384, 384, 32});
-}
-TEST_F(SpaceToDepthOpTest, OPENCLRandomHalf) {
-  RandomTest<DeviceType::GPU, half>(false, 2, {1, 384, 384, 32});
 }
 }  // namespace test

--- a/mace/ops/space_to_batch.h
+++ b/mace/ops/space_to_batch.h
@@ -31,8 +31,7 @@ class SpaceToBatchNDOp : public Operator<D, T> {
      : Operator<D, T>(op_def, context),
        functor_(context,
                 OperatorBase::GetRepeatedArgs<int>("paddings", {0, 0, 0, 0}),
-                 OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1}),
+                 OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1})) {}
-                 false) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *space_tensor = this->Input(INPUT);

--- a/mace/ops/space_to_depth.h
+++ b/mace/ops/space_to_depth.h
@@ -19,7 +19,7 @@
 #include <vector>
 #include "mace/core/operator.h"
-#include "mace/kernels/depth_to_space.h"
+#include "mace/kernels/space_to_depth.h"
 namespace mace {
 namespace ops {
@@ -30,34 +30,12 @@ class SpaceToDepthOp : public Operator<D, T> {
  SpaceToDepthOp(const OperatorDef &op_def, OpKernelContext *context)
      : Operator<D, T>(op_def, context),
        functor_(context,
-                 OperatorBase::GetOptionalArg<int>("block_size", 1),
+                 OperatorBase::GetOptionalArg<int>("block_size", 1)) {}
-                 false) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);
    Tensor *output = this->Output(OUTPUT);
    MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
-    const int block_size = OperatorBase::GetOptionalArg<int>("block_size", 1);
-    index_t input_height;
-    index_t input_width;
-    index_t input_depth;
-    if (D == CPU) {
-      input_height = input->dim(2);
-      input_width = input->dim(3);
-      input_depth = input->dim(1);
-    } else if (D == GPU) {
-      input_height = input->dim(1);
-      input_width = input->dim(2);
-      input_depth = input->dim(3);
-    } else {
-      MACE_NOT_IMPLEMENTED;
-    }
-    MACE_CHECK((input_depth % 4) == 0,
-               "input channel should be dividable by 4");
-    MACE_CHECK(
-        (input_width % block_size == 0) && (input_height % block_size == 0),
-        "input width and height should be dividable by block_size",
-        input->dim(3));
    return functor_(input, output, future);
  }
@@ -66,7 +44,7 @@ class SpaceToDepthOp : public Operator<D, T> {
  MACE_OP_OUTPUT_TAGS(OUTPUT);
 private:
-  kernels::DepthToSpaceOpFunctor<D, T> functor_;
+  kernels::SpaceToDepthOpFunctor<D, T> functor_;
 };
 }  // namespace ops

--- a/mace/ops/space_to_depth_test.cc
+++ b/mace/ops/space_to_depth_test.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <fstream>
+#include <vector>
+#include "mace/core/operator.h"
+#include "mace/ops/ops_test_util.h"
+namespace mace {
+namespace ops {
+namespace test {
+namespace {
+template <DeviceType D>
+void RunSpaceToDepth(const std::vector<index_t> &input_shape,
+                     const std::vector<float> &input_data,
+                     const int block_size,
+                     const std::vector<index_t> &expected_shape,
+                     const std::vector<float> &expected_data) {
+  OpsTestNet net;
+  net.AddInputFromArray<D, float>("Input", input_shape, input_data);
+  // Construct graph
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+                                                    NCHW);
+    OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
+        .Input("InputNCHW")
+        .Output("OutputNCHW")
+        .AddIntArg("block_size", block_size)
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+                                                    "Output", NHWC);
+  } else {
+    BufferToImage<D, float>(&net, "Input", "InputImage",
+                            kernels::BufferType::IN_OUT_CHANNEL);
+    OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
+        .Input("InputImage")
+        .Output("OutputImage")
+        .AddIntArg("block_size", block_size)
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }
+  if (D == DeviceType::GPU) {
+    ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
+                                          kernels::BufferType::IN_OUT_CHANNEL);
+  }
+  auto expected = net.CreateTensor<float>(expected_shape, expected_data);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+}
+}  // namespace
+class SpaceToDepthOpTest : public OpsTestBase {};
+TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_CPU) {
+  RunSpaceToDepth<DeviceType::CPU>(
+      {1, 2, 4, 4},
+      {0, 1, 2,  3,  4,  5,  6,  7,  16, 17, 18, 19, 20, 21, 22, 23,
+       8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
+      2, {1, 1, 2, 16},
+      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
+}
+TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_OPENCL) {
+  RunSpaceToDepth<DeviceType::GPU>(
+      {1, 2, 4, 4},
+      {0, 1, 2,  3,  4,  5,  6,  7,  16, 17, 18, 19, 20, 21, 22, 23,
+       8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
+      2, {1, 1, 2, 16},
+      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
+}
+TEST_F(SpaceToDepthOpTest, Input2x2x4_B2_CPU) {
+  RunSpaceToDepth<DeviceType::CPU>(
+      {1, 2, 2, 4},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+}
+TEST_F(SpaceToDepthOpTest, Input4x4x1_B2_OPENCL) {
+  RunSpaceToDepth<DeviceType::GPU>(
+      {1, 2, 2, 4},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+}
+namespace {
+template <DeviceType D, typename T>
+void RandomTest(const int block_size,
+                const std::vector<index_t> &shape) {
+  testing::internal::LogToStderr();
+  srand(time(NULL));
+  // Construct graph
+  OpsTestNet net;
+  // Add input data
+  net.AddRandomInput<D, float>("Input", shape);
+  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+                                                  NCHW);
+  OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
+      .Input("InputNCHW")
+      .AddIntArg("block_size", block_size)
+      .Output("OutputNCHW")
+      .Finalize(net.NewOperatorDef());
+  // Run
+  net.RunOp();
+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+                                                  NHWC);
+  BufferToImage<D, T>(&net, "Input", "InputImg",
+                      kernels::BufferType::IN_OUT_CHANNEL);
+  OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
+      .Input("InputImg")
+      .AddIntArg("block_size", block_size)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Output("OutputImg")
+      .Finalize(net.NewOperatorDef());
+  // Run
+  net.RunOp(D);
+  ImageToBuffer<D, float>(&net, "OutputImg", "OPENCLOutput",
+                          kernels::BufferType::IN_OUT_CHANNEL);
+  if (DataTypeToEnum<T>::value == DT_FLOAT) {
+    ExpectTensorNear<float>(*net.GetTensor("Output"),
+                            *net.GetOutput("OPENCLOutput"), 1e-5);
+  } else {
+    ExpectTensorNear<float>(*net.GetTensor("Output"),
+                            *net.GetOutput("OPENCLOutput"), 1e-3, 1e-4);
+  }
+}
+}  // namespace
+TEST_F(SpaceToDepthOpTest, OPENCLRandomFloat) {
+  RandomTest<DeviceType::GPU, float>(2, {1, 384, 384, 32});
+}
+TEST_F(SpaceToDepthOpTest, OPENCLRandomHalf) {
+  RandomTest<DeviceType::GPU, half>(2, {1, 384, 384, 32});
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace mace