Feature: S2D and D2S support depth < 4.

894096bd · liuqi · 52a379a2 · 894096bd · 894096bd · 894096bd
10 changed file
--- a/mace/core/net_def_adapter.cc
+++ b/mace/core/net_def_adapter.cc
@@ -164,6 +164,7 @@ MaceStatus NetDefAdapter::AdaptNetDef(
        input_info->set_dims(j, input_shape[j]);
      }
    }
+    tensor_shape_map.emplace(input_info->name(), input_shape);
    output_map.emplace(input_info->name(), InternalOutputInfo(
        mem_type, input_info->data_type(),
        input_data_format, input_shape, -1));

--- a/mace/ops/opencl/cl/depth_to_space.cl
+++ b/mace/ops/opencl/cl/depth_to_space.cl
 #include <common.h>
-__kernel void depth_to_space(OUT_OF_RANGE_PARAMS
+__kernel void depth_to_space_d1_d2(OUT_OF_RANGE_PARAMS
                                   GLOBAL_WORK_GROUP_SIZE_DIM3
                                   __read_only image2d_t input,
-                             __private const int block_size,
+                                   __private const int input_height,
-                             __private const int input_hb,
                                   __private const int input_width,
-                             __private const int input_depth_blocks,
+                                   __private const int block_size,
+                                   __private const int output_height,
                                   __private const int output_width,
-                             __private const int output_depth_blocks,
+                                   __private const int output_depth,
                                   __write_only image2d_t output) {
-  const int out_d = get_global_id(0);
+  const int in_depth_blk_idx = get_global_id(0);
-  const int out_w = get_global_id(1);
+  const int in_width_idx = get_global_id(1);
-  const int out_hb = get_global_id(2);
+  const int in_hb_idx = get_global_id(2);
 #ifndef NON_UNIFORM_WORK_GROUP
-  if (out_d >= global_size_dim0 || out_w >= global_size_dim1
+  if (in_depth_blk_idx >= global_size_dim0 || in_width_idx >= global_size_dim1
-      || out_hb >= global_size_dim2) {
+      || in_hb_idx >= global_size_dim2) {
    return;
  }
 #endif
+  const int batch_idx = in_hb_idx / input_height;
+  const int in_height_idx = in_hb_idx - mul24(batch_idx, input_height);
+  int in_depth_idx = in_depth_blk_idx << 2;
+  int hw_block_size = in_depth_idx / output_depth;
+  int out_depth_idx = in_depth_idx - mul24(hw_block_size, output_depth);
+  int bottom_width_idx = mul24(in_width_idx, block_size);
+  int out_width_idx = bottom_width_idx + (hw_block_size % block_size);
+  int out_height_idx = mad24(in_height_idx, block_size,
+      hw_block_size / block_size);
-  const int out_pos = mad24(out_d, output_width, out_w);
+  const int in_x = mad24(in_depth_blk_idx, input_width, in_width_idx);
+  DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_x, in_hb_idx));
-  const int in_hb = out_hb / block_size;
+  int out_x_base = mul24((out_depth_idx >> 2), output_width);
-  const int offset_h = out_hb - mul24(in_hb, block_size);
+  int out_x = out_x_base + out_width_idx;
-  const int in_w = out_w / block_size;
+  int out_y = mad24(batch_idx, output_height, out_height_idx);
-  const int offset_w = out_w - mul24(in_w, block_size);
+#if defined(DEPTH1)
-  const int offset_d = mul24(mad24(offset_h, block_size, offset_w), output_depth_blocks);
+  int top_width = mul24(in_width_idx + 1, block_size);
-  const int in_d = out_d + offset_d;
+  int top_height = mul24(in_height_idx + 1, block_size);
+  DATA_TYPE4 t_out_data = 0;
+  int t_out_x = out_x;
+  int t_width_idx = out_width_idx;
+  int t_height_idx = out_height_idx;
+  DATA_TYPE *in_data_ptr = (DATA_TYPE*)(&in_data);
+  for (int i = 0; i < 4; ++i) {
+    t_out_data.x = in_data_ptr[i];
+    WRITE_IMAGET(output, (int2)(t_out_x, out_y), t_out_data);
+    if (t_width_idx + 1 >= top_width) {
+      if (t_height_idx + 1 >= top_height) {
+        break;
+      }
+      t_width_idx = bottom_width_idx;
+      t_out_x = out_x_base + t_width_idx;
+      t_height_idx += 1;
+      out_y += 1;
+    } else {
+      t_width_idx += 1;
+      t_out_x += 1;
+    }
+  }
+#elif defined(DEPTH2)
+  int top_width = mul24(in_width_idx + 1, block_size);
+  int top_height = mul24(in_height_idx + 1, block_size);
+  DATA_TYPE4 t_out_data = 0;
+  t_out_data.x = in_data.x;
+  t_out_data.y = in_data.y;
+  WRITE_IMAGET(output, (int2)(out_x, out_y), t_out_data);
+  t_out_data.x = in_data.z;
+  t_out_data.y = in_data.w;
+  if (out_width_idx + 1 >= top_width) {
+    if (out_height_idx + 1 < top_height) {
+      int t_out_x = out_x_base + bottom_width_idx;
+      WRITE_IMAGET(output, (int2)(t_out_x, out_y + 1), t_out_data);
+    }
+  } else {
+    WRITE_IMAGET(output, (int2)(out_x + 1, out_y), t_out_data);
+  }
+#endif
+}
-  if (in_hb >= input_hb || in_w >= input_width || in_d >= input_depth_blocks) {
+__kernel void depth_to_space(OUT_OF_RANGE_PARAMS
+                             GLOBAL_WORK_GROUP_SIZE_DIM3
+                             __read_only image2d_t input,
+                             __private const int input_height,
+                             __private const int input_width,
+                             __private const int block_size,
+                             __private const int output_height,
+                             __private const int output_width,
+                             __private const int output_depth,
+                             __write_only image2d_t output) {
+  const int out_depth_blk_idx = get_global_id(0);
+  const int out_width_idx = get_global_id(1);
+  const int out_hb_idx = get_global_id(2);
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_depth_blk_idx >= global_size_dim0 || out_width_idx >= global_size_dim1
+      || out_hb_idx >= global_size_dim2) {
    return;
  }
+#endif
+  const int batch_idx = out_hb_idx / output_height;
+  const int out_height_idx = out_hb_idx - mul24(batch_idx, output_height);
-  const int in_pos = mad24(in_d, input_width, in_w);
+  int in_height_idx = out_height_idx / block_size;
-  DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, in_hb));
+  int height_idx_in_blk = out_height_idx - mul24(in_height_idx, block_size);
+  int in_width_idx = out_width_idx / block_size;
+  int width_idx_in_blk = out_width_idx - mul24(in_width_idx, block_size);
+  int in_depth_idx = mad24(
+      mad24(height_idx_in_blk, block_size, width_idx_in_blk),
+      output_depth, out_depth_blk_idx << 2);
+  int in_depth_blk_idx = in_depth_idx >> 2;
+  int in_x = mad24(in_depth_blk_idx, input_width, in_width_idx);
+  int in_y = mad24(batch_idx, input_height, in_height_idx);
+  DATA_TYPE4 out_data = READ_IMAGET(input, SAMPLER, (int2)(in_x, in_y));
+#ifdef DEPTH3
+  DATA_TYPE4 t_out_data = out_data;
+  int left_part_size = 4 - (in_depth_idx & 0x3);
+  switch(left_part_size) {
+    case 1:
+      out_data.x = t_out_data.w;
+      break;
+    case 2:
+      out_data.x = t_out_data.z;
+      out_data.y = t_out_data.w;
+      break;
+    case 3:
+      out_data.x = t_out_data.y;
+      out_data.y = t_out_data.z;
+      out_data.z = t_out_data.w;
+      break;
+    case 4:
+      out_data.x = t_out_data.x;
+      out_data.y = t_out_data.y;
+      out_data.z = t_out_data.z;
+      break;
+    default:
+      out_data = 0;
+  }
+  int right_part_size = 3 - left_part_size;
+  if (right_part_size > 0) {
+    int in_depth_blks = mul24(mul24(block_size, block_size), 3) >> 2;
+    in_x = select(-1, in_x + input_width, in_depth_blk_idx + 1 < in_depth_blks);
+    t_out_data = READ_IMAGET(input, SAMPLER, (int2)(in_x, in_y));
+    switch (right_part_size) {
+      case 2:
+        out_data.y = t_out_data.x;
+        out_data.z = t_out_data.y;
+        break;
+      case 1:
+        out_data.z = t_out_data.x;
+        break;
+      default:
+        out_data = 0;
+    }
+  }
+  out_data.w = 0;
+#endif
-  WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data);
+  int out_x = mad24(out_depth_blk_idx, output_width, out_width_idx);
+  WRITE_IMAGET(output, (int2)(out_x, out_hb_idx), out_data);
 }
--- a/mace/ops/opencl/cl/space_to_depth.cl
+++ b/mace/ops/opencl/cl/space_to_depth.cl
@@ -3,39 +3,130 @@
 __kernel void space_to_depth(OUT_OF_RANGE_PARAMS
                             GLOBAL_WORK_GROUP_SIZE_DIM3
                             __read_only image2d_t input,
-                             __private const int block_size,
+                             __private const int input_height,
                             __private const int input_width,
-                             __private const int input_depth_blocks,
+                             __private const int input_depth,
-                             __private const int output_hb,
+                             __private const int block_size,
+                             __private const int output_height,
                             __private const int output_width,
-                             __private const int output_depth_blocks,
                             __write_only image2d_t output) {
-  const int d = get_global_id(0);
+  const int out_depth_blk_idx = get_global_id(0);
-  const int w = get_global_id(1);
+  const int out_width_idx = get_global_id(1);
-  const int hb = get_global_id(2);
+  const int out_hb_idx = get_global_id(2);
 #ifndef NON_UNIFORM_WORK_GROUP
-  if (d >= global_size_dim0 || w >= global_size_dim1
+  if (out_depth_blk_idx >= global_size_dim0 || out_width_idx >= global_size_dim1
-      || hb >= global_size_dim2) {
+      || out_hb_idx >= global_size_dim2) {
    return;
  }
 #endif
-  const int in_pos = mad24(d, input_width, w);
+  const int batch_idx = out_hb_idx / output_height;
+  const int out_height_idx = out_hb_idx - mul24(batch_idx, output_height);
-  const int out_hb = hb / block_size;
+  int out_depth_idx = out_depth_blk_idx << 2;
-  const int offset_h = hb - mul24(out_hb, block_size);
+  int in_depth_idx = out_depth_idx % input_depth;
-  const int out_w = w / block_size;
+  int hw_block_size = out_depth_idx / input_depth;
-  const int offset_w = w - mul24(out_w, block_size);
+  int bottom_width_idx = mul24(out_width_idx, block_size);
-  const int offset_d = mul24(input_depth_blocks, mad24(offset_h, block_size, offset_w));
+  int in_width_idx = bottom_width_idx + (hw_block_size % block_size);
-  const int out_d = d + offset_d;
+  int in_height_idx = mad24(out_height_idx, block_size,
+      hw_block_size / block_size);
-  if (out_d >= output_depth_blocks || out_hb >= output_hb || out_w >= output_width) {
+  DATA_TYPE4 in_data = 0;
-    return;
+  int in_x = mad24((in_depth_idx >> 2), input_width, in_width_idx);
+  int in_y = mad24(batch_idx, input_height, in_height_idx);
+#if defined(DEPTH1)
+  int top_width = mul24(out_width_idx + 1, block_size);
+  int top_height = mul24(out_height_idx + 1, block_size);
+  DATA_TYPE4 t_in_data = 0;
+  int t_in_x = in_x;
+  int t_width_idx = in_width_idx;
+  int t_height_idx = in_height_idx;
+  DATA_TYPE *in_data_ptr = (DATA_TYPE*)(&in_data);
+  for (int i = 0; i < 4; ++i) {
+    t_in_data = READ_IMAGET(input, SAMPLER, (int2)(t_in_x, in_y));
+    in_data_ptr[i] = t_in_data.x;
+    if (t_width_idx + 1 >= top_width) {
+      if (t_height_idx + 1 >= top_height) {
+        break;
      }
+      t_width_idx = bottom_width_idx;
+      t_in_x = mad24((in_depth_idx >> 2), input_width, t_width_idx);
+      t_height_idx += 1;
+      in_y += 1;
+    } else {
+      t_width_idx += 1;
+      t_in_x += 1;
+    }
+  }
+#elif defined(DEPTH2)
+  int top_width = mul24(out_width_idx + 1, block_size);
+  int top_height = mul24(out_height_idx + 1, block_size);
+  DATA_TYPE4 t_in_data = READ_IMAGET(input, SAMPLER, (int2)(in_x, in_y));
+  in_data.x = t_in_data.x;
+  in_data.y = t_in_data.y;
+  t_in_data = 0;
+  if (in_width_idx + 1 >= top_width) {
+    if (in_height_idx + 1 < top_height) {
+      int t_in_x = mad24((in_depth_idx >> 2), input_width, bottom_width_idx);
+      t_in_data = READ_IMAGET(input, SAMPLER, (int2)(t_in_x, in_y + 1));
+    }
+  } else {
+    t_in_data = READ_IMAGET(input, SAMPLER, (int2)(in_x + 1, in_y));
+  }
+  in_data.z = t_in_data.x;
+  in_data.w = t_in_data.y;
+#elif defined(DEPTH3)
+  int top_width = mul24(out_width_idx + 1, block_size);
+  int top_height = mul24(out_height_idx + 1, block_size);
+  DATA_TYPE4 in_data0 = READ_IMAGET(input, SAMPLER, (int2)(in_x, in_y));
+  DATA_TYPE4 in_data1 = 0;
+  if (in_width_idx + 1 >= top_width) {
+    if (in_height_idx + 1 < top_height) {
+      int t_in_x = mad24((in_depth_idx >> 2), input_width, bottom_width_idx);
+      in_data1 = READ_IMAGET(input, SAMPLER, (int2)(t_in_x, in_y + 1));
+    }
+  } else {
+    in_data1 = READ_IMAGET(input, SAMPLER, (int2)(in_x + 1, in_y));
+  }
+  int left_part_size = 3 - in_depth_idx;
+  int right_part_size = 4 - left_part_size;
+  switch(left_part_size) {
+    case 3:
+      in_data.z = in_data0.z;
+      in_data.y = in_data0.y;
+      in_data.x = in_data0.x;
+      break;
+    case 2:
+      in_data.y = in_data0.z;
+      in_data.x = in_data0.y;
+      break;
+    case 1:
+      in_data.x = in_data0.z;
+      break;
+    default:
+      in_data = 0;
+  }
+  switch(right_part_size) {
+    case 3:
+      in_data.y = in_data1.x;
+      in_data.z = in_data1.y;
+      in_data.w = in_data1.z;
+      break;
+    case 2:
+      in_data.z = in_data1.x;
+      in_data.w = in_data1.y;
+      break;
+    case 1:
+      in_data.w = in_data1.x;
+      break;
+    default:
+      in_data = 0;
+  }
+#else
+  in_data = READ_IMAGET(input, SAMPLER, (int2)(in_x, in_y));
+#endif
-  const int out_pos = mad24(out_d, output_width, out_w);
+  const int out_x = mad24(out_depth_blk_idx, output_width, out_width_idx);
-  DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, hb));
+  WRITE_IMAGET(output, (int2)(out_x, out_hb_idx), in_data);
-  WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data);
 }
--- a/mace/ops/opencl/image/depth_to_space.h
+++ b/mace/ops/opencl/image/depth_to_space.h
@@ -58,19 +58,14 @@ MaceStatus DepthToSpaceKernel<T>::Compute(
  const index_t input_depth = input->dim(3);
  MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
-             "input depth should be dividable by block_size * block_size",
+             "input depth should be dividable by block_size * block_size ",
             input_depth);
-  MACE_CHECK((input_depth % 4) == 0,
-             "input channel should be dividable by 4");
  const index_t output_height = input_height * block_size_;
  const index_t output_width = input_width * block_size_;
  const index_t output_depth = input_depth / (block_size_ * block_size_);
-  MACE_CHECK(output_depth % 4 == 0, "output channel not support:")
+  MACE_CHECK(output_depth % 4 == 0 || output_depth < 4,
-    << output_depth;
+             "output channel not support:") << output_depth;
-  const index_t input_depth_blocks = RoundUpDiv4(input_depth);
-  const index_t output_depth_blocks = RoundUpDiv4(output_depth);
  std::vector<index_t> output_shape = {batch,
                                       output_height,
@@ -82,11 +77,16 @@ MaceStatus DepthToSpaceKernel<T>::Compute(
                              &image_shape);
  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
-  const uint32_t gws[3] = {
+  uint32_t gws[3];
-      static_cast<uint32_t>(RoundUpDiv4(output_depth)),
+  if (output_depth < 3) {
-      static_cast<uint32_t>(output_width),
+    gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth));
-      static_cast<uint32_t>(output_height * batch)
+    gws[1] = static_cast<uint32_t>(input_width);
-  };
+    gws[2] = static_cast<uint32_t>(input_height * batch);
+  } else {
+    gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth));
+    gws[1] = static_cast<uint32_t>(output_width);
+    gws[2] = static_cast<uint32_t>(output_height * batch);
+  }
  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
  MACE_OUT_OF_RANGE_DEFINITION;
@@ -95,6 +95,10 @@ MaceStatus DepthToSpaceKernel<T>::Compute(
    MACE_OUT_OF_RANGE_CONFIG;
    MACE_NON_UNIFORM_WG_CONFIG;
    const char *kernel_name = "depth_to_space";
+    if (output_depth < 4) {
+      built_options.emplace(MakeString("-DDEPTH", output_depth));
+      if (output_depth != 3) kernel_name = "depth_to_space_d1_d2";
+    }
    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
    std::stringstream kernel_name_ss;
    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
@@ -116,18 +120,18 @@ MaceStatus DepthToSpaceKernel<T>::Compute(
    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
    MACE_SET_3D_GWS_ARGS(kernel_, gws);
    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_height * batch));
    kernel_.setArg(idx++, static_cast<int32_t>(input_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
+    kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_height));
    kernel_.setArg(idx++, static_cast<int32_t>(output_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_depth));
    kernel_.setArg(idx++, *(output->opencl_image()));
    input_shape_ = input->shape();
  }
-  std::string  tuning_key = Concat("depth_to_space_opencl_kernel",
+  std::string tuning_key = Concat("depth_to_space",
                                  batch, output_height,
                                  output_width, output_depth);
  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);

--- a/mace/ops/opencl/image/space_to_depth.h
+++ b/mace/ops/opencl/image/space_to_depth.h
@@ -57,7 +57,7 @@ MaceStatus SpaceToDepthKernel<T>::Compute(
  const index_t input_width = input->dim(2);
  const index_t input_depth = input->dim(3);
-  MACE_CHECK((input_depth % 4) == 0,
+  MACE_CHECK(input_depth < 4 || (input_depth % 4) == 0,
             "input channel should be dividable by 4");
  MACE_CHECK(
      (input_width % block_size_ == 0) && (input_height % block_size_ == 0),
@@ -67,7 +67,6 @@ MaceStatus SpaceToDepthKernel<T>::Compute(
  const index_t output_width = input_width / block_size_;
  const index_t output_depth = input_depth * block_size_ * block_size_;
-  const index_t input_depth_blocks = RoundUpDiv4(input_depth);
  const index_t output_depth_blocks = RoundUpDiv4(output_depth);
  std::vector<index_t> output_shape = {batch, output_height, output_width,
@@ -90,6 +89,9 @@ MaceStatus SpaceToDepthKernel<T>::Compute(
    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
    std::stringstream kernel_name_ss;
    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    if (input_depth < 4) {
+      built_options.emplace(MakeString("-DDEPTH", input_depth));
+    }
    built_options.emplace(kernel_name_ss.str());
    auto dt = DataTypeToEnum<T>::value;
    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
@@ -102,28 +104,28 @@ MaceStatus SpaceToDepthKernel<T>::Compute(
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
-  const uint32_t gws[3] = {static_cast<uint32_t>(input_depth_blocks),
+  const uint32_t gws[3] = {static_cast<uint32_t>(output_depth_blocks),
-                           static_cast<uint32_t>(input_width),
+                           static_cast<uint32_t>(output_width),
-                           static_cast<uint32_t>(input_height * batch)};
+                           static_cast<uint32_t>(output_height * batch)};
  MACE_OUT_OF_RANGE_INIT(kernel_);
  if (!IsVecEqual(input_shape_, input->shape())) {
    uint32_t idx = 0;
    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
    MACE_SET_3D_GWS_ARGS(kernel_, gws);
    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_height));
    kernel_.setArg(idx++, static_cast<int32_t>(input_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_depth));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_height * batch));
+    kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_height));
    kernel_.setArg(idx++, static_cast<int32_t>(output_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
    kernel_.setArg(idx++, *(output->opencl_image()));
    input_shape_ = input->shape();
  }
  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0),
+  std::string tuning_key = Concat("space_to_depth", input->dim(0),
                                  input->dim(1), input->dim(2), input->dim(3));
  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
                                           gws, lws, context->future()));

--- a/mace/tools/benchmark/BUILD.bazel
+++ b/mace/tools/benchmark/BUILD.bazel
@@ -28,10 +28,7 @@ cc_binary(
        "//mace/codegen:generated_models",
        "//mace/libmace",
        "//mace/utils",
-    ] + if_opencl_enabled([
+    ],
-        "//mace/codegen:generated_opencl_binary",
-        "//mace/codegen:generated_opencl_parameter",
-    ]),
 )
 cc_binary(
@@ -60,10 +57,7 @@ cc_binary(
        "//mace/codegen:generated_mace_engine_factory",
        "//mace/codegen:generated_models",
        "//mace/libmace:libmace_dynamic",
-    ] + if_opencl_enabled([
+    ],
-        "//mace/codegen:generated_opencl_binary",
-        "//mace/codegen:generated_opencl_parameter",
-    ]),
 )
 cc_library(

--- a/test/ccbenchmark/mace/ops/depth_to_space_benchmark.cc
+++ b/test/ccbenchmark/mace/ops/depth_to_space_benchmark.cc
@@ -78,9 +78,15 @@ void DepthToSpace(
  MACE_BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, CPU)
 #endif
+MACE_BM_DEPTH_TO_SPACE(1, 4, 512, 512, 2);
+MACE_BM_DEPTH_TO_SPACE(1, 8, 256, 256, 2);
+MACE_BM_DEPTH_TO_SPACE(1, 12, 512, 512, 2);
+MACE_BM_DEPTH_TO_SPACE(1, 9, 512, 512, 3);
+MACE_BM_DEPTH_TO_SPACE(1, 18, 256, 256, 3);
+MACE_BM_DEPTH_TO_SPACE(1, 27, 512, 512, 3);
 MACE_BM_DEPTH_TO_SPACE(1, 64, 64, 64, 4);
-MACE_BM_DEPTH_TO_SPACE(1, 64, 128, 128, 4);
+MACE_BM_DEPTH_TO_SPACE(1, 128, 128, 128, 4);
-MACE_BM_DEPTH_TO_SPACE(1, 64, 256, 256, 4);
+MACE_BM_DEPTH_TO_SPACE(1, 256, 256, 256, 4);
 }  // namespace test
 }  // namespace ops

--- a/test/ccbenchmark/mace/ops/space_to_depth_benchmark.cc
+++ b/test/ccbenchmark/mace/ops/space_to_depth_benchmark.cc
@@ -29,7 +29,7 @@ void SpaceToDepth(
  // Add input data
  if (D == DeviceType::CPU) {
-    net.AddRandomInput<D, float>("Input", {batch, height, channels, width});
+    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
  } else {
@@ -78,6 +78,10 @@ void SpaceToDepth(
  MACE_BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, CPU)
 #endif
+MACE_BM_SPACE_TO_DEPTH(1, 1, 513, 513, 3);
+MACE_BM_SPACE_TO_DEPTH(1, 2, 256, 256, 2);
+MACE_BM_SPACE_TO_DEPTH(1, 3, 512, 512, 2);
+MACE_BM_SPACE_TO_DEPTH(1, 3, 513, 513, 3);
 MACE_BM_SPACE_TO_DEPTH(1, 64, 64, 64, 4);
 MACE_BM_SPACE_TO_DEPTH(1, 64, 128, 128, 4);
 MACE_BM_SPACE_TO_DEPTH(1, 64, 256, 256, 4);

--- a/test/ccunit/mace/ops/depth_to_space_test.cc
+++ b/test/ccunit/mace/ops/depth_to_space_test.cc
@@ -61,7 +61,45 @@ void RunDepthToSpace(const std::vector<index_t> &input_shape,
 class DepthToSpaceOpTest : public OpsTestBase {};
-TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) {
+TEST_F(DepthToSpaceOpTest, CPUInputDepthLess4) {
+  RunDepthToSpace<DeviceType::CPU>(
+      {1, 1, 2, 9},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17},
+      3,
+      {1, 3, 6, 1},
+      {0, 1, 2, 9, 10, 11,
+       3, 4, 5, 12, 13, 14,
+       6, 7, 8, 15, 16, 17});
+  RunDepthToSpace<DeviceType::CPU>(
+      {1, 1, 2, 18},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35},
+      3,
+      {1, 3, 6, 2},
+      {0, 1, 2, 3, 4, 5, 18, 19, 20, 21, 22, 23,
+       6, 7, 8, 9, 10, 11, 24, 25, 26, 27, 28, 29,
+       12, 13, 14, 15, 16, 17, 30, 31, 32, 33, 34, 35});
+  RunDepthToSpace<DeviceType::CPU>(
+      {1, 1, 2, 12},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+       12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
+      2,
+      {1, 2, 4, 3},
+      {0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 17,
+       6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23});
+  RunDepthToSpace<DeviceType::CPU>(
+      {1, 1, 1, 27},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8,
+       9, 10, 11, 12, 13, 14, 15, 16, 17,
+       18, 19, 20, 21, 22, 23, 24, 25, 26},
+      3,
+      {1, 3, 3, 3},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8,
+       9, 10, 11, 12, 13, 14, 15, 16, 17,
+       18, 19, 20, 21, 22, 23, 24, 25, 26});
+}
+TEST_F(DepthToSpaceOpTest, CPUInputDepth4) {
  RunDepthToSpace<DeviceType::CPU>(
      {1, 1, 2, 16},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
@@ -69,9 +107,57 @@ TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) {
      2, {1, 2, 4, 4},
      {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
       8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31});
+  RunDepthToSpace<DeviceType::CPU>(
+      {1, 1, 1, 16},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+}
+TEST_F(DepthToSpaceOpTest, OPENCLInputDepth1) {
+  RunDepthToSpace<DeviceType::GPU>(
+      {1, 1, 2, 9},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17},
+      3,
+      {1, 3, 6, 1},
+      {0, 1, 2, 9, 10, 11,
+       3, 4, 5, 12, 13, 14,
+       6, 7, 8, 15, 16, 17});
+}
+TEST_F(DepthToSpaceOpTest, OPENCLInputDepth2) {
+  RunDepthToSpace<DeviceType::GPU>(
+      {1, 1, 2, 18},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35},
+      3,
+      {1, 3, 6, 2},
+      {0, 1, 2, 3, 4, 5, 18, 19, 20, 21, 22, 23,
+       6, 7, 8, 9, 10, 11, 24, 25, 26, 27, 28, 29,
+       12, 13, 14, 15, 16, 17, 30, 31, 32, 33, 34, 35});
+}
+TEST_F(DepthToSpaceOpTest, OPENCLInputDepth3) {
+  RunDepthToSpace<DeviceType::GPU>(
+      {1, 1, 2, 12},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+       12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
+      2,
+      {1, 2, 4, 3},
+      {0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 17,
+       6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23});
+  RunDepthToSpace<DeviceType::GPU>(
+      {1, 1, 1, 27},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8,
+       9, 10, 11, 12, 13, 14, 15, 16, 17,
+       18, 19, 20, 21, 22, 23, 24, 25, 26},
+      3,
+      {1, 3, 3, 3},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8,
+       9, 10, 11, 12, 13, 14, 15, 16, 17,
+       18, 19, 20, 21, 22, 23, 24, 25, 26});
 }
-TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) {
+TEST_F(DepthToSpaceOpTest, OPENCLInputDepth4) {
  RunDepthToSpace<DeviceType::GPU>(
      {1, 1, 2, 16},
      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
@@ -79,29 +165,12 @@ TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) {
      2, {1, 2, 4, 4},
      {0, 1, 2,  3,  4,  5,  6,  7,  16, 17, 18, 19, 20, 21, 22, 23,
       8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31});
-}
-TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_CPU) {
-  RunDepthToSpace<DeviceType::CPU>(
-      {1, 1, 1, 16},
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4},
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-}
-TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) {
  RunDepthToSpace<DeviceType::GPU>(
      {1, 1, 1, 16},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 2, 2, 4},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
 }
-TEST_F(DepthToSpaceOpTest, InputLarger_B2_OPENCL) {
-  const std::vector<float> in = std::vector<float>(192 * 192 * 128, 1.0);
-  RunDepthToSpace<DeviceType::GPU>({1, 192, 192, 128}, in, 2,
-                                   {1, 384, 384, 32}, in);
-}
 namespace {
 template <DeviceType D, typename T>
 void RandomTest(const int block_size,
@@ -149,14 +218,51 @@ void RandomTest(const int block_size,
 }
 }  // namespace
+TEST_F(DepthToSpaceOpTest, OPENCLRandomFloatDepth1) {
+  RandomTest<DeviceType::GPU, float>(2, {1, 192, 192, 4});
+  RandomTest<DeviceType::GPU, float>(3, {1, 111, 111, 9});
+  RandomTest<DeviceType::GPU, float>(5, {1, 20, 20, 25});
+  RandomTest<DeviceType::GPU, float>(7, {1, 14, 14, 49});
+}
+TEST_F(DepthToSpaceOpTest, OPENCLRandomFloatDepth2) {
+  RandomTest<DeviceType::GPU, float>(2, {1, 192, 192, 8});
+  RandomTest<DeviceType::GPU, float>(3, {1, 111, 111, 18});
+  RandomTest<DeviceType::GPU, float>(5, {1, 20, 20, 50});
+  RandomTest<DeviceType::GPU, float>(7, {1, 14, 14, 98});
+}
+TEST_F(DepthToSpaceOpTest, OPENCLRandomFloatDepth3) {
+  RandomTest<DeviceType::GPU, float>(2, {1, 192, 192, 12});
+  RandomTest<DeviceType::GPU, float>(3, {1, 111, 111, 27});
+  RandomTest<DeviceType::GPU, float>(5, {1, 20, 20, 75});
+  RandomTest<DeviceType::GPU, float>(7, {1, 14, 14, 147});
+}
 TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) {
-  RandomTest<DeviceType::GPU, float>(2, {1, 192, 192, 128});
+  RandomTest<DeviceType::GPU, float>(2, {1, 192, 192, 16});
+  RandomTest<DeviceType::GPU, float>(3, {1, 222, 222, 144});
+  RandomTest<DeviceType::GPU, float>(5, {1, 100, 100, 200});
+  RandomTest<DeviceType::GPU, float>(7, {1, 98, 98, 196});
 }
 TEST_F(DepthToSpaceOpTest, OPENCLRandomHalf) {
-  RandomTest<DeviceType::GPU, half>(2, {1, 192, 192, 128});
+  RandomTest<DeviceType::GPU, half>(2, {1, 192, 192, 4});
+  RandomTest<DeviceType::GPU, half>(3, {1, 111, 111, 18});
+  RandomTest<DeviceType::GPU, half>(5, {1, 20, 20, 75});
+  RandomTest<DeviceType::GPU, half>(7, {1, 14, 14, 147});
+  RandomTest<DeviceType::GPU, half>(2, {1, 384, 384, 8});
 }
+TEST_F(DepthToSpaceOpTest, OPENCLRandomBatchHalf) {
+  RandomTest<DeviceType::GPU, half>(2, {2, 192, 192, 4});
+  RandomTest<DeviceType::GPU, half>(3, {3, 111, 111, 18});
+  RandomTest<DeviceType::GPU, half>(5, {2, 20, 20, 75});
+  RandomTest<DeviceType::GPU, half>(7, {3, 14, 14, 147});
+  RandomTest<DeviceType::GPU, half>(2, {2, 384, 384, 8});
+}
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/test/ccunit/mace/ops/space_to_depth_test.cc
+++ b/test/ccunit/mace/ops/space_to_depth_test.cc
@@ -61,38 +61,106 @@ void RunSpaceToDepth(const std::vector<index_t> &input_shape,
 class SpaceToDepthOpTest : public OpsTestBase {};
-TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_CPU) {
+TEST_F(SpaceToDepthOpTest, CPUInputDepthLess4) {
  RunSpaceToDepth<DeviceType::CPU>(
-      {1, 2, 4, 4},
+      {1, 3, 6, 1},
-      {0, 1, 2,  3,  4,  5,  6,  7,  16, 17, 18, 19, 20, 21, 22, 23,
+      {0, 1, 2, 9, 10, 11,
-       8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
+       3, 4, 5, 12, 13, 14,
-      2, {1, 1, 2, 16},
+       6, 7, 8, 15, 16, 17},
-      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+      3,
-       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
+      {1, 1, 2, 9},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
+  RunSpaceToDepth<DeviceType::CPU>(
+      {1, 3, 6, 2},
+      {0, 1, 2, 3, 4, 5, 18, 19, 20, 21, 22, 23,
+       6, 7, 8, 9, 10, 11, 24, 25, 26, 27, 28, 29,
+       12, 13, 14, 15, 16, 17, 30, 31, 32, 33, 34, 35},
+      3,
+      {1, 1, 2, 18},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35});
+  RunSpaceToDepth<DeviceType::CPU>(
+      {1, 2, 4, 3},
+      {0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 17,
+       6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23},
+      2,
+      {1, 1, 2, 12},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+       12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23});
+  RunSpaceToDepth<DeviceType::CPU>(
+      {1, 3, 3, 3},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8,
+       9, 10, 11, 12, 13, 14, 15, 16, 17,
+       18, 19, 20, 21, 22, 23, 24, 25, 26},
+      3,
+      {1, 1, 1, 27},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8,
+       9, 10, 11, 12, 13, 14, 15, 16, 17,
+       18, 19, 20, 21, 22, 23, 24, 25, 26});
 }
-TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_OPENCL) {
+TEST_F(SpaceToDepthOpTest, CPUInputDepth4) {
-  RunSpaceToDepth<DeviceType::GPU>(
+  RunSpaceToDepth<DeviceType::CPU>(
      {1, 2, 4, 4},
      {0, 1, 2,  3,  4,  5,  6,  7,  16, 17, 18, 19, 20, 21, 22, 23,
       8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
      2, {1, 1, 2, 16},
      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
-}
-TEST_F(SpaceToDepthOpTest, Input2x2x4_B2_CPU) {
  RunSpaceToDepth<DeviceType::CPU>(
      {1, 2, 2, 4},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
 }
-TEST_F(SpaceToDepthOpTest, Input4x4x1_B2_OPENCL) {
+TEST_F(SpaceToDepthOpTest, OPENCLInputDepth1) {
  RunSpaceToDepth<DeviceType::GPU>(
-      {1, 2, 2, 4},
+      {1, 3, 6, 1},
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 2, {1, 1, 1, 16},
+      {0, 1, 2, 9, 10, 11,
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+       3, 4, 5, 12, 13, 14,
+       6, 7, 8, 15, 16, 17},
+      3, {1, 1, 2, 9},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
+}
+TEST_F(SpaceToDepthOpTest, OPENCLInputDepth2) {
+  RunSpaceToDepth<DeviceType::GPU>(
+      {1, 3, 6, 2},
+      {0, 1, 2, 3, 4, 5, 18, 19, 20, 21, 22, 23,
+       6, 7, 8, 9, 10, 11, 24, 25, 26, 27, 28, 29,
+      12, 13, 14, 15, 16, 17, 30, 31, 32, 33, 34, 35},
+      3, {1, 1, 2, 18},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35});
+}
+TEST_F(SpaceToDepthOpTest, OPENCLInputDepth3) {
+  RunSpaceToDepth<DeviceType::GPU>(
+      {1, 2, 4, 3},
+      {0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 17,
+       6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23},
+      2, {1, 1, 2, 12},
+      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+       12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23});
+  RunSpaceToDepth<DeviceType::GPU>(
+      {1, 3, 3, 3},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8,
+       9, 10, 11, 12, 13, 14, 15, 16, 17,
+       18, 19, 20, 21, 22, 23, 24, 25, 26},
+      3, {1, 1, 1, 27},
+      {0,  1,  2,  3,  4,  5,  6,  7,  8,
+       9, 10, 11, 12, 13, 14, 15, 16, 17,
+       18, 19, 20, 21, 22, 23, 24, 25, 26});
+}
+TEST_F(SpaceToDepthOpTest, OPENCLInputDepth4) {
+  RunSpaceToDepth<DeviceType::GPU>(
+      {1, 2, 4, 4},
+      {0, 1, 2,  3,  4,  5,  6,  7,  16, 17, 18, 19, 20, 21, 22, 23,
+       8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
+      2, {1, 1, 2, 16},
+      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
 }
 namespace {
@@ -141,14 +209,50 @@ void RandomTest(const int block_size,
 }
 }  // namespace
+TEST_F(SpaceToDepthOpTest, OPENCLRandomFloatDepth1) {
+  RandomTest<DeviceType::GPU, float>(2, {1, 384, 384, 1});
+  RandomTest<DeviceType::GPU, float>(3, {1, 333, 333, 1});
+  RandomTest<DeviceType::GPU, float>(5, {1, 100, 100, 1});
+  RandomTest<DeviceType::GPU, float>(7, {1, 98, 98, 1});
+}
+TEST_F(SpaceToDepthOpTest, OPENCLRandomFloatDepth2) {
+  RandomTest<DeviceType::GPU, float>(2, {1, 384, 384, 2});
+  RandomTest<DeviceType::GPU, float>(3, {1, 333, 333, 2});
+  RandomTest<DeviceType::GPU, float>(5, {1, 100, 100, 2});
+  RandomTest<DeviceType::GPU, float>(7, {1, 98, 98, 2});
+}
+TEST_F(SpaceToDepthOpTest, OPENCLRandomFloatDepth3) {
+  RandomTest<DeviceType::GPU, float>(2, {1, 384, 384, 3});
+  RandomTest<DeviceType::GPU, float>(3, {1, 333, 333, 3});
+  RandomTest<DeviceType::GPU, float>(5, {1, 100, 100, 3});
+  RandomTest<DeviceType::GPU, float>(7, {1, 98, 98, 3});
+}
 TEST_F(SpaceToDepthOpTest, OPENCLRandomFloat) {
-  RandomTest<DeviceType::GPU, float>(2, {1, 384, 384, 32});
+  RandomTest<DeviceType::GPU, float>(2, {1, 384, 384, 4});
+  RandomTest<DeviceType::GPU, float>(3, {1, 333, 333, 16});
+  RandomTest<DeviceType::GPU, float>(5, {1, 100, 100, 32});
+  RandomTest<DeviceType::GPU, float>(7, {1, 98, 98, 64});
 }
 TEST_F(SpaceToDepthOpTest, OPENCLRandomHalf) {
+  RandomTest<DeviceType::GPU, half>(2, {1, 384, 384, 1});
+  RandomTest<DeviceType::GPU, half>(3, {1, 333, 333, 2});
+  RandomTest<DeviceType::GPU, half>(5, {1, 100, 100, 3});
+  RandomTest<DeviceType::GPU, half>(7, {1, 98, 98, 4});
  RandomTest<DeviceType::GPU, half>(2, {1, 384, 384, 32});
 }
+TEST_F(SpaceToDepthOpTest, OPENCLBatchRandomHalf) {
+  RandomTest<DeviceType::GPU, half>(2, {2, 384, 384, 1});
+  RandomTest<DeviceType::GPU, half>(3, {3, 333, 333, 2});
+  RandomTest<DeviceType::GPU, half>(5, {2, 100, 100, 3});
+  RandomTest<DeviceType::GPU, half>(7, {3, 98, 98, 4});
+  RandomTest<DeviceType::GPU, half>(2, {2, 384, 384, 32});
+}
 }  // namespace test
 }  // namespace ops
 }  // namespace mace