diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h
index 90103b4fc1ccd9203005d9a8fe61915a69b5f652..46a0e0318b26c24cb230d32b2eea41751a55a012 100644
--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -17,6 +17,7 @@
 
 #include <memory>
 #include <vector>
+#include <algorithm>
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
@@ -160,10 +161,16 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
     Tensor::MappingGuard input_guard(space_tensor);
     Tensor::MappingGuard output_guard(batch_tensor);
 
+    int pad_top = paddings_[0];
+    int pad_left = paddings_[2];
+    int block_shape_h = block_shape_[0];
+    int block_shape_w = block_shape_[1];
+
     if (b2s_) {
       const float *input_data = batch_tensor->data<float>();
       float *output_data = space_tensor->mutable_data<float>();
 
+      index_t in_batches = batch_tensor->dim(0);
       index_t in_height = batch_tensor->dim(2);
       index_t in_width = batch_tensor->dim(3);
 
@@ -172,26 +179,58 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
       index_t out_height = space_tensor->dim(2);
       index_t out_width = space_tensor->dim(3);
 
-#pragma omp parallel for collapse(2)
-      for (index_t b = 0; b < out_batches; ++b) {
-        for (index_t c = 0; c < channels; ++c) {
-          for (index_t h = 0; h < out_height; ++h) {
-            const index_t in_h = (h + paddings_[0]) / block_shape_[0];
-            const index_t tile_h = (h + paddings_[0]) % block_shape_[0];
-            for (index_t w = 0; w < out_width; ++w) {
-              const index_t in_w = (w + paddings_[2]) / block_shape_[1];
-              const index_t tile_w = (w + paddings_[2]) % block_shape_[1];
-              const index_t
-                in_b = (tile_h * block_shape_[1] + tile_w) * out_batches + b;
-              output_data[((b * channels + c) * out_height + h) * out_width
-                + w] =
-                input_data[
-                  ((in_b * channels + c) * in_height + in_h) * in_width
-                    + in_w];
-            }
-          }
-        }
-      }
+      // 32k/sizeof(float)/out_width/block_shape
+      index_t
+        block_h_size =
+        std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / out_width);
+
+      // make channel outter loop so we can make best use of cache
+#pragma omp parallel for collapse(3)
+      for (index_t c = 0; c < channels; ++c) {
+        for (index_t block_h = 0; block_h < in_height;
+             block_h += block_h_size) {
+          for (index_t in_b = 0; in_b < in_batches; ++in_b) {
+            const index_t b = in_b % out_batches;
+            const index_t tile_index = in_b / out_batches;
+            const index_t tile_h = tile_index / block_shape_w;
+            const index_t tile_w = tile_index % block_shape_w;
+            const index_t valid_h_start = std::max(block_h,
+                                                   (pad_top - tile_h
+                                                     + block_shape_h - 1)
+                                                     / block_shape_h);
+            const index_t valid_h_end = std::min(in_height,
+                                                 std::min(
+                                                   block_h + block_h_size,
+                                                   (out_height + pad_top
+                                                     - tile_h
+                                                     + block_shape_h - 1)
+                                                     / block_shape_h));
+            const index_t valid_w_start = std::max(static_cast<index_t>(0),
+                                                   (pad_left - tile_w
+                                                     + block_shape_w - 1)
+                                                     / block_shape_w);
+            const index_t valid_w_end = std::min(in_width,
+                                                 (out_width + pad_left - tile_w
+                                                   + block_shape_w - 1)
+                                                   / block_shape_w);
+            const float *input_base =
+              input_data + (in_b * channels + c) * in_height * in_width;
+            float *output_base =
+              output_data + (b * channels + c) * out_height * out_width;
+
+            index_t h = valid_h_start * block_shape_h + tile_h - pad_top;
+            for (index_t in_h = valid_h_start; in_h < valid_h_end; ++in_h) {
+              index_t w = valid_w_start * block_shape_w + tile_w - pad_left;
+              for (index_t in_w = valid_w_start; in_w < valid_w_end; ++in_w) {
+                output_base[h * out_width + w] =
+                  input_base[in_h * in_width + in_w];
+                w += block_shape_w;
+              }  // w
+              h += block_shape_h;
+            }  // h
+          }  // b
+        }  // block_h
+      }  // c
     } else {
       const float *input_data = space_tensor->data<float>();
       float *output_data = batch_tensor->mutable_data<float>();
@@ -205,31 +244,73 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
       index_t out_height = batch_tensor->dim(2);
       index_t out_width = batch_tensor->dim(3);
 
-#pragma omp parallel for collapse(2)
-      for (index_t b = 0; b < out_batches; ++b) {
-        for (index_t c = 0; c < channels; ++c) {
-          const index_t in_b = b % in_batches;
-          const index_t tile_h = b / in_batches / block_shape_[1];
-          const index_t tile_w = b / in_batches % block_shape_[1];
-          for (index_t h = 0; h < out_height; ++h) {
-            const index_t in_h = h * block_shape_[0] + tile_h - paddings_[0];
-            for (index_t w = 0; w < out_width; ++w) {
-              const index_t in_w = w * block_shape_[1] + tile_w - paddings_[2];
-              if (in_h >= 0 && in_w >= 0 && in_h < in_height
-                && in_w < in_width) {
-                output_data[((b * channels + c) * out_height + h) * out_width
-                  + w] =
-                  input_data[
-                    ((in_b * channels + c) * in_height + in_h) * in_width
-                      + in_w];
-              } else {
-                output_data[((b * channels + c) * out_height + h) * out_width
-                  + w] = 0;
-              }
-            }
-          }
-        }
-      }
+      index_t block_h_size =
+        std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / in_width);
+
+      // make channel outter loop so we can make best use of cache
+#pragma omp parallel for collapse(3)
+      for (index_t c = 0; c < channels; ++c) {
+        for (index_t block_h = 0; block_h < out_height;
+             block_h += block_h_size) {
+          for (index_t b = 0; b < out_batches; ++b) {
+            const index_t in_b = b % in_batches;
+            const index_t tile_index = b / in_batches;
+            const index_t tile_h = tile_index / block_shape_w;
+            const index_t tile_w = tile_index % block_shape_w;
+            const index_t valid_h_start = std::max(block_h,
+                                                   (pad_top - tile_h
+                                                     + block_shape_h - 1)
+                                                     / block_shape_h);
+            const index_t valid_h_end = std::min(out_height,
+                                                 std::min(
+                                                   block_h + block_h_size,
+                                                   (in_height + pad_top
+                                                     - tile_h
+                                                     + block_shape_h - 1)
+                                                     / block_shape_h));
+            const index_t valid_w_start = std::max(static_cast<index_t>(0),
+                                                   (pad_left - tile_w
+                                                     + block_shape_w - 1)
+                                                     / block_shape_w);
+            const index_t valid_w_end = std::min(out_width,
+                                                 (in_width + pad_left - tile_w
+                                                   + block_shape_w - 1)
+                                                   / block_shape_w);
+            const float *input_base =
+              input_data + (in_b * channels + c) * in_height * in_width;
+            float *output_base =
+              output_data + (b * channels + c) * out_height * out_width;
+
+            memset(output_base + block_h * out_width,
+                   0,
+                   (valid_h_start - block_h) * out_width * sizeof(float));
+
+            index_t in_h = valid_h_start * block_shape_h + tile_h - pad_top;
+            for (index_t h = valid_h_start; h < valid_h_end; ++h) {
+              memset(output_base + h * out_width,
+                     0,
+                     valid_w_start * sizeof(float));
+
+              index_t in_w = valid_w_start * block_shape_w + tile_w - pad_left;
+              for (index_t w = valid_w_start; w < valid_w_end; ++w) {
+                output_base[h * out_width + w] =
+                  input_base[in_h * in_width + in_w];
+                in_w += block_shape_w;
+              }  // w
+              in_h += block_shape_h;
+
+              memset(output_base + h * out_width + valid_w_end,
+                     0,
+                     (out_width - valid_w_end) * sizeof(float));
+            }  // h
+
+            memset(output_base + valid_h_end * out_width,
+                   0,
+                   (std::min(out_height, block_h + block_h_size) - valid_h_end)
+                     * out_width * sizeof(float));
+          }  // b
+        }  // block_h
+      }  // c
     }
   }
 };
diff --git a/mace/ops/batch_to_space_benchmark.cc b/mace/ops/batch_to_space_benchmark.cc
index 5cfe7015cf531d73c22af4f0ac6ccc3b3292d9bc..fdb7331706afc7635f103bf02e29b30ca29c8cbb 100644
--- a/mace/ops/batch_to_space_benchmark.cc
+++ b/mace/ops/batch_to_space_benchmark.cc
@@ -27,16 +27,29 @@ void BMBatchToSpace(
   mace::testing::StopTiming();
 
   OpsTestNet net;
-  net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
-  BufferToImage<D, float>(&net, "Input", "InputImage",
-                          kernels::BufferType::IN_OUT_CHANNEL);
-  OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
-      .Input("InputImage")
-      .Output("OutputImage")
-      .AddIntsArg("crops", {0, 0, 0, 0})
-      .AddIntsArg("block_shape", {arg, arg})
-      .Finalize(net.NewOperatorDef());
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+  } else if (D == DeviceType::GPU) {
+    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  }
 
+  if (D == DeviceType::CPU) {
+    OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
+        .Input("Input")
+        .Output("Output")
+        .AddIntsArg("crops", {0, 0, 0, 0})
+        .AddIntsArg("block_shape", {arg, arg})
+        .Finalize(net.NewOperatorDef());
+  } else if (D == DeviceType::GPU) {
+    BufferToImage<D, float>(&net, "Input", "InputImage",
+                            kernels::BufferType::IN_OUT_CHANNEL);
+    OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
+        .Input("InputImage")
+        .Output("OutputImage")
+        .AddIntsArg("crops", {0, 0, 0, 0})
+        .AddIntsArg("block_shape", {arg, arg})
+        .Finalize(net.NewOperatorDef());
+  }
   // Warm-up
   for (int i = 0; i < 5; ++i) {
     net.RunOp(D);
@@ -63,11 +76,13 @@ void BMBatchToSpace(
   BENCHMARK(BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE)
 
 #define BM_BATCH_TO_SPACE(N, H, W, C, ARG) \
-  BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU);
+  BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU); \
+  BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, CPU);
 
 BM_BATCH_TO_SPACE(128, 8, 8, 128, 2);
 BM_BATCH_TO_SPACE(4, 128, 128, 32, 2);
 BM_BATCH_TO_SPACE(16, 64, 64, 32, 4);
+BM_BATCH_TO_SPACE(64, 32, 32, 32, 8);
 
 }  // namespace test
 }  // namespace ops
diff --git a/mace/ops/space_to_batch_benchmark.cc b/mace/ops/space_to_batch_benchmark.cc
index ff3ee74925bcbafe037c7af4e732aeb861e0a1e9..524a32cec2849657c74d37453047e33909cfc6f5 100644
--- a/mace/ops/space_to_batch_benchmark.cc
+++ b/mace/ops/space_to_batch_benchmark.cc
@@ -27,17 +27,29 @@ void BMSpaceToBatch(
   mace::testing::StopTiming();
 
   OpsTestNet net;
-  net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
-
-  BufferToImage<D, float>(&net, "Input", "InputImage",
-                          kernels::BufferType::IN_OUT_CHANNEL);
-  OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
-      .Input("InputImage")
-      .Output("OutputImage")
-      .AddIntsArg("paddings", {shape, shape, shape, shape})
-      .AddIntsArg("block_shape", {shape, shape})
-      .Finalize(net.NewOperatorDef());
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+  } else if (D == DeviceType::GPU) {
+    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  }
 
+  if (D == DeviceType::CPU) {
+    OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
+        .Input("Input")
+        .Output("Output")
+        .AddIntsArg("paddings", {shape, shape, shape, shape})
+        .AddIntsArg("block_shape", {shape, shape})
+        .Finalize(net.NewOperatorDef());
+  } else if (D == DeviceType::GPU) {
+    BufferToImage<D, float>(&net, "Input", "InputImage",
+                            kernels::BufferType::IN_OUT_CHANNEL);
+    OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
+        .Input("InputImage")
+        .Output("OutputImage")
+        .AddIntsArg("paddings", {shape, shape, shape, shape})
+        .AddIntsArg("block_shape", {shape, shape})
+        .Finalize(net.NewOperatorDef());
+  }
   // Warm-up
   for (int i = 0; i < 5; ++i) {
     net.RunOp(D);
@@ -65,11 +77,14 @@ void BMSpaceToBatch(
       BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE)
 
 #define BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \
-  BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU);
+  BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); \
+  BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU);
 
 BM_SPACE_TO_BATCH(128, 16, 16, 128, 2);
 BM_SPACE_TO_BATCH(1, 256, 256, 32, 2);
+BM_SPACE_TO_BATCH(1, 256, 256, 16, 2);
 BM_SPACE_TO_BATCH(1, 256, 256, 32, 4);
+BM_SPACE_TO_BATCH(1, 256, 256, 32, 8);
 
 }  // namespace test
 }  // namespace ops
diff --git a/mace/ops/space_to_batch_test.cc b/mace/ops/space_to_batch_test.cc
index b464c41537e825e1cead227b29ed663582686dee..cca39bafc75a082eab41db042501795e2f3de282 100644
--- a/mace/ops/space_to_batch_test.cc
+++ b/mace/ops/space_to_batch_test.cc
@@ -213,6 +213,96 @@ TEST(SpaceToBatchTest, MultiBatchAndChannelData) {
        9, 10, 13, 14, 25, 26, 29, 30, 11, 12, 15, 16, 27, 28, 31, 32});
 }
 
+void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
+                     const std::vector<int> &block_shape_data,
+                     const std::vector<int> &padding_data) {
+  OpsTestNet net;
+  net.AddRandomInput<GPU, float>("Input", input_shape);
+
+  // run gpu
+  BufferToImage<GPU, float>(&net, "Input", "InputImage",
+                          kernels::BufferType::IN_OUT_CHANNEL);
+  OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
+    .Input("InputImage")
+    .Output("OutputImage")
+    .AddIntsArg("paddings", padding_data)
+    .AddIntsArg("block_shape", block_shape_data)
+    .Finalize(net.NewOperatorDef());
+  net.RunOp(GPU);
+  ImageToBuffer<GPU, float>(&net, "OutputImage", "OutputGPU",
+                          kernels::BufferType::IN_OUT_CHANNEL);
+
+  // run cpu
+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+  OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
+    .Input("InputNCHW")
+    .Output("OutputNCHW")
+    .AddIntsArg("paddings", padding_data)
+    .AddIntsArg("block_shape", block_shape_data)
+    .Finalize(net.NewOperatorDef());
+  net.RunOp(CPU);
+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "OutputCPU",
+                                                  NHWC);
+
+  // Check
+  ExpectTensorNear<float>(*net.GetOutput("OutputCPU"),
+                          *net.GetOutput("OutputGPU"));
+}
+
+void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
+                                const std::vector<int> &block_shape_data,
+                                const std::vector<int> &crops_data) {
+  OpsTestNet net;
+  net.AddRandomInput<GPU, float>("Input", input_shape);
+
+  // run gpu
+  BufferToImage<GPU, float>(&net, "Input", "InputImage",
+                            kernels::BufferType::IN_OUT_CHANNEL);
+  OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
+    .Input("InputImage")
+    .Output("OutputImage")
+    .AddIntsArg("crops", crops_data)
+    .AddIntsArg("block_shape", block_shape_data)
+    .Finalize(net.NewOperatorDef());
+  net.RunOp(GPU);
+  ImageToBuffer<GPU, float>(&net, "OutputImage", "OutputGPU",
+                            kernels::BufferType::IN_OUT_CHANNEL);
+
+  // run cpu
+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+  OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
+    .Input("InputNCHW")
+    .Output("OutputNCHW")
+    .AddIntsArg("crops", crops_data)
+    .AddIntsArg("block_shape", block_shape_data)
+    .Finalize(net.NewOperatorDef());
+  net.RunOp(CPU);
+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "OutputCPU",
+                                                  NHWC);
+
+  // Check
+  ExpectTensorNear<float>(*net.GetOutput("OutputCPU"),
+                          *net.GetOutput("OutputGPU"));
+}
+
+
+TEST(SpaceToBatchTest, LargeData) {
+  TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {0, 0, 0, 0});
+  TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {4, 4, 4, 4});
+  TestoBatchToSpaceLargeInput({64, 32, 32, 32}, {8, 8}, {0, 0, 0, 0});
+  TestoBatchToSpaceLargeInput({64, 32, 32, 32}, {8, 8}, {4, 4, 4, 4});
+}
+
 }  // namespace test
 }  // namespace ops
 }  // namespace mace