Implement quantized resize_bilinear, b2s, s2b

a5c9db3a · 李寅 · df23f428 · a5c9db3a · a5c9db3a · a5c9db3a
12 changed file
--- a/mace/kernels/batch_to_space.h
+++ b/mace/kernels/batch_to_space.h
@@ -35,12 +35,12 @@ struct BatchToSpaceFunctorBase : OpKernel {
  BatchToSpaceFunctorBase(OpKernelContext *context,
                          const std::vector<int> &paddings,
                          const std::vector<int> &block_shape)
-    : OpKernel(context),
+      : OpKernel(context),
-      paddings_(paddings.begin(), paddings.end()),
+        paddings_(paddings.begin(), paddings.end()),
-      block_shape_(block_shape.begin(), block_shape.end()) {
+        block_shape_(block_shape.begin(), block_shape.end()) {
    MACE_CHECK(
-      block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1,
+        block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1,
-      "Block's shape should be 1D, and greater than 1");
+        "Block's shape should be 1D, and greater than 1");
    MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D");
  }
@@ -94,11 +94,11 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
  BatchToSpaceFunctor(OpKernelContext *context,
                      const std::vector<int> &paddings,
                      const std::vector<int> &block_shape)
-    : BatchToSpaceFunctorBase(context, paddings, block_shape) {}
+      : BatchToSpaceFunctorBase(context, paddings, block_shape) {}
  MaceStatus operator()(Tensor *space_tensor,
-                  Tensor *batch_tensor,
+                        Tensor *batch_tensor,
-                  StatsFuture *future) {
+                        StatsFuture *future) {
    MACE_UNUSED(future);
    std::vector<index_t> output_shape(4, 0);
@@ -107,8 +107,8 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
                                     output_shape.data());
    MACE_RETURN_IF_ERROR(space_tensor->Resize(output_shape));
-    Tensor::MappingGuard input_guard(space_tensor);
+    Tensor::MappingGuard input_guard(batch_tensor);
-    Tensor::MappingGuard output_guard(batch_tensor);
+    Tensor::MappingGuard output_guard(space_tensor);
    int pad_top = paddings_[0];
    int pad_left = paddings_[2];
@@ -129,8 +129,8 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
    // 32k/sizeof(float)/out_width/block_shape
    index_t
-      block_h_size =
+        block_h_size =
-      std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / out_width);
+        std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / out_width);
    // make channel outter loop so we can make best use of cache
 #pragma omp parallel for collapse(3)
@@ -144,34 +144,34 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
          const index_t tile_w = tile_index % block_shape_w;
          const index_t valid_h_start = std::max(block_h,
                                                 (pad_top - tile_h
-                                                   + block_shape_h - 1)
+                                                     + block_shape_h - 1)
-                                                   / block_shape_h);
+                                                     / block_shape_h);
          const index_t valid_h_end = std::min(in_height,
                                               std::min(
-                                                 block_h + block_h_size,
+                                                   block_h + block_h_size,
-                                                 (out_height + pad_top
+                                                   (out_height + pad_top
-                                                   - tile_h
+                                                       - tile_h
-                                                   + block_shape_h - 1)
+                                                       + block_shape_h - 1)
-                                                   / block_shape_h));
+                                                       / block_shape_h));
          const index_t valid_w_start = std::max(static_cast<index_t>(0),
                                                 (pad_left - tile_w
-                                                   + block_shape_w - 1)
+                                                     + block_shape_w - 1)
-                                                   / block_shape_w);
+                                                     / block_shape_w);
          const index_t valid_w_end = std::min(in_width,
                                               (out_width + pad_left - tile_w
-                                                 + block_shape_w - 1)
+                                                   + block_shape_w - 1)
-                                                 / block_shape_w);
+                                                   / block_shape_w);
          const float *input_base =
-            input_data + (in_b * channels + c) * in_height * in_width;
+              input_data + (in_b * channels + c) * in_height * in_width;
          float *output_base =
-            output_data + (b * channels + c) * out_height * out_width;
+              output_data + (b * channels + c) * out_height * out_width;
          index_t h = valid_h_start * block_shape_h + tile_h - pad_top;
          for (index_t in_h = valid_h_start; in_h < valid_h_end; ++in_h) {
            index_t w = valid_w_start * block_shape_w + tile_w - pad_left;
            for (index_t in_w = valid_w_start; in_w < valid_w_end; ++in_w) {
              output_base[h * out_width + w] =
-                input_base[in_h * in_width + in_w];
+                  input_base[in_h * in_width + in_w];
              w += block_shape_w;
            }  // w
            h += block_shape_h;
@@ -184,6 +184,93 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
  }
 };
+template<>
+struct BatchToSpaceFunctor<CPU, uint8_t> : BatchToSpaceFunctorBase {
+  BatchToSpaceFunctor(OpKernelContext *context,
+                      const std::vector<int> &paddings,
+                      const std::vector<int> &block_shape)
+      : BatchToSpaceFunctorBase(context, paddings, block_shape) {}
+  MaceStatus operator()(Tensor *space_tensor,
+                        Tensor *batch_tensor,
+                        StatsFuture *future) {
+    MACE_UNUSED(future);
+    std::vector<index_t> output_shape(4, 0);
+    CalculateBatchToSpaceOutputShape(batch_tensor,
+                                     DataFormat::NHWC,
+                                     output_shape.data());
+    MACE_RETURN_IF_ERROR(space_tensor->Resize(output_shape));
+    Tensor::MappingGuard input_guard(batch_tensor);
+    Tensor::MappingGuard output_guard(space_tensor);
+    int pad_top = paddings_[0];
+    int pad_left = paddings_[2];
+    int block_shape_h = block_shape_[0];
+    int block_shape_w = block_shape_[1];
+    space_tensor->SetScale(batch_tensor->scale());
+    space_tensor->SetZeroPoint(batch_tensor->zero_point());
+    const uint8_t *input_data = batch_tensor->data<uint8_t>();
+    uint8_t *output_data = space_tensor->mutable_data<uint8_t>();
+    index_t in_batches = batch_tensor->dim(0);
+    index_t in_height = batch_tensor->dim(1);
+    index_t in_width = batch_tensor->dim(2);
+    index_t out_batches = space_tensor->dim(0);
+    index_t out_height = space_tensor->dim(1);
+    index_t out_width = space_tensor->dim(2);
+    index_t channels = space_tensor->dim(3);
+#pragma omp parallel for
+    for (index_t in_b = 0; in_b < in_batches; ++in_b) {
+      const index_t b = in_b % out_batches;
+      const index_t tile_index = in_b / out_batches;
+      const index_t tile_h = tile_index / block_shape_w;
+      const index_t tile_w = tile_index % block_shape_w;
+      const index_t valid_h_start = std::max(static_cast<index_t>(0),
+                                             (pad_top - tile_h
+                                                 + block_shape_h - 1)
+                                                 / block_shape_h);
+      const index_t valid_h_end = std::min(in_height,
+                                           (out_height + pad_top
+                                               - tile_h
+                                               + block_shape_h - 1)
+                                               / block_shape_h);
+      const index_t valid_w_start = std::max(static_cast<index_t>(0),
+                                             (pad_left - tile_w
+                                                 + block_shape_w - 1)
+                                                 / block_shape_w);
+      const index_t valid_w_end = std::min(in_width,
+                                           (out_width + pad_left
+                                               - tile_w
+                                               + block_shape_w - 1)
+                                               / block_shape_w);
+      const uint8_t *input_base =
+          input_data + in_b * in_height * in_width * channels;
+      uint8_t
+          *output_base = output_data + b * out_height * out_width * channels;
+      index_t h = valid_h_start * block_shape_h + tile_h - pad_top;
+      for (index_t in_h = valid_h_start; in_h < valid_h_end; ++in_h) {
+        index_t w = valid_w_start * block_shape_w + tile_w - pad_left;
+        for (index_t in_w = valid_w_start; in_w < valid_w_end; ++in_w) {
+          memcpy(output_base + (h * out_width + w) * channels,
+                 input_base + (in_h * in_width + in_w) * channels,
+                 channels * sizeof(uint8_t));
+          w += block_shape_w;
+        }  // w
+        h += block_shape_h;
+      }  // h
+    }  // b
+    return MACE_SUCCESS;
+  }
+};
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct BatchToSpaceFunctor<DeviceType::GPU, T> : BatchToSpaceFunctorBase {

--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -61,50 +61,72 @@ inline void ComputeInterpolationWeights(
  }
 }
-inline float ComputeLerp(const float top_left,
+template <typename T>
-                         const float top_right,
+inline T ComputeLerp(const T top_left,
-                         const float bottom_left,
+                     const T top_right,
-                         const float bottom_right,
+                     const T bottom_left,
-                         const float x_lerp,
+                     const T bottom_right,
-                         const float y_lerp) {
+                     const float x_lerp,
+                     const float y_lerp);
+template <>
+inline float ComputeLerp<float>(const float top_left,
+                                const float top_right,
+                                const float bottom_left,
+                                const float bottom_right,
+                                const float x_lerp,
+                                const float y_lerp) {
  const float top = top_left + (top_right - top_left) * x_lerp;
  const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
  return top + (bottom - top) * y_lerp;
 }
-inline void ResizeImage(const float *images,
+template <>
-                        const index_t batch_size,
+inline uint8_t ComputeLerp<uint8_t>(const uint8_t top_left,
-                        const index_t in_height,
+                                    const uint8_t top_right,
-                        const index_t in_width,
+                                    const uint8_t bottom_left,
-                        const index_t out_height,
+                                    const uint8_t bottom_right,
-                        const index_t out_width,
+                                    const float x_lerp,
-                        const index_t channels,
+                                    const float y_lerp) {
-                        const std::vector<CachedInterpolation> &xs_vec,
+  const float top = top_left + (top_right - top_left) * x_lerp;
-                        const std::vector<CachedInterpolation> &ys,
+  const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
-                        float *output) {
+  return static_cast<uint8_t>(roundf(top + (bottom - top) * y_lerp));
+}
+template <typename T>
+inline void ResizeImageNCHW(const T *images,
+                            const index_t batch_size,
+                            const index_t in_height,
+                            const index_t in_width,
+                            const index_t out_height,
+                            const index_t out_width,
+                            const index_t channels,
+                            const std::vector<CachedInterpolation> &xs_vec,
+                            const std::vector<CachedInterpolation> &ys,
+                            T *output) {
  const CachedInterpolation *xs = xs_vec.data();
 #pragma omp parallel for collapse(2)
  for (index_t b = 0; b < batch_size; ++b) {
    for (index_t c = 0; c < channels; ++c) {
-      const float
+      const T
          *channel_input_ptr =
          images + (b * channels + c) * in_height * in_width;
-      float *channel_output_ptr =
+      T *channel_output_ptr =
          output + (b * channels + c) * out_height * out_width;
      for (index_t y = 0; y < out_height; ++y) {
-        const float *y_lower_input_ptr =
+        const T *y_lower_input_ptr =
            channel_input_ptr + ys[y].lower * in_width;
-        const float *y_upper_input_ptr =
+        const T *y_upper_input_ptr =
            channel_input_ptr + ys[y].upper * in_width;
        const float ys_lerp = ys[y].lerp;
        for (index_t x = 0; x < out_width; ++x) {
          const float xs_lerp = xs[x].lerp;
-          const float top_left = y_lower_input_ptr[xs[x].lower];
+          const T top_left = y_lower_input_ptr[xs[x].lower];
-          const float top_right = y_lower_input_ptr[xs[x].upper];
+          const T top_right = y_lower_input_ptr[xs[x].upper];
-          const float bottom_left = y_upper_input_ptr[xs[x].lower];
+          const T bottom_left = y_upper_input_ptr[xs[x].lower];
-          const float bottom_right = y_upper_input_ptr[xs[x].upper];
+          const T bottom_right = y_upper_input_ptr[xs[x].upper];
          channel_output_ptr[y * out_width + x] =
              ComputeLerp(top_left, top_right, bottom_left,
                          bottom_right, xs_lerp, ys_lerp);
@@ -114,6 +136,48 @@ inline void ResizeImage(const float *images,
  }
 }
+template <typename T>
+inline void ResizeImageNHWC(const T *images,
+                            const index_t batch_size,
+                            const index_t in_height,
+                            const index_t in_width,
+                            const index_t out_height,
+                            const index_t out_width,
+                            const index_t channels,
+                            const std::vector<CachedInterpolation> &xs_vec,
+                            const std::vector<CachedInterpolation> &ys,
+                            T *output) {
+  const CachedInterpolation *xs = xs_vec.data();
+  for (index_t b = 0; b < batch_size; ++b) {
+    const T *input_base = images + b * channels * in_height * in_width;
+    T *output_base = output + b * channels * out_height * out_width;
+#pragma omp parallel for
+    for (index_t y = 0; y < out_height; ++y) {
+      const T
+          *y_lower_input_ptr = input_base + ys[y].lower * in_width * channels;
+      const T
+          *y_upper_input_ptr = input_base + ys[y].upper * in_width * channels;
+      const float ys_lerp = ys[y].lerp;
+      for (index_t x = 0; x < out_width; ++x) {
+        const float xs_lerp = xs[x].lerp;
+        const T *top_left = y_lower_input_ptr + xs[x].lower * channels;
+        const T *top_right = y_lower_input_ptr + xs[x].upper * channels;
+        const T *bottom_left = y_upper_input_ptr + xs[x].lower * channels;
+        const T *bottom_right = y_upper_input_ptr + xs[x].upper * channels;
+        T *output_ptr = output_base + (y * out_width + x) * channels;
+        for (index_t c = 0; c < channels; ++c) {
+          output_ptr[c] =
+              ComputeLerp(top_left[c], top_right[c], bottom_left[c],
+                          bottom_right[c], xs_lerp, ys_lerp);
+        }
+      }
+    }
+  }
+}
 struct ResizeBilinearFunctorBase : OpKernel {
  ResizeBilinearFunctorBase(OpKernelContext *context,
                            const std::vector<index_t> &size,
@@ -132,11 +196,7 @@ struct ResizeBilinearFunctorBase : OpKernel {
 };
 template<DeviceType D, typename T>
-struct ResizeBilinearFunctor;
+struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
-template<>
-struct ResizeBilinearFunctor<DeviceType::CPU, float>
-    : ResizeBilinearFunctorBase {
  ResizeBilinearFunctor(OpKernelContext *context,
                        const std::vector<index_t> &size,
                        bool align_corners)
@@ -159,8 +219,8 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
    Tensor::MappingGuard input_mapper(input);
    Tensor::MappingGuard output_mapper(output);
-    const float *input_data = input->data<float>();
+    const T *input_data = input->data<T>();
-    float *output_data = output->mutable_data<float>();
+    T *output_data = output->mutable_data<T>();
    if (out_height == in_height && out_width == in_width) {
      std::copy(input_data,
@@ -181,8 +241,77 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
    ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data());
    ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data());
-    ResizeImage(input_data, batch, in_height, in_width, out_height, out_width,
+    ResizeImageNCHW(input_data,
-                channels, xs, ys, output_data);
+                    batch,
+                    in_height,
+                    in_width,
+                    out_height,
+                    out_width,
+                    channels,
+                    xs,
+                    ys,
+                    output_data);
+    return MACE_SUCCESS;
+  }
+};
+template<DeviceType D>
+struct ResizeBilinearFunctor<D, uint8_t> : ResizeBilinearFunctorBase {
+  ResizeBilinearFunctor(OpKernelContext *context,
+                        const std::vector<index_t> &size,
+                        bool align_corners)
+      : ResizeBilinearFunctorBase(context, size, align_corners) {}
+  MaceStatus operator()(const Tensor *input,
+                        Tensor *output,
+                        StatsFuture *future) {
+    MACE_UNUSED(future);
+    const index_t batch = input->dim(0);
+    const index_t in_height = input->dim(1);
+    const index_t in_width = input->dim(2);
+    const index_t channels = input->dim(3);
+    index_t out_height = out_height_;
+    index_t out_width = out_width_;
+    MACE_CHECK(out_height > 0 && out_width > 0);
+    std::vector<index_t> out_shape{batch, out_height, out_width, channels};
+    MACE_RETURN_IF_ERROR(output->Resize(out_shape));
+    Tensor::MappingGuard input_mapper(input);
+    Tensor::MappingGuard output_mapper(output);
+    const uint8_t *input_data = input->data<uint8_t>();
+    uint8_t *output_data = output->mutable_data<uint8_t>();
+    if (out_height == in_height && out_width == in_width) {
+      std::copy(input_data,
+                input_data + batch * in_height * in_width * channels ,
+                output_data);
+      return MACE_SUCCESS;
+    }
+    float height_scale =
+        CalculateResizeScale(in_height, out_height, align_corners_);
+    float width_scale =
+        CalculateResizeScale(in_width, out_width, align_corners_);
+    std::vector<CachedInterpolation> ys(out_height + 1);
+    std::vector<CachedInterpolation> xs(out_width + 1);
+    // Compute the cached interpolation weights on the x and y dimensions.
+    ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data());
+    ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data());
+    ResizeImageNHWC(input_data,
+                    batch,
+                    in_height,
+                    in_width,
+                    out_height,
+                    out_width,
+                    channels,
+                    xs,
+                    ys,
+                    output_data);
    return MACE_SUCCESS;
  }

--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -34,12 +34,12 @@ struct SpaceToBatchFunctorBase : OpKernel {
  SpaceToBatchFunctorBase(OpKernelContext *context,
                          const std::vector<int> &paddings,
                          const std::vector<int> &block_shape)
-    : OpKernel(context),
+      : OpKernel(context),
-      paddings_(paddings.begin(), paddings.end()),
+        paddings_(paddings.begin(), paddings.end()),
-      block_shape_(block_shape.begin(), block_shape.end()) {
+        block_shape_(block_shape.begin(), block_shape.end()) {
    MACE_CHECK(
-      block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1,
+        block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1,
-      "Block's shape should be 1D, and greater than 1");
+        "Block's shape should be 1D, and greater than 1");
    MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D");
  }
@@ -100,11 +100,11 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
  SpaceToBatchFunctor(OpKernelContext *context,
                      const std::vector<int> &paddings,
                      const std::vector<int> &block_shape)
-    : SpaceToBatchFunctorBase(context, paddings, block_shape) {}
+      : SpaceToBatchFunctorBase(context, paddings, block_shape) {}
  MaceStatus operator()(Tensor *space_tensor,
-                  Tensor *batch_tensor,
+                        Tensor *batch_tensor,
-                  StatsFuture *future) {
+                        StatsFuture *future) {
    MACE_UNUSED(future);
    std::vector<index_t> output_shape(4, 0);
@@ -135,7 +135,7 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
    index_t out_width = batch_tensor->dim(3);
    index_t block_h_size =
-      std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / in_width);
+        std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / in_width);
    // make channel outter loop so we can make best use of cache
 #pragma omp parallel for collapse(3)
@@ -149,27 +149,27 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
          const index_t tile_w = tile_index % block_shape_w;
          const index_t valid_h_start = std::max(block_h,
                                                 (pad_top - tile_h
-                                                   + block_shape_h - 1)
+                                                     + block_shape_h - 1)
-                                                   / block_shape_h);
+                                                     / block_shape_h);
          const index_t valid_h_end = std::min(out_height,
                                               std::min(
-                                                 block_h + block_h_size,
+                                                   block_h + block_h_size,
-                                                 (in_height + pad_top
+                                                   (in_height + pad_top
-                                                   - tile_h
+                                                       - tile_h
-                                                   + block_shape_h - 1)
+                                                       + block_shape_h - 1)
-                                                   / block_shape_h));
+                                                       / block_shape_h));
          const index_t valid_w_start = std::max(static_cast<index_t>(0),
                                                 (pad_left - tile_w
-                                                   + block_shape_w - 1)
+                                                     + block_shape_w - 1)
-                                                   / block_shape_w);
+                                                     / block_shape_w);
          const index_t valid_w_end = std::min(out_width,
                                               (in_width + pad_left - tile_w
-                                                 + block_shape_w - 1)
+                                                   + block_shape_w - 1)
-                                                 / block_shape_w);
+                                                   / block_shape_w);
          const float *input_base =
-            input_data + (in_b * channels + c) * in_height * in_width;
+              input_data + (in_b * channels + c) * in_height * in_width;
          float *output_base =
-            output_data + (b * channels + c) * out_height * out_width;
+              output_data + (b * channels + c) * out_height * out_width;
          memset(output_base + block_h * out_width,
                 0,
@@ -184,7 +184,7 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
            index_t in_w = valid_w_start * block_shape_w + tile_w - pad_left;
            for (index_t w = valid_w_start; w < valid_w_end; ++w) {
              output_base[h * out_width + w] =
-                input_base[in_h * in_width + in_w];
+                  input_base[in_h * in_width + in_w];
              in_w += block_shape_w;
            }  // w
            in_h += block_shape_h;
@@ -197,7 +197,7 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
          memset(output_base + valid_h_end * out_width,
                 0,
                 (std::min(out_height, block_h + block_h_size) - valid_h_end)
-                   * out_width * sizeof(float));
+                     * out_width * sizeof(float));
        }  // b
      }  // block_h
    }  // c
@@ -205,6 +205,111 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
  }
 };
+template<>
+struct SpaceToBatchFunctor<DeviceType::CPU, uint8_t> : SpaceToBatchFunctorBase {
+  SpaceToBatchFunctor(OpKernelContext *context,
+                      const std::vector<int> &paddings,
+                      const std::vector<int> &block_shape)
+      : SpaceToBatchFunctorBase(context, paddings, block_shape) {}
+  MaceStatus operator()(Tensor *space_tensor,
+                        Tensor *batch_tensor,
+                        StatsFuture *future) {
+    MACE_UNUSED(future);
+    std::vector<index_t> output_shape(4, 0);
+    CalculateSpaceToBatchOutputShape(space_tensor,
+                                     DataFormat::NHWC,
+                                     output_shape.data());
+    MACE_RETURN_IF_ERROR(batch_tensor->Resize(output_shape));
+    int zero_point = space_tensor->zero_point();
+    Tensor::MappingGuard input_guard(space_tensor);
+    Tensor::MappingGuard output_guard(batch_tensor);
+    int pad_top = paddings_[0];
+    int pad_left = paddings_[2];
+    int block_shape_h = block_shape_[0];
+    int block_shape_w = block_shape_[1];
+    batch_tensor->SetScale(space_tensor->scale());
+    batch_tensor->SetZeroPoint(space_tensor->zero_point());
+    const uint8_t *input_data = space_tensor->data<uint8_t>();
+    uint8_t *output_data = batch_tensor->mutable_data<uint8_t>();
+    index_t in_batches = space_tensor->dim(0);
+    index_t in_height = space_tensor->dim(1);
+    index_t in_width = space_tensor->dim(2);
+    index_t out_batches = batch_tensor->dim(0);
+    index_t out_height = batch_tensor->dim(1);
+    index_t out_width = batch_tensor->dim(2);
+    index_t channels = batch_tensor->dim(3);
+#pragma omp parallel for
+    for (index_t b = 0; b < out_batches; ++b) {
+      const index_t in_b = b % in_batches;
+      const index_t tile_index = b / in_batches;
+      const index_t tile_h = tile_index / block_shape_w;
+      const index_t tile_w = tile_index % block_shape_w;
+      const index_t valid_h_start = std::max(static_cast<index_t>(0),
+                                             (pad_top - tile_h
+                                                 + block_shape_h - 1)
+                                                 / block_shape_h);
+      const index_t valid_h_end = std::min(out_height,
+                                           (in_height + pad_top
+                                               - tile_h
+                                               + block_shape_h - 1)
+                                               / block_shape_h);
+      const index_t valid_w_start = std::max(static_cast<index_t>(0),
+                                             (pad_left - tile_w
+                                                 + block_shape_w - 1)
+                                                 / block_shape_w);
+      const index_t valid_w_end = std::min(out_width,
+                                           (in_width + pad_left - tile_w
+                                               + block_shape_w - 1)
+                                               / block_shape_w);
+      const uint8_t *input_base =
+          input_data + in_b * channels * in_height * in_width;
+      uint8_t *output_base =
+          output_data + b * channels * out_height * out_width;
+      memset(output_base,
+             zero_point,
+             valid_h_start * out_width * channels * sizeof(uint8_t));
+      index_t in_h = valid_h_start * block_shape_h + tile_h - pad_top;
+      for (index_t h = valid_h_start; h < valid_h_end; ++h) {
+        memset(output_base + h * out_width * channels,
+               zero_point,
+               valid_w_start * channels * sizeof(uint8_t));
+        index_t
+            in_w = valid_w_start * block_shape_w + tile_w - pad_left;
+        for (index_t w = valid_w_start; w < valid_w_end; ++w) {
+          memcpy(output_base + (h * out_width + w) * channels,
+                 input_base + (in_h * in_width + in_w) * channels,
+                 sizeof(uint8_t) * channels);
+          in_w += block_shape_w;
+        }  // w
+        in_h += block_shape_h;
+        memset(output_base + (h * out_width + valid_w_end) * channels,
+               zero_point,
+               (out_width - valid_w_end) * channels * sizeof(uint8_t));
+      }  // h
+      memset(output_base + valid_h_end * out_width * channels,
+             zero_point,
+             (out_height - valid_h_end) * out_width * channels
+                 * sizeof(uint8_t));
+    }  // b
+    return MACE_SUCCESS;
+  }
+};
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct SpaceToBatchFunctor<DeviceType::GPU, T> : SpaceToBatchFunctorBase {

--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -23,6 +23,11 @@ void Register_BatchToSpaceND(OperatorRegistryBase *op_registry) {
                                          .TypeConstraint<float>("T")
                                          .Build(),
                         BatchToSpaceNDOp<DeviceType::CPU, float>);
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
+                                          .Device(DeviceType::CPU)
+                                          .TypeConstraint<uint8_t>("T")
+                                          .Build(),
+                         BatchToSpaceNDOp<DeviceType::CPU, uint8_t>);
 #ifdef MACE_ENABLE_OPENCL
  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
                                          .Device(DeviceType::GPU)

--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -28,6 +28,11 @@ void Register_Concat(OperatorRegistryBase *op_registry) {
                                          .TypeConstraint<int32_t>("T")
                                          .Build(),
                         ConcatOp<DeviceType::CPU, int32_t>);
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
+                                          .Device(DeviceType::CPU)
+                                          .TypeConstraint<uint8_t>("T")
+                                          .Build(),
+                         ConcatOp<DeviceType::CPU, uint8_t>);
 #ifdef MACE_ENABLE_OPENCL
  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
                                          .Device(DeviceType::GPU)

--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -162,11 +162,13 @@ class OpsTestNet {
    std::fill(input_data, input_data + input->size(), data);
  }
-  template <DeviceType D, typename T>
+  template<DeviceType D, typename T>
  void AddRandomInput(const std::string &name,
                      const std::vector<index_t> &shape,
                      bool positive = true,
-                      bool truncate = false) {
+                      bool truncate = false,
+                      const float truncate_min = 0.001f,
+                      const float truncate_max = 100.f) {
    Tensor *input =
        ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(),
                         DataTypeToEnum<T>::v());
@@ -180,21 +182,22 @@ class OpsTestNet {
    if (DataTypeToEnum<T>::value == DT_HALF) {
      std::generate(
          input_data, input_data + input->size(),
-          [&gen, &nd, positive, truncate] {
+          [&gen, &nd, positive, truncate, truncate_min, truncate_max] {
            float d = nd(gen);
            if (truncate) {
-              if (std::abs(d) > 100.f) d = 100.f;
+              if (std::abs(d) > truncate_max) d = truncate_max;
-              if (std::abs(d) < 0.001f) d = 0.001f;
+              if (std::abs(d) < truncate_min) d = truncate_min;
            }
            return half_float::half_cast<half>(positive ? std::abs(d) : d);
          });
    } else {
      std::generate(input_data, input_data + input->size(),
-                    [&gen, &nd, positive, truncate] {
+                    [&gen, &nd, positive, truncate,
+                        truncate_min, truncate_max] {
                      float d = nd(gen);
                      if (truncate) {
-                        if (std::abs(d) > 100.f) d = 100.f;
+                        if (std::abs(d) > truncate_max) d = truncate_max;
-                        if (std::abs(d) < 0.001f) d = 0.001f;
+                        if (std::abs(d) < truncate_min) d = truncate_min;
                      }
                      return (positive ? std::abs(d) : d);
                    });

--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
@@ -24,6 +24,12 @@ void Register_ResizeBilinear(OperatorRegistryBase *op_registry) {
                                          .Build(),
                         ResizeBilinearOp<DeviceType::CPU, float>);
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
+                                         .Device(DeviceType::CPU)
+                                         .TypeConstraint<uint8_t>("T")
+                                         .Build(),
+                         ResizeBilinearOp<DeviceType::CPU, uint8_t>);
 #ifdef MACE_ENABLE_OPENCL
  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
                                          .Device(DeviceType::GPU)

--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
@@ -36,8 +36,15 @@ void ResizeBilinearBenchmark(int iters,
  // Add input data
  if (D == DeviceType::CPU) {
-    net.AddRandomInput<D, float>("Input",
+    if (DataTypeToEnum<T>::value != DT_UINT8) {
-                                 {batch, channels, input_height, input_width});
+      net.AddRandomInput<D, float>("Input",
+                                   {batch, channels, input_height,
+                                    input_width});
+    } else {
+      net.AddRandomInput<D, uint8_t>("Input",
+                                   {batch, input_height, input_width,
+                                    channels});
+    }
  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, float>("Input",
                                 {batch, input_height, input_width, channels});
@@ -99,6 +106,7 @@ void ResizeBilinearBenchmark(int iters,
 #define MACE_BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1)                 \
  MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, CPU);    \
+  MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, uint8_t, CPU);    \
  MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, GPU);    \
  MACE_BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, GPU);

--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
@@ -140,12 +140,89 @@ void TestRandomResizeBilinear() {
                            1e-6);
  }
 }
+void TestQuantizedResizeBilinear() {
+  testing::internal::LogToStderr();
+  static unsigned int seed = time(NULL);
+  for (int round = 0; round < 10; ++round) {
+    int batch = 1 + rand_r(&seed) % 5;
+    int channels = 1 + rand_r(&seed) % 100;
+    int height = 1 + rand_r(&seed) % 100;
+    int width = 1 + rand_r(&seed) % 100;
+    int in_height = 1 + rand_r(&seed) % 100;
+    int in_width = 1 + rand_r(&seed) % 100;
+    int align_corners = rand_r(&seed) % 1;
+    // Construct graph
+    OpsTestNet net;
+    // Add input data
+    net.AddRandomInput<CPU, float>("Input",
+                                   {batch, in_height, in_width, channels},
+                                   false,
+                                   true,
+                                   -1.f,
+                                   1.f);
+    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+                                                    NCHW);
+    OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
+        .Input("InputNCHW")
+        .Output("OutputNCHW")
+        .AddIntArg("align_corners", align_corners)
+        .AddIntsArg("size", {height, width})
+        .Finalize(net.NewOperatorDef());
+    // Run on CPU
+    net.RunOp(DeviceType::CPU);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+                                                    "Output", NHWC);
+    // run quantize
+    OpDefBuilder("Quantize", "QuantizeInput")
+        .Input("Input")
+        .Output("QuantizedInput")
+        .OutputType({DT_UINT8})
+        .AddIntArg("T", DT_UINT8)
+        .Finalize(net.NewOperatorDef());
+    net.RunOp();
+    OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
+        .Input("QuantizedInput")
+        .Output("QuantizedOutput")
+        .AddIntArg("align_corners", align_corners)
+        .AddIntsArg("size", {height, width})
+        .OutputType({DT_UINT8})
+        .AddIntArg("T", DT_UINT8)
+        .Finalize(net.NewOperatorDef());
+    net.RunOp();
+    Tensor *eq_output = net.GetTensor("QuantizedInput");
+    Tensor *q_output = net.GetTensor("QuantizedOutput");
+    q_output->SetScale(eq_output->scale());
+    q_output->SetZeroPoint(eq_output->zero_point());
+    OpDefBuilder("Dequantize", "DeQuantizeTest")
+        .Input("QuantizedOutput")
+        .Output("DequantizedOutput")
+        .OutputType({DT_FLOAT})
+        .AddIntArg("T", DT_UINT8)
+        .Finalize(net.NewOperatorDef());
+    net.RunOp();
+    // Check
+    ExpectTensorSimilar<float>(*net.GetOutput("Output"),
+                               *net.GetTensor("DequantizedOutput"), 0.01);
+  }
+}
 }  // namespace
 TEST_F(ResizeBilinearTest, OPENCLRandomResizeBilinear) {
  TestRandomResizeBilinear<DeviceType::GPU>();
 }
+TEST_F(ResizeBilinearTest, QuantizedResizeBilinear) {
+  TestQuantizedResizeBilinear();
+}
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
@@ -23,6 +23,11 @@ void Register_SpaceToBatchND(OperatorRegistryBase *op_registry) {
                                          .TypeConstraint<float>("T")
                                          .Build(),
                         SpaceToBatchNDOp<DeviceType::CPU, float>);
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
+                                          .Device(DeviceType::CPU)
+                                          .TypeConstraint<uint8_t>("T")
+                                          .Build(),
+                         SpaceToBatchNDOp<DeviceType::CPU, uint8_t>);
 #ifdef MACE_ENABLE_OPENCL
  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
                                          .Device(DeviceType::GPU)

--- a/mace/ops/space_to_batch_benchmark.cc
+++ b/mace/ops/space_to_batch_benchmark.cc
@@ -28,7 +28,13 @@ void BMSpaceToBatch(
  OpsTestNet net;
  if (D == DeviceType::CPU) {
-    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+    if (DataTypeToEnum<T>::value != DT_UINT8) {
+      net.AddRandomInput<D, float>(
+          "Input", {batch, channels, height, width});
+    } else {
+      net.AddRandomInput<DeviceType::CPU, uint8_t>(
+          "Input", {batch, height, width, channels});
+    }
  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
  }
@@ -39,6 +45,7 @@ void BMSpaceToBatch(
        .Output("Output")
        .AddIntsArg("paddings", {shape, shape, shape, shape})
        .AddIntsArg("block_shape", {shape, shape})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
@@ -78,7 +85,8 @@ void BMSpaceToBatch(
 #define MACE_BM_SPACE_TO_BATCH(N, H, W, C, SHAPE)              \
  MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); \
-  MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU);
+  MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU); \
+  MACE_BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, uint8_t, CPU);
 MACE_BM_SPACE_TO_BATCH(128, 16, 16, 128, 2);
 MACE_BM_SPACE_TO_BATCH(1, 256, 256, 32, 2);

--- a/mace/ops/space_to_batch_test.cc
+++ b/mace/ops/space_to_batch_test.cc
@@ -147,62 +147,7 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
  RunBatchToSpace<DeviceType::CPU>(batch_shape, batch_data, block_data,
                                   padding_data, space_tensor.get());
 }
-}  // namespace
-TEST(SpaceToBatchTest, SmallData) {
-  TestBidirectionalTransform<float>({1, 2, 2, 1}, {1, 2, 3, 4}, {2, 2},
-                                    {0, 0, 0, 0}, {4, 1, 1, 1}, {1, 2, 3, 4});
-}
-TEST(SpaceToBatchTest, SmallDataWithOnePadding) {
-  TestBidirectionalTransform<float>({1, 2, 2, 1}, {1, 2, 3, 4}, {3, 3},
-                                    {1, 0, 1, 0}, {9, 1, 1, 1},
-                                    {0, 0, 0, 0, 1, 2, 0, 3, 4});
-}
-TEST(SpaceToBatchTest, SmallDataWithTwoPadding) {
-  TestBidirectionalTransform<float>(
-      {1, 2, 2, 1}, {1, 2, 3, 4}, {2, 2}, {1, 1, 1, 1}, {4, 2, 2, 1},
-      {0, 0, 0, 4, 0, 0, 3, 0, 0, 2, 0, 0, 1, 0, 0, 0});
-}
-TEST(SpaceToBatchTest, SmallDataWithLargeImage) {
-  TestBidirectionalTransform<float>(
-      {1, 2, 10, 1},
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20},
-      {2, 2}, {0, 0, 0, 0}, {4, 1, 5, 1},
-      {1, 3, 5, 7, 9, 2, 4, 6, 8, 10, 11, 13, 15, 17, 19, 12, 14, 16, 18, 20});
-}
-TEST(SpaceToBatchTest, MultiChannelData) {
-  TestBidirectionalTransform<float>(
-      {1, 2, 2, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {2, 2},
-      {0, 0, 0, 0}, {4, 1, 1, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
-}
-TEST(SpaceToBatchTest, LargerMultiChannelData) {
-  TestBidirectionalTransform<float>(
-      {1, 4, 4, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-      {2, 2}, {0, 0, 0, 0}, {4, 2, 2, 1},
-      {1, 3, 9, 11, 2, 4, 10, 12, 5, 7, 13, 15, 6, 8, 14, 16});
-}
-TEST(SpaceToBatchTest, MultiBatchData) {
-  TestBidirectionalTransform<float>(
-      {2, 2, 4, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-      {2, 2}, {0, 0, 0, 0}, {8, 1, 2, 1},
-      {1, 3, 9, 11, 2, 4, 10, 12, 5, 7, 13, 15, 6, 8, 14, 16});
-}
-TEST(SpaceToBatchTest, MultiBatchAndChannelData) {
-  TestBidirectionalTransform<float>(
-      {2, 2, 4, 2},
-      {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
-      {2, 2}, {0, 0, 0, 0}, {8, 1, 2, 2},
-      {1, 2,  5,  6,  17, 18, 21, 22, 3,  4,  7,  8,  19, 20, 23, 24,
-       9, 10, 13, 14, 25, 26, 29, 30, 11, 12, 15, 16, 27, 28, 31, 32});
-}
 void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
                                const std::vector<int> &block_shape_data,
@@ -278,6 +223,173 @@ void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
                          *net.GetOutput("OutputGPU"));
 }
+void TestSpaceToBatchQuantize(const std::vector<index_t> &input_shape,
+                              const std::vector<int> &block_shape_data,
+                              const std::vector<int> &padding_data) {
+  OpsTestNet net;
+  net.AddRandomInput<CPU, float>("Input", input_shape, false, true, -1.f, 1.f);
+  // run cpu
+  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+                                                  NCHW);
+  OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
+      .Input("InputNCHW")
+      .Output("OutputNCHW")
+      .AddIntsArg("paddings", padding_data)
+      .AddIntsArg("block_shape", block_shape_data)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp(CPU);
+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+                                                  "OutputCPU", NHWC);
+  // run quantize
+  OpDefBuilder("Quantize", "QuantizeInput")
+      .Input("Input")
+      .Output("QuantizedInput")
+      .OutputType({DT_UINT8})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+  OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
+      .Input("QuantizedInput")
+      .Output("QuantizedOutput")
+      .AddIntsArg("paddings", padding_data)
+      .AddIntsArg("block_shape", block_shape_data)
+      .OutputType({DT_UINT8})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+  Tensor *eq_output = net.GetTensor("QuantizedInput");
+  Tensor *q_output = net.GetTensor("QuantizedOutput");
+  q_output->SetScale(eq_output->scale());
+  q_output->SetZeroPoint(eq_output->zero_point());
+  OpDefBuilder("Dequantize", "DeQuantizeTest")
+      .Input("QuantizedOutput")
+      .Output("DequantizedOutput")
+      .OutputType({DT_FLOAT})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+  // Check
+  ExpectTensorSimilar<float>(*net.GetOutput("OutputCPU"),
+                             *net.GetTensor("DequantizedOutput"), 0.01);
+}
+void TestoBatchToSpaceQuantize(const std::vector<index_t> &input_shape,
+                               const std::vector<int> &block_shape_data,
+                               const std::vector<int> &crops_data) {
+  OpsTestNet net;
+  net.AddRandomInput<CPU, float>("Input", input_shape, false, true, -1.f, 1.f);
+  // run cpu
+  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+                                                  NCHW);
+  OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
+      .Input("InputNCHW")
+      .Output("OutputNCHW")
+      .AddIntsArg("crops", crops_data)
+      .AddIntsArg("block_shape", block_shape_data)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp(CPU);
+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+                                                  "OutputCPU", NHWC);
+  // run quantize
+  OpDefBuilder("Quantize", "QuantizeInput")
+      .Input("Input")
+      .Output("QuantizedInput")
+      .OutputType({DT_UINT8})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+  OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
+      .Input("QuantizedInput")
+      .Output("QuantizedOutput")
+      .AddIntsArg("crops", crops_data)
+      .AddIntsArg("block_shape", block_shape_data)
+      .OutputType({DT_UINT8})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+  Tensor *eq_output = net.GetTensor("QuantizedInput");
+  Tensor *q_output = net.GetTensor("QuantizedOutput");
+  q_output->SetScale(eq_output->scale());
+  q_output->SetZeroPoint(eq_output->zero_point());
+  OpDefBuilder("Dequantize", "DeQuantizeTest")
+      .Input("QuantizedOutput")
+      .Output("DequantizedOutput")
+      .OutputType({DT_FLOAT})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+  // Check
+  ExpectTensorSimilar<float>(*net.GetOutput("OutputCPU"),
+                             *net.GetTensor("DequantizedOutput"), 0.01);
+}
+}  // namespace
+TEST(SpaceToBatchTest, SmallData) {
+  TestBidirectionalTransform<float>({1, 2, 2, 1}, {1, 2, 3, 4}, {2, 2},
+                                    {0, 0, 0, 0}, {4, 1, 1, 1}, {1, 2, 3, 4});
+}
+TEST(SpaceToBatchTest, SmallDataWithOnePadding) {
+  TestBidirectionalTransform<float>({1, 2, 2, 1}, {1, 2, 3, 4}, {3, 3},
+                                    {1, 0, 1, 0}, {9, 1, 1, 1},
+                                    {0, 0, 0, 0, 1, 2, 0, 3, 4});
+}
+TEST(SpaceToBatchTest, SmallDataWithTwoPadding) {
+  TestBidirectionalTransform<float>(
+      {1, 2, 2, 1}, {1, 2, 3, 4}, {2, 2}, {1, 1, 1, 1}, {4, 2, 2, 1},
+      {0, 0, 0, 4, 0, 0, 3, 0, 0, 2, 0, 0, 1, 0, 0, 0});
+}
+TEST(SpaceToBatchTest, SmallDataWithLargeImage) {
+  TestBidirectionalTransform<float>(
+      {1, 2, 10, 1},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20},
+      {2, 2}, {0, 0, 0, 0}, {4, 1, 5, 1},
+      {1, 3, 5, 7, 9, 2, 4, 6, 8, 10, 11, 13, 15, 17, 19, 12, 14, 16, 18, 20});
+}
+TEST(SpaceToBatchTest, MultiChannelData) {
+  TestBidirectionalTransform<float>(
+      {1, 2, 2, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {2, 2},
+      {0, 0, 0, 0}, {4, 1, 1, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+}
+TEST(SpaceToBatchTest, LargerMultiChannelData) {
+  TestBidirectionalTransform<float>(
+      {1, 4, 4, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+      {2, 2}, {0, 0, 0, 0}, {4, 2, 2, 1},
+      {1, 3, 9, 11, 2, 4, 10, 12, 5, 7, 13, 15, 6, 8, 14, 16});
+}
+TEST(SpaceToBatchTest, MultiBatchData) {
+  TestBidirectionalTransform<float>(
+      {2, 2, 4, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+      {2, 2}, {0, 0, 0, 0}, {8, 1, 2, 1},
+      {1, 3, 9, 11, 2, 4, 10, 12, 5, 7, 13, 15, 6, 8, 14, 16});
+}
+TEST(SpaceToBatchTest, MultiBatchAndChannelData) {
+  TestBidirectionalTransform<float>(
+      {2, 2, 4, 2},
+      {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+      {2, 2}, {0, 0, 0, 0}, {8, 1, 2, 2},
+      {1, 2,  5,  6,  17, 18, 21, 22, 3,  4,  7,  8,  19, 20, 23, 24,
+       9, 10, 13, 14, 25, 26, 29, 30, 11, 12, 15, 16, 27, 28, 31, 32});
+}
 TEST(SpaceToBatchTest, LargeData) {
  TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {0, 0, 0, 0});
  TestSpaceToBatchLargeInput({1, 256, 256, 32}, {8, 8}, {4, 4, 4, 4});
@@ -285,6 +397,13 @@ TEST(SpaceToBatchTest, LargeData) {
  TestoBatchToSpaceLargeInput({64, 32, 32, 32}, {8, 8}, {4, 4, 4, 4});
 }
+TEST(SpaceToBatchTest, Quantize) {
+  TestSpaceToBatchQuantize({1, 256, 256, 32}, {8, 8}, {0, 0, 0, 0});
+  TestSpaceToBatchQuantize({1, 256, 256, 32}, {8, 8}, {4, 4, 4, 4});
+  TestoBatchToSpaceQuantize({64, 32, 32, 32}, {8, 8}, {0, 0, 0, 0});
+  TestoBatchToSpaceQuantize({64, 32, 32, 32}, {8, 8}, {4, 4, 4, 4});
+}
 }  // namespace test
 }  // namespace ops
 }  // namespace mace