diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc
index ba87830a9038ac2c791787a148b114d0a5c0c8f6..6efa4d24566972164fd39d848d037f8c850e12e2 100644
--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
@@ -25,7 +25,10 @@ namespace mace {
 namespace ops {
 
 template<DeviceType D, class T>
-class DepthToSpaceOp : public Operation {
+class DepthToSpaceOp;
+
+template<>
+class DepthToSpaceOp<CPU, float> : public Operation {
  public:
   explicit DepthToSpaceOp(OpConstructContext *context)
       : Operation(context),
@@ -55,8 +58,8 @@ class DepthToSpaceOp : public Operation {
 
     Tensor::MappingGuard logits_guard(input);
     Tensor::MappingGuard output_guard(output);
-    const T *input_ptr = input->data<T>();
-    T *output_ptr = output->mutable_data<T>();
+    const float *input_ptr = input->data<float>();
+    float *output_ptr = output->mutable_data<float>();
 
     for (index_t b = 0; b < batch_size; ++b) {
       for (index_t d = 0; d < output_depth; ++d) {
@@ -89,6 +92,73 @@ class DepthToSpaceOp : public Operation {
   const int block_size_;
 };
 
+#ifdef MACE_ENABLE_QUANTIZE
+template<>
+class DepthToSpaceOp<CPU, uint8_t> : public Operation {
+ public:
+  explicit DepthToSpaceOp(OpConstructContext *context)
+      : Operation(context),
+        block_size_(Operation::GetOptionalArg<int>("block_size", 1)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
+    const index_t batch_size = input->dim(0);
+    const index_t input_depth = input->dim(3);
+    const index_t input_height = input->dim(1);
+    const index_t input_width = input->dim(2);
+
+    MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
+               "input depth should be dividable by block_size * block_size",
+               input_depth);
+
+    const index_t output_depth = input_depth / (block_size_ * block_size_);
+    const index_t output_width = input_width * block_size_;
+    const index_t output_height = input_height * block_size_;
+    std::vector<index_t>
+        output_shape = {batch_size, output_height, output_width, output_depth};
+
+    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+    Tensor::MappingGuard logits_guard(input);
+    Tensor::MappingGuard output_guard(output);
+    const uint8_t *input_ptr = input->data<uint8_t>();
+    uint8_t *output_ptr = output->mutable_data<uint8_t>();
+
+    for (index_t b = 0; b < batch_size; ++b) {
+      for (index_t h = 0; h < output_height; ++h) {
+        const index_t in_h = h / block_size_;
+        const index_t offset_h = (h % block_size_);
+        for (int w = 0; w < output_width; ++w) {
+          const index_t in_w = w / block_size_;
+          const index_t offset_w = w % block_size_;
+          const index_t offset_d =
+              (offset_h * block_size_ + offset_w) * output_depth;
+
+          for (index_t d = 0; d < output_depth; ++d) {
+            const index_t in_d = d + offset_d;
+            const index_t o_index =
+                ((b * output_height + h) * output_width + w) * output_depth
+                    + d;
+            const index_t i_index =
+                ((b * input_height + in_h) * input_width + in_w) * input_depth
+                    + in_d;
+            output_ptr[o_index] = input_ptr[i_index];
+          }
+        }
+      }
+    }
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  const int block_size_;
+};
+#endif  // MACE_ENABLE_QUANTIZE
+
 #ifdef MACE_ENABLE_OPENCL
 template<>
 class DepthToSpaceOp<DeviceType::GPU, float> : public Operation {
@@ -118,6 +188,11 @@ void RegisterDepthToSpace(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "DepthToSpace",
                    DepthToSpaceOp, DeviceType::CPU, float);
 
+#ifdef MACE_ENABLE_QUANTIZE
+  MACE_REGISTER_OP(op_registry, "DepthToSpace",
+                   DepthToSpaceOp, DeviceType::CPU, uint8_t);
+#endif  // MACE_ENABLE_QUANTIZE
+
   MACE_REGISTER_GPU_OP(op_registry, "DepthToSpace", DepthToSpaceOp);
 }
 
diff --git a/mace/ops/space_to_depth.cc b/mace/ops/space_to_depth.cc
index d9b5473629da962985261bc955dc591ef4b3a0f7..59c1a342162d0637f8e2d30b33c9b1835fac61f5 100644
--- a/mace/ops/space_to_depth.cc
+++ b/mace/ops/space_to_depth.cc
@@ -25,7 +25,10 @@ namespace mace {
 namespace ops {
 
 template<DeviceType D, class T>
-class SpaceToDepthOp : public Operation {
+class SpaceToDepthOp;
+
+template<>
+class SpaceToDepthOp<CPU, float> : public Operation {
  public:
   explicit SpaceToDepthOp(OpConstructContext *context)
       : Operation(context),
@@ -55,8 +58,8 @@ class SpaceToDepthOp : public Operation {
 
     Tensor::MappingGuard logits_guard(input);
     Tensor::MappingGuard output_guard(output);
-    const T *input_ptr = input->data<T>();
-    T *output_ptr = output->mutable_data<T>();
+    const float *input_ptr = input->data<float>();
+    float *output_ptr = output->mutable_data<float>();
 
     for (index_t b = 0; b < batch_size; ++b) {
       for (index_t d = 0; d < input_depth; ++d) {
@@ -87,6 +90,71 @@ class SpaceToDepthOp : public Operation {
   const int block_size_;
 };
 
+#ifdef MACE_ENABLE_QUANTIZE
+template<>
+class SpaceToDepthOp<CPU, uint8_t> : public Operation {
+ public:
+  explicit SpaceToDepthOp(OpConstructContext *context)
+      : Operation(context),
+        block_size_(Operation::GetOptionalArg<int>("block_size", 1)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
+    const index_t batch_size = input->dim(0);
+    const index_t input_depth = input->dim(3);
+    const index_t input_height = input->dim(1);
+    const index_t input_width = input->dim(2);
+
+    MACE_CHECK(
+        (input_width % block_size_ == 0) && (input_height % block_size_ == 0),
+        "input width and height should be dividable by block_size");
+
+    const index_t output_depth = input_depth * block_size_ * block_size_;
+    const index_t output_width = input_width / block_size_;
+    const index_t output_height = input_height / block_size_;
+    std::vector<index_t>
+        output_shape = {batch_size, output_height, output_width, output_depth};
+
+    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+    Tensor::MappingGuard logits_guard(input);
+    Tensor::MappingGuard output_guard(output);
+    const uint8_t *input_ptr = input->data<uint8_t>();
+    uint8_t *output_ptr = output->mutable_data<uint8_t>();
+
+    for (index_t b = 0; b < batch_size; ++b) {
+      for (index_t h = 0; h < input_height; ++h) {
+        const index_t out_h = h / block_size_;
+        const index_t offset_h = (h % block_size_);
+        for (index_t w = 0; w < input_width; ++w) {
+          const index_t out_w = w / block_size_;
+          const index_t offset_w = (w % block_size_);
+          const index_t offset_d =
+              (offset_h * block_size_ + offset_w) * input_depth;
+
+          for (index_t d = 0; d < input_depth; ++d) {
+            const index_t out_d = d + offset_d;
+            const index_t o_index =
+                ((b * output_height + out_h) * output_width + out_w)
+                    * output_depth + out_d;
+            const index_t i_index =
+                ((b * input_height + h) * input_width + w) * input_depth + d;
+            output_ptr[o_index] = input_ptr[i_index];
+          }
+        }
+      }
+    }
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  const int block_size_;
+};
+#endif  // MACE_ENABLE_QUANTIZE
+
 #ifdef MACE_ENABLE_OPENCL
 template<>
 class SpaceToDepthOp<DeviceType::GPU, float> : public Operation {
@@ -116,6 +184,11 @@ void RegisterSpaceToDepth(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "SpaceToDepth",
                    SpaceToDepthOp, DeviceType::CPU, float);
 
+#ifdef MACE_ENABLE_QUANTIZE
+  MACE_REGISTER_OP(op_registry, "SpaceToDepth",
+                   SpaceToDepthOp, DeviceType::CPU, uint8_t);
+#endif  // MACE_ENABLE_QUANTIZE
+
   MACE_REGISTER_GPU_OP(op_registry, "SpaceToDepth", SpaceToDepthOp);
 }
 
diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py
index 1f2e986d5c5b04a64bc9b5a5395716ccee5b2e28..e9559861220df330ad55459577b6bbf8ce301e38 100644
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -1819,7 +1819,9 @@ class Transformer(base_converter.ConverterInterface):
                            MaceOp.Reshape.name,
                            MaceOp.ResizeBilinear.name,
                            MaceOp.BatchToSpaceND.name,
-                           MaceOp.SpaceToBatchND.name]:
+                           MaceOp.SpaceToBatchND.name,
+                           MaceOp.SpaceToDepth.name,
+                           MaceOp.DepthToSpace.name]:
                 del op.quantize_info[:]
                 producer_op = self._producer[op.input[0]]
                 if producer_op.output[0] in self._option.input_nodes:
diff --git a/test/ccunit/mace/ops/depth_to_space_test.cc b/test/ccunit/mace/ops/depth_to_space_test.cc
index 3bf32efa3c849f3c25a872fe1c989c18c872d037..f093bae6e09a8cb64095bbdaff923dc41966f11d 100644
--- a/test/ccunit/mace/ops/depth_to_space_test.cc
+++ b/test/ccunit/mace/ops/depth_to_space_test.cc
@@ -262,6 +262,76 @@ TEST_F(DepthToSpaceOpTest, OPENCLRandomBatchHalf) {
   RandomTest<DeviceType::GPU, half>(2, {2, 384, 384, 8});
 }
 
+namespace {
+
+void TestDepthToSpaceQuantize(const int block_size,
+                              const std::vector<index_t> &shape) {
+  OpsTestNet net;
+  net.AddRandomInput<CPU, float>("Input",
+                                 shape,
+                                 false,
+                                 false,
+                                 true,
+                                 -1.f,
+                                 1.f);
+
+  // run cpu
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
+
+  OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
+      .Input("InputNCHW")
+      .AddIntArg("block_size", block_size)
+      .Output("OutputNCHW")
+      .Finalize(net.NewOperatorDef());
+
+  net.RunOp(CPU);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);
+
+  // run quantize
+  OpDefBuilder("Quantize", "QuantizeInput")
+      .Input("Input")
+      .Output("QuantizedInput")
+      .OutputType({DT_UINT8})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+
+  OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
+      .Input("QuantizedInput")
+      .Output("QuantizedOutput")
+      .AddIntArg("block_size", block_size)
+      .OutputType({DT_UINT8})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+
+  Tensor *eq_output = net.GetTensor("QuantizedInput");
+  Tensor *q_output = net.GetTensor("QuantizedOutput");
+  q_output->SetScale(eq_output->scale());
+  q_output->SetZeroPoint(eq_output->zero_point());
+  OpDefBuilder("Dequantize", "DeQuantizeTest")
+      .Input("QuantizedOutput")
+      .Output("DequantizedOutput")
+      .OutputType({DT_FLOAT})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+
+  // Check
+  ExpectTensorSimilar<float>(*net.GetOutput("OutputCPU"),
+                             *net.GetTensor("DequantizedOutput"), 0.01);
+}
+
+}  // namespace
+
+TEST_F(DepthToSpaceOpTest, Quantize) {
+  TestDepthToSpaceQuantize(2, {1, 192, 192, 4});
+  TestDepthToSpaceQuantize(3, {1, 111, 111, 9});
+  TestDepthToSpaceQuantize(5, {1, 20, 20, 25});
+  TestDepthToSpaceQuantize(7, {1, 14, 14, 49});
+}
 
 }  // namespace test
 }  // namespace ops
diff --git a/test/ccunit/mace/ops/space_to_depth_test.cc b/test/ccunit/mace/ops/space_to_depth_test.cc
index 226083b71344ffdbe22266b30e53f333cfc2d8fc..7a7091cad5a5918524f50cd0dc8c38b3446e33fa 100644
--- a/test/ccunit/mace/ops/space_to_depth_test.cc
+++ b/test/ccunit/mace/ops/space_to_depth_test.cc
@@ -253,6 +253,77 @@ TEST_F(SpaceToDepthOpTest, OPENCLBatchRandomHalf) {
   RandomTest<DeviceType::GPU, half>(2, {2, 384, 384, 32});
 }
 
+namespace {
+
+void TestSpaceToDepthQuantize(int block_size,
+                              const std::vector<index_t> &shape) {
+  OpsTestNet net;
+  net.AddRandomInput<CPU, float>("Input",
+                                 shape,
+                                 false,
+                                 false,
+                                 true,
+                                 -1.f,
+                                 1.f);
+
+  // run cpu
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
+
+  OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
+      .Input("InputNCHW")
+      .AddIntArg("block_size", block_size)
+      .Output("OutputNCHW")
+      .Finalize(net.NewOperatorDef());
+
+  net.RunOp(CPU);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);
+
+  // run quantize
+  OpDefBuilder("Quantize", "QuantizeInput")
+      .Input("Input")
+      .Output("QuantizedInput")
+      .OutputType({DT_UINT8})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+
+  OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
+      .Input("QuantizedInput")
+      .Output("QuantizedOutput")
+      .AddIntArg("block_size", block_size)
+      .OutputType({DT_UINT8})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+
+  Tensor *eq_output = net.GetTensor("QuantizedInput");
+  Tensor *q_output = net.GetTensor("QuantizedOutput");
+  q_output->SetScale(eq_output->scale());
+  q_output->SetZeroPoint(eq_output->zero_point());
+  OpDefBuilder("Dequantize", "DeQuantizeTest")
+      .Input("QuantizedOutput")
+      .Output("DequantizedOutput")
+      .OutputType({DT_FLOAT})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+
+  // Check
+  ExpectTensorSimilar<float>(*net.GetOutput("OutputCPU"),
+                             *net.GetTensor("DequantizedOutput"), 0.01);
+}
+
+TEST_F(SpaceToDepthOpTest, Quantize) {
+  TestSpaceToDepthQuantize(2, {1, 384, 384, 1});
+  TestSpaceToDepthQuantize(3, {1, 333, 333, 1});
+  TestSpaceToDepthQuantize(5, {1, 100, 100, 1});
+  TestSpaceToDepthQuantize(7, {1, 98, 98, 1});
+}
+
+}  // namespace
+
 }  // namespace test
 }  // namespace ops
 }  // namespace mace