Add quantized space to depth and depth to space

9dc60eb1 · liyin · 2b820d8b · 9dc60eb1 · 9dc60eb1 · 9dc60eb1
5 changed file
--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
@@ -25,7 +25,10 @@ namespace mace {
 namespace ops {
 template<DeviceType D, class T>
-class DepthToSpaceOp : public Operation {
+class DepthToSpaceOp;
+template<>
+class DepthToSpaceOp<CPU, float> : public Operation {
 public:
  explicit DepthToSpaceOp(OpConstructContext *context)
      : Operation(context),
@@ -55,8 +58,8 @@ class DepthToSpaceOp : public Operation {
    Tensor::MappingGuard logits_guard(input);
    Tensor::MappingGuard output_guard(output);
-    const T *input_ptr = input->data<T>();
+    const float *input_ptr = input->data<float>();
-    T *output_ptr = output->mutable_data<T>();
+    float *output_ptr = output->mutable_data<float>();
    for (index_t b = 0; b < batch_size; ++b) {
      for (index_t d = 0; d < output_depth; ++d) {
@@ -89,6 +92,73 @@ class DepthToSpaceOp : public Operation {
  const int block_size_;
 };
+#ifdef MACE_ENABLE_QUANTIZE
+template<>
+class DepthToSpaceOp<CPU, uint8_t> : public Operation {
+ public:
+  explicit DepthToSpaceOp(OpConstructContext *context)
+      : Operation(context),
+        block_size_(Operation::GetOptionalArg<int>("block_size", 1)) {}
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
+    const index_t batch_size = input->dim(0);
+    const index_t input_depth = input->dim(3);
+    const index_t input_height = input->dim(1);
+    const index_t input_width = input->dim(2);
+    MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
+               "input depth should be dividable by block_size * block_size",
+               input_depth);
+    const index_t output_depth = input_depth / (block_size_ * block_size_);
+    const index_t output_width = input_width * block_size_;
+    const index_t output_height = input_height * block_size_;
+    std::vector<index_t>
+        output_shape = {batch_size, output_height, output_width, output_depth};
+    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+    Tensor::MappingGuard logits_guard(input);
+    Tensor::MappingGuard output_guard(output);
+    const uint8_t *input_ptr = input->data<uint8_t>();
+    uint8_t *output_ptr = output->mutable_data<uint8_t>();
+    for (index_t b = 0; b < batch_size; ++b) {
+      for (index_t h = 0; h < output_height; ++h) {
+        const index_t in_h = h / block_size_;
+        const index_t offset_h = (h % block_size_);
+        for (int w = 0; w < output_width; ++w) {
+          const index_t in_w = w / block_size_;
+          const index_t offset_w = w % block_size_;
+          const index_t offset_d =
+              (offset_h * block_size_ + offset_w) * output_depth;
+          for (index_t d = 0; d < output_depth; ++d) {
+            const index_t in_d = d + offset_d;
+            const index_t o_index =
+                ((b * output_height + h) * output_width + w) * output_depth
+                    + d;
+            const index_t i_index =
+                ((b * input_height + in_h) * input_width + in_w) * input_depth
+                    + in_d;
+            output_ptr[o_index] = input_ptr[i_index];
+          }
+        }
+      }
+    }
+    return MaceStatus::MACE_SUCCESS;
+  }
+ private:
+  const int block_size_;
+};
+#endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_OPENCL
 template<>
 class DepthToSpaceOp<DeviceType::GPU, float> : public Operation {
@@ -118,6 +188,11 @@ void RegisterDepthToSpace(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "DepthToSpace",
                   DepthToSpaceOp, DeviceType::CPU, float);
+#ifdef MACE_ENABLE_QUANTIZE
+  MACE_REGISTER_OP(op_registry, "DepthToSpace",
+                   DepthToSpaceOp, DeviceType::CPU, uint8_t);
+#endif  // MACE_ENABLE_QUANTIZE
  MACE_REGISTER_GPU_OP(op_registry, "DepthToSpace", DepthToSpaceOp);
 }

--- a/mace/ops/space_to_depth.cc
+++ b/mace/ops/space_to_depth.cc
@@ -25,7 +25,10 @@ namespace mace {
 namespace ops {
 template<DeviceType D, class T>
-class SpaceToDepthOp : public Operation {
+class SpaceToDepthOp;
+template<>
+class SpaceToDepthOp<CPU, float> : public Operation {
 public:
  explicit SpaceToDepthOp(OpConstructContext *context)
      : Operation(context),
@@ -55,8 +58,8 @@ class SpaceToDepthOp : public Operation {
    Tensor::MappingGuard logits_guard(input);
    Tensor::MappingGuard output_guard(output);
-    const T *input_ptr = input->data<T>();
+    const float *input_ptr = input->data<float>();
-    T *output_ptr = output->mutable_data<T>();
+    float *output_ptr = output->mutable_data<float>();
    for (index_t b = 0; b < batch_size; ++b) {
      for (index_t d = 0; d < input_depth; ++d) {
@@ -87,6 +90,71 @@ class SpaceToDepthOp : public Operation {
  const int block_size_;
 };
+#ifdef MACE_ENABLE_QUANTIZE
+template<>
+class SpaceToDepthOp<CPU, uint8_t> : public Operation {
+ public:
+  explicit SpaceToDepthOp(OpConstructContext *context)
+      : Operation(context),
+        block_size_(Operation::GetOptionalArg<int>("block_size", 1)) {}
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
+    const index_t batch_size = input->dim(0);
+    const index_t input_depth = input->dim(3);
+    const index_t input_height = input->dim(1);
+    const index_t input_width = input->dim(2);
+    MACE_CHECK(
+        (input_width % block_size_ == 0) && (input_height % block_size_ == 0),
+        "input width and height should be dividable by block_size");
+    const index_t output_depth = input_depth * block_size_ * block_size_;
+    const index_t output_width = input_width / block_size_;
+    const index_t output_height = input_height / block_size_;
+    std::vector<index_t>
+        output_shape = {batch_size, output_height, output_width, output_depth};
+    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+    Tensor::MappingGuard logits_guard(input);
+    Tensor::MappingGuard output_guard(output);
+    const uint8_t *input_ptr = input->data<uint8_t>();
+    uint8_t *output_ptr = output->mutable_data<uint8_t>();
+    for (index_t b = 0; b < batch_size; ++b) {
+      for (index_t h = 0; h < input_height; ++h) {
+        const index_t out_h = h / block_size_;
+        const index_t offset_h = (h % block_size_);
+        for (index_t w = 0; w < input_width; ++w) {
+          const index_t out_w = w / block_size_;
+          const index_t offset_w = (w % block_size_);
+          const index_t offset_d =
+              (offset_h * block_size_ + offset_w) * input_depth;
+          for (index_t d = 0; d < input_depth; ++d) {
+            const index_t out_d = d + offset_d;
+            const index_t o_index =
+                ((b * output_height + out_h) * output_width + out_w)
+                    * output_depth + out_d;
+            const index_t i_index =
+                ((b * input_height + h) * input_width + w) * input_depth + d;
+            output_ptr[o_index] = input_ptr[i_index];
+          }
+        }
+      }
+    }
+    return MaceStatus::MACE_SUCCESS;
+  }
+ private:
+  const int block_size_;
+};
+#endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_OPENCL
 template<>
 class SpaceToDepthOp<DeviceType::GPU, float> : public Operation {
@@ -116,6 +184,11 @@ void RegisterSpaceToDepth(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "SpaceToDepth",
                   SpaceToDepthOp, DeviceType::CPU, float);
+#ifdef MACE_ENABLE_QUANTIZE
+  MACE_REGISTER_OP(op_registry, "SpaceToDepth",
+                   SpaceToDepthOp, DeviceType::CPU, uint8_t);
+#endif  // MACE_ENABLE_QUANTIZE
  MACE_REGISTER_GPU_OP(op_registry, "SpaceToDepth", SpaceToDepthOp);
 }

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -1819,7 +1819,9 @@ class Transformer(base_converter.ConverterInterface):
                           MaceOp.Reshape.name,
                           MaceOp.ResizeBilinear.name,
                           MaceOp.BatchToSpaceND.name,
-                           MaceOp.SpaceToBatchND.name]:
+                           MaceOp.SpaceToBatchND.name,
+                           MaceOp.SpaceToDepth.name,
+                           MaceOp.DepthToSpace.name]:
                del op.quantize_info[:]
                producer_op = self._producer[op.input[0]]
                if producer_op.output[0] in self._option.input_nodes:

--- a/test/ccunit/mace/ops/depth_to_space_test.cc
+++ b/test/ccunit/mace/ops/depth_to_space_test.cc
@@ -262,6 +262,76 @@ TEST_F(DepthToSpaceOpTest, OPENCLRandomBatchHalf) {
  RandomTest<DeviceType::GPU, half>(2, {2, 384, 384, 8});
 }
+namespace {
+void TestDepthToSpaceQuantize(const int block_size,
+                              const std::vector<index_t> &shape) {
+  OpsTestNet net;
+  net.AddRandomInput<CPU, float>("Input",
+                                 shape,
+                                 false,
+                                 false,
+                                 true,
+                                 -1.f,
+                                 1.f);
+  // run cpu
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
+  OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
+      .Input("InputNCHW")
+      .AddIntArg("block_size", block_size)
+      .Output("OutputNCHW")
+      .Finalize(net.NewOperatorDef());
+  net.RunOp(CPU);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);
+  // run quantize
+  OpDefBuilder("Quantize", "QuantizeInput")
+      .Input("Input")
+      .Output("QuantizedInput")
+      .OutputType({DT_UINT8})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+  OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
+      .Input("QuantizedInput")
+      .Output("QuantizedOutput")
+      .AddIntArg("block_size", block_size)
+      .OutputType({DT_UINT8})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+  Tensor *eq_output = net.GetTensor("QuantizedInput");
+  Tensor *q_output = net.GetTensor("QuantizedOutput");
+  q_output->SetScale(eq_output->scale());
+  q_output->SetZeroPoint(eq_output->zero_point());
+  OpDefBuilder("Dequantize", "DeQuantizeTest")
+      .Input("QuantizedOutput")
+      .Output("DequantizedOutput")
+      .OutputType({DT_FLOAT})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+  // Check
+  ExpectTensorSimilar<float>(*net.GetOutput("OutputCPU"),
+                             *net.GetTensor("DequantizedOutput"), 0.01);
+}
+}  // namespace
+TEST_F(DepthToSpaceOpTest, Quantize) {
+  TestDepthToSpaceQuantize(2, {1, 192, 192, 4});
+  TestDepthToSpaceQuantize(3, {1, 111, 111, 9});
+  TestDepthToSpaceQuantize(5, {1, 20, 20, 25});
+  TestDepthToSpaceQuantize(7, {1, 14, 14, 49});
+}
 }  // namespace test
 }  // namespace ops

--- a/test/ccunit/mace/ops/space_to_depth_test.cc
+++ b/test/ccunit/mace/ops/space_to_depth_test.cc
@@ -253,6 +253,77 @@ TEST_F(SpaceToDepthOpTest, OPENCLBatchRandomHalf) {
  RandomTest<DeviceType::GPU, half>(2, {2, 384, 384, 32});
 }
+namespace {
+void TestSpaceToDepthQuantize(int block_size,
+                              const std::vector<index_t> &shape) {
+  OpsTestNet net;
+  net.AddRandomInput<CPU, float>("Input",
+                                 shape,
+                                 false,
+                                 false,
+                                 true,
+                                 -1.f,
+                                 1.f);
+  // run cpu
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
+  OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
+      .Input("InputNCHW")
+      .AddIntArg("block_size", block_size)
+      .Output("OutputNCHW")
+      .Finalize(net.NewOperatorDef());
+  net.RunOp(CPU);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);
+  // run quantize
+  OpDefBuilder("Quantize", "QuantizeInput")
+      .Input("Input")
+      .Output("QuantizedInput")
+      .OutputType({DT_UINT8})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+  OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
+      .Input("QuantizedInput")
+      .Output("QuantizedOutput")
+      .AddIntArg("block_size", block_size)
+      .OutputType({DT_UINT8})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+  Tensor *eq_output = net.GetTensor("QuantizedInput");
+  Tensor *q_output = net.GetTensor("QuantizedOutput");
+  q_output->SetScale(eq_output->scale());
+  q_output->SetZeroPoint(eq_output->zero_point());
+  OpDefBuilder("Dequantize", "DeQuantizeTest")
+      .Input("QuantizedOutput")
+      .Output("DequantizedOutput")
+      .OutputType({DT_FLOAT})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+  // Check
+  ExpectTensorSimilar<float>(*net.GetOutput("OutputCPU"),
+                             *net.GetTensor("DequantizedOutput"), 0.01);
+}
+TEST_F(SpaceToDepthOpTest, Quantize) {
+  TestSpaceToDepthQuantize(2, {1, 384, 384, 1});
+  TestSpaceToDepthQuantize(3, {1, 333, 333, 1});
+  TestSpaceToDepthQuantize(5, {1, 100, 100, 1});
+  TestSpaceToDepthQuantize(7, {1, 98, 98, 1});
+}
+}  // namespace
 }  // namespace test
 }  // namespace ops
 }  // namespace mace