Merge branch 's2d' into 'master'

Add quantized space to depth and depth to space See merge request !1121

Merge branch 's2d' into 'master'
Add quantized space to depth and depth to space See merge request !1121
86b23aa0 · 刘托 · 41ac3372 · 9dc60eb1 · 86b23aa0 · 86b23aa0
5 changed file
--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
@@ -25,7 +25,10 @@ namespace mace {
 namespace ops {

 template<DeviceType D, class T>
-class DepthToSpaceOp : public Operation {
+class DepthToSpaceOp;
+
+template<>
+class DepthToSpaceOp<CPU, float> : public Operation {
 public:
  explicit DepthToSpaceOp(OpConstructContext *context)
      : Operation(context),
@@ -55,8 +58,8 @@ class DepthToSpaceOp : public Operation {

    Tensor::MappingGuard logits_guard(input);
    Tensor::MappingGuard output_guard(output);
-    const T *input_ptr = input->data<T>();
-    T *output_ptr = output->mutable_data<T>();
+    const float *input_ptr = input->data<float>();
+    float *output_ptr = output->mutable_data<float>();

    for (index_t b = 0; b < batch_size; ++b) {
      for (index_t d = 0; d < output_depth; ++d) {
@@ -89,6 +92,73 @@ class DepthToSpaceOp : public Operation {
  const int block_size_;
 };

+#ifdef MACE_ENABLE_QUANTIZE
+template<>
+class DepthToSpaceOp<CPU, uint8_t> : public Operation {
+ public:
+  explicit DepthToSpaceOp(OpConstructContext *context)
+      : Operation(context),
+        block_size_(Operation::GetOptionalArg<int>("block_size", 1)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
+    const index_t batch_size = input->dim(0);
+    const index_t input_depth = input->dim(3);
+    const index_t input_height = input->dim(1);
+    const index_t input_width = input->dim(2);
+
+    MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
+               "input depth should be dividable by block_size * block_size",
+               input_depth);
+
+    const index_t output_depth = input_depth / (block_size_ * block_size_);
+    const index_t output_width = input_width * block_size_;
+    const index_t output_height = input_height * block_size_;
+    std::vector<index_t>
+        output_shape = {batch_size, output_height, output_width, output_depth};
+
+    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+    Tensor::MappingGuard logits_guard(input);
+    Tensor::MappingGuard output_guard(output);
+    const uint8_t *input_ptr = input->data<uint8_t>();
+    uint8_t *output_ptr = output->mutable_data<uint8_t>();
+
+    for (index_t b = 0; b < batch_size; ++b) {
+      for (index_t h = 0; h < output_height; ++h) {
+        const index_t in_h = h / block_size_;
+        const index_t offset_h = (h % block_size_);
+        for (int w = 0; w < output_width; ++w) {
+          const index_t in_w = w / block_size_;
+          const index_t offset_w = w % block_size_;
+          const index_t offset_d =
+              (offset_h * block_size_ + offset_w) * output_depth;
+
+          for (index_t d = 0; d < output_depth; ++d) {
+            const index_t in_d = d + offset_d;
+            const index_t o_index =
+                ((b * output_height + h) * output_width + w) * output_depth
+                    + d;
+            const index_t i_index =
+                ((b * input_height + in_h) * input_width + in_w) * input_depth
+                    + in_d;
+            output_ptr[o_index] = input_ptr[i_index];
+          }
+        }
+      }
+    }
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  const int block_size_;
+};
+#endif  // MACE_ENABLE_QUANTIZE
+
 #ifdef MACE_ENABLE_OPENCL
 template<>
 class DepthToSpaceOp<DeviceType::GPU, float> : public Operation {
@@ -118,6 +188,11 @@ void RegisterDepthToSpace(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "DepthToSpace",
                   DepthToSpaceOp, DeviceType::CPU, float);

+#ifdef MACE_ENABLE_QUANTIZE
+  MACE_REGISTER_OP(op_registry, "DepthToSpace",
+                   DepthToSpaceOp, DeviceType::CPU, uint8_t);
+#endif  // MACE_ENABLE_QUANTIZE
+
  MACE_REGISTER_GPU_OP(op_registry, "DepthToSpace", DepthToSpaceOp);
 }


--- a/mace/ops/space_to_depth.cc
+++ b/mace/ops/space_to_depth.cc
@@ -25,7 +25,10 @@ namespace mace {
 namespace ops {

 template<DeviceType D, class T>
-class SpaceToDepthOp : public Operation {
+class SpaceToDepthOp;
+
+template<>
+class SpaceToDepthOp<CPU, float> : public Operation {
 public:
  explicit SpaceToDepthOp(OpConstructContext *context)
      : Operation(context),
@@ -55,8 +58,8 @@ class SpaceToDepthOp : public Operation {

    Tensor::MappingGuard logits_guard(input);
    Tensor::MappingGuard output_guard(output);
-    const T *input_ptr = input->data<T>();
-    T *output_ptr = output->mutable_data<T>();
+    const float *input_ptr = input->data<float>();
+    float *output_ptr = output->mutable_data<float>();

    for (index_t b = 0; b < batch_size; ++b) {
      for (index_t d = 0; d < input_depth; ++d) {
@@ -87,6 +90,71 @@ class SpaceToDepthOp : public Operation {
  const int block_size_;
 };

+#ifdef MACE_ENABLE_QUANTIZE
+template<>
+class SpaceToDepthOp<CPU, uint8_t> : public Operation {
+ public:
+  explicit SpaceToDepthOp(OpConstructContext *context)
+      : Operation(context),
+        block_size_(Operation::GetOptionalArg<int>("block_size", 1)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
+    const index_t batch_size = input->dim(0);
+    const index_t input_depth = input->dim(3);
+    const index_t input_height = input->dim(1);
+    const index_t input_width = input->dim(2);
+
+    MACE_CHECK(
+        (input_width % block_size_ == 0) && (input_height % block_size_ == 0),
+        "input width and height should be dividable by block_size");
+
+    const index_t output_depth = input_depth * block_size_ * block_size_;
+    const index_t output_width = input_width / block_size_;
+    const index_t output_height = input_height / block_size_;
+    std::vector<index_t>
+        output_shape = {batch_size, output_height, output_width, output_depth};
+
+    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+    Tensor::MappingGuard logits_guard(input);
+    Tensor::MappingGuard output_guard(output);
+    const uint8_t *input_ptr = input->data<uint8_t>();
+    uint8_t *output_ptr = output->mutable_data<uint8_t>();
+
+    for (index_t b = 0; b < batch_size; ++b) {
+      for (index_t h = 0; h < input_height; ++h) {
+        const index_t out_h = h / block_size_;
+        const index_t offset_h = (h % block_size_);
+        for (index_t w = 0; w < input_width; ++w) {
+          const index_t out_w = w / block_size_;
+          const index_t offset_w = (w % block_size_);
+          const index_t offset_d =
+              (offset_h * block_size_ + offset_w) * input_depth;
+
+          for (index_t d = 0; d < input_depth; ++d) {
+            const index_t out_d = d + offset_d;
+            const index_t o_index =
+                ((b * output_height + out_h) * output_width + out_w)
+                    * output_depth + out_d;
+            const index_t i_index =
+                ((b * input_height + h) * input_width + w) * input_depth + d;
+            output_ptr[o_index] = input_ptr[i_index];
+          }
+        }
+      }
+    }
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  const int block_size_;
+};
+#endif  // MACE_ENABLE_QUANTIZE
+
 #ifdef MACE_ENABLE_OPENCL
 template<>
 class SpaceToDepthOp<DeviceType::GPU, float> : public Operation {
@@ -116,6 +184,11 @@ void RegisterSpaceToDepth(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "SpaceToDepth",
                   SpaceToDepthOp, DeviceType::CPU, float);

+#ifdef MACE_ENABLE_QUANTIZE
+  MACE_REGISTER_OP(op_registry, "SpaceToDepth",
+                   SpaceToDepthOp, DeviceType::CPU, uint8_t);
+#endif  // MACE_ENABLE_QUANTIZE
+
  MACE_REGISTER_GPU_OP(op_registry, "SpaceToDepth", SpaceToDepthOp);
 }


--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -1819,7 +1819,9 @@ class Transformer(base_converter.ConverterInterface):
                           MaceOp.Reshape.name,
                           MaceOp.ResizeBilinear.name,
                           MaceOp.BatchToSpaceND.name,
-                           MaceOp.SpaceToBatchND.name]:
+                           MaceOp.SpaceToBatchND.name,
+                           MaceOp.SpaceToDepth.name,
+                           MaceOp.DepthToSpace.name]:
                del op.quantize_info[:]
                producer_op = self._producer[op.input[0]]
                if producer_op.output[0] in self._option.input_nodes:

--- a/test/ccunit/mace/ops/depth_to_space_test.cc
+++ b/test/ccunit/mace/ops/depth_to_space_test.cc
@@ -262,6 +262,76 @@ TEST_F(DepthToSpaceOpTest, OPENCLRandomBatchHalf) {
  RandomTest<DeviceType::GPU, half>(2, {2, 384, 384, 8});
 }

+namespace {
+
+void TestDepthToSpaceQuantize(const int block_size,
+                              const std::vector<index_t> &shape) {
+  OpsTestNet net;
+  net.AddRandomInput<CPU, float>("Input",
+                                 shape,
+                                 false,
+                                 false,
+                                 true,
+                                 -1.f,
+                                 1.f);
+
+  // run cpu
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
+
+  OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
+      .Input("InputNCHW")
+      .AddIntArg("block_size", block_size)
+      .Output("OutputNCHW")
+      .Finalize(net.NewOperatorDef());
+
+  net.RunOp(CPU);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);
+
+  // run quantize
+  OpDefBuilder("Quantize", "QuantizeInput")
+      .Input("Input")
+      .Output("QuantizedInput")
+      .OutputType({DT_UINT8})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+
+  OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
+      .Input("QuantizedInput")
+      .Output("QuantizedOutput")
+      .AddIntArg("block_size", block_size)
+      .OutputType({DT_UINT8})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+
+  Tensor *eq_output = net.GetTensor("QuantizedInput");
+  Tensor *q_output = net.GetTensor("QuantizedOutput");
+  q_output->SetScale(eq_output->scale());
+  q_output->SetZeroPoint(eq_output->zero_point());
+  OpDefBuilder("Dequantize", "DeQuantizeTest")
+      .Input("QuantizedOutput")
+      .Output("DequantizedOutput")
+      .OutputType({DT_FLOAT})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+
+  // Check
+  ExpectTensorSimilar<float>(*net.GetOutput("OutputCPU"),
+                             *net.GetTensor("DequantizedOutput"), 0.01);
+}
+
+}  // namespace
+
+TEST_F(DepthToSpaceOpTest, Quantize) {
+  TestDepthToSpaceQuantize(2, {1, 192, 192, 4});
+  TestDepthToSpaceQuantize(3, {1, 111, 111, 9});
+  TestDepthToSpaceQuantize(5, {1, 20, 20, 25});
+  TestDepthToSpaceQuantize(7, {1, 14, 14, 49});
+}

 }  // namespace test
 }  // namespace ops

--- a/test/ccunit/mace/ops/space_to_depth_test.cc
+++ b/test/ccunit/mace/ops/space_to_depth_test.cc
@@ -253,6 +253,77 @@ TEST_F(SpaceToDepthOpTest, OPENCLBatchRandomHalf) {
  RandomTest<DeviceType::GPU, half>(2, {2, 384, 384, 32});
 }

+namespace {
+
+void TestSpaceToDepthQuantize(int block_size,
+                              const std::vector<index_t> &shape) {
+  OpsTestNet net;
+  net.AddRandomInput<CPU, float>("Input",
+                                 shape,
+                                 false,
+                                 false,
+                                 true,
+                                 -1.f,
+                                 1.f);
+
+  // run cpu
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
+
+  OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
+      .Input("InputNCHW")
+      .AddIntArg("block_size", block_size)
+      .Output("OutputNCHW")
+      .Finalize(net.NewOperatorDef());
+
+  net.RunOp(CPU);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);
+
+  // run quantize
+  OpDefBuilder("Quantize", "QuantizeInput")
+      .Input("Input")
+      .Output("QuantizedInput")
+      .OutputType({DT_UINT8})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+
+  OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
+      .Input("QuantizedInput")
+      .Output("QuantizedOutput")
+      .AddIntArg("block_size", block_size)
+      .OutputType({DT_UINT8})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+
+  Tensor *eq_output = net.GetTensor("QuantizedInput");
+  Tensor *q_output = net.GetTensor("QuantizedOutput");
+  q_output->SetScale(eq_output->scale());
+  q_output->SetZeroPoint(eq_output->zero_point());
+  OpDefBuilder("Dequantize", "DeQuantizeTest")
+      .Input("QuantizedOutput")
+      .Output("DequantizedOutput")
+      .OutputType({DT_FLOAT})
+      .AddIntArg("T", DT_UINT8)
+      .Finalize(net.NewOperatorDef());
+  net.RunOp();
+
+  // Check
+  ExpectTensorSimilar<float>(*net.GetOutput("OutputCPU"),
+                             *net.GetTensor("DequantizedOutput"), 0.01);
+}
+
+TEST_F(SpaceToDepthOpTest, Quantize) {
+  TestSpaceToDepthQuantize(2, {1, 384, 384, 1});
+  TestSpaceToDepthQuantize(3, {1, 333, 333, 1});
+  TestSpaceToDepthQuantize(5, {1, 100, 100, 1});
+  TestSpaceToDepthQuantize(7, {1, 98, 98, 1});
+}
+
+}  // namespace
+
 }  // namespace test
 }  // namespace ops
 }  // namespace mace