From 1bb11f64aeeaebc64f489deb1a31274844b00e1e Mon Sep 17 00:00:00 2001
From: luxuhui <luxuhui@xiaomi.com>
Date: Tue, 3 Nov 2020 12:06:40 +0800
Subject: [PATCH] fix: fix compute error in `mvn` and `reduce` ops

N/A

Signed-off-by: Luxuhui <luxuhui@xiaomi.com>
---
 mace/ops/opencl/image/mvnorm.cc     |   2 +-
 mace/ops/reduce.cc                  | 141 ++++++++++++++++++----------
 test/ccunit/mace/ops/reduce_test.cc |  18 +++-
 3 files changed, 110 insertions(+), 51 deletions(-)
diff --git a/mace/ops/opencl/image/mvnorm.cc b/mace/ops/opencl/image/mvnorm.cc
index de409310..799e0ce4 100644
--- a/mace/ops/opencl/image/mvnorm.cc
+++ b/mace/ops/opencl/image/mvnorm.cc
@@ -85,7 +85,7 @@ MaceStatus MVNormKernel::DoCompute(
 
   const std::vector<index_t > mean_shape = {batch, 1, 1, channels};
   std::vector<size_t> mean_image_shape;
-  OpenCLUtil::CalImage2DShape(mean_shape, OpenCLBufferType::IN_OUT_HEIGHT,
+  OpenCLUtil::CalImage2DShape(mean_shape, OpenCLBufferType::IN_OUT_CHANNEL,
                               &mean_image_shape);
   ScratchImageManager *scratch_manager =
       context->device()->gpu_runtime()->scratch_image_manager();
diff --git a/mace/ops/reduce.cc b/mace/ops/reduce.cc
index 0141b05c..8f67de18 100644
--- a/mace/ops/reduce.cc
+++ b/mace/ops/reduce.cc
@@ -17,12 +17,13 @@
 #include <set>
 #include <vector>
 
-#include "mace/ops/common/reduce_type.h"
 #include "mace/core/future.h"
 #include "mace/core/ops/operator.h"
+#include "mace/core/quantize.h"
 #include "mace/core/registry/ops_registry.h"
 #include "mace/core/runtime/cpu/cpu_runtime.h"
 #include "mace/core/tensor.h"
+#include "mace/ops/common/reduce_type.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/reduce.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -75,9 +76,11 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
     const Tensor *input = this->Input(0);
     Tensor *output = this->Output(0);
     Simplify(input);
-    // Use the same scale and zero point with input and output.
-    output->SetScale(input->scale());
-    output->SetZeroPoint(input->zero_point());
+    if (reduce_type_ != SUM) {
+      // Use the same scale and zero point with input and output.
+      output->SetScale(input->scale());
+      output->SetZeroPoint(input->zero_point());
+    }
     output->Resize(out_shape_);
     Compute(context, input, output);
     return MaceStatus::MACE_SUCCESS;
@@ -139,10 +142,12 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
   }
 
   void Reduce1Dims(const OpContext *context,
-                   const T *input,
+                   const Tensor *input_tensor,
                    ReduceType type,
-                   T *output) {
+                   Tensor *output_tensor) {
     MACE_UNUSED(context);
+    const T *input = input_tensor->data<T>();
+    T *output = output_tensor->mutable_data<T>();
     if (reduce_first_axis_) {
       if (type == ReduceType::MEAN) {
         T tmp = 0.f;
@@ -183,12 +188,14 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
   }
 
   void Reduce2Dims(const OpContext *context,
-                   const T *input,
+                   const Tensor *input_tensor,
                    ReduceType type,
-                   T *output) {
+                   Tensor *output_tensor) {
     utils::ThreadPool
         &thread_pool = context->device()->cpu_runtime()->thread_pool();
 
+    const T *input = input_tensor->data<T>();
+    T *output = output_tensor->mutable_data<T>();
     if (reduce_first_axis_) {
       thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
         if (type == ReduceType::MEAN) {
@@ -285,12 +292,14 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
   }
 
   void Reduce3Dims(const OpContext *context,
-                   const T *input,
+                   const Tensor *input_tensor,
                    ReduceType type,
-                   T *output) {
+                   Tensor *output_tensor) {
     utils::ThreadPool
         &thread_pool = context->device()->cpu_runtime()->thread_pool();
 
+    const T *input = input_tensor->data<T>();
+    T *output = output_tensor->mutable_data<T>();
     if (reduce_first_axis_) {
       thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
         if (type == ReduceType::MEAN) {
@@ -407,8 +416,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
             for (int j = 0; j < data_reshape_[2]; ++j) {
               for (int k = 0; k < data_reshape_[1]; ++k) {
                 output[i * data_reshape_[2] + j] +=
-                    input[(i * data_reshape_[1] + k) * data_reshape_[2]
-                        + j];
+                    input[(i * data_reshape_[1] + k) * data_reshape_[2] + j];
               }
             }
           }
@@ -420,12 +428,14 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
   }
 
   void Reduce4Dims(const OpContext *context,
-                   const T *input,
+                   const Tensor *input_tensor,
                    ReduceType type,
-                   T *output) {
+                   Tensor *output_tensor) {
     utils::ThreadPool
         &thread_pool = context->device()->cpu_runtime()->thread_pool();
 
+    const T *input = input_tensor->data<T>();
+    T *output = output_tensor->mutable_data<T>();
     if (reduce_first_axis_) {
       thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                                 index_t start1, index_t end1, index_t step1) {
@@ -587,18 +597,17 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
 
   void Compute(const OpContext *context, const Tensor *input, Tensor *output) {
     Tensor::MappingGuard input_mapper(input);
-    const T *input_ptr = input->data<T>();
     Tensor::MappingGuard output_map(output);
     T *output_ptr = output->mutable_data<T>();
     memset(static_cast<void *>(output_ptr), 0, output->size() * sizeof(T));
     switch (data_reshape_.size()) {
-      case 1:Reduce1Dims(context, input_ptr, reduce_type_, output_ptr);
+      case 1:Reduce1Dims(context, input, reduce_type_, output);
         break;
-      case 2:Reduce2Dims(context, input_ptr, reduce_type_, output_ptr);
+      case 2:Reduce2Dims(context, input, reduce_type_, output);
         break;
-      case 3:Reduce3Dims(context, input_ptr, reduce_type_, output_ptr);
+      case 3:Reduce3Dims(context, input, reduce_type_, output);
         break;
-      case 4:Reduce4Dims(context, input_ptr, reduce_type_, output_ptr);
+      case 4:Reduce4Dims(context, input, reduce_type_, output);
         break;
       default:MACE_CHECK(false, "not implemented in mace")
           << "data reshape size" << data_reshape_.size()
@@ -617,8 +626,10 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
 template<>
 void ReduceOp<DeviceType::CPU, uint8_t>::Reduce1Dims(
     const OpContext *context,
-    const uint8_t *input, ReduceType type, uint8_t *output) {
+    const Tensor *input_tensor, ReduceType type, Tensor *output_tensor) {
   MACE_UNUSED(context);
+  const uint8_t *input = input_tensor->data<uint8_t>();
+  uint8_t *output = output_tensor->mutable_data<uint8_t>();
   if (reduce_first_axis_) {
     if (type == ReduceType::MEAN) {
       uint32_t tmp = 0;
@@ -640,11 +651,15 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce1Dims(
       }
       output[0] = tmp;
     } else if (type == ReduceType::SUM) {
-      uint32_t tmp = 0;
+      int32_t sum = 0;
+      const auto in_zero_point = input_tensor->zero_point();
+      const auto out_zero_point = output_tensor->zero_point();
+      const auto scale = input_tensor->scale() / output_tensor->scale();
       for (int i = 0; i < data_reshape_[0]; ++i) {
-        tmp = tmp + input[i];
+        sum = sum + input[i];
       }
-      output[0] = static_cast<uint8_t>(tmp + data_reshape_[0] / 2);
+      const float f = (sum - in_zero_point * data_reshape_[0]) * scale;
+      output[0] = Saturate<uint8_t>(std::roundf(f + out_zero_point));
     } else {
       MACE_NOT_IMPLEMENTED;
     }
@@ -656,10 +671,12 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce1Dims(
 template<>
 void ReduceOp<DeviceType::CPU, uint8_t>::Reduce2Dims(
     const OpContext *context,
-    const uint8_t *input, ReduceType type, uint8_t *output) {
+    const Tensor *input_tensor, ReduceType type, Tensor *output_tensor) {
   utils::ThreadPool
       &thread_pool = context->device()->cpu_runtime()->thread_pool();
 
+  const uint8_t *input = input_tensor->data<uint8_t>();
+  uint8_t *output = output_tensor->mutable_data<uint8_t>();
   if (reduce_first_axis_) {
     thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
       if (type == ReduceType::MEAN) {
@@ -687,13 +704,17 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce2Dims(
           }
           output[i] = tmp;
         }
-      }  else if (type == ReduceType::SUM) {
+      } else if (type == ReduceType::SUM) {
+        const auto in_zero_point = input_tensor->zero_point();
+        const auto out_zero_point = output_tensor->zero_point();
+        const auto scale = input_tensor->scale() / output_tensor->scale();
         for (index_t i = start; i < end; i += step) {
-          uint32_t tmp = 0;
+          int32_t sum = 0;
           for (int j = 0; j < data_reshape_[0]; ++j) {
-            tmp += input[j * data_reshape_[1] + i];
+            sum += input[j * data_reshape_[1] + i];
           }
-          output[i] = static_cast<uint8_t>(tmp + data_reshape_[0] / 2);
+          const float f = (sum - in_zero_point * data_reshape_[0]) * scale;
+          output[i] = Saturate<uint8_t>(std::roundf(f + out_zero_point));
         }
       } else {
         MACE_NOT_IMPLEMENTED;
@@ -727,12 +748,16 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce2Dims(
           output[i] = tmp;
         }
       } else if (type == ReduceType::SUM) {
+        const auto in_zero_point = input_tensor->zero_point();
+        const auto out_zero_point = output_tensor->zero_point();
+        const auto scale = input_tensor->scale() / output_tensor->scale();
         for (index_t i = start; i < end; i += step) {
-          uint32_t tmp = 0;
+          int32_t sum = 0;
           for (int j = 0; j < data_reshape_[1]; ++j) {
-            tmp += input[i * data_reshape_[1] + j];
+            sum += input[i * data_reshape_[1] + j];
           }
-          output[i] = static_cast<uint8_t>(tmp + data_reshape_[1] / 2);
+          const float f = (sum - in_zero_point * data_reshape_[1]) * scale;
+          output[i] = Saturate<uint8_t>(std::roundf(f + out_zero_point));
         }
       } else {
         MACE_NOT_IMPLEMENTED;
@@ -744,10 +769,12 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce2Dims(
 template<>
 void ReduceOp<DeviceType::CPU, uint8_t>::Reduce3Dims(
     const OpContext *context,
-    const uint8_t *input, ReduceType type, uint8_t *output) {
+    const Tensor *input_tensor, ReduceType type, Tensor *output_tensor) {
   utils::ThreadPool
       &thread_pool = context->device()->cpu_runtime()->thread_pool();
 
+  const uint8_t *input = input_tensor->data<uint8_t>();
+  uint8_t *output = output_tensor->mutable_data<uint8_t>();
   if (reduce_first_axis_) {
     thread_pool.Compute1D([=](index_t start, index_t end, index_t step) {
       if (type == ReduceType::MEAN) {
@@ -787,15 +814,19 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce3Dims(
           output[i] = tmp;
         }
       } else if (type == ReduceType::SUM) {
+        const auto in_zero_point = input_tensor->zero_point();
+        const auto out_zero_point = output_tensor->zero_point();
+        const auto scale = input_tensor->scale() / output_tensor->scale();
         for (index_t i = start; i < end; i += step) {
-          uint32_t tmp = 0;
+          int32_t sum = 0;
           for (int j = 0; j < data_reshape_[2]; ++j) {
             for (int k = 0; k < data_reshape_[0]; ++k) {
-              tmp += input[(k * data_reshape_[1] + i) * data_reshape_[2] + j];
+              sum += input[(k * data_reshape_[1] + i) * data_reshape_[2] + j];
             }
           }
-          index_t dim = data_reshape_[0] * data_reshape_[2];
-          output[i] = static_cast<uint8_t>(tmp + dim / 2);
+          const auto count = data_reshape_[2] * data_reshape_[0];
+          const float f = (sum - in_zero_point * count) * scale;
+          output[i] = Saturate<uint8_t>(std::roundf(f + out_zero_point));
         }
       } else {
         MACE_NOT_IMPLEMENTED;
@@ -841,14 +872,18 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce3Dims(
           }
         }
       } else if (type == ReduceType::SUM) {
+        const auto in_zero_point = input_tensor->zero_point();
+        const auto out_zero_point = output_tensor->zero_point();
+        const auto scale = input_tensor->scale() / output_tensor->scale();
         for (index_t i = start0; i < end0; i += step0) {
           for (index_t j = start1; j < end1; j += step1) {
-            uint32_t tmp = 0;
+            int32_t sum = 0;
             for (int k = 0; k < data_reshape_[1]; ++k) {
-              tmp += input[(i * data_reshape_[1] + k) * data_reshape_[2] + j];
+              sum += input[(i * data_reshape_[1] + k) * data_reshape_[2] + j];
             }
+            const float f = (sum - in_zero_point * data_reshape_[1]) * scale;
             output[i * data_reshape_[2] + j] =
-                static_cast<uint8_t>(tmp + data_reshape_[1] / 2);
+                Saturate<uint8_t>(std::roundf(f + out_zero_point));
           }
         }
       } else {
@@ -861,10 +896,12 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce3Dims(
 template<>
 void ReduceOp<DeviceType::CPU, uint8_t>::Reduce4Dims(
     const OpContext *context,
-    const uint8_t *input, ReduceType type, uint8_t *output) {
+    const Tensor *input_tensor, ReduceType type, Tensor *output_tensor) {
   utils::ThreadPool
       &thread_pool = context->device()->cpu_runtime()->thread_pool();
 
+  const uint8_t *input = input_tensor->data<uint8_t>();
+  uint8_t *output = output_tensor->mutable_data<uint8_t>();
   if (reduce_first_axis_) {
     thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                               index_t start1, index_t end1, index_t step1) {
@@ -914,18 +951,22 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce4Dims(
           }
         }
       } else if (type == ReduceType::SUM) {
+        const auto in_zero_point = input_tensor->zero_point();
+        const auto out_zero_point = output_tensor->zero_point();
+        const auto scale = input_tensor->scale() / output_tensor->scale();
         for (index_t i = start0; i < end0; i += step0) {
           for (index_t j = start1; j < end1; j += step1) {
-            uint32_t tmp = 0;
+            int32_t sum = 0;
             for (int k = 0; k < data_reshape_[2]; ++k) {
               for (int t = 0; t < data_reshape_[0]; ++t) {
-                tmp += input[((t * data_reshape_[1] + i) *
+                sum += input[((t * data_reshape_[1] + i) *
                     data_reshape_[2] + k) * data_reshape_[3] + j];
               }
             }
-            index_t dim = data_reshape_[0] * data_reshape_[2];
+            const auto count = data_reshape_[2] * data_reshape_[0];
+            const float f = (sum - in_zero_point * count) * scale;
             output[i * data_reshape_[3] + j] =
-                static_cast<uint8_t>(tmp + dim / 2);
+                Saturate<uint8_t>(std::roundf(f + out_zero_point));
           }
         }
       } else {
@@ -983,18 +1024,22 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce4Dims(
           }
         }
       } else if (type == ReduceType::SUM) {
+        const auto in_zero_point = input_tensor->zero_point();
+        const auto out_zero_point = output_tensor->zero_point();
+        const auto scale = input_tensor->scale() / output_tensor->scale();
         for (index_t i = start0; i < end0; i += step0) {
           for (index_t j = start1; j < end1; j += step1) {
-            uint32_t tmp = 0;
+            int32_t sum = 0;
             for (int k = 0; k < data_reshape_[1]; ++k) {
               for (int t = 0; t < data_reshape_[3]; ++t) {
-                tmp += input[((i * data_reshape_[1] + k) *
+                sum += input[((i * data_reshape_[1] + k) *
                     data_reshape_[2] + j) * data_reshape_[3] + t];
               }
             }
-            index_t dim = data_reshape_[1] * data_reshape_[3];
+            const auto count = data_reshape_[1] * data_reshape_[3];
+            const float f = (sum - in_zero_point * count) * scale;
             output[i * data_reshape_[2] + j] =
-                static_cast<uint8_t>(tmp + dim / 2);
+                Saturate<uint8_t>(std::roundf(f + out_zero_point));
           }
         }
       } else {
diff --git a/test/ccunit/mace/ops/reduce_test.cc b/test/ccunit/mace/ops/reduce_test.cc
index 753bf419..fd394f4b 100644
--- a/test/ccunit/mace/ops/reduce_test.cc
+++ b/test/ccunit/mace/ops/reduce_test.cc
@@ -372,6 +372,15 @@ void TestQuant(const std::vector<index_t> &input_shape,
     net.TransformDataFormat<DeviceType::CPU, float>(
         "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
+    OpDefBuilder("Quantize", "DoQuantizeOutput")
+        .Input("Output")
+        .Output("ExpectedQuantizedOutput")
+        .OutputType({DT_UINT8})
+        .AddIntArg("T", DT_UINT8)
+        .AddIntArg("non_zero", true)
+        .Finalize(net.NewOperatorDef());
+    net.RunOp();
+
     OpDefBuilder("Quantize", "QuantizeInput")
         .Input("Input")
         .Output("QuantizedInput")
@@ -392,7 +401,12 @@ void TestQuant(const std::vector<index_t> &input_shape,
         .AddIntArg("has_data_format", 1)
         .AddIntArg("T", DT_UINT8)
         .Finalize(net.NewOperatorDef());
-    net.RunOp();
+    net.Setup(DeviceType::CPU);
+    Tensor *expect_quantize_output = net.GetTensor("ExpectedQuantizedOutput");
+    Tensor *quantize_output = net.GetTensor("QuantizedOutput");
+    quantize_output->SetScale(expect_quantize_output->scale());
+    quantize_output->SetZeroPoint(expect_quantize_output->zero_point());
+    net.Run();
 
     OpDefBuilder("Dequantize", "DeQuantizeTest")
         .Input("QuantizedOutput")
@@ -406,7 +420,7 @@ void TestQuant(const std::vector<index_t> &input_shape,
                                *net.GetTensor("DequantizedOutput"), 0.01);
   };
 
-  for (ReduceType type : {MEAN, MIN, MAX}) {
+  for (ReduceType type : {MEAN, MIN, MAX, SUM}) {
     func(type);
   }
 }
-- 
GitLab