From 1bb11f64aeeaebc64f489deb1a31274844b00e1e Mon Sep 17 00:00:00 2001 From: luxuhui Date: Tue, 3 Nov 2020 12:06:40 +0800 Subject: [PATCH] fix: fix compute error in `mvn` and `reduce` ops N/A Signed-off-by: Luxuhui --- mace/ops/opencl/image/mvnorm.cc | 2 +- mace/ops/reduce.cc | 141 ++++++++++++++++++---------- test/ccunit/mace/ops/reduce_test.cc | 18 +++- 3 files changed, 110 insertions(+), 51 deletions(-) diff --git a/mace/ops/opencl/image/mvnorm.cc b/mace/ops/opencl/image/mvnorm.cc index de409310..799e0ce4 100644 --- a/mace/ops/opencl/image/mvnorm.cc +++ b/mace/ops/opencl/image/mvnorm.cc @@ -85,7 +85,7 @@ MaceStatus MVNormKernel::DoCompute( const std::vector mean_shape = {batch, 1, 1, channels}; std::vector mean_image_shape; - OpenCLUtil::CalImage2DShape(mean_shape, OpenCLBufferType::IN_OUT_HEIGHT, + OpenCLUtil::CalImage2DShape(mean_shape, OpenCLBufferType::IN_OUT_CHANNEL, &mean_image_shape); ScratchImageManager *scratch_manager = context->device()->gpu_runtime()->scratch_image_manager(); diff --git a/mace/ops/reduce.cc b/mace/ops/reduce.cc index 0141b05c..8f67de18 100644 --- a/mace/ops/reduce.cc +++ b/mace/ops/reduce.cc @@ -17,12 +17,13 @@ #include #include -#include "mace/ops/common/reduce_type.h" #include "mace/core/future.h" #include "mace/core/ops/operator.h" +#include "mace/core/quantize.h" #include "mace/core/registry/ops_registry.h" #include "mace/core/runtime/cpu/cpu_runtime.h" #include "mace/core/tensor.h" +#include "mace/ops/common/reduce_type.h" #ifdef MACE_ENABLE_OPENCL #include "mace/ops/opencl/image/reduce.h" #endif // MACE_ENABLE_OPENCL @@ -75,9 +76,11 @@ class ReduceOp : public ReduceOpBase { const Tensor *input = this->Input(0); Tensor *output = this->Output(0); Simplify(input); - // Use the same scale and zero point with input and output. - output->SetScale(input->scale()); - output->SetZeroPoint(input->zero_point()); + if (reduce_type_ != SUM) { + // Use the same scale and zero point with input and output. + output->SetScale(input->scale()); + output->SetZeroPoint(input->zero_point()); + } output->Resize(out_shape_); Compute(context, input, output); return MaceStatus::MACE_SUCCESS; @@ -139,10 +142,12 @@ class ReduceOp : public ReduceOpBase { } void Reduce1Dims(const OpContext *context, - const T *input, + const Tensor *input_tensor, ReduceType type, - T *output) { + Tensor *output_tensor) { MACE_UNUSED(context); + const T *input = input_tensor->data(); + T *output = output_tensor->mutable_data(); if (reduce_first_axis_) { if (type == ReduceType::MEAN) { T tmp = 0.f; @@ -183,12 +188,14 @@ class ReduceOp : public ReduceOpBase { } void Reduce2Dims(const OpContext *context, - const T *input, + const Tensor *input_tensor, ReduceType type, - T *output) { + Tensor *output_tensor) { utils::ThreadPool &thread_pool = context->device()->cpu_runtime()->thread_pool(); + const T *input = input_tensor->data(); + T *output = output_tensor->mutable_data(); if (reduce_first_axis_) { thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { if (type == ReduceType::MEAN) { @@ -285,12 +292,14 @@ class ReduceOp : public ReduceOpBase { } void Reduce3Dims(const OpContext *context, - const T *input, + const Tensor *input_tensor, ReduceType type, - T *output) { + Tensor *output_tensor) { utils::ThreadPool &thread_pool = context->device()->cpu_runtime()->thread_pool(); + const T *input = input_tensor->data(); + T *output = output_tensor->mutable_data(); if (reduce_first_axis_) { thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { if (type == ReduceType::MEAN) { @@ -407,8 +416,7 @@ class ReduceOp : public ReduceOpBase { for (int j = 0; j < data_reshape_[2]; ++j) { for (int k = 0; k < data_reshape_[1]; ++k) { output[i * data_reshape_[2] + j] += - input[(i * data_reshape_[1] + k) * data_reshape_[2] - + j]; + input[(i * data_reshape_[1] + k) * data_reshape_[2] + j]; } } } @@ -420,12 +428,14 @@ class ReduceOp : public ReduceOpBase { } void Reduce4Dims(const OpContext *context, - const T *input, + const Tensor *input_tensor, ReduceType type, - T *output) { + Tensor *output_tensor) { utils::ThreadPool &thread_pool = context->device()->cpu_runtime()->thread_pool(); + const T *input = input_tensor->data(); + T *output = output_tensor->mutable_data(); if (reduce_first_axis_) { thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, index_t start1, index_t end1, index_t step1) { @@ -587,18 +597,17 @@ class ReduceOp : public ReduceOpBase { void Compute(const OpContext *context, const Tensor *input, Tensor *output) { Tensor::MappingGuard input_mapper(input); - const T *input_ptr = input->data(); Tensor::MappingGuard output_map(output); T *output_ptr = output->mutable_data(); memset(static_cast(output_ptr), 0, output->size() * sizeof(T)); switch (data_reshape_.size()) { - case 1:Reduce1Dims(context, input_ptr, reduce_type_, output_ptr); + case 1:Reduce1Dims(context, input, reduce_type_, output); break; - case 2:Reduce2Dims(context, input_ptr, reduce_type_, output_ptr); + case 2:Reduce2Dims(context, input, reduce_type_, output); break; - case 3:Reduce3Dims(context, input_ptr, reduce_type_, output_ptr); + case 3:Reduce3Dims(context, input, reduce_type_, output); break; - case 4:Reduce4Dims(context, input_ptr, reduce_type_, output_ptr); + case 4:Reduce4Dims(context, input, reduce_type_, output); break; default:MACE_CHECK(false, "not implemented in mace") << "data reshape size" << data_reshape_.size() @@ -617,8 +626,10 @@ class ReduceOp : public ReduceOpBase { template<> void ReduceOp::Reduce1Dims( const OpContext *context, - const uint8_t *input, ReduceType type, uint8_t *output) { + const Tensor *input_tensor, ReduceType type, Tensor *output_tensor) { MACE_UNUSED(context); + const uint8_t *input = input_tensor->data(); + uint8_t *output = output_tensor->mutable_data(); if (reduce_first_axis_) { if (type == ReduceType::MEAN) { uint32_t tmp = 0; @@ -640,11 +651,15 @@ void ReduceOp::Reduce1Dims( } output[0] = tmp; } else if (type == ReduceType::SUM) { - uint32_t tmp = 0; + int32_t sum = 0; + const auto in_zero_point = input_tensor->zero_point(); + const auto out_zero_point = output_tensor->zero_point(); + const auto scale = input_tensor->scale() / output_tensor->scale(); for (int i = 0; i < data_reshape_[0]; ++i) { - tmp = tmp + input[i]; + sum = sum + input[i]; } - output[0] = static_cast(tmp + data_reshape_[0] / 2); + const float f = (sum - in_zero_point * data_reshape_[0]) * scale; + output[0] = Saturate(std::roundf(f + out_zero_point)); } else { MACE_NOT_IMPLEMENTED; } @@ -656,10 +671,12 @@ void ReduceOp::Reduce1Dims( template<> void ReduceOp::Reduce2Dims( const OpContext *context, - const uint8_t *input, ReduceType type, uint8_t *output) { + const Tensor *input_tensor, ReduceType type, Tensor *output_tensor) { utils::ThreadPool &thread_pool = context->device()->cpu_runtime()->thread_pool(); + const uint8_t *input = input_tensor->data(); + uint8_t *output = output_tensor->mutable_data(); if (reduce_first_axis_) { thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { if (type == ReduceType::MEAN) { @@ -687,13 +704,17 @@ void ReduceOp::Reduce2Dims( } output[i] = tmp; } - } else if (type == ReduceType::SUM) { + } else if (type == ReduceType::SUM) { + const auto in_zero_point = input_tensor->zero_point(); + const auto out_zero_point = output_tensor->zero_point(); + const auto scale = input_tensor->scale() / output_tensor->scale(); for (index_t i = start; i < end; i += step) { - uint32_t tmp = 0; + int32_t sum = 0; for (int j = 0; j < data_reshape_[0]; ++j) { - tmp += input[j * data_reshape_[1] + i]; + sum += input[j * data_reshape_[1] + i]; } - output[i] = static_cast(tmp + data_reshape_[0] / 2); + const float f = (sum - in_zero_point * data_reshape_[0]) * scale; + output[i] = Saturate(std::roundf(f + out_zero_point)); } } else { MACE_NOT_IMPLEMENTED; @@ -727,12 +748,16 @@ void ReduceOp::Reduce2Dims( output[i] = tmp; } } else if (type == ReduceType::SUM) { + const auto in_zero_point = input_tensor->zero_point(); + const auto out_zero_point = output_tensor->zero_point(); + const auto scale = input_tensor->scale() / output_tensor->scale(); for (index_t i = start; i < end; i += step) { - uint32_t tmp = 0; + int32_t sum = 0; for (int j = 0; j < data_reshape_[1]; ++j) { - tmp += input[i * data_reshape_[1] + j]; + sum += input[i * data_reshape_[1] + j]; } - output[i] = static_cast(tmp + data_reshape_[1] / 2); + const float f = (sum - in_zero_point * data_reshape_[1]) * scale; + output[i] = Saturate(std::roundf(f + out_zero_point)); } } else { MACE_NOT_IMPLEMENTED; @@ -744,10 +769,12 @@ void ReduceOp::Reduce2Dims( template<> void ReduceOp::Reduce3Dims( const OpContext *context, - const uint8_t *input, ReduceType type, uint8_t *output) { + const Tensor *input_tensor, ReduceType type, Tensor *output_tensor) { utils::ThreadPool &thread_pool = context->device()->cpu_runtime()->thread_pool(); + const uint8_t *input = input_tensor->data(); + uint8_t *output = output_tensor->mutable_data(); if (reduce_first_axis_) { thread_pool.Compute1D([=](index_t start, index_t end, index_t step) { if (type == ReduceType::MEAN) { @@ -787,15 +814,19 @@ void ReduceOp::Reduce3Dims( output[i] = tmp; } } else if (type == ReduceType::SUM) { + const auto in_zero_point = input_tensor->zero_point(); + const auto out_zero_point = output_tensor->zero_point(); + const auto scale = input_tensor->scale() / output_tensor->scale(); for (index_t i = start; i < end; i += step) { - uint32_t tmp = 0; + int32_t sum = 0; for (int j = 0; j < data_reshape_[2]; ++j) { for (int k = 0; k < data_reshape_[0]; ++k) { - tmp += input[(k * data_reshape_[1] + i) * data_reshape_[2] + j]; + sum += input[(k * data_reshape_[1] + i) * data_reshape_[2] + j]; } } - index_t dim = data_reshape_[0] * data_reshape_[2]; - output[i] = static_cast(tmp + dim / 2); + const auto count = data_reshape_[2] * data_reshape_[0]; + const float f = (sum - in_zero_point * count) * scale; + output[i] = Saturate(std::roundf(f + out_zero_point)); } } else { MACE_NOT_IMPLEMENTED; @@ -841,14 +872,18 @@ void ReduceOp::Reduce3Dims( } } } else if (type == ReduceType::SUM) { + const auto in_zero_point = input_tensor->zero_point(); + const auto out_zero_point = output_tensor->zero_point(); + const auto scale = input_tensor->scale() / output_tensor->scale(); for (index_t i = start0; i < end0; i += step0) { for (index_t j = start1; j < end1; j += step1) { - uint32_t tmp = 0; + int32_t sum = 0; for (int k = 0; k < data_reshape_[1]; ++k) { - tmp += input[(i * data_reshape_[1] + k) * data_reshape_[2] + j]; + sum += input[(i * data_reshape_[1] + k) * data_reshape_[2] + j]; } + const float f = (sum - in_zero_point * data_reshape_[1]) * scale; output[i * data_reshape_[2] + j] = - static_cast(tmp + data_reshape_[1] / 2); + Saturate(std::roundf(f + out_zero_point)); } } } else { @@ -861,10 +896,12 @@ void ReduceOp::Reduce3Dims( template<> void ReduceOp::Reduce4Dims( const OpContext *context, - const uint8_t *input, ReduceType type, uint8_t *output) { + const Tensor *input_tensor, ReduceType type, Tensor *output_tensor) { utils::ThreadPool &thread_pool = context->device()->cpu_runtime()->thread_pool(); + const uint8_t *input = input_tensor->data(); + uint8_t *output = output_tensor->mutable_data(); if (reduce_first_axis_) { thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, index_t start1, index_t end1, index_t step1) { @@ -914,18 +951,22 @@ void ReduceOp::Reduce4Dims( } } } else if (type == ReduceType::SUM) { + const auto in_zero_point = input_tensor->zero_point(); + const auto out_zero_point = output_tensor->zero_point(); + const auto scale = input_tensor->scale() / output_tensor->scale(); for (index_t i = start0; i < end0; i += step0) { for (index_t j = start1; j < end1; j += step1) { - uint32_t tmp = 0; + int32_t sum = 0; for (int k = 0; k < data_reshape_[2]; ++k) { for (int t = 0; t < data_reshape_[0]; ++t) { - tmp += input[((t * data_reshape_[1] + i) * + sum += input[((t * data_reshape_[1] + i) * data_reshape_[2] + k) * data_reshape_[3] + j]; } } - index_t dim = data_reshape_[0] * data_reshape_[2]; + const auto count = data_reshape_[2] * data_reshape_[0]; + const float f = (sum - in_zero_point * count) * scale; output[i * data_reshape_[3] + j] = - static_cast(tmp + dim / 2); + Saturate(std::roundf(f + out_zero_point)); } } } else { @@ -983,18 +1024,22 @@ void ReduceOp::Reduce4Dims( } } } else if (type == ReduceType::SUM) { + const auto in_zero_point = input_tensor->zero_point(); + const auto out_zero_point = output_tensor->zero_point(); + const auto scale = input_tensor->scale() / output_tensor->scale(); for (index_t i = start0; i < end0; i += step0) { for (index_t j = start1; j < end1; j += step1) { - uint32_t tmp = 0; + int32_t sum = 0; for (int k = 0; k < data_reshape_[1]; ++k) { for (int t = 0; t < data_reshape_[3]; ++t) { - tmp += input[((i * data_reshape_[1] + k) * + sum += input[((i * data_reshape_[1] + k) * data_reshape_[2] + j) * data_reshape_[3] + t]; } } - index_t dim = data_reshape_[1] * data_reshape_[3]; + const auto count = data_reshape_[1] * data_reshape_[3]; + const float f = (sum - in_zero_point * count) * scale; output[i * data_reshape_[2] + j] = - static_cast(tmp + dim / 2); + Saturate(std::roundf(f + out_zero_point)); } } } else { diff --git a/test/ccunit/mace/ops/reduce_test.cc b/test/ccunit/mace/ops/reduce_test.cc index 753bf419..fd394f4b 100644 --- a/test/ccunit/mace/ops/reduce_test.cc +++ b/test/ccunit/mace/ops/reduce_test.cc @@ -372,6 +372,15 @@ void TestQuant(const std::vector &input_shape, net.TransformDataFormat( "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); + OpDefBuilder("Quantize", "DoQuantizeOutput") + .Input("Output") + .Output("ExpectedQuantizedOutput") + .OutputType({DT_UINT8}) + .AddIntArg("T", DT_UINT8) + .AddIntArg("non_zero", true) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + OpDefBuilder("Quantize", "QuantizeInput") .Input("Input") .Output("QuantizedInput") @@ -392,7 +401,12 @@ void TestQuant(const std::vector &input_shape, .AddIntArg("has_data_format", 1) .AddIntArg("T", DT_UINT8) .Finalize(net.NewOperatorDef()); - net.RunOp(); + net.Setup(DeviceType::CPU); + Tensor *expect_quantize_output = net.GetTensor("ExpectedQuantizedOutput"); + Tensor *quantize_output = net.GetTensor("QuantizedOutput"); + quantize_output->SetScale(expect_quantize_output->scale()); + quantize_output->SetZeroPoint(expect_quantize_output->zero_point()); + net.Run(); OpDefBuilder("Dequantize", "DeQuantizeTest") .Input("QuantizedOutput") @@ -406,7 +420,7 @@ void TestQuant(const std::vector &input_shape, *net.GetTensor("DequantizedOutput"), 0.01); }; - for (ReduceType type : {MEAN, MIN, MAX}) { + for (ReduceType type : {MEAN, MIN, MAX, SUM}) { func(type); } } -- GitLab