diff --git a/mace/ops/BUILD b/mace/ops/BUILD index 7f03ce12221a7e074e59a34cdb38f918b86ff51a..aa434203db3da0eeb6be83c3e0538b3b76551fbe 100644 --- a/mace/ops/BUILD +++ b/mace/ops/BUILD @@ -84,7 +84,7 @@ cc_library( ]), deps = [ "//mace/core", - "@gtest//:gtest", + "@gtest", ], ) @@ -254,7 +254,7 @@ cc_library( ":arm_neon_kernels", ":ref_kernels", ":testing", - "@gtest//:gtest", + "@gtest", ], alwayslink = 1, ) @@ -289,7 +289,7 @@ cc_library( ":opencl_kernels", ":ref_kernels", ":testing", - "@gtest//:gtest", + "@gtest", ], alwayslink = 1, ) @@ -329,12 +329,12 @@ cc_library( "ops_registry.h", "ops_test_util.h", "fixpoint.h", - "gemmlowp_util.h", + "common/gemmlowp_util.h", "quantization_util.h", ], ) + if_quantize_enabled(glob([ "fixpoint.h", - "gemmlowp_util.h", + "common/gemmlowp_util.h", "quantization_util.h", ])), copts = [ diff --git a/mace/ops/arm/q8/eltwise.cc b/mace/ops/arm/q8/eltwise.cc new file mode 100644 index 0000000000000000000000000000000000000000..f987da81373282f769f660e5f10e7795413b3be4 --- /dev/null +++ b/mace/ops/arm/q8/eltwise.cc @@ -0,0 +1,157 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/arm/q8/eltwise.h" + +#include +#include + +#include "mace/ops/common/gemmlowp_util.h" +#include "mace/utils/logging.h" + +namespace mace { +namespace ops { +namespace arm { +namespace q8 { + +MaceStatus Eltwise::Compute(const OpContext *context, + const Tensor *input0, + const Tensor *input1, + Tensor *output) { + MACE_UNUSED(context); + MACE_CHECK(type_ == SUM || type_ == SUB, + "Quantized Elementwise only support SUM and SUB now."); + + constexpr int left_shift = 20; + const double doubled_scale = 2 * std::max(input0->scale(), input1->scale()); + const double adjusted_input0_scale = input0->scale() / doubled_scale; + const double adjusted_input1_scale = input1->scale() / doubled_scale; + const double adjusted_output_scale = + doubled_scale / ((1 << left_shift) * output->scale()); + + int32_t input0_multiplier; + int32_t input1_multiplier; + int32_t output_multiplier; + int32_t input0_shift; + int32_t input1_shift; + int32_t output_shift; + QuantizeMultiplier(adjusted_input0_scale, + &input0_multiplier, + &input0_shift); + QuantizeMultiplier(adjusted_input1_scale, + &input1_multiplier, + &input1_shift); + QuantizeMultiplier(adjusted_output_scale, + &output_multiplier, + &output_shift); + + Tensor::MappingGuard input0_guard(input0); + Tensor::MappingGuard input1_guard(input1); + Tensor::MappingGuard output_guard(output); + + auto input0_ptr = input0->data(); + auto input1_ptr = input1->data(); + auto output_ptr = output->mutable_data(); + +#pragma omp parallel for schedule(runtime) + for (index_t i = 0; i <= output->size() - 8; i += 8) { + const auto input0_val = vld1_u8(input0_ptr + i); + const auto input1_val = vld1_u8(input1_ptr + i); + const auto input0_val_s16 = + vreinterpretq_s16_u16(vmovl_u8(input0_val)); + const auto input1_val_s16 = + vreinterpretq_s16_u16(vmovl_u8(input1_val)); + const auto offset_input0 = + vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point())); + const auto offset_input1 = + vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point())); + auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0)); + auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0)); + auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1)); + auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1)); + const auto left_shift_dup = vdupq_n_s32(left_shift); + input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup); + input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup); + input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup); + input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup); + input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier); + input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier); + input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier); + input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier); + const auto input0_shift_dup = vdupq_n_s32(input0_shift); + const auto input1_shift_dup = vdupq_n_s32(input1_shift); + input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup); + input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup); + input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup); + input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup); + int32x4_t res_low, res_high; + if (type_ == SUM) { + res_low = vaddq_s32(input0_low_s32, input1_low_s32); + res_high = vaddq_s32(input0_high_s32, input1_high_s32); + } else { + res_low = vsubq_s32(input0_low_s32, input1_low_s32); + res_high = vsubq_s32(input0_high_s32, input1_high_s32); + } + res_low = vqrdmulhq_n_s32(res_low, output_multiplier); + res_high = vqrdmulhq_n_s32(res_high, output_multiplier); + res_low = gemmlowp::RoundingDivideByPOT(res_low, -output_shift); + res_high = gemmlowp::RoundingDivideByPOT(res_high, -output_shift); + const auto res_low_s16 = vmovn_s32(res_low); + const auto res_high_s16 = vmovn_s32(res_high); + const auto output_val = vaddq_s16(vcombine_s16(res_low_s16, + res_high_s16), + vdupq_n_s16(output->zero_point())); + vst1_u8(output_ptr + i, vqmovun_s16(output_val)); + } + + index_t handled_output_size = output->size() - output->size() % 8; +#pragma omp parallel for schedule(runtime) + for (index_t i = handled_output_size; i < output->size(); ++i) { + const int32_t offset_input0 = input0_ptr[i] - input0->zero_point(); + const int32_t offset_input1 = input1_ptr[i] - input1->zero_point(); + const int32_t shifted_input0 = offset_input0 * (1 << left_shift); + const int32_t shifted_input1 = offset_input1 * (1 << left_shift); + const int32_t multiplied_input0 = + gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0, + input0_multiplier), + -input0_shift); + const int32_t multiplied_input1 = + gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1, + input1_multiplier), + -input1_shift); + + int32_t res; + if (type_ == SUM) { + res = multiplied_input0 + multiplied_input1; + } else { + res = multiplied_input0 - multiplied_input1; + } + + const int32_t output_val = + gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul(res, + output_multiplier), + -output_shift) + output->zero_point(); + output_ptr[i] = Saturate(output_val); + } + + return MaceStatus::MACE_SUCCESS; +} + +} // namespace q8 +} // namespace arm +} // namespace ops +} // namespace mace diff --git a/mace/ops/arm/q8/eltwise.h b/mace/ops/arm/q8/eltwise.h new file mode 100644 index 0000000000000000000000000000000000000000..5223dc30c58c3b7a97059969133200c3465870c2 --- /dev/null +++ b/mace/ops/arm/q8/eltwise.h @@ -0,0 +1,48 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This implements matrix-vector multiplication described as +// https://github.com/google/gemmlowp/blob/master/todo/fast-gemv.txt + +#ifndef MACE_OPS_ARM_Q8_ELTWISE_H_ +#define MACE_OPS_ARM_Q8_ELTWISE_H_ + +#include "mace/core/op_context.h" +#include "mace/core/types.h" +#include "mace/ops/common/eltwise_type.h" + +namespace mace { +namespace ops { +namespace arm { +namespace q8 { + +class Eltwise { + public: + explicit Eltwise(const EltwiseType type) : type_(type) {} + + MaceStatus Compute(const OpContext *context, + const Tensor *input0, + const Tensor *input1, + Tensor *output); + + private: + EltwiseType type_; +}; + +} // namespace q8 +} // namespace arm +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_ARM_Q8_ELTWISE_H_ diff --git a/mace/ops/common/eltwise_type.h b/mace/ops/common/eltwise_type.h new file mode 100644 index 0000000000000000000000000000000000000000..634c4919c18f221b255939a01d8411428b8f3476 --- /dev/null +++ b/mace/ops/common/eltwise_type.h @@ -0,0 +1,40 @@ +// Copyright 2018 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_OPS_COMMON_ELTWISE_TYPE_H_ +#define MACE_OPS_COMMON_ELTWISE_TYPE_H_ + +namespace mace { +namespace ops { + +enum EltwiseType { + SUM = 0, + SUB = 1, + PROD = 2, + DIV = 3, + MIN = 4, + MAX = 5, + NEG = 6, + ABS = 7, + SQR_DIFF = 8, + POW = 9, + EQUAL = 10, + FLOOR_DIV = 11, + NONE = 12, +}; + +} // namespace ops +} // namespace mace + +#endif // MACE_OPS_COMMON_ELTWISE_TYPE_H_ diff --git a/mace/ops/gemmlowp_util.h b/mace/ops/common/gemmlowp_util.h similarity index 96% rename from mace/ops/gemmlowp_util.h rename to mace/ops/common/gemmlowp_util.h index c7091544ef5d90ef5fa11cbaacb052744dbe0ef0..c7eed2ad275c9b51cc5cf55cf2f88f90edf3d500 100644 --- a/mace/ops/gemmlowp_util.h +++ b/mace/ops/common/gemmlowp_util.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_GEMMLOWP_UTIL_H_ -#define MACE_OPS_GEMMLOWP_UTIL_H_ +#ifndef MACE_OPS_COMMON_GEMMLOWP_UTIL_H_ +#define MACE_OPS_COMMON_GEMMLOWP_UTIL_H_ #include @@ -75,4 +75,4 @@ struct GemmlowpOutputPipeline { }; } // namespace mace -#endif // MACE_OPS_GEMMLOWP_UTIL_H_ +#endif // MACE_OPS_COMMON_GEMMLOWP_UTIL_H_ diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index 19794b38be56fe3a99deb0583b0967575de571ae..653e3e33f535915eb52baad01e4e14f3b7a80bb7 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -41,7 +41,7 @@ #endif // MACE_ENABLE_NEON #ifdef MACE_ENABLE_QUANTIZE -#include "mace/ops/gemmlowp_util.h" +#include "mace/ops/common/gemmlowp_util.h" #include "mace/ops/quantization_util.h" #endif // MACE_ENABLE_QUANTIZE diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc index f035eeee579907fea2ddb77d04ca5c982c903b67..768349fffd268d58aee8f05260f37c841021947e 100644 --- a/mace/ops/eltwise.cc +++ b/mace/ops/eltwise.cc @@ -12,6 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifdef MACE_ENABLE_NEON +#ifdef MACE_ENABLE_QUANTIZE +#include "mace/ops/arm/q8/eltwise.h" +#endif // MACE_ENABLE_QUANTIZE +#endif // MACE_ENABLE_NEON + #include "mace/ops/eltwise.h" #include @@ -1035,19 +1041,30 @@ class EltwiseOp : public Operation { scalar_input_index_(Operation::GetOptionalArg( "scalar_input_index", 1)), data_format_(static_cast(Operation::GetOptionalArg( - "data_format", 0))) {} + "data_format", 0))) +#ifdef MACE_ENABLE_NEON + , eltwise_(static_cast(Operation::GetOptionalArg( + "type", static_cast(ops::EltwiseType::NONE)))) +#endif + {} MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); const Tensor *input0 = this->Input(0); - const Tensor *input1 = this->InputSize() == 2 ? this->Input(1) : nullptr; + MACE_CHECK(this->InputSize() == 2, + "Quantized Elementwise don't support broadcast now."); + const Tensor *input1 = this->Input(1); Tensor *output = this->Output(0); - MACE_CHECK(type_ == SUM, "Only support Elementwise SUM now. "); + MACE_CHECK(type_ == SUM || type_ == SUB, + "Quantized Elementwise only support SUM and SUB now."); MACE_CHECK(input0->size() == input1->size(), "input0 and input1 must have the same shape."); MACE_CHECK(output->scale() != 0); MACE_RETURN_IF_ERROR(output->Resize(input0->shape())); +#ifdef MACE_ENABLE_NEON + eltwise_.Compute(context, input0, input1, output); +#else constexpr int left_shift = 20; const double doubled_scale = 2 * std::max(input0->scale(), input1->scale()); const double adjusted_input0_scale = input0->scale() / doubled_scale; @@ -1078,57 +1095,8 @@ class EltwiseOp : public Operation { auto input0_ptr = input0->data(); auto input1_ptr = input1->data(); auto output_ptr = output->mutable_data(); - - index_t handled_output_size = 0; -#ifdef MACE_ENABLE_NEON - #pragma omp parallel for schedule(runtime) - for (index_t i = handled_output_size; i <= output->size() - 8; i += 8) { - const auto input0_val = vld1_u8(input0_ptr + i); - const auto input1_val = vld1_u8(input1_ptr + i); - const auto input0_val_s16 = - vreinterpretq_s16_u16(vmovl_u8(input0_val)); - const auto input1_val_s16 = - vreinterpretq_s16_u16(vmovl_u8(input1_val)); - const auto offset_input0 = - vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point())); - const auto offset_input1 = - vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point())); - auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0)); - auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0)); - auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1)); - auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1)); - const auto left_shift_dup = vdupq_n_s32(left_shift); - input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup); - input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup); - input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup); - input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup); - input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier); - input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier); - input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier); - input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier); - const auto input0_shift_dup = vdupq_n_s32(input0_shift); - const auto input1_shift_dup = vdupq_n_s32(input1_shift); - input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup); - input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup); - input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup); - input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup); - auto sum_low = vaddq_s32(input0_low_s32, input1_low_s32); - auto sum_high = vaddq_s32(input0_high_s32, input1_high_s32); - sum_low = vqrdmulhq_n_s32(sum_low, output_multiplier); - sum_high = vqrdmulhq_n_s32(sum_high, output_multiplier); - sum_low = gemmlowp::RoundingDivideByPOT(sum_low, -output_shift); - sum_high = gemmlowp::RoundingDivideByPOT(sum_high, -output_shift); - const auto sum_low_s16 = vmovn_s32(sum_low); - const auto sum_high_s16 = vmovn_s32(sum_high); - const auto output_val = vaddq_s16(vcombine_s16(sum_low_s16, - sum_high_s16), - vdupq_n_s16(output->zero_point())); - vst1_u8(output_ptr + i, vqmovun_s16(output_val)); - } - handled_output_size = output->size() - output->size() % 8; -#endif // NEON #pragma omp parallel for schedule(runtime) - for (index_t i = handled_output_size; i < output->size(); ++i) { + for (index_t i = 0; i < output->size(); ++i) { const int32_t offset_input0 = input0_ptr[i] - input0->zero_point(); const int32_t offset_input1 = input1_ptr[i] - input1->zero_point(); const int32_t shifted_input0 = offset_input0 * (1 << left_shift); @@ -1143,14 +1111,22 @@ class EltwiseOp : public Operation { gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1, input1_multiplier), -input1_shift); - const int32_t sum = multiplied_input0 + multiplied_input1; + + int32_t res; + if (type_ == SUM) { + res = multiplied_input0 + multiplied_input1; + } else { + res = multiplied_input0 - multiplied_input1; + } + const int32_t output_val = gemmlowp::RoundingDivideByPOT( - gemmlowp::SaturatingRoundingDoublingHighMul(sum, + gemmlowp::SaturatingRoundingDoublingHighMul(res, output_multiplier), -output_shift) + output->zero_point(); output_ptr[i] = Saturate(output_val); } +#endif // NEON return MaceStatus::MACE_SUCCESS; } @@ -1162,6 +1138,9 @@ class EltwiseOp : public Operation { int32_t scalar_input_index_; DataFormat data_format_; Tensor scalar_tensor_; +#ifdef MACE_ENABLE_NEON + arm::q8::Eltwise eltwise_; +#endif }; #endif // MACE_ENABLE_QUANTIZE diff --git a/mace/ops/eltwise.h b/mace/ops/eltwise.h index c79c6c27abfb3cef4ed02abfacc3dea5384e1bd3..208d7f26549b6642502dcf6022983ad4f0f52622 100644 --- a/mace/ops/eltwise.h +++ b/mace/ops/eltwise.h @@ -15,25 +15,11 @@ #ifndef MACE_OPS_ELTWISE_H_ #define MACE_OPS_ELTWISE_H_ +#include "mace/ops/common/eltwise_type.h" + namespace mace { namespace ops { -enum EltwiseType { - SUM = 0, - SUB = 1, - PROD = 2, - DIV = 3, - MIN = 4, - MAX = 5, - NEG = 6, - ABS = 7, - SQR_DIFF = 8, - POW = 9, - EQUAL = 10, - FLOOR_DIV = 11, - NONE = 12, -}; - inline bool IsLogicalType(EltwiseType type) { return type == EQUAL; } } // namespace ops diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc index 0bfb666f70d3fd606703e32bcd3a4baf3f788fa6..cb239f53e0c01d79dc718d5b3d4eca636b187863 100644 --- a/mace/ops/eltwise_benchmark.cc +++ b/mace/ops/eltwise_benchmark.cc @@ -30,12 +30,12 @@ void EltwiseBenchmark( OpsTestNet net; // Add input data - if (D == DeviceType::GPU) { - net.AddRandomInput("Input0", {n, h, w, c}); - net.AddRandomInput("Input1", {n, h, w, c}); - } else { + if (D == DeviceType::CPU && DataTypeToEnum::value != DT_UINT8) { net.AddRandomInput("Input0", {n, c, h, w}); net.AddRandomInput("Input1", {n, c, h, w}); + } else { + net.AddRandomInput("Input0", {n, h, w, c}); + net.AddRandomInput("Input1", {n, h, w, c}); } OpDefBuilder("Eltwise", "EltwiseTest") @@ -47,15 +47,21 @@ void EltwiseBenchmark( .Output("Output") .Finalize(net.NewOperatorDef()); + net.Setup(D); + + if (D == DeviceType::CPU && DataTypeToEnum::value == DT_UINT8) { + net.GetTensor("Output")->SetScale(0.1); + } + // Warm-up for (int i = 0; i < 5; ++i) { - net.RunOp(D); + net.Run(); net.Sync(); } mace::testing::StartTiming(); while (iters--) { - net.RunOp(D); + net.Run(); net.Sync(); } } @@ -86,6 +92,9 @@ MACE_BM_ELTWISE(0, 1, 240, 240, 256); MACE_BM_ELTWISE(5, 1, 128, 128, 32); MACE_BM_ELTWISE(5, 1, 240, 240, 256); +MACE_BM_ELTWISE_MACRO(0, 1, 128, 128, 32, uint8_t, CPU); +MACE_BM_ELTWISE_MACRO(1, 1, 128, 128, 32, uint8_t, CPU); + } // namespace test } // namespace ops } // namespace mace diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc index 7ca799e2e8701b8adb439218c17ce10d8fbd0f56..4f18810e73213c6113cca236d39e215731435983 100644 --- a/mace/ops/eltwise_test.cc +++ b/mace/ops/eltwise_test.cc @@ -729,7 +729,8 @@ void RandomTensorEltwise(const ops::EltwiseType type, } } -void QuantizedSum(const std::vector &shape) { +void Quantized(const std::vector &shape, + const ops::EltwiseType type) { // Construct graph OpsTestNet net; @@ -753,7 +754,7 @@ void QuantizedSum(const std::vector &shape) { OpDefBuilder("Eltwise", "EltwiseTest") .Input("TInput0") .Input("TInput1") - .AddIntArg("type", static_cast(ops::EltwiseType::SUM)) + .AddIntArg("type", static_cast(type)) .AddIntArg("data_format", DataFormat::NCHW) .Output("TOutput") .Finalize(net.NewOperatorDef()); @@ -794,7 +795,7 @@ void QuantizedSum(const std::vector &shape) { .Input("QuantizedInput0") .Input("QuantizedInput1") .Output("QuantizedOutput") - .AddIntArg("type", static_cast(ops::EltwiseType::SUM)) + .AddIntArg("type", static_cast(type)) .AddIntArg("T", static_cast(DT_UINT8)) .Finalize(net.NewOperatorDef()); net.Setup(DeviceType::CPU); @@ -1009,9 +1010,11 @@ TEST_F(EltwiseOpTest, TensorGeneralBroadcastGPU) { {1, 1, 2, 1}, {2, 3}, {1, 1, 2, 5}, {4, 1, 0, 1, 4, 4, 9, 16, 25, 36}); } -TEST_F(EltwiseOpTest, QuantizedSum) { - QuantizedSum({1, 32, 32, 16}); - QuantizedSum({1, 31, 31, 17}); +TEST_F(EltwiseOpTest, Quantized) { + Quantized({1, 32, 32, 16}, ops::EltwiseType::SUM); + Quantized({1, 31, 31, 17}, ops::EltwiseType::SUM); + Quantized({1, 32, 32, 16}, ops::EltwiseType::SUB); + Quantized({1, 31, 31, 17}, ops::EltwiseType::SUB); } } // namespace test diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc index a3aebcb49abe323a24bc792f857577481be19f35..07a65d79459166281a6f13cb4d58817a69d0f3ac 100644 --- a/mace/ops/matmul.cc +++ b/mace/ops/matmul.cc @@ -38,7 +38,7 @@ #endif // MACE_ENABLE_NEON #ifdef MACE_ENABLE_QUANTIZE -#include "mace/ops/gemmlowp_util.h" +#include "mace/ops/common/gemmlowp_util.h" #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_OPENCL diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc index 54f3e55bbaf07d04026ed28de0ed361bd9ff2061..b407ac34c357a6e81295007ee946ff61e0c18b7b 100644 --- a/mace/ops/softmax.cc +++ b/mace/ops/softmax.cc @@ -22,7 +22,7 @@ #ifdef MACE_ENABLE_QUANTIZE #include "mace/ops/fixpoint.h" -#include "mace/ops/gemmlowp_util.h" +#include "mace/ops/common/gemmlowp_util.h" #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_OPENCL diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index 33d4633635528b94a3d8d0ed108398368572a36c..f93d1819fb0ef025e6f034032c16795d878b3591 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -1423,8 +1423,9 @@ class Transformer(base_converter.ConverterInterface): else: mace_check(op.type == MaceOp.Quantize.name, "Quantization only support float ops, " - "but get %s(%s)" - % (op.name, op.type)) + "but get %s(%s, %s)" + % (op.name, op.type, + mace_pb2.DataType.Name(data_type_arg.i))) for input_node in self._option.input_nodes.values(): new_input_name = self.input_name_map[input_node.name] @@ -1725,18 +1726,29 @@ class Transformer(base_converter.ConverterInterface): self.add_quantize_info(op, 0.0, 1.0) self._quantize_activation_info[op.output[0]] = quantize_info elif (op.type == MaceOp.Eltwise.name - and ConverterUtil.get_arg(op, MaceKeyword.mace_element_type_str).i == EltwiseType.SUM.value # noqa and not op.quantize_info and len(op.input) == 2 and len(op.input[0]) not in self._consts and len(op.input[1]) not in self._consts): - del op.quantize_info[:] producer_op0 = self._producer[op.input[0]] producer_op1 = self._producer[op.input[1]] - minval = producer_op0.quantize_info[0].minval \ - + producer_op1.quantize_info[0].minval - maxval = producer_op0.quantize_info[0].maxval \ - + producer_op1.quantize_info[0].maxval + if ConverterUtil.get_arg( + op, MaceKeyword.mace_element_type_str).i \ + == EltwiseType.SUM.value: + minval = producer_op0.quantize_info[0].minval \ + + producer_op1.quantize_info[0].minval + maxval = producer_op0.quantize_info[0].maxval \ + + producer_op1.quantize_info[0].maxval + elif ConverterUtil.get_arg( + op, MaceKeyword.mace_element_type_str).i \ + == EltwiseType.SUB.value: + minval = producer_op0.quantize_info[0].minval \ + - producer_op1.quantize_info[0].maxval + maxval = producer_op0.quantize_info[0].maxval \ + - producer_op1.quantize_info[0].minval + else: + mace_check(False, "Quantized Elementwise only support:" + " SUM and SUB now.") quantize_info = \ self.add_quantize_info(op, minval, maxval) self._quantize_activation_info[op.output[0]] = quantize_info diff --git a/mace/utils/quantize.h b/mace/utils/quantize.h index 81d820cbfc39b2fe9edb729071d351c1993b1b01..ae8551c15f47b112c469085b868af2694ee7b452 100644 --- a/mace/utils/quantize.h +++ b/mace/utils/quantize.h @@ -19,6 +19,8 @@ #include #include +#include "mace/utils/logging.h" + namespace mace { template @@ -138,11 +140,6 @@ inline void Dequantize(const T *input, inline void QuantizeMultiplier(double multiplier, int32_t* output_multiplier, int32_t* shift) { - if (multiplier == 0.f) { - *output_multiplier = 0; - *shift = 0; - return; - } const double q = std::frexp(multiplier, shift); auto qint = static_cast(roundl(q * (1ll << 31))); if (qint == (1ll << 31)) {