提交 d4f1368d 编写于 作者: B Bin Li

Add quantized eltwise sub

上级 f2f05c0d
...@@ -84,7 +84,7 @@ cc_library( ...@@ -84,7 +84,7 @@ cc_library(
]), ]),
deps = [ deps = [
"//mace/core", "//mace/core",
"@gtest//:gtest", "@gtest",
], ],
) )
...@@ -254,7 +254,7 @@ cc_library( ...@@ -254,7 +254,7 @@ cc_library(
":arm_neon_kernels", ":arm_neon_kernels",
":ref_kernels", ":ref_kernels",
":testing", ":testing",
"@gtest//:gtest", "@gtest",
], ],
alwayslink = 1, alwayslink = 1,
) )
...@@ -289,7 +289,7 @@ cc_library( ...@@ -289,7 +289,7 @@ cc_library(
":opencl_kernels", ":opencl_kernels",
":ref_kernels", ":ref_kernels",
":testing", ":testing",
"@gtest//:gtest", "@gtest",
], ],
alwayslink = 1, alwayslink = 1,
) )
...@@ -329,12 +329,12 @@ cc_library( ...@@ -329,12 +329,12 @@ cc_library(
"ops_registry.h", "ops_registry.h",
"ops_test_util.h", "ops_test_util.h",
"fixpoint.h", "fixpoint.h",
"gemmlowp_util.h", "common/gemmlowp_util.h",
"quantization_util.h", "quantization_util.h",
], ],
) + if_quantize_enabled(glob([ ) + if_quantize_enabled(glob([
"fixpoint.h", "fixpoint.h",
"gemmlowp_util.h", "common/gemmlowp_util.h",
"quantization_util.h", "quantization_util.h",
])), ])),
copts = [ copts = [
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/arm/q8/eltwise.h"
#include <arm_neon.h>
#include <algorithm>
#include "mace/ops/common/gemmlowp_util.h"
#include "mace/utils/logging.h"
namespace mace {
namespace ops {
namespace arm {
namespace q8 {
MaceStatus Eltwise::Compute(const OpContext *context,
const Tensor *input0,
const Tensor *input1,
Tensor *output) {
MACE_UNUSED(context);
MACE_CHECK(type_ == SUM || type_ == SUB,
"Quantized Elementwise only support SUM and SUB now.");
constexpr int left_shift = 20;
const double doubled_scale = 2 * std::max(input0->scale(), input1->scale());
const double adjusted_input0_scale = input0->scale() / doubled_scale;
const double adjusted_input1_scale = input1->scale() / doubled_scale;
const double adjusted_output_scale =
doubled_scale / ((1 << left_shift) * output->scale());
int32_t input0_multiplier;
int32_t input1_multiplier;
int32_t output_multiplier;
int32_t input0_shift;
int32_t input1_shift;
int32_t output_shift;
QuantizeMultiplier(adjusted_input0_scale,
&input0_multiplier,
&input0_shift);
QuantizeMultiplier(adjusted_input1_scale,
&input1_multiplier,
&input1_shift);
QuantizeMultiplier(adjusted_output_scale,
&output_multiplier,
&output_shift);
Tensor::MappingGuard input0_guard(input0);
Tensor::MappingGuard input1_guard(input1);
Tensor::MappingGuard output_guard(output);
auto input0_ptr = input0->data<uint8_t>();
auto input1_ptr = input1->data<uint8_t>();
auto output_ptr = output->mutable_data<uint8_t>();
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i <= output->size() - 8; i += 8) {
const auto input0_val = vld1_u8(input0_ptr + i);
const auto input1_val = vld1_u8(input1_ptr + i);
const auto input0_val_s16 =
vreinterpretq_s16_u16(vmovl_u8(input0_val));
const auto input1_val_s16 =
vreinterpretq_s16_u16(vmovl_u8(input1_val));
const auto offset_input0 =
vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point()));
const auto offset_input1 =
vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point()));
auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0));
auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0));
auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1));
auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1));
const auto left_shift_dup = vdupq_n_s32(left_shift);
input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup);
input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup);
input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup);
input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup);
input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier);
input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier);
input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier);
input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier);
const auto input0_shift_dup = vdupq_n_s32(input0_shift);
const auto input1_shift_dup = vdupq_n_s32(input1_shift);
input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup);
input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup);
input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup);
input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup);
int32x4_t res_low, res_high;
if (type_ == SUM) {
res_low = vaddq_s32(input0_low_s32, input1_low_s32);
res_high = vaddq_s32(input0_high_s32, input1_high_s32);
} else {
res_low = vsubq_s32(input0_low_s32, input1_low_s32);
res_high = vsubq_s32(input0_high_s32, input1_high_s32);
}
res_low = vqrdmulhq_n_s32(res_low, output_multiplier);
res_high = vqrdmulhq_n_s32(res_high, output_multiplier);
res_low = gemmlowp::RoundingDivideByPOT(res_low, -output_shift);
res_high = gemmlowp::RoundingDivideByPOT(res_high, -output_shift);
const auto res_low_s16 = vmovn_s32(res_low);
const auto res_high_s16 = vmovn_s32(res_high);
const auto output_val = vaddq_s16(vcombine_s16(res_low_s16,
res_high_s16),
vdupq_n_s16(output->zero_point()));
vst1_u8(output_ptr + i, vqmovun_s16(output_val));
}
index_t handled_output_size = output->size() - output->size() % 8;
#pragma omp parallel for schedule(runtime)
for (index_t i = handled_output_size; i < output->size(); ++i) {
const int32_t offset_input0 = input0_ptr[i] - input0->zero_point();
const int32_t offset_input1 = input1_ptr[i] - input1->zero_point();
const int32_t shifted_input0 = offset_input0 * (1 << left_shift);
const int32_t shifted_input1 = offset_input1 * (1 << left_shift);
const int32_t multiplied_input0 =
gemmlowp::RoundingDivideByPOT(
gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0,
input0_multiplier),
-input0_shift);
const int32_t multiplied_input1 =
gemmlowp::RoundingDivideByPOT(
gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1,
input1_multiplier),
-input1_shift);
int32_t res;
if (type_ == SUM) {
res = multiplied_input0 + multiplied_input1;
} else {
res = multiplied_input0 - multiplied_input1;
}
const int32_t output_val =
gemmlowp::RoundingDivideByPOT(
gemmlowp::SaturatingRoundingDoublingHighMul(res,
output_multiplier),
-output_shift) + output->zero_point();
output_ptr[i] = Saturate<uint8_t>(output_val);
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace q8
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This implements matrix-vector multiplication described as
// https://github.com/google/gemmlowp/blob/master/todo/fast-gemv.txt
#ifndef MACE_OPS_ARM_Q8_ELTWISE_H_
#define MACE_OPS_ARM_Q8_ELTWISE_H_
#include "mace/core/op_context.h"
#include "mace/core/types.h"
#include "mace/ops/common/eltwise_type.h"
namespace mace {
namespace ops {
namespace arm {
namespace q8 {
class Eltwise {
public:
explicit Eltwise(const EltwiseType type) : type_(type) {}
MaceStatus Compute(const OpContext *context,
const Tensor *input0,
const Tensor *input1,
Tensor *output);
private:
EltwiseType type_;
};
} // namespace q8
} // namespace arm
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ARM_Q8_ELTWISE_H_
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_COMMON_ELTWISE_TYPE_H_
#define MACE_OPS_COMMON_ELTWISE_TYPE_H_
namespace mace {
namespace ops {
enum EltwiseType {
SUM = 0,
SUB = 1,
PROD = 2,
DIV = 3,
MIN = 4,
MAX = 5,
NEG = 6,
ABS = 7,
SQR_DIFF = 8,
POW = 9,
EQUAL = 10,
FLOOR_DIV = 11,
NONE = 12,
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_COMMON_ELTWISE_TYPE_H_
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_GEMMLOWP_UTIL_H_ #ifndef MACE_OPS_COMMON_GEMMLOWP_UTIL_H_
#define MACE_OPS_GEMMLOWP_UTIL_H_ #define MACE_OPS_COMMON_GEMMLOWP_UTIL_H_
#include <tuple> #include <tuple>
...@@ -75,4 +75,4 @@ struct GemmlowpOutputPipeline { ...@@ -75,4 +75,4 @@ struct GemmlowpOutputPipeline {
}; };
} // namespace mace } // namespace mace
#endif // MACE_OPS_GEMMLOWP_UTIL_H_ #endif // MACE_OPS_COMMON_GEMMLOWP_UTIL_H_
...@@ -41,7 +41,7 @@ ...@@ -41,7 +41,7 @@
#endif // MACE_ENABLE_NEON #endif // MACE_ENABLE_NEON
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
#include "mace/ops/gemmlowp_util.h" #include "mace/ops/common/gemmlowp_util.h"
#include "mace/ops/quantization_util.h" #include "mace/ops/quantization_util.h"
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
......
...@@ -12,6 +12,12 @@ ...@@ -12,6 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifdef MACE_ENABLE_NEON
#ifdef MACE_ENABLE_QUANTIZE
#include "mace/ops/arm/q8/eltwise.h"
#endif // MACE_ENABLE_QUANTIZE
#endif // MACE_ENABLE_NEON
#include "mace/ops/eltwise.h" #include "mace/ops/eltwise.h"
#include <algorithm> #include <algorithm>
...@@ -1035,19 +1041,30 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation { ...@@ -1035,19 +1041,30 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
scalar_input_index_(Operation::GetOptionalArg<int32_t>( scalar_input_index_(Operation::GetOptionalArg<int32_t>(
"scalar_input_index", 1)), "scalar_input_index", 1)),
data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>( data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
"data_format", 0))) {} "data_format", 0)))
#ifdef MACE_ENABLE_NEON
, eltwise_(static_cast<ops::EltwiseType>(Operation::GetOptionalArg<int>(
"type", static_cast<int>(ops::EltwiseType::NONE))))
#endif
{}
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context); MACE_UNUSED(context);
const Tensor *input0 = this->Input(0); const Tensor *input0 = this->Input(0);
const Tensor *input1 = this->InputSize() == 2 ? this->Input(1) : nullptr; MACE_CHECK(this->InputSize() == 2,
"Quantized Elementwise don't support broadcast now.");
const Tensor *input1 = this->Input(1);
Tensor *output = this->Output(0); Tensor *output = this->Output(0);
MACE_CHECK(type_ == SUM, "Only support Elementwise SUM now. "); MACE_CHECK(type_ == SUM || type_ == SUB,
"Quantized Elementwise only support SUM and SUB now.");
MACE_CHECK(input0->size() == input1->size(), MACE_CHECK(input0->size() == input1->size(),
"input0 and input1 must have the same shape."); "input0 and input1 must have the same shape.");
MACE_CHECK(output->scale() != 0); MACE_CHECK(output->scale() != 0);
MACE_RETURN_IF_ERROR(output->Resize(input0->shape())); MACE_RETURN_IF_ERROR(output->Resize(input0->shape()));
#ifdef MACE_ENABLE_NEON
eltwise_.Compute(context, input0, input1, output);
#else
constexpr int left_shift = 20; constexpr int left_shift = 20;
const double doubled_scale = 2 * std::max(input0->scale(), input1->scale()); const double doubled_scale = 2 * std::max(input0->scale(), input1->scale());
const double adjusted_input0_scale = input0->scale() / doubled_scale; const double adjusted_input0_scale = input0->scale() / doubled_scale;
...@@ -1078,57 +1095,8 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation { ...@@ -1078,57 +1095,8 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
auto input0_ptr = input0->data<uint8_t>(); auto input0_ptr = input0->data<uint8_t>();
auto input1_ptr = input1->data<uint8_t>(); auto input1_ptr = input1->data<uint8_t>();
auto output_ptr = output->mutable_data<uint8_t>(); auto output_ptr = output->mutable_data<uint8_t>();
index_t handled_output_size = 0;
#ifdef MACE_ENABLE_NEON
#pragma omp parallel for schedule(runtime)
for (index_t i = handled_output_size; i <= output->size() - 8; i += 8) {
const auto input0_val = vld1_u8(input0_ptr + i);
const auto input1_val = vld1_u8(input1_ptr + i);
const auto input0_val_s16 =
vreinterpretq_s16_u16(vmovl_u8(input0_val));
const auto input1_val_s16 =
vreinterpretq_s16_u16(vmovl_u8(input1_val));
const auto offset_input0 =
vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point()));
const auto offset_input1 =
vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point()));
auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0));
auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0));
auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1));
auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1));
const auto left_shift_dup = vdupq_n_s32(left_shift);
input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup);
input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup);
input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup);
input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup);
input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier);
input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier);
input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier);
input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier);
const auto input0_shift_dup = vdupq_n_s32(input0_shift);
const auto input1_shift_dup = vdupq_n_s32(input1_shift);
input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup);
input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup);
input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup);
input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup);
auto sum_low = vaddq_s32(input0_low_s32, input1_low_s32);
auto sum_high = vaddq_s32(input0_high_s32, input1_high_s32);
sum_low = vqrdmulhq_n_s32(sum_low, output_multiplier);
sum_high = vqrdmulhq_n_s32(sum_high, output_multiplier);
sum_low = gemmlowp::RoundingDivideByPOT(sum_low, -output_shift);
sum_high = gemmlowp::RoundingDivideByPOT(sum_high, -output_shift);
const auto sum_low_s16 = vmovn_s32(sum_low);
const auto sum_high_s16 = vmovn_s32(sum_high);
const auto output_val = vaddq_s16(vcombine_s16(sum_low_s16,
sum_high_s16),
vdupq_n_s16(output->zero_point()));
vst1_u8(output_ptr + i, vqmovun_s16(output_val));
}
handled_output_size = output->size() - output->size() % 8;
#endif // NEON
#pragma omp parallel for schedule(runtime) #pragma omp parallel for schedule(runtime)
for (index_t i = handled_output_size; i < output->size(); ++i) { for (index_t i = 0; i < output->size(); ++i) {
const int32_t offset_input0 = input0_ptr[i] - input0->zero_point(); const int32_t offset_input0 = input0_ptr[i] - input0->zero_point();
const int32_t offset_input1 = input1_ptr[i] - input1->zero_point(); const int32_t offset_input1 = input1_ptr[i] - input1->zero_point();
const int32_t shifted_input0 = offset_input0 * (1 << left_shift); const int32_t shifted_input0 = offset_input0 * (1 << left_shift);
...@@ -1143,14 +1111,22 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation { ...@@ -1143,14 +1111,22 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1, gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1,
input1_multiplier), input1_multiplier),
-input1_shift); -input1_shift);
const int32_t sum = multiplied_input0 + multiplied_input1;
int32_t res;
if (type_ == SUM) {
res = multiplied_input0 + multiplied_input1;
} else {
res = multiplied_input0 - multiplied_input1;
}
const int32_t output_val = const int32_t output_val =
gemmlowp::RoundingDivideByPOT( gemmlowp::RoundingDivideByPOT(
gemmlowp::SaturatingRoundingDoublingHighMul(sum, gemmlowp::SaturatingRoundingDoublingHighMul(res,
output_multiplier), output_multiplier),
-output_shift) + output->zero_point(); -output_shift) + output->zero_point();
output_ptr[i] = Saturate<uint8_t>(output_val); output_ptr[i] = Saturate<uint8_t>(output_val);
} }
#endif // NEON
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
...@@ -1162,6 +1138,9 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation { ...@@ -1162,6 +1138,9 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
int32_t scalar_input_index_; int32_t scalar_input_index_;
DataFormat data_format_; DataFormat data_format_;
Tensor scalar_tensor_; Tensor scalar_tensor_;
#ifdef MACE_ENABLE_NEON
arm::q8::Eltwise eltwise_;
#endif
}; };
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
......
...@@ -15,25 +15,11 @@ ...@@ -15,25 +15,11 @@
#ifndef MACE_OPS_ELTWISE_H_ #ifndef MACE_OPS_ELTWISE_H_
#define MACE_OPS_ELTWISE_H_ #define MACE_OPS_ELTWISE_H_
#include "mace/ops/common/eltwise_type.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
enum EltwiseType {
SUM = 0,
SUB = 1,
PROD = 2,
DIV = 3,
MIN = 4,
MAX = 5,
NEG = 6,
ABS = 7,
SQR_DIFF = 8,
POW = 9,
EQUAL = 10,
FLOOR_DIV = 11,
NONE = 12,
};
inline bool IsLogicalType(EltwiseType type) { return type == EQUAL; } inline bool IsLogicalType(EltwiseType type) { return type == EQUAL; }
} // namespace ops } // namespace ops
......
...@@ -30,12 +30,12 @@ void EltwiseBenchmark( ...@@ -30,12 +30,12 @@ void EltwiseBenchmark(
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
if (D == DeviceType::GPU) { if (D == DeviceType::CPU && DataTypeToEnum<T>::value != DT_UINT8) {
net.AddRandomInput<D, T>("Input0", {n, h, w, c});
net.AddRandomInput<D, T>("Input1", {n, h, w, c});
} else {
net.AddRandomInput<D, T>("Input0", {n, c, h, w}); net.AddRandomInput<D, T>("Input0", {n, c, h, w});
net.AddRandomInput<D, T>("Input1", {n, c, h, w}); net.AddRandomInput<D, T>("Input1", {n, c, h, w});
} else {
net.AddRandomInput<D, T>("Input0", {n, h, w, c});
net.AddRandomInput<D, T>("Input1", {n, h, w, c});
} }
OpDefBuilder("Eltwise", "EltwiseTest") OpDefBuilder("Eltwise", "EltwiseTest")
...@@ -47,15 +47,21 @@ void EltwiseBenchmark( ...@@ -47,15 +47,21 @@ void EltwiseBenchmark(
.Output("Output") .Output("Output")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.Setup(D);
if (D == DeviceType::CPU && DataTypeToEnum<T>::value == DT_UINT8) {
net.GetTensor("Output")->SetScale(0.1);
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
net.RunOp(D); net.Run();
net.Sync(); net.Sync();
} }
mace::testing::StartTiming(); mace::testing::StartTiming();
while (iters--) { while (iters--) {
net.RunOp(D); net.Run();
net.Sync(); net.Sync();
} }
} }
...@@ -86,6 +92,9 @@ MACE_BM_ELTWISE(0, 1, 240, 240, 256); ...@@ -86,6 +92,9 @@ MACE_BM_ELTWISE(0, 1, 240, 240, 256);
MACE_BM_ELTWISE(5, 1, 128, 128, 32); MACE_BM_ELTWISE(5, 1, 128, 128, 32);
MACE_BM_ELTWISE(5, 1, 240, 240, 256); MACE_BM_ELTWISE(5, 1, 240, 240, 256);
MACE_BM_ELTWISE_MACRO(0, 1, 128, 128, 32, uint8_t, CPU);
MACE_BM_ELTWISE_MACRO(1, 1, 128, 128, 32, uint8_t, CPU);
} // namespace test } // namespace test
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -729,7 +729,8 @@ void RandomTensorEltwise(const ops::EltwiseType type, ...@@ -729,7 +729,8 @@ void RandomTensorEltwise(const ops::EltwiseType type,
} }
} }
void QuantizedSum(const std::vector<index_t> &shape) { void Quantized(const std::vector<index_t> &shape,
const ops::EltwiseType type) {
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
...@@ -753,7 +754,7 @@ void QuantizedSum(const std::vector<index_t> &shape) { ...@@ -753,7 +754,7 @@ void QuantizedSum(const std::vector<index_t> &shape) {
OpDefBuilder("Eltwise", "EltwiseTest") OpDefBuilder("Eltwise", "EltwiseTest")
.Input("TInput0") .Input("TInput0")
.Input("TInput1") .Input("TInput1")
.AddIntArg("type", static_cast<int>(ops::EltwiseType::SUM)) .AddIntArg("type", static_cast<int>(type))
.AddIntArg("data_format", DataFormat::NCHW) .AddIntArg("data_format", DataFormat::NCHW)
.Output("TOutput") .Output("TOutput")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
...@@ -794,7 +795,7 @@ void QuantizedSum(const std::vector<index_t> &shape) { ...@@ -794,7 +795,7 @@ void QuantizedSum(const std::vector<index_t> &shape) {
.Input("QuantizedInput0") .Input("QuantizedInput0")
.Input("QuantizedInput1") .Input("QuantizedInput1")
.Output("QuantizedOutput") .Output("QuantizedOutput")
.AddIntArg("type", static_cast<int>(ops::EltwiseType::SUM)) .AddIntArg("type", static_cast<int>(type))
.AddIntArg("T", static_cast<int>(DT_UINT8)) .AddIntArg("T", static_cast<int>(DT_UINT8))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
net.Setup(DeviceType::CPU); net.Setup(DeviceType::CPU);
...@@ -1009,9 +1010,11 @@ TEST_F(EltwiseOpTest, TensorGeneralBroadcastGPU) { ...@@ -1009,9 +1010,11 @@ TEST_F(EltwiseOpTest, TensorGeneralBroadcastGPU) {
{1, 1, 2, 1}, {2, 3}, {1, 1, 2, 5}, {4, 1, 0, 1, 4, 4, 9, 16, 25, 36}); {1, 1, 2, 1}, {2, 3}, {1, 1, 2, 5}, {4, 1, 0, 1, 4, 4, 9, 16, 25, 36});
} }
TEST_F(EltwiseOpTest, QuantizedSum) { TEST_F(EltwiseOpTest, Quantized) {
QuantizedSum({1, 32, 32, 16}); Quantized({1, 32, 32, 16}, ops::EltwiseType::SUM);
QuantizedSum({1, 31, 31, 17}); Quantized({1, 31, 31, 17}, ops::EltwiseType::SUM);
Quantized({1, 32, 32, 16}, ops::EltwiseType::SUB);
Quantized({1, 31, 31, 17}, ops::EltwiseType::SUB);
} }
} // namespace test } // namespace test
......
...@@ -38,7 +38,7 @@ ...@@ -38,7 +38,7 @@
#endif // MACE_ENABLE_NEON #endif // MACE_ENABLE_NEON
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
#include "mace/ops/gemmlowp_util.h" #include "mace/ops/common/gemmlowp_util.h"
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
#include "mace/ops/fixpoint.h" #include "mace/ops/fixpoint.h"
#include "mace/ops/gemmlowp_util.h" #include "mace/ops/common/gemmlowp_util.h"
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
......
...@@ -1423,8 +1423,9 @@ class Transformer(base_converter.ConverterInterface): ...@@ -1423,8 +1423,9 @@ class Transformer(base_converter.ConverterInterface):
else: else:
mace_check(op.type == MaceOp.Quantize.name, mace_check(op.type == MaceOp.Quantize.name,
"Quantization only support float ops, " "Quantization only support float ops, "
"but get %s(%s)" "but get %s(%s, %s)"
% (op.name, op.type)) % (op.name, op.type,
mace_pb2.DataType.Name(data_type_arg.i)))
for input_node in self._option.input_nodes.values(): for input_node in self._option.input_nodes.values():
new_input_name = self.input_name_map[input_node.name] new_input_name = self.input_name_map[input_node.name]
...@@ -1725,18 +1726,29 @@ class Transformer(base_converter.ConverterInterface): ...@@ -1725,18 +1726,29 @@ class Transformer(base_converter.ConverterInterface):
self.add_quantize_info(op, 0.0, 1.0) self.add_quantize_info(op, 0.0, 1.0)
self._quantize_activation_info[op.output[0]] = quantize_info self._quantize_activation_info[op.output[0]] = quantize_info
elif (op.type == MaceOp.Eltwise.name elif (op.type == MaceOp.Eltwise.name
and ConverterUtil.get_arg(op, MaceKeyword.mace_element_type_str).i == EltwiseType.SUM.value # noqa
and not op.quantize_info and not op.quantize_info
and len(op.input) == 2 and len(op.input) == 2
and len(op.input[0]) not in self._consts and len(op.input[0]) not in self._consts
and len(op.input[1]) not in self._consts): and len(op.input[1]) not in self._consts):
del op.quantize_info[:]
producer_op0 = self._producer[op.input[0]] producer_op0 = self._producer[op.input[0]]
producer_op1 = self._producer[op.input[1]] producer_op1 = self._producer[op.input[1]]
minval = producer_op0.quantize_info[0].minval \ if ConverterUtil.get_arg(
+ producer_op1.quantize_info[0].minval op, MaceKeyword.mace_element_type_str).i \
maxval = producer_op0.quantize_info[0].maxval \ == EltwiseType.SUM.value:
+ producer_op1.quantize_info[0].maxval minval = producer_op0.quantize_info[0].minval \
+ producer_op1.quantize_info[0].minval
maxval = producer_op0.quantize_info[0].maxval \
+ producer_op1.quantize_info[0].maxval
elif ConverterUtil.get_arg(
op, MaceKeyword.mace_element_type_str).i \
== EltwiseType.SUB.value:
minval = producer_op0.quantize_info[0].minval \
- producer_op1.quantize_info[0].maxval
maxval = producer_op0.quantize_info[0].maxval \
- producer_op1.quantize_info[0].minval
else:
mace_check(False, "Quantized Elementwise only support:"
" SUM and SUB now.")
quantize_info = \ quantize_info = \
self.add_quantize_info(op, minval, maxval) self.add_quantize_info(op, minval, maxval)
self._quantize_activation_info[op.output[0]] = quantize_info self._quantize_activation_info[op.output[0]] = quantize_info
......
...@@ -19,6 +19,8 @@ ...@@ -19,6 +19,8 @@
#include <cmath> #include <cmath>
#include <limits> #include <limits>
#include "mace/utils/logging.h"
namespace mace { namespace mace {
template<typename T> template<typename T>
...@@ -138,11 +140,6 @@ inline void Dequantize(const T *input, ...@@ -138,11 +140,6 @@ inline void Dequantize(const T *input,
inline void QuantizeMultiplier(double multiplier, inline void QuantizeMultiplier(double multiplier,
int32_t* output_multiplier, int32_t* output_multiplier,
int32_t* shift) { int32_t* shift) {
if (multiplier == 0.f) {
*output_multiplier = 0;
*shift = 0;
return;
}
const double q = std::frexp(multiplier, shift); const double q = std::frexp(multiplier, shift);
auto qint = static_cast<int64_t>(roundl(q * (1ll << 31))); auto qint = static_cast<int64_t>(roundl(q * (1ll << 31)));
if (qint == (1ll << 31)) { if (qint == (1ll << 31)) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册