提交 fc568de7 编写于 作者: B Bin Li

Support quantized deeplabv3

上级 91e9b208
......@@ -47,7 +47,7 @@ class OpMap {
if (op_map_.find(op_type) != end(op_map_)) {
return op_map_[op_type];
} else {
LOG(ERROR) << "DSP unsupoorted op type: " << op_type;
LOG(ERROR) << "DSP unsupported op type: " << op_type;
return MACE_OP_INVALID;
}
}
......
......@@ -22,7 +22,10 @@
#include <memory>
#include <vector>
// We reuse TensorFlow Lite's optimized depthwiseconv_uint8 and parallelized it
// using OpenMP for MACE's quantized depthwise_conv2d.
#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
#include "mace/core/future.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/kernels/activation.h"
......
......@@ -25,6 +25,7 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/utils/quantize.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
......@@ -959,6 +960,141 @@ struct EltwiseFunctor : EltwiseFunctorBase {
Tensor scalar_tensor_;
};
template <>
struct EltwiseFunctor<DeviceType::CPU, uint8_t> : EltwiseFunctorBase {
EltwiseFunctor(OpKernelContext *context,
const EltwiseType type,
const std::vector<float> &coeff,
const float scalar_input, // float as it comes from arg
const int32_t scalar_input_index,
const DataFormat data_format)
: EltwiseFunctorBase(context,
type,
coeff,
scalar_input,
scalar_input_index,
data_format) {}
MaceStatus operator()(const Tensor *input0,
const Tensor *input1,
Tensor *output,
StatsFuture *future) {
MACE_UNUSED(future);
MACE_CHECK(type_ == SUM, "Only support Elementwise SUM now. ");
MACE_CHECK(input0->size() == input1->size(),
"input0 and input1 must have the same shape.");
MACE_CHECK(output->scale() != 0);
MACE_RETURN_IF_ERROR(output->Resize(input0->shape()));
constexpr int left_shift = 20;
const double doubled_scale = 2 * std::max(input0->scale(), input1->scale());
const double adjusted_input0_scale = input0->scale() / doubled_scale;
const double adjusted_input1_scale = input1->scale() / doubled_scale;
const double adjusted_output_scale =
doubled_scale / ((1 << left_shift) * output->scale());
int32_t input0_multiplier;
int32_t input1_multiplier;
int32_t output_multiplier;
int32_t input0_shift;
int32_t input1_shift;
int32_t output_shift;
QuantizeMultiplier(adjusted_input0_scale,
&input0_multiplier,
&input0_shift);
QuantizeMultiplier(adjusted_input1_scale,
&input1_multiplier,
&input1_shift);
QuantizeMultiplier(adjusted_output_scale,
&output_multiplier,
&output_shift);
Tensor::MappingGuard input0_guard(input0);
Tensor::MappingGuard input1_guard(input1);
Tensor::MappingGuard output_guard(output);
auto input0_ptr = input0->data<uint8_t>();
auto input1_ptr = input1->data<uint8_t>();
auto output_ptr = output->mutable_data<uint8_t>();
index_t handled_output_size = 0;
#ifdef MACE_ENABLE_NEON
#pragma omp parallel for
for (index_t i = handled_output_size; i <= output->size() - 8; i += 8) {
const auto input0_val = vld1_u8(input0_ptr + i);
const auto input1_val = vld1_u8(input1_ptr + i);
const auto input0_val_s16 =
vreinterpretq_s16_u16(vmovl_u8(input0_val));
const auto input1_val_s16 =
vreinterpretq_s16_u16(vmovl_u8(input1_val));
const auto offset_input0 =
vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point()));
const auto offset_input1 =
vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point()));
auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0));
auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0));
auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1));
auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1));
const auto left_shift_dup = vdupq_n_s32(left_shift);
input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup);
input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup);
input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup);
input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup);
input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier);
input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier);
input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier);
input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier);
const auto input0_shift_dup = vdupq_n_s32(input0_shift);
const auto input1_shift_dup = vdupq_n_s32(input1_shift);
input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup);
input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup);
input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup);
input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup);
auto sum_low = vaddq_s32(input0_low_s32, input1_low_s32);
auto sum_high = vaddq_s32(input0_high_s32, input1_high_s32);
sum_low = vqrdmulhq_n_s32(sum_low, output_multiplier);
sum_high = vqrdmulhq_n_s32(sum_high, output_multiplier);
sum_low = gemmlowp::RoundingDivideByPOT(sum_low, -output_shift);
sum_high = gemmlowp::RoundingDivideByPOT(sum_high, -output_shift);
const auto sum_low_s16 = vmovn_s32(sum_low);
const auto sum_high_s16 = vmovn_s32(sum_high);
const auto output_val = vaddq_s16(vcombine_s16(sum_low_s16,
sum_high_s16),
vdupq_n_s16(output->zero_point()));
vst1_u8(output_ptr + i, vqmovun_s16(output_val));
}
handled_output_size = output->size() - output->size() % 8;
#endif // NEON
#pragma omp parallel for
for (index_t i = handled_output_size; i < output->size(); ++i) {
const int32_t offset_input0 = input0_ptr[i] - input0->zero_point();
const int32_t offset_input1 = input1_ptr[i] - input1->zero_point();
const int32_t shifted_input0 = offset_input0 * (1 << left_shift);
const int32_t shifted_input1 = offset_input1 * (1 << left_shift);
const int32_t multiplied_input0 =
gemmlowp::RoundingDivideByPOT(
gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0,
input0_multiplier),
-input0_shift);
const int32_t multiplied_input1 =
gemmlowp::RoundingDivideByPOT(
gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1,
input1_multiplier),
-input1_shift);
const int32_t sum = multiplied_input0 + multiplied_input1;
const int32_t output_val =
gemmlowp::RoundingDivideByPOT(
gemmlowp::SaturatingRoundingDoublingHighMul(sum,
output_multiplier),
-output_shift) + output->zero_point();
output_ptr[i] = Saturate<uint8_t>(output_val);
}
return MACE_SUCCESS;
}
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase {
......
......@@ -28,6 +28,11 @@ void Register_Eltwise(OperatorRegistryBase *op_registry) {
.TypeConstraint<int32_t>("T")
.Build(),
EltwiseOp<DeviceType::CPU, int32_t>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
.Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T")
.Build(),
EltwiseOp<DeviceType::CPU, uint8_t>);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
......
......@@ -559,6 +559,86 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
ExpectTensorNear<float>(*expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2);
}
}
void QuantizedSum(const std::vector<index_t> &shape) {
// Construct graph
OpsTestNet net;
// Add input data
net.AddRandomInput<DeviceType::CPU, float>("Input0", shape, true, true);
net.AddRandomInput<DeviceType::CPU, float>("Input1", shape, true, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC, "TInput0",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Input1", NHWC, "TInput1",
NCHW);
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("TInput0")
.Input("TInput1")
.AddIntArg("type", static_cast<int>(kernels::EltwiseType::SUM))
.AddIntArg("data_format", DataFormat::NCHW)
.Output("TOutput")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(DeviceType::CPU);
net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
NHWC);
OpDefBuilder("Quantize", "QuantizeInput0")
.Input("Input0")
.Output("QuantizedInput0")
.OutputType({DT_UINT8})
.AddIntArg("T", DT_UINT8)
.AddIntArg("non_zero", true)
.Finalize(net.NewOperatorDef());
net.RunOp();
OpDefBuilder("Quantize", "QuantizeInput1")
.Input("Input1")
.Output("QuantizedInput1")
.OutputType({DT_UINT8})
.AddIntArg("T", DT_UINT8)
.AddIntArg("non_zero", true)
.Finalize(net.NewOperatorDef());
net.RunOp();
OpDefBuilder("Quantize", "QuantizeOutput")
.Input("Output")
.Output("ExpectedQuantizedOutput")
.OutputType({DT_UINT8})
.AddIntArg("T", DT_UINT8)
.AddIntArg("non_zero", true)
.Finalize(net.NewOperatorDef());
net.RunOp();
OpDefBuilder("Eltwise", "QuantizeEltwiseTest")
.Input("QuantizedInput0")
.Input("QuantizedInput1")
.Output("QuantizedOutput")
.AddIntArg("type", static_cast<int>(kernels::EltwiseType::SUM))
.AddIntArg("T", static_cast<int>(DT_UINT8))
.Finalize(net.NewOperatorDef());
net.Setup(DeviceType::CPU);
Tensor *eq_output = net.GetTensor("ExpectedQuantizedOutput");
Tensor *q_output = net.GetTensor("QuantizedOutput");
q_output->SetScale(eq_output->scale());
q_output->SetZeroPoint(eq_output->zero_point());
net.Run();
OpDefBuilder("Dequantize", "DeQuantizeTest")
.Input("QuantizedOutput")
.Output("DequantizedOutput")
.OutputType({DT_FLOAT})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.RunOp();
// Check
ExpectTensorSimilar<float>(*net.GetOutput("Output"),
*net.GetTensor("DequantizedOutput"), 0.01);
}
} // namespace
TEST_F(EltwiseOpTest, RandomTensorScalarFloat) {
......@@ -694,6 +774,11 @@ TEST_F(EltwiseOpTest, TensorGeneralBroadcast) {
{1, 1, 2, 1}, {1, 2}, {1, 1, 2, 3}, {1, 0, 0, 0, 0, 0});
}
TEST_F(EltwiseOpTest, QuantizedSum) {
QuantizedSum({1, 32, 32, 16});
QuantizedSum({1, 31, 31, 17});
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -207,6 +207,7 @@ class TransformerRule(Enum):
TRANSFORM_BASIC_LSTMCELL = 27
TRANSFORM_FAKE_QUANTIZE = 28
CHECK_QUANTIZE_INFO = 29
REARRANGE_BATCH_TO_SPACE = 30
class ConverterInterface(object):
......@@ -364,6 +365,7 @@ class ConverterOption(object):
TransformerRule.FOLD_DEPTHWISE_CONV_AND_BN,
TransformerRule.TRANSFORM_GPU_WINOGRAD,
TransformerRule.TRANSFORM_ADD_TO_BIASADD,
TransformerRule.REARRANGE_BATCH_TO_SPACE,
TransformerRule.FOLD_BIASADD,
TransformerRule.FLATTEN_ATROUS_CONV,
TransformerRule.FOLD_ACTIVATION,
......
......@@ -68,6 +68,8 @@ class Transformer(base_converter.ConverterInterface):
self.transform_gpu_winograd, # data_format related
TransformerRule.TRANSFORM_ADD_TO_BIASADD:
self.transform_add_to_biasadd,
TransformerRule.REARRANGE_BATCH_TO_SPACE:
self.rearrange_batch_to_space,
TransformerRule.FOLD_BIASADD: self.fold_biasadd,
TransformerRule.FLATTEN_ATROUS_CONV: self.flatten_atrous_conv,
TransformerRule.FOLD_ACTIVATION: self.fold_activation,
......@@ -793,6 +795,9 @@ class Transformer(base_converter.ConverterInterface):
if len(replace_op.quantize_info) > 0:
del op.quantize_info[:]
op.quantize_info.extend(replace_op.quantize_info)
for i in range(len(op.quantize_info)):
self._quantize_activation_info[op.output[i]] = \
op.quantize_info[i]
def fold_biasadd(self):
net = self._model
......@@ -1741,6 +1746,56 @@ class Transformer(base_converter.ConverterInterface):
return False
def rearrange_batch_to_space(self):
if not self._option.quantize:
return False
# Put b2s after biasadd and relu
for conv_op in self._model.op:
if conv_op.type in [MaceOp.Conv2D.name,
MaceOp.DepthwiseConv2d.name] \
and self.consumer_count(conv_op.output[0]) == 1:
b2s_op = self._consumers[conv_op.output[0]][0]
if b2s_op.type == MaceOp.BatchToSpaceND.name \
and self.consumer_count(b2s_op.output[0]) == 1:
biasadd_or_act_op = self._consumers[b2s_op.output[0]][0]
if biasadd_or_act_op.type == MaceOp.BiasAdd.name:
biasadd_op = biasadd_or_act_op
if self.consumer_count(biasadd_op.output[0]) == 1 \
and self._consumers[biasadd_op.output[0]][0].type == MaceOp.Activation.name: # noqa
act_op = self._consumers[biasadd_op.output[0]][0]
biasadd_op.input[0] = conv_op.output[0]
b2s_op.input[0] = act_op.output[0]
for op in self._consumers[act_op.output[0]]:
self.replace(op.input,
act_op.output[0],
b2s_op.output[0])
else:
biasadd_op.input[0] = conv_op.output[0]
b2s_op.input[0] = biasadd_op.output[0]
for op in self._consumers[biasadd_op.output[0]]:
self.replace(op.input,
biasadd_op.output[0],
b2s_op.output[0])
print("Rearrange batch to space: %s(%s)"
% (b2s_op.name, b2s_op.type))
return True
elif biasadd_or_act_op.type == MaceOp.Activation.name:
act_op = biasadd_or_act_op
act_op.input[0] = conv_op.output[0]
b2s_op.input[0] = act_op.output[0]
for op in self._consumers[act_op.output[0]]:
self.replace(op.input,
act_op.output[0],
b2s_op.output[0])
print("Rearrange batch to space: %s(%s)"
% (b2s_op.name, b2s_op.type))
return True
return False
def add_quantize_tensor_range(self):
if not self._option.quantize:
return False
......@@ -1796,6 +1851,26 @@ class Transformer(base_converter.ConverterInterface):
quantize_info = \
self.add_quantize_info(op, 0.0, 1.0)
self._quantize_activation_info[op.output[0]] = quantize_info
elif (op.type == MaceOp.Eltwise.name
and ConverterUtil.get_arg(op, MaceKeyword.mace_element_type_str).i == EltwiseType.SUM.value # noqa
and not op.quantize_info
and len(op.input) == 2
and len(op.input[0]) not in self._consts
and len(op.input[1]) not in self._consts):
del op.quantize_info[:]
producer_op0 = self._producer[op.input[0]]
producer_op1 = self._producer[op.input[1]]
quantize_info = op.quantize_info.add()
quantize_info.minval = producer_op0.quantize_info[0].minval \
+ producer_op1.quantize_info[0].minval
quantize_info.maxval = producer_op0.quantize_info[0].maxval \
+ producer_op1.quantize_info[0].maxval
scale, zero = quantize_util.adjust_range(quantize_info.minval,
quantize_info.maxval,
non_zero=False)
quantize_info.scale = scale
quantize_info.zero_point = zero
self._quantize_activation_info[op.output[0]] = quantize_info
print ("Add default quantize info for input")
for input_node in self._option.input_nodes.values():
......@@ -1818,6 +1893,7 @@ class Transformer(base_converter.ConverterInterface):
if not self._option.quantize:
return False
print("Check quantize info")
for op in self._model.op:
if (op.name.find(MaceKeyword.mace_input_node_name) == -1
and op.name.find(MaceKeyword.mace_output_node_name) == -1
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册