diff --git a/mace/core/runtime/hexagon/hexagon_nn_ops.h b/mace/core/runtime/hexagon/hexagon_nn_ops.h index 7f800550a2d0e3c6409dbe7edc43c940488ea40d..928a735dd570a672b4dc6d4159c6c69baa74f1bc 100644 --- a/mace/core/runtime/hexagon/hexagon_nn_ops.h +++ b/mace/core/runtime/hexagon/hexagon_nn_ops.h @@ -47,7 +47,7 @@ class OpMap { if (op_map_.find(op_type) != end(op_map_)) { return op_map_[op_type]; } else { - LOG(ERROR) << "DSP unsupoorted op type: " << op_type; + LOG(ERROR) << "DSP unsupported op type: " << op_type; return MACE_OP_INVALID; } } diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h index 3b2eb70bc78d26e586fba777945e4a674250df10..bd98184362fe10286c23e8172cadf53f87270c84 100644 --- a/mace/kernels/depthwise_conv2d.h +++ b/mace/kernels/depthwise_conv2d.h @@ -22,7 +22,10 @@ #include #include +// We reuse TensorFlow Lite's optimized depthwiseconv_uint8 and parallelized it +// using OpenMP for MACE's quantized depthwise_conv2d. #include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h" + #include "mace/core/future.h" #include "mace/kernels/conv_pool_2d_util.h" #include "mace/kernels/activation.h" diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h index e07b45053dd28bf0feee46178ebc18aeda9ebf47..5403733b5f634b9c4025553e68cf1a32f65d4a0a 100644 --- a/mace/kernels/eltwise.h +++ b/mace/kernels/eltwise.h @@ -25,6 +25,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/kernels/kernel.h" +#include "mace/utils/quantize.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/cl2_header.h" @@ -959,6 +960,141 @@ struct EltwiseFunctor : EltwiseFunctorBase { Tensor scalar_tensor_; }; +template <> +struct EltwiseFunctor : EltwiseFunctorBase { + EltwiseFunctor(OpKernelContext *context, + const EltwiseType type, + const std::vector &coeff, + const float scalar_input, // float as it comes from arg + const int32_t scalar_input_index, + const DataFormat data_format) + : EltwiseFunctorBase(context, + type, + coeff, + scalar_input, + scalar_input_index, + data_format) {} + + MaceStatus operator()(const Tensor *input0, + const Tensor *input1, + Tensor *output, + StatsFuture *future) { + MACE_UNUSED(future); + + MACE_CHECK(type_ == SUM, "Only support Elementwise SUM now. "); + MACE_CHECK(input0->size() == input1->size(), + "input0 and input1 must have the same shape."); + MACE_CHECK(output->scale() != 0); + MACE_RETURN_IF_ERROR(output->Resize(input0->shape())); + + constexpr int left_shift = 20; + const double doubled_scale = 2 * std::max(input0->scale(), input1->scale()); + const double adjusted_input0_scale = input0->scale() / doubled_scale; + const double adjusted_input1_scale = input1->scale() / doubled_scale; + const double adjusted_output_scale = + doubled_scale / ((1 << left_shift) * output->scale()); + + int32_t input0_multiplier; + int32_t input1_multiplier; + int32_t output_multiplier; + int32_t input0_shift; + int32_t input1_shift; + int32_t output_shift; + QuantizeMultiplier(adjusted_input0_scale, + &input0_multiplier, + &input0_shift); + QuantizeMultiplier(adjusted_input1_scale, + &input1_multiplier, + &input1_shift); + QuantizeMultiplier(adjusted_output_scale, + &output_multiplier, + &output_shift); + + Tensor::MappingGuard input0_guard(input0); + Tensor::MappingGuard input1_guard(input1); + Tensor::MappingGuard output_guard(output); + + auto input0_ptr = input0->data(); + auto input1_ptr = input1->data(); + auto output_ptr = output->mutable_data(); + + index_t handled_output_size = 0; +#ifdef MACE_ENABLE_NEON +#pragma omp parallel for + for (index_t i = handled_output_size; i <= output->size() - 8; i += 8) { + const auto input0_val = vld1_u8(input0_ptr + i); + const auto input1_val = vld1_u8(input1_ptr + i); + const auto input0_val_s16 = + vreinterpretq_s16_u16(vmovl_u8(input0_val)); + const auto input1_val_s16 = + vreinterpretq_s16_u16(vmovl_u8(input1_val)); + const auto offset_input0 = + vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point())); + const auto offset_input1 = + vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point())); + auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0)); + auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0)); + auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1)); + auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1)); + const auto left_shift_dup = vdupq_n_s32(left_shift); + input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup); + input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup); + input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup); + input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup); + input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier); + input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier); + input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier); + input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier); + const auto input0_shift_dup = vdupq_n_s32(input0_shift); + const auto input1_shift_dup = vdupq_n_s32(input1_shift); + input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup); + input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup); + input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup); + input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup); + auto sum_low = vaddq_s32(input0_low_s32, input1_low_s32); + auto sum_high = vaddq_s32(input0_high_s32, input1_high_s32); + sum_low = vqrdmulhq_n_s32(sum_low, output_multiplier); + sum_high = vqrdmulhq_n_s32(sum_high, output_multiplier); + sum_low = gemmlowp::RoundingDivideByPOT(sum_low, -output_shift); + sum_high = gemmlowp::RoundingDivideByPOT(sum_high, -output_shift); + const auto sum_low_s16 = vmovn_s32(sum_low); + const auto sum_high_s16 = vmovn_s32(sum_high); + const auto output_val = vaddq_s16(vcombine_s16(sum_low_s16, + sum_high_s16), + vdupq_n_s16(output->zero_point())); + vst1_u8(output_ptr + i, vqmovun_s16(output_val)); + } + handled_output_size = output->size() - output->size() % 8; +#endif // NEON +#pragma omp parallel for + for (index_t i = handled_output_size; i < output->size(); ++i) { + const int32_t offset_input0 = input0_ptr[i] - input0->zero_point(); + const int32_t offset_input1 = input1_ptr[i] - input1->zero_point(); + const int32_t shifted_input0 = offset_input0 * (1 << left_shift); + const int32_t shifted_input1 = offset_input1 * (1 << left_shift); + const int32_t multiplied_input0 = + gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0, + input0_multiplier), + -input0_shift); + const int32_t multiplied_input1 = + gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1, + input1_multiplier), + -input1_shift); + const int32_t sum = multiplied_input0 + multiplied_input1; + const int32_t output_val = + gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul(sum, + output_multiplier), + -output_shift) + output->zero_point(); + output_ptr[i] = Saturate(output_val); + } + + return MACE_SUCCESS; + } +}; + #ifdef MACE_ENABLE_OPENCL template struct EltwiseFunctor : EltwiseFunctorBase { diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc index b3d46025133bb9617d436bca2d02e8653323635a..2e82fb70bd03c966d579cb6ce3fdeca25a93b755 100644 --- a/mace/ops/eltwise.cc +++ b/mace/ops/eltwise.cc @@ -28,6 +28,11 @@ void Register_Eltwise(OperatorRegistryBase *op_registry) { .TypeConstraint("T") .Build(), EltwiseOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + EltwiseOp); #ifdef MACE_ENABLE_OPENCL MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc index 76b04f3423a31fd344edb4cadee02857dcc4a71a..0fd1fc8dd573495d34f53f1826b500c0c8f9c452 100644 --- a/mace/ops/eltwise_test.cc +++ b/mace/ops/eltwise_test.cc @@ -559,6 +559,86 @@ void RandomTensorEltwise(const kernels::EltwiseType type, ExpectTensorNear(*expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2); } } + +void QuantizedSum(const std::vector &shape) { + // Construct graph + OpsTestNet net; + + // Add input data + net.AddRandomInput("Input0", shape, true, true); + net.AddRandomInput("Input1", shape, true, true); + + net.TransformDataFormat("Input0", NHWC, "TInput0", + NCHW); + net.TransformDataFormat("Input1", NHWC, "TInput1", + NCHW); + + OpDefBuilder("Eltwise", "EltwiseTest") + .Input("TInput0") + .Input("TInput1") + .AddIntArg("type", static_cast(kernels::EltwiseType::SUM)) + .AddIntArg("data_format", DataFormat::NCHW) + .Output("TOutput") + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(DeviceType::CPU); + net.TransformDataFormat("TOutput", NCHW, "Output", + NHWC); + + OpDefBuilder("Quantize", "QuantizeInput0") + .Input("Input0") + .Output("QuantizedInput0") + .OutputType({DT_UINT8}) + .AddIntArg("T", DT_UINT8) + .AddIntArg("non_zero", true) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + + OpDefBuilder("Quantize", "QuantizeInput1") + .Input("Input1") + .Output("QuantizedInput1") + .OutputType({DT_UINT8}) + .AddIntArg("T", DT_UINT8) + .AddIntArg("non_zero", true) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + + OpDefBuilder("Quantize", "QuantizeOutput") + .Input("Output") + .Output("ExpectedQuantizedOutput") + .OutputType({DT_UINT8}) + .AddIntArg("T", DT_UINT8) + .AddIntArg("non_zero", true) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + + OpDefBuilder("Eltwise", "QuantizeEltwiseTest") + .Input("QuantizedInput0") + .Input("QuantizedInput1") + .Output("QuantizedOutput") + .AddIntArg("type", static_cast(kernels::EltwiseType::SUM)) + .AddIntArg("T", static_cast(DT_UINT8)) + .Finalize(net.NewOperatorDef()); + net.Setup(DeviceType::CPU); + Tensor *eq_output = net.GetTensor("ExpectedQuantizedOutput"); + Tensor *q_output = net.GetTensor("QuantizedOutput"); + q_output->SetScale(eq_output->scale()); + q_output->SetZeroPoint(eq_output->zero_point()); + net.Run(); + + OpDefBuilder("Dequantize", "DeQuantizeTest") + .Input("QuantizedOutput") + .Output("DequantizedOutput") + .OutputType({DT_FLOAT}) + .AddIntArg("T", DT_UINT8) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + + // Check + ExpectTensorSimilar(*net.GetOutput("Output"), + *net.GetTensor("DequantizedOutput"), 0.01); +} } // namespace TEST_F(EltwiseOpTest, RandomTensorScalarFloat) { @@ -694,6 +774,11 @@ TEST_F(EltwiseOpTest, TensorGeneralBroadcast) { {1, 1, 2, 1}, {1, 2}, {1, 1, 2, 3}, {1, 0, 0, 0, 0, 0}); } +TEST_F(EltwiseOpTest, QuantizedSum) { + QuantizedSum({1, 32, 32, 16}); + QuantizedSum({1, 31, 31, 17}); +} + } // namespace test } // namespace ops } // namespace mace diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py index aa704d0e58f07a017b7f35de6b1bdc128cc6332c..7587f64738fb26ed47d8b45e17218b241ec36a70 100644 --- a/mace/python/tools/converter_tool/base_converter.py +++ b/mace/python/tools/converter_tool/base_converter.py @@ -207,6 +207,7 @@ class TransformerRule(Enum): TRANSFORM_BASIC_LSTMCELL = 27 TRANSFORM_FAKE_QUANTIZE = 28 CHECK_QUANTIZE_INFO = 29 + REARRANGE_BATCH_TO_SPACE = 30 class ConverterInterface(object): @@ -364,6 +365,7 @@ class ConverterOption(object): TransformerRule.FOLD_DEPTHWISE_CONV_AND_BN, TransformerRule.TRANSFORM_GPU_WINOGRAD, TransformerRule.TRANSFORM_ADD_TO_BIASADD, + TransformerRule.REARRANGE_BATCH_TO_SPACE, TransformerRule.FOLD_BIASADD, TransformerRule.FLATTEN_ATROUS_CONV, TransformerRule.FOLD_ACTIVATION, diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index 5fbab749ddcf0f163bad9f42ee3987c989712ce8..b591ab2bc0bfdcec6b98fda45205c25adb4c3e97 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -68,6 +68,8 @@ class Transformer(base_converter.ConverterInterface): self.transform_gpu_winograd, # data_format related TransformerRule.TRANSFORM_ADD_TO_BIASADD: self.transform_add_to_biasadd, + TransformerRule.REARRANGE_BATCH_TO_SPACE: + self.rearrange_batch_to_space, TransformerRule.FOLD_BIASADD: self.fold_biasadd, TransformerRule.FLATTEN_ATROUS_CONV: self.flatten_atrous_conv, TransformerRule.FOLD_ACTIVATION: self.fold_activation, @@ -793,6 +795,9 @@ class Transformer(base_converter.ConverterInterface): if len(replace_op.quantize_info) > 0: del op.quantize_info[:] op.quantize_info.extend(replace_op.quantize_info) + for i in range(len(op.quantize_info)): + self._quantize_activation_info[op.output[i]] = \ + op.quantize_info[i] def fold_biasadd(self): net = self._model @@ -1741,6 +1746,56 @@ class Transformer(base_converter.ConverterInterface): return False + def rearrange_batch_to_space(self): + if not self._option.quantize: + return False + + # Put b2s after biasadd and relu + for conv_op in self._model.op: + if conv_op.type in [MaceOp.Conv2D.name, + MaceOp.DepthwiseConv2d.name] \ + and self.consumer_count(conv_op.output[0]) == 1: + b2s_op = self._consumers[conv_op.output[0]][0] + if b2s_op.type == MaceOp.BatchToSpaceND.name \ + and self.consumer_count(b2s_op.output[0]) == 1: + biasadd_or_act_op = self._consumers[b2s_op.output[0]][0] + if biasadd_or_act_op.type == MaceOp.BiasAdd.name: + biasadd_op = biasadd_or_act_op + if self.consumer_count(biasadd_op.output[0]) == 1 \ + and self._consumers[biasadd_op.output[0]][0].type == MaceOp.Activation.name: # noqa + act_op = self._consumers[biasadd_op.output[0]][0] + biasadd_op.input[0] = conv_op.output[0] + b2s_op.input[0] = act_op.output[0] + for op in self._consumers[act_op.output[0]]: + self.replace(op.input, + act_op.output[0], + b2s_op.output[0]) + else: + biasadd_op.input[0] = conv_op.output[0] + b2s_op.input[0] = biasadd_op.output[0] + for op in self._consumers[biasadd_op.output[0]]: + self.replace(op.input, + biasadd_op.output[0], + b2s_op.output[0]) + + print("Rearrange batch to space: %s(%s)" + % (b2s_op.name, b2s_op.type)) + return True + elif biasadd_or_act_op.type == MaceOp.Activation.name: + act_op = biasadd_or_act_op + act_op.input[0] = conv_op.output[0] + b2s_op.input[0] = act_op.output[0] + for op in self._consumers[act_op.output[0]]: + self.replace(op.input, + act_op.output[0], + b2s_op.output[0]) + + print("Rearrange batch to space: %s(%s)" + % (b2s_op.name, b2s_op.type)) + return True + + return False + def add_quantize_tensor_range(self): if not self._option.quantize: return False @@ -1796,6 +1851,26 @@ class Transformer(base_converter.ConverterInterface): quantize_info = \ self.add_quantize_info(op, 0.0, 1.0) self._quantize_activation_info[op.output[0]] = quantize_info + elif (op.type == MaceOp.Eltwise.name + and ConverterUtil.get_arg(op, MaceKeyword.mace_element_type_str).i == EltwiseType.SUM.value # noqa + and not op.quantize_info + and len(op.input) == 2 + and len(op.input[0]) not in self._consts + and len(op.input[1]) not in self._consts): + del op.quantize_info[:] + producer_op0 = self._producer[op.input[0]] + producer_op1 = self._producer[op.input[1]] + quantize_info = op.quantize_info.add() + quantize_info.minval = producer_op0.quantize_info[0].minval \ + + producer_op1.quantize_info[0].minval + quantize_info.maxval = producer_op0.quantize_info[0].maxval \ + + producer_op1.quantize_info[0].maxval + scale, zero = quantize_util.adjust_range(quantize_info.minval, + quantize_info.maxval, + non_zero=False) + quantize_info.scale = scale + quantize_info.zero_point = zero + self._quantize_activation_info[op.output[0]] = quantize_info print ("Add default quantize info for input") for input_node in self._option.input_nodes.values(): @@ -1818,6 +1893,7 @@ class Transformer(base_converter.ConverterInterface): if not self._option.quantize: return False + print("Check quantize info") for op in self._model.op: if (op.name.find(MaceKeyword.mace_input_node_name) == -1 and op.name.find(MaceKeyword.mace_output_node_name) == -1