diff --git a/mace/core/runtime/apu/apu_wrapper.cc b/mace/core/runtime/apu/apu_wrapper.cc index 6feac5c699cab4503904037833b925a5ddb0f545..d4e2ba01dd4f9f42277706b3b8ba86ad7c46a7bf 100644 --- a/mace/core/runtime/apu/apu_wrapper.cc +++ b/mace/core/runtime/apu/apu_wrapper.cc @@ -21,7 +21,8 @@ namespace mace { ApuWrapper::ApuWrapper(Device *device) - : quantize_util_(&device->cpu_runtime()->thread_pool()) { + : quantize_util_uint8_(&device->cpu_runtime()->thread_pool()), + quantize_util_int16_(&device->cpu_runtime()->thread_pool()) { } apu_data_type ApuWrapper::MapToApuDataType(DataType mace_type) { @@ -270,7 +271,7 @@ bool ApuWrapper::Run(const std::map &input_tensors, "Wrong outputs num"); // prepare input for (int i = 0 ; i < static_cast(input_tensors.size()) ; i++) { - Tensor *tensor = input_tensors.at(input_infos[i].name); + Tensor* tensor = input_tensors.at(input_infos[i].name); // check size int element_size = input_infos[i].size; @@ -279,18 +280,18 @@ bool ApuWrapper::Run(const std::map &input_tensors, "Wrong input size"); // quantize if (input_infos[i].data_type == APU_DATA_TYPE_INT16) { - quantize_util_.QuantizeWithScaleAndZeropoint( + quantize_util_int16_.QuantizeWithScaleAndZeropoint( (const float*)tensor->raw_data(), element_size, input_infos[i].scale, input_infos[i].zero_point, reinterpret_cast(input_infos[i].buf.get())); } else if (input_infos[i].data_type == APU_DATA_TYPE_FLOAT) { - std::memcpy(input_infos[i].buf.get(), + std::memcpy(input_infos[i].buf.get(), (const float*)tensor->raw_data(), element_size * byte_per_element); } else { - quantize_util_.QuantizeWithScaleAndZeropoint( + quantize_util_uint8_.QuantizeWithScaleAndZeropoint( (const float*)tensor->raw_data(), element_size, input_infos[i].scale, @@ -304,8 +305,8 @@ bool ApuWrapper::Run(const std::map &input_tensors, MACE_CHECK(ret == true, "neuron run model failed"); // process output - for (int i = 0; i < static_cast(output_tensors->size()); i++) { - Tensor *tensor = output_tensors->at(output_infos[i].name); + for (int i = 0 ; i < static_cast(output_tensors->size()) ; i++) { + Tensor* tensor = output_tensors->at(output_infos[i].name); // prepare out buffer tensor->SetDtype(DT_FLOAT); @@ -316,7 +317,7 @@ bool ApuWrapper::Run(const std::map &input_tensors, "Wrong output size"); // dequantize if (output_infos[i].data_type == APU_DATA_TYPE_INT16) { - quantize_util_.Dequantize( + quantize_util_int16_.Dequantize( reinterpret_cast(output_infos[i].buf.get()), element_size, output_infos[i].scale, @@ -327,7 +328,7 @@ bool ApuWrapper::Run(const std::map &input_tensors, output_infos[i].buf.get(), element_size * byte_per_element); } else { - quantize_util_.Dequantize( + quantize_util_uint8_.Dequantize( output_infos[i].buf.get(), element_size, output_infos[i].scale, @@ -348,19 +349,19 @@ bool ApuWrapper::Uninit() { } int ApuWrapper::GetByteNum(apu_data_type data_type) { - int byte_per_element; - if (data_type == APU_DATA_TYPE_FLOAT || data_type == APU_DATA_TYPE_INT32) { - byte_per_element = 4; - } else if (data_type == APU_DATA_TYPE_HALF || - data_type == APU_DATA_TYPE_INT16) { - byte_per_element = 2; - } else if (data_type == APU_DATA_TYPE_UINT8) { - byte_per_element = 1; - } else { - byte_per_element = 1; - MACE_CHECK(false, "unsupport data type"); - } - return byte_per_element; + int byte_per_element; + if (data_type == APU_DATA_TYPE_FLOAT || data_type == APU_DATA_TYPE_INT32) { + byte_per_element = 4; + } else if (data_type == APU_DATA_TYPE_HALF || + data_type == APU_DATA_TYPE_INT16) { + byte_per_element = 2; + } else if (data_type == APU_DATA_TYPE_UINT8) { + byte_per_element = 1; + } else { + byte_per_element = 1; + MACE_CHECK(false, "unsupport data type"); + } + return byte_per_element; } } // namespace mace diff --git a/mace/core/runtime/apu/apu_wrapper.h b/mace/core/runtime/apu/apu_wrapper.h index a18694edd0681f0b9a65a2ceaa922a1e6bf0582c..46d5d32e921fec157011b50d9d2d279dc3c4fac6 100644 --- a/mace/core/runtime/apu/apu_wrapper.h +++ b/mace/core/runtime/apu/apu_wrapper.h @@ -59,7 +59,8 @@ class ApuWrapper { ApuFrontend *frontend; std::vector input_infos; std::vector output_infos; - QuantizeUtil quantize_util_; + QuantizeUtil quantize_util_uint8_; + QuantizeUtil quantize_util_int16_; }; } // namespace mace diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index d31f9eb56e1415f3691da8593926263abfb6b846..44e025d09703600e315753bc0e71fd6940da066b 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -304,15 +304,14 @@ MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy( return MaceStatus::MACE_SUCCESS; } -#ifdef MACE_ENABLE_HEXAGON MaceStatus MaceEngineConfig::Impl::SetHexagonToUnsignedPD() { bool ret = false; +#ifdef MACE_ENABLE_HEXAGON ret = HexagonDSPWrapper::RequestUnsignedPD(); +#endif return ret ? MaceStatus::MACE_SUCCESS : MaceStatus::MACE_RUNTIME_ERROR; } -#endif -#ifdef MACE_ENABLE_HEXAGON MaceStatus MaceEngineConfig::Impl::SetHexagonPower( HexagonNNCornerType corner, bool dcvs_enable, @@ -321,12 +320,12 @@ MaceStatus MaceEngineConfig::Impl::SetHexagonPower( hexagon_dcvs_enable_ = dcvs_enable; hexagon_latency_ = latency; bool ret = false; +#ifdef MACE_ENABLE_HEXAGON ret = HexagonDSPWrapper::SetPower(corner, dcvs_enable, latency); +#endif return ret ? MaceStatus::MACE_SUCCESS : MaceStatus::MACE_RUNTIME_ERROR; } -#endif -#ifdef MACE_ENABLE_APU MaceStatus MaceEngineConfig::Impl::SetAPUCache( APUCachePolicy policy, const std::string &binary_file, @@ -335,10 +334,11 @@ MaceStatus MaceEngineConfig::Impl::SetAPUCache( apu_cache_policy_ = policy; apu_binary_file_ = binary_file; apu_storage_file_ = storage_file; +#ifdef MACE_ENABLE_APU ret = true; +#endif return ret ? MaceStatus::MACE_SUCCESS : MaceStatus::MACE_RUNTIME_ERROR; } -#endif MaceEngineConfig::MaceEngineConfig( const DeviceType device_type) diff --git a/third_party/apu/mt6853/libapu-frontend.so b/third_party/apu/mt6853/libapu-frontend.so index 2144858555caa0c0926de15a16eeab9ce3aabf46..81568bbf07ac6eb10de10229df13bd23bbf8bb3d 100644 Binary files a/third_party/apu/mt6853/libapu-frontend.so and b/third_party/apu/mt6853/libapu-frontend.so differ diff --git a/third_party/apu/mt6853/libapu-platform.so b/third_party/apu/mt6853/libapu-platform.so index 7537371553ec0daf3c97c6277a2ba16a3275b173..47873fae1fd5e7daa32b832decd275bd7fc69677 100644 Binary files a/third_party/apu/mt6853/libapu-platform.so and b/third_party/apu/mt6853/libapu-platform.so differ diff --git a/third_party/apu/mt6873/libapu-frontend.so b/third_party/apu/mt6873/libapu-frontend.so index 453f5388c1986bd749fec9d0249dc7c0fbe7e530..aa71b41c9aca644cab5c414be4b555f4c207b9c3 100644 Binary files a/third_party/apu/mt6873/libapu-frontend.so and b/third_party/apu/mt6873/libapu-frontend.so differ diff --git a/third_party/apu/mt6873/libapu-platform.so b/third_party/apu/mt6873/libapu-platform.so index af29cee6a9f6554595fd8c9066dbaff12a4fe07c..e9ee3a1771dd0039c020cdd58c7c4f1f2e858cde 100644 Binary files a/third_party/apu/mt6873/libapu-platform.so and b/third_party/apu/mt6873/libapu-platform.so differ diff --git a/third_party/apu/mt6885/libapu-frontend.so b/third_party/apu/mt6885/libapu-frontend.so index 453f5388c1986bd749fec9d0249dc7c0fbe7e530..aa71b41c9aca644cab5c414be4b555f4c207b9c3 100644 Binary files a/third_party/apu/mt6885/libapu-frontend.so and b/third_party/apu/mt6885/libapu-frontend.so differ diff --git a/third_party/apu/mt6885/libapu-platform.so b/third_party/apu/mt6885/libapu-platform.so index af29cee6a9f6554595fd8c9066dbaff12a4fe07c..e9ee3a1771dd0039c020cdd58c7c4f1f2e858cde 100644 Binary files a/third_party/apu/mt6885/libapu-platform.so and b/third_party/apu/mt6885/libapu-platform.so differ diff --git a/tools/python/transform/apu_converter.py b/tools/python/transform/apu_converter.py index faeb0be688010cbbe776f635f4b9545b4444e931..7a9c2068b3e4c0ebe9dc670c18ed4cc265c90376 100644 --- a/tools/python/transform/apu_converter.py +++ b/tools/python/transform/apu_converter.py @@ -37,9 +37,12 @@ ApuSupportedOps = [ 'Concat', 'Conv2D', 'DepthwiseConv2d', + 'Deconv2D', 'Eltwise', + 'FullyConnected', 'Pad', 'Pooling', + 'PRelu', 'Reduce', 'ResizeBilinear', 'Reshape', @@ -56,7 +59,9 @@ class ApuOps(object): MaceOp.Concat.name: ApuOp.Concat.name, MaceOp.Conv2D.name: ApuOp.Conv2D.name, MaceOp.DepthwiseConv2d.name: ApuOp.DepthwiseConv2d.name, + MaceOp.Deconv2D.name: ApuOp.Deconv2D.name, MaceOp.Eltwise.name: ApuOp.Eltwise.name, + MaceOp.FullyConnected.name: ApuOp.FullyConnected.name, MaceOp.Pad.name: ApuOp.Pad.name, MaceOp.Pooling.name: ApuOp.Pooling.name, MaceOp.Reduce.name: ApuOp.Reduce.name, @@ -135,7 +140,8 @@ class ApuConverter(base_converter.ConverterInterface): act_mode_arg = ConverterUtil.get_arg( op, MaceKeyword.mace_activation_type_str) if act_mode_arg is not None: - mace_check(act_mode_arg.s == b'RELU' + mace_check(act_mode_arg.s == b'PRELU' + or act_mode_arg.s == b'RELU' or act_mode_arg.s == b'RELUX' or act_mode_arg.s == b'TANH' or act_mode_arg.s == b'SIGMOID', @@ -179,6 +185,15 @@ class ApuConverter(base_converter.ConverterInterface): multiplier.int32_data.extend([tensor.dims[0]]) break op.input.extend([multiplier.name]) + elif op.type == MaceOp.Deconv2D.name: + mace_check(len(op.input) == 4, + op.name + ': apu only support ' + op.type + ' op' + ' with 4 input') + self.add_size_tensor_from_arg( + op, MaceKeyword.mace_strides_str) + self.add_padding_value_tensor_from_arg(op) + self.add_size_tensor_from_list( + op, MaceKeyword.mace_dilations_str, [1, 1]) elif op.type == MaceOp.Eltwise.name: eltwise_type = ConverterUtil.get_arg( op, MaceKeyword.mace_element_type_str).i @@ -276,8 +291,8 @@ class ApuConverter(base_converter.ConverterInterface): op.name + ': apu only support squeeze op with 1' ' input') self.add_shape_tensor_from_axis_arg(op) - op.type = self._apu_ops.map_nn_op(op.type) + self.change_activation_to_prelu() def add_op_output_type(self): type_map = {} @@ -371,6 +386,14 @@ class ApuConverter(base_converter.ConverterInterface): size_value_tensor.int32_data.extend(size_value_arg.ints) op.input.extend([size_value_tensor.name]) + def add_size_tensor_from_list(self, op, keyword, list_value): + size_value_tensor = self._model.tensors.add() + size_value_tensor.name = op.name + '/' + keyword + ':0' + size_value_tensor.data_type = mace_pb2.DT_INT32 + size_value_tensor.dims.extend([len(list_value)]) + size_value_tensor.int32_data.extend(list_value) + op.input.extend([size_value_tensor.name]) + def add_int_tensor_from_arg(self, op, keyword): int_value_arg = ConverterUtil.get_arg(op, keyword) mace_check(int_value_arg.i is not None, @@ -420,7 +443,6 @@ class ApuConverter(base_converter.ConverterInterface): op, MaceKeyword.mace_padding_str) if padding_type is None: continue - padding_arg = op.arg.add() padding_arg.name = MaceKeyword.mace_padding_values_str if padding_type.i == PaddingMode.VALID.value: @@ -431,7 +453,8 @@ class ApuConverter(base_converter.ConverterInterface): kernel = [] dilation = [1, 1] if op.type == MaceOp.Conv2D.name or \ - op.type == MaceOp.DepthwiseConv2d.name: + op.type == MaceOp.DepthwiseConv2d.name or \ + op.type == MaceOp.Deconv2D.name: if ConverterUtil.get_arg( op, MaceKeyword.mace_dilations_str) is not None: dilation = ConverterUtil.get_arg( @@ -456,22 +479,37 @@ class ApuConverter(base_converter.ConverterInterface): if len(in_size) > 0: break out_size = op.output_shape[0].dims[1:3] - h = (out_size[0] - 1) * stride[0] \ - + ((kernel[0] - 1) * dilation[0] + 1) - in_size[0] - w = (out_size[1] - 1) * stride[1] \ - + ((kernel[1] - 1) * dilation[1] + 1) - in_size[1] + if(op.type == MaceOp.Deconv2D.name): + h = (in_size[0] - 1) * stride[0] + kernel[0] - out_size[0] + w = (in_size[1] - 1) * stride[1] + kernel[1] - out_size[1] + else: + h = (out_size[0] - 1) * stride[0] \ + + ((kernel[0] - 1) * dilation[0] + 1) - in_size[0] + w = (out_size[1] - 1) * stride[1] \ + + ((kernel[1] - 1) * dilation[1] + 1) - in_size[1] top = int(np.floor(h/2)) left = int(np.floor(w/2)) bottom = h - top right = w - left padding_arg.ints.extend([top, right, bottom, left]) + def change_activation_to_prelu(self): + for op in self._model.op: + if op.type == ApuOp.Activation.name and \ + ConverterUtil.get_arg( + op, MaceKeyword.mace_activation_type_str).s == b'PRELU': + op.type = ApuOp.PRelu.name + def ensure_bias_vector(self): for _op in self._model.op: - if _op.type != MaceOp.Conv2D.name and \ - _op.type != MaceOp.DepthwiseConv2d.name: - continue - if len(_op.input) != 2: + ensure_input = -1 + if _op.type == MaceOp.Conv2D.name or \ + _op.type == MaceOp.DepthwiseConv2d.name or \ + _op.type == MaceOp.FullyConnected.name: + ensure_input = 3 + if _op.type == MaceOp.Deconv2D.name: + ensure_input = 4 + if ensure_input == -1 or len(_op.input) != ensure_input - 1: continue tensor = self._model.tensors.add() @@ -522,15 +560,14 @@ class ApuConverter(base_converter.ConverterInterface): const_tensor.name = _op.name + '/' + \ MaceKeyword.mace_scalar_input_str + ':0' const_tensor.dims.extend([1]) + const_tensor.data_type = _op.output_type[0] if _op.output_type[0] == mace_pb2.DT_UINT8 or \ _op.output_type[0] == mace_pb2.DT_INT16: - const_tensor.data_type = _op.output_type[0] const_tensor.scale = scalar const_tensor.zero_point = 0 const_tensor.quantized = True const_tensor.int32_data.extend([1]) elif _op.output_type[0] == mace_pb2.DT_FLOAT: - const_tensor.data_type = mace_pb2.DT_FLOAT const_tensor.float_data.extend([scalar]) _op.input.extend([const_tensor.name]) ConverterUtil.del_arg( diff --git a/tools/python/transform/base_converter.py b/tools/python/transform/base_converter.py index 4fce320aee7e0a0c67935eccb205efb4bd2df220..696a59551701d12274a0f26d0dc0d95c5820ec60 100644 --- a/tools/python/transform/base_converter.py +++ b/tools/python/transform/base_converter.py @@ -340,6 +340,8 @@ class TransformerRule(Enum): QUANTIZE_LARGE_WEIGHTS = 43 TRANSPOSE_SHAPE_TENSOR_TO_PARAM = 44 TRANSFORM_SINGLE_BN_TO_DEPTHWISE_CONV = 45 + TRANSFORM_MUL_MAX_TO_PRELU = 46 + TRANSFORM_EXPAND_DIMS_TO_RESHAPE = 47 class ConverterInterface(object): @@ -610,6 +612,8 @@ class ConverterOption(object): if self._device == DeviceType.APU.value: self._transformer_option = self._transformer_option + [ TransformerRule.TRANSFORM_SINGLE_BN_TO_DEPTHWISE_CONV, + TransformerRule.TRANSFORM_MUL_MAX_TO_PRELU, + TransformerRule.TRANSFORM_EXPAND_DIMS_TO_RESHAPE, ] if self.quantize_large_weights: self._transformer_option = self._transformer_option + [ diff --git a/tools/python/transform/transformer.py b/tools/python/transform/transformer.py index 73f6b66b7eb455d6b82cf7bc72b6e8afdc555b61..8f032acefbc352b94cd488f8653a3a1392ce5e94 100644 --- a/tools/python/transform/transformer.py +++ b/tools/python/transform/transformer.py @@ -117,6 +117,10 @@ class Transformer(base_converter.ConverterInterface): self.quantize_large_weights, TransformerRule.TRANSFORM_SINGLE_BN_TO_DEPTHWISE_CONV: self.transform_single_bn_to_depthwise_conv, + TransformerRule.TRANSFORM_MUL_MAX_TO_PRELU: + self.transform_mul_max_to_prelu, + TransformerRule.TRANSFORM_EXPAND_DIMS_TO_RESHAPE: + self.transform_expand_dims_to_reshape, } self._option = option @@ -962,17 +966,23 @@ class Transformer(base_converter.ConverterInterface): or op.type == MaceOp.BatchNorm.name) \ and len(self._consumers.get(op.output[0], [])) == 1: consumer_op = self._consumers[op.output[0]][0] + fold_consumer = False if consumer_op.type == MaceOp.Activation.name: act_type_arg = ConverterUtil.get_arg( consumer_op, MaceKeyword.mace_activation_type_str) act_type = act_type_arg.s.decode() - if act_type == ActivationType.PRELU.name: - continue + if self._option.device == DeviceType.APU.value: + fold_consumer = (act_type in + [ActivationType.RELU.name, + ActivationType.RELUX.name]) + else: + fold_consumer = (act_type != ActivationType.PRELU.name) # during quantization, only fold relu/relux if (self._option.quantize_stat or self._option.quantize) \ and act_type not in [ActivationType.RELU.name, ActivationType.RELUX.name]: continue + if fold_consumer: print("Fold activation: %s(%s)" % (op.name, op.type)) op.name = consumer_op.name op.output[0] = consumer_op.output[0] @@ -1032,6 +1042,8 @@ class Transformer(base_converter.ConverterInterface): return False def reshape_fc_weight(self): + if self._option.device == DeviceType.APU.value: + return net = self._model filter_format = self.filter_format() for op in net.op: @@ -1348,6 +1360,36 @@ class Transformer(base_converter.ConverterInterface): weight.dims[:] = [1, 1] + list(weight_data.shape) return True + if self._option.device == DeviceType.APU.value: + if op.type == MaceOp.MatMul.name: + transpose_a_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_transpose_a_str) # noqa + transpose_b_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_transpose_b_str) # noqa + transpose_a = transpose_a_arg is not None and transpose_a_arg.i == 1 # noqa + transpose_b = transpose_b_arg is not None and transpose_b_arg.i == 1 # noqa + if transpose_a is False and transpose_b is False and \ + op.input[1] in self._consts and \ + len(self.get_tensor_shape(op.input[0])) == 2 and \ + len(self.get_tensor_shape(op.input[1])) == 2: + op.type = MaceOp.FullyConnected.name + del op.arg[:] + rhs = op.input[1] + if rhs in self._consts and \ + len(self._consts[rhs].dims) == 2: + arg = ConverterUtil.get_arg(op, MaceKeyword.mace_transpose_b_str) # noqa + if arg is None: + arg = op.arg.add() + arg.name = MaceKeyword.mace_transpose_b_str + arg.i = 0 + if arg.i == 0: + arg.i = 1 + filter = self._consts[rhs] + filter_data = np.array(filter.float_data) \ + .reshape(filter.dims) + filter_data = filter_data.transpose(1, 0) + filter.float_data[:] = filter_data.flat + filter.dims[:] = filter_data.shape + six.print_('Transpose matmul weight to shape:', + filter.dims) return False def update_float_op_data_type(self): @@ -2476,5 +2518,68 @@ class Transformer(base_converter.ConverterInterface): tensor.dims[:] = [1, 1, 1, tensor.dims[0]] break return True + return False + def transform_mul_max_to_prelu(self): + if self._option.device != DeviceType.APU.value: + return False + net = self._model + for op in net.op: + if op.type != MaceOp.Eltwise.name or \ + ConverterUtil.get_arg( + op, MaceKeyword.mace_element_type_str).i \ + != EltwiseType.PROD.value or \ + op.output[0] not in self._consumers: + continue + if len(op.input) != 1: + continue + consumer_op = self._consumers[op.output[0]][0] + if consumer_op.type != MaceOp.Eltwise.name or \ + ConverterUtil.get_arg( + consumer_op, MaceKeyword.mace_element_type_str).i \ + != EltwiseType.MAX.value: + continue + if op.input[0] not in consumer_op.input: + continue + float_value_arg = ConverterUtil.get_arg( + op, MaceKeyword.mace_scalar_input_str) + mace_check(float_value_arg is not None, + op.name + ': ' + MaceKeyword.mace_scalar_input_str + + ' value float should not be None') + scalar = float_value_arg.f + if scalar < 0: + continue + if scalar > 1: + scalar = 1 + # Change Mul op to Prelu + print("Change mul and max to prelu: %s(%s)" % (op.name, op.type)) + op.name = consumer_op.name + op.output[0] = consumer_op.output[0] + alpha_tensor = net.tensors.add() + alpha_tensor.name = op.name + '_alpha' + alpha_tensor.dims.append(1) + alpha_tensor.data_type = mace_pb2.DT_FLOAT + alpha_tensor.float_data.extend([scalar]) + op.input.extend([alpha_tensor.name]) + ConverterUtil.del_arg(op, MaceKeyword.mace_scalar_input_str) + ConverterUtil.del_arg( + op, MaceKeyword.mace_scalar_input_index_str) + op.type = MaceOp.Activation.name + type_arg = op.arg.add() + type_arg.name = MaceKeyword.mace_activation_type_str + type_arg.s = six.b(ActivationType.PRELU.name) + self.replace_quantize_info(op, consumer_op) + self.safe_remove_node(consumer_op, op) + return True + return False + + def transform_expand_dims_to_reshape(self): + if self._option.device != DeviceType.APU.value: + return False + net = self._model + for op in net.op: + if op.type == MaceOp.ExpandDims.name: + op.type = MaceOp.Reshape.name + del op.arg[:] + return True return False