From 68cfff5fbdd7cc798e23cdf942fc0b7eb125ba90 Mon Sep 17 00:00:00 2001 From: Bin Li Date: Wed, 27 Nov 2019 16:05:14 +0800 Subject: [PATCH] Support Hexagon TransposeConv2d and multiple float inputs/outputs for HTA --- .../runtime/hexagon/hexagon_hta_wrapper.cc | 128 +++++++++++------- mace/libmace/mace.cc | 4 - tools/device.py | 2 +- tools/python/transform/hexagon_converter.py | 93 +++++++++---- tools/python/transform/transformer.py | 13 ++ tools/sh_commands.py | 12 +- 6 files changed, 168 insertions(+), 84 deletions(-) diff --git a/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc b/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc index 07a7a5e9..069eab30 100644 --- a/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc +++ b/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc @@ -31,6 +31,39 @@ namespace mace { +namespace { +struct InputOutputMetadata { + void Init(float min_val, float max_val, int needs_quantization) { + this->min_val = min_val; + this->max_val = max_val; + this->needs_quantization = needs_quantization; + } + float min_val; + float max_val; + int needs_quantization; +}; + +template +void AddInputMetadata(const T &data, hexagon_hta_nn_tensordef *tensor) { + tensor->batches = 1; + tensor->height = 1; + tensor->width = 1; + tensor->depth = 1; + tensor->data = const_cast( + reinterpret_cast(&data)); + tensor->dataLen = sizeof(data); + tensor->data_valid_len = sizeof(data); + tensor->unused = 0; +} + +template +void AddOutputMetadata(const T &data, hexagon_hta_nn_tensordef *tensor) { + tensor->data = const_cast( + reinterpret_cast(&data)); + tensor->dataLen = sizeof(data); +} +} // namespace + HexagonHTAWrapper::HexagonHTAWrapper(Device *device) : quantize_util_(&device->cpu_runtime()->thread_pool()) { } @@ -227,86 +260,81 @@ bool HexagonHTAWrapper::ExecuteGraphNew( const std::map &input_tensors, std::map *output_tensors) { VLOG(2) << "Execute graph new: " << nn_id_; - uint32_t num_inputs = static_cast(input_tensors.size()); - uint32_t num_outputs = static_cast(output_tensors->size()); + auto num_inputs = static_cast(input_tensors.size()); + auto num_outputs = static_cast(output_tensors->size()); MACE_CHECK(num_inputs_ == static_cast(num_inputs), "Wrong inputs num"); MACE_CHECK(num_outputs_ == static_cast(num_outputs), "Wrong outputs num"); - std::vector inputs(num_inputs); - std::vector outputs(num_outputs); + std::vector inputs(num_inputs * kNumMetaData); + std::vector outputs(num_outputs * kNumMetaData); + std::vector input_metadata(num_inputs); + std::vector output_metadata(num_outputs); for (size_t i = 0; i < num_inputs; ++i) { const auto input_tensor = input_tensors.at(input_info_[i].name); const auto &input_shape = input_tensor->shape(); - inputs[i].batches = static_cast(input_shape[0]); - inputs[i].height = static_cast(input_shape[1]); - inputs[i].width = static_cast(input_shape[2]); - inputs[i].depth = static_cast(input_shape[3]); - input_info_[i].tensor_u8->SetDtype(DT_UINT8); - input_info_[i].tensor_u8->Resize(input_shape); - - const float *input_data = input_tensor->data(); - uint8_t *input_data_u8 = input_info_[i].tensor_u8->mutable_data(); - quantize_util_.QuantizeWithScaleAndZeropoint(input_data, - input_tensor->size(), - input_info_[i].scale, - input_info_[i].zero_point, - input_data_u8); - - inputs[i].data = const_cast( - reinterpret_cast( - input_info_[i].tensor_u8->raw_data())); - inputs[i].dataLen = static_cast(input_info_[i].tensor_u8->raw_size()); - inputs[i].data_valid_len = static_cast( - input_info_[i].tensor_u8->raw_size()); - inputs[i].unused = 0; + size_t index = i * kNumMetaData; + inputs[index].batches = static_cast(input_shape[0]); + inputs[index].height = static_cast(input_shape[1]); + inputs[index].width = static_cast(input_shape[2]); + inputs[index].depth = static_cast(input_shape[3]); + inputs[index].data = const_cast( + reinterpret_cast(input_tensor->raw_data())); + inputs[index].dataLen = static_cast(input_tensor->raw_size()); + inputs[index].data_valid_len = + static_cast(input_tensor->raw_size()); + inputs[index].unused = 0; + input_metadata[i].Init(.0f, .0f, 1); + AddInputMetadata(input_metadata[i].min_val, &inputs[index + 1]); + AddInputMetadata(input_metadata[i].max_val, &inputs[index + 2]); + AddInputMetadata(input_metadata[i].needs_quantization, &inputs[index + 3]); } + // transform mace output to hexagon output for (size_t i = 0; i < num_outputs; ++i) { auto output_tensor = output_tensors->at(output_info_[i].name); + size_t index = i * kNumMetaData; output_tensor->SetDtype(output_info_[i].data_type); output_tensor->Resize(output_info_[i].shape); - output_info_[i].tensor_u8->SetDtype(DT_UINT8); - output_info_[i].tensor_u8->Resize(output_info_[i].shape); - outputs[i].data = reinterpret_cast( - output_info_[i].tensor_u8->raw_mutable_data()); - outputs[i].dataLen = - static_cast(output_info_[i].tensor_u8->raw_size()); + + outputs[index].data = reinterpret_cast( + output_tensor->raw_mutable_data()); + outputs[index].dataLen = static_cast(output_tensor->raw_size()); + output_metadata[i].Init(.0f, .0f, 1); + + AddOutputMetadata(output_metadata[i].min_val, &outputs[index + 1]); + AddOutputMetadata(output_metadata[i].max_val, &outputs[index + 2]); + AddOutputMetadata(output_metadata[i].needs_quantization, + &outputs[index + 3]); } int res = hexagon_hta_nn_execute_new(nn_id_, inputs.data(), - num_inputs, + num_inputs * kNumMetaData, outputs.data(), - num_outputs); + num_outputs * kNumMetaData); + MACE_CHECK(res == 0, "execute error"); for (size_t i = 0; i < num_outputs; ++i) { + size_t index = i * kNumMetaData; std::vector output_shape{ - outputs[i].batches, outputs[i].height, outputs[i].width, - outputs[i].depth}; + outputs[index].batches, outputs[index].height, outputs[index].width, + outputs[index].depth}; MACE_CHECK(output_shape.size() == output_info_[i].shape.size(), output_shape.size(), " vs ", output_info_[i].shape.size(), - "wrong output shape inferred"); + " wrong output shape inferred"); for (size_t j = 0; j < output_shape.size(); ++j) { MACE_CHECK(static_cast(output_shape[j]) == output_info_[i].shape[j], output_shape[j], " vs ", output_info_[i].shape[j], - "wrong output shape inferred"); + " wrong output shape[", j, "] inferred"); } auto output_tensor = output_tensors->at(output_info_[i].name); - MACE_CHECK(static_cast(outputs[i].data_valid_len) - == output_tensor->size(), - outputs[i].data_valid_len, " vs ", output_tensor->size(), - " wrong output size inferred."); - - const uint8_t *output_data_u8 = output_info_[i].tensor_u8->data(); - float *output_data = output_tensor->mutable_data(); - quantize_util_.Dequantize(output_data_u8, - output_info_[i].tensor_u8->size(), - output_info_[i].scale, - output_info_[i].zero_point, - output_data); + MACE_CHECK(static_cast(outputs[index].data_valid_len) + == output_tensor->raw_size(), + outputs[index].data_valid_len, " vs ", output_tensor->raw_size(), + " wrong output bytes inferred."); } return res == 0; diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index 98ddf484..3ab4b13e 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -878,10 +878,6 @@ MaceStatus MaceEngine::Impl::Run( } #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) if (device_type_ == HEXAGON || device_type_ == HTA) { - if (device_type_ == HTA) { - MACE_CHECK(input_tensors.size() == 1 && output_tensors.size() == 1, - "HTA not support multiple inputs and outputs yet."); - } hexagon_controller_->ExecuteGraphNew(input_tensors, &output_tensors); } else { #endif diff --git a/tools/device.py b/tools/device.py index 2952177f..c7e95553 100644 --- a/tools/device.py +++ b/tools/device.py @@ -1006,7 +1006,7 @@ class DeviceManager: YAMLKeyword.target_abis: [ABIType.host], YAMLKeyword.target_socs: '', YAMLKeyword.system: SystemType.host, - YAMLKeyword.address: None, + YAMLKeyword.address: SystemType.host, } devices_list.append(host) diff --git a/tools/python/transform/hexagon_converter.py b/tools/python/transform/hexagon_converter.py index d8fcc1f7..ccdefdce 100644 --- a/tools/python/transform/hexagon_converter.py +++ b/tools/python/transform/hexagon_converter.py @@ -51,6 +51,7 @@ HexagonSupportedOps = [ 'QuantizedResizeBilinear_8', 'QuantizedSoftmax_8', 'QuantizedSub_8p8to8', + 'QuantizedTransposeConv2d_8x8p32to8', 'QuantizeINPUT_f_to_8', 'SpaceToBatchND_8', 'SpaceToDepth_8', @@ -96,6 +97,7 @@ class HexagonConverter(base_converter.ConverterInterface): MaceOp.BatchToSpaceND.name: self.convert_batchspace, MaceOp.Concat.name: self.convert_concat, MaceOp.Conv2D.name: self.convert_conv2d, + MaceOp.Deconv2D.name: self.convert_deconv2d, MaceOp.DepthToSpace.name: self.convert_depthspace, MaceOp.DepthwiseConv2d.name: self.convert_conv2d, MaceOp.Dequantize.name: self.convert_dequantize, @@ -110,11 +112,6 @@ class HexagonConverter(base_converter.ConverterInterface): } def run(self): - if self._option.device == DeviceType.HTA.value: - mace_check(len(self._option.input_nodes) == 1 - and len(self._option.output_nodes) == 1, - 'hta only support single input and output') - for tensor in self._model.tensors: self._consts[tensor.name] = tensor @@ -136,7 +133,7 @@ class HexagonConverter(base_converter.ConverterInterface): self._quantize_activation_info[tensors[i]] = \ self._quantize_activation_info[node_name] - def add_const_node(self, name, val): + def add_scalar_const_node(self, name, val): if name not in self._consts: tensor = self._model.tensors.add() self._consts[name] = tensor @@ -180,14 +177,14 @@ class HexagonConverter(base_converter.ConverterInterface): min_tensor_name = op + ':1' else: min_tensor_name = op + '_min:0' - self.add_const_node(min_tensor_name, minval) + self.add_scalar_const_node(min_tensor_name, minval) this_op.input.extend([min_tensor_name]) if add_max: if is_activation and diff_port: max_tensor_name = op + ':2' else: max_tensor_name = op + '_max:0' - self.add_const_node(max_tensor_name, maxval) + self.add_scalar_const_node(max_tensor_name, maxval) this_op.input.extend([max_tensor_name]) def add_constant_min_max_for_first_op(self, op): @@ -196,8 +193,8 @@ class HexagonConverter(base_converter.ConverterInterface): input_op, _ = get_op_and_port_from_tensor(op.input[0]) input_min = input_op + '_min:0' input_max = input_op + '_max:0' - self.add_const_node(input_min, minval) - self.add_const_node(input_max, maxval) + self.add_scalar_const_node(input_min, minval) + self.add_scalar_const_node(input_max, maxval) for i in range(len(op.input)): if op.input[i] == input_op + ':1': op.input[i] = input_min @@ -265,20 +262,6 @@ class HexagonConverter(base_converter.ConverterInterface): else: index += 1 - if self._option.device == DeviceType.HTA.value: - # replace QuantizeINPUT_f_to_8 with INPUT - quantize_input_op.type = HexagonOp.INPUT.name - del quantize_input_op.output_shape[1:] - del quantize_input_op.output_type[1:] - del quantize_input_op.out_max_byte_size[1:] - - # replace first op's input min max with constant - self.add_constant_min_max_for_first_op(self._model.op[1]) - - # replace DequantizeOUTPUT_8tof with OUTPUT - dequantize_output_op.type = HexagonOp.OUTPUT.name - del dequantize_output_op.input[1:] - return quantize_input_op.output def add_node_id(self, model_inputs): @@ -421,6 +404,68 @@ class HexagonConverter(base_converter.ConverterInterface): else: op.type = HexagonOp.Supernode_8x8p32to8.name + def add_deconv_pad_node(self, op): + padding_type_arg = \ + ConverterUtil.get_arg(op, MaceKeyword.mace_padding_type_str) + mace_check(padding_type_arg is not None, "Missing padding of Deconv.") + padding_type = PaddingMode(padding_type_arg.i) + filter_tensor = self._consts[op.input[1]] + filter_height = filter_tensor.dims[1] + filter_width = filter_tensor.dims[2] + + if padding_type == PaddingMode.VALID: + paddings = [0, 0, 0, 0] + elif padding_type == PaddingMode.SAME: + pad_height, pad_width = filter_height // 2, filter_width // 2 + paddings = [pad_height, pad_height, pad_width, pad_width] + else: + raise Exception('Hexagon deconv does not support padding type: ', + padding_type) + + padding_tensor = self._model.tensors.add() + padding_tensor.name = op.name + "/paddings:0" + padding_tensor.data_type = mace_pb2.DT_INT32 + padding_tensor.dims.extend([1, 1, 2, 2]) + padding_tensor.int32_data.extend(paddings) + + self._consts[padding_tensor.name] = padding_tensor + op.input.append(padding_tensor.name) + + def convert_deconv2d(self, op): + channels = op.output_shape[0].dims[3] + if len(op.input) < 4: + print('Hexagon deconv requires biasadd, we add it.') + bias_data = np.zeros(channels, dtype=int) + bias_tensor = self._model.tensors.add() + bias_tensor.data_type = mace_pb2.DT_INT32 + bias_tensor.dims.extend([channels]) + bias_tensor.int32_data.extend(bias_data) + bias_tensor.minval = 0 + bias_tensor.maxval = 0 + bias_tensor.name = op.name + "/bias:0" + bias = bias_tensor.name + self._consts[bias] = bias_tensor + else: + bias = op.input.pop() + op.input.pop() # output shape + + self.add_min_max_const_node(op, op.input[0]) + self.add_min_max_const_node(op, op.input[1]) + + self.add_deconv_pad_node(op) + + strides_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_strides_str) + mace_check(strides_arg is not None, "Missing strides of Deconv.") + self.add_arg_const_node( + op, '/strides:0', [1, strides_arg.ints[0], strides_arg.ints[1], 1]) + + op.input.append(bias) + self.add_min_max_const_node(op, bias) + self.add_min_max_const_node( + op, op.output[0], True, True, False) + + op.type = HexagonOp.QuantizedTransposeConv2d_8x8p32to8.name + def convert_depthspace(self, op): size_arg = ConverterUtil.get_arg( op, MaceKeyword.mace_space_depth_block_size_str) diff --git a/tools/python/transform/transformer.py b/tools/python/transform/transformer.py index 1c69a07e..2b67b1c9 100644 --- a/tools/python/transform/transformer.py +++ b/tools/python/transform/transformer.py @@ -1130,6 +1130,19 @@ class Transformer(base_converter.ConverterInterface): elif self._option.quantize and \ (self._option.device == DeviceType.HEXAGON.value or self._option.device == DeviceType.HTA.value): + for op in net.op: + # from HWOI to OHWI, deconv is unique + if op.type == MaceOp.Deconv2D.name \ + and op.input[1] in self._consts \ + and op.input[1] not in transposed_deconv_filter: + filter = self._consts[op.input[1]] + filter_data = np.array(filter.float_data).reshape( + filter.dims) + filter_data = filter_data.transpose(2, 0, 1, 3) + filter.float_data[:] = filter_data.flat + filter.dims[:] = filter_data.shape + transposed_deconv_filter.add(op.input[1]) + print("Transpose filters to HWIO/HWIM") mace_check(filter_format == DataFormat.HWIO, "HEXAGON only support HWIO/HWIM filter format.") diff --git a/tools/sh_commands.py b/tools/sh_commands.py index c713b0c2..95d93c5d 100644 --- a/tools/sh_commands.py +++ b/tools/sh_commands.py @@ -70,7 +70,8 @@ def device_lock_path(serialno): def device_lock(serialno, timeout=7200): import filelock - return filelock.FileLock(device_lock_path(serialno.replace("/", "")), timeout=timeout) + return filelock.FileLock(device_lock_path(serialno.replace("/", "")), + timeout=timeout) def is_device_locked(serialno): @@ -100,8 +101,8 @@ def stdout_success(stdout): def choose_a_random_device(target_devices, target_abi): eligible_devices = [dev for dev in target_devices if target_abi in dev[common.YAMLKeyword.target_abis]] - unlocked_devices = \ - [dev for dev in eligible_devices if not is_device_locked(dev)] + unlocked_devices = [dev for dev in eligible_devices if + not is_device_locked(dev[common.YAMLKeyword.address])] if len(unlocked_devices) > 0: chosen_devices = [random.choice(unlocked_devices)] else: @@ -607,11 +608,12 @@ def push_depended_so_libs(libmace_dynamic_library_path, for dep in split_stdout(dep_so_libs): if dep == "libgnustl_shared.so": src_file = "%s/sources/cxx-stl/gnu-libstdc++/4.9/libs/" \ - "%s/libgnustl_shared.so"\ + "%s/libgnustl_shared.so" \ % (os.environ["ANDROID_NDK_HOME"], abi) elif dep == "libc++_shared.so": src_file = "%s/sources/cxx-stl/llvm-libc++/libs/" \ - "%s/libc++_shared.so" % (os.environ["ANDROID_NDK_HOME"], abi) + "%s/libc++_shared.so"\ + % (os.environ["ANDROID_NDK_HOME"], abi) print("push %s to %s" % (src_file, phone_data_dir)) adb_push(src_file, phone_data_dir, serialno) -- GitLab