提交 a2f49f02 编写于 作者: 卢旭辉

Merge branch 'multiple' into 'master'

Support Hexagon TransposeConv2d and multiple float inputs/outputs for HTA

See merge request deep-computing/mace!1239
......@@ -31,6 +31,39 @@
namespace mace {
namespace {
struct InputOutputMetadata {
void Init(float min_val, float max_val, int needs_quantization) {
this->min_val = min_val;
this->max_val = max_val;
this->needs_quantization = needs_quantization;
}
float min_val;
float max_val;
int needs_quantization;
};
template<typename T>
void AddInputMetadata(const T &data, hexagon_hta_nn_tensordef *tensor) {
tensor->batches = 1;
tensor->height = 1;
tensor->width = 1;
tensor->depth = 1;
tensor->data = const_cast<unsigned char *>(
reinterpret_cast<const unsigned char *>(&data));
tensor->dataLen = sizeof(data);
tensor->data_valid_len = sizeof(data);
tensor->unused = 0;
}
template<typename T>
void AddOutputMetadata(const T &data, hexagon_hta_nn_tensordef *tensor) {
tensor->data = const_cast<unsigned char *>(
reinterpret_cast<const unsigned char *>(&data));
tensor->dataLen = sizeof(data);
}
} // namespace
HexagonHTAWrapper::HexagonHTAWrapper(Device *device)
: quantize_util_(&device->cpu_runtime()->thread_pool()) {
}
......@@ -227,86 +260,81 @@ bool HexagonHTAWrapper::ExecuteGraphNew(
const std::map<std::string, Tensor *> &input_tensors,
std::map<std::string, Tensor *> *output_tensors) {
VLOG(2) << "Execute graph new: " << nn_id_;
uint32_t num_inputs = static_cast<uint32_t>(input_tensors.size());
uint32_t num_outputs = static_cast<uint32_t>(output_tensors->size());
auto num_inputs = static_cast<uint32_t>(input_tensors.size());
auto num_outputs = static_cast<uint32_t>(output_tensors->size());
MACE_CHECK(num_inputs_ == static_cast<int>(num_inputs), "Wrong inputs num");
MACE_CHECK(num_outputs_ == static_cast<int>(num_outputs),
"Wrong outputs num");
std::vector<hexagon_hta_nn_tensordef> inputs(num_inputs);
std::vector<hexagon_hta_nn_tensordef> outputs(num_outputs);
std::vector<hexagon_hta_nn_tensordef> inputs(num_inputs * kNumMetaData);
std::vector<hexagon_hta_nn_tensordef> outputs(num_outputs * kNumMetaData);
std::vector<InputOutputMetadata> input_metadata(num_inputs);
std::vector<InputOutputMetadata> output_metadata(num_outputs);
for (size_t i = 0; i < num_inputs; ++i) {
const auto input_tensor = input_tensors.at(input_info_[i].name);
const auto &input_shape = input_tensor->shape();
inputs[i].batches = static_cast<uint32_t>(input_shape[0]);
inputs[i].height = static_cast<uint32_t>(input_shape[1]);
inputs[i].width = static_cast<uint32_t>(input_shape[2]);
inputs[i].depth = static_cast<uint32_t>(input_shape[3]);
input_info_[i].tensor_u8->SetDtype(DT_UINT8);
input_info_[i].tensor_u8->Resize(input_shape);
const float *input_data = input_tensor->data<float>();
uint8_t *input_data_u8 = input_info_[i].tensor_u8->mutable_data<uint8_t>();
quantize_util_.QuantizeWithScaleAndZeropoint(input_data,
input_tensor->size(),
input_info_[i].scale,
input_info_[i].zero_point,
input_data_u8);
inputs[i].data = const_cast<unsigned char *>(
reinterpret_cast<const unsigned char *>(
input_info_[i].tensor_u8->raw_data()));
inputs[i].dataLen = static_cast<int>(input_info_[i].tensor_u8->raw_size());
inputs[i].data_valid_len = static_cast<uint32_t>(
input_info_[i].tensor_u8->raw_size());
inputs[i].unused = 0;
size_t index = i * kNumMetaData;
inputs[index].batches = static_cast<uint32_t>(input_shape[0]);
inputs[index].height = static_cast<uint32_t>(input_shape[1]);
inputs[index].width = static_cast<uint32_t>(input_shape[2]);
inputs[index].depth = static_cast<uint32_t>(input_shape[3]);
inputs[index].data = const_cast<unsigned char *>(
reinterpret_cast<const unsigned char *>(input_tensor->raw_data()));
inputs[index].dataLen = static_cast<int>(input_tensor->raw_size());
inputs[index].data_valid_len =
static_cast<uint32_t>(input_tensor->raw_size());
inputs[index].unused = 0;
input_metadata[i].Init(.0f, .0f, 1);
AddInputMetadata(input_metadata[i].min_val, &inputs[index + 1]);
AddInputMetadata(input_metadata[i].max_val, &inputs[index + 2]);
AddInputMetadata(input_metadata[i].needs_quantization, &inputs[index + 3]);
}
// transform mace output to hexagon output
for (size_t i = 0; i < num_outputs; ++i) {
auto output_tensor = output_tensors->at(output_info_[i].name);
size_t index = i * kNumMetaData;
output_tensor->SetDtype(output_info_[i].data_type);
output_tensor->Resize(output_info_[i].shape);
output_info_[i].tensor_u8->SetDtype(DT_UINT8);
output_info_[i].tensor_u8->Resize(output_info_[i].shape);
outputs[i].data = reinterpret_cast<unsigned char *>(
output_info_[i].tensor_u8->raw_mutable_data());
outputs[i].dataLen =
static_cast<int>(output_info_[i].tensor_u8->raw_size());
outputs[index].data = reinterpret_cast<unsigned char *>(
output_tensor->raw_mutable_data());
outputs[index].dataLen = static_cast<int>(output_tensor->raw_size());
output_metadata[i].Init(.0f, .0f, 1);
AddOutputMetadata(output_metadata[i].min_val, &outputs[index + 1]);
AddOutputMetadata(output_metadata[i].max_val, &outputs[index + 2]);
AddOutputMetadata(output_metadata[i].needs_quantization,
&outputs[index + 3]);
}
int res = hexagon_hta_nn_execute_new(nn_id_,
inputs.data(),
num_inputs,
num_inputs * kNumMetaData,
outputs.data(),
num_outputs);
num_outputs * kNumMetaData);
MACE_CHECK(res == 0, "execute error");
for (size_t i = 0; i < num_outputs; ++i) {
size_t index = i * kNumMetaData;
std::vector<uint32_t> output_shape{
outputs[i].batches, outputs[i].height, outputs[i].width,
outputs[i].depth};
outputs[index].batches, outputs[index].height, outputs[index].width,
outputs[index].depth};
MACE_CHECK(output_shape.size() == output_info_[i].shape.size(),
output_shape.size(), " vs ", output_info_[i].shape.size(),
"wrong output shape inferred");
" wrong output shape inferred");
for (size_t j = 0; j < output_shape.size(); ++j) {
MACE_CHECK(static_cast<index_t>(output_shape[j])
== output_info_[i].shape[j],
output_shape[j], " vs ", output_info_[i].shape[j],
"wrong output shape inferred");
" wrong output shape[", j, "] inferred");
}
auto output_tensor = output_tensors->at(output_info_[i].name);
MACE_CHECK(static_cast<index_t>(outputs[i].data_valid_len)
== output_tensor->size(),
outputs[i].data_valid_len, " vs ", output_tensor->size(),
" wrong output size inferred.");
const uint8_t *output_data_u8 = output_info_[i].tensor_u8->data<uint8_t>();
float *output_data = output_tensor->mutable_data<float>();
quantize_util_.Dequantize(output_data_u8,
output_info_[i].tensor_u8->size(),
output_info_[i].scale,
output_info_[i].zero_point,
output_data);
MACE_CHECK(static_cast<index_t>(outputs[index].data_valid_len)
== output_tensor->raw_size(),
outputs[index].data_valid_len, " vs ", output_tensor->raw_size(),
" wrong output bytes inferred.");
}
return res == 0;
......
......@@ -878,10 +878,6 @@ MaceStatus MaceEngine::Impl::Run(
}
#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
if (device_type_ == HEXAGON || device_type_ == HTA) {
if (device_type_ == HTA) {
MACE_CHECK(input_tensors.size() == 1 && output_tensors.size() == 1,
"HTA not support multiple inputs and outputs yet.");
}
hexagon_controller_->ExecuteGraphNew(input_tensors, &output_tensors);
} else {
#endif
......
......@@ -1006,7 +1006,7 @@ class DeviceManager:
YAMLKeyword.target_abis: [ABIType.host],
YAMLKeyword.target_socs: '',
YAMLKeyword.system: SystemType.host,
YAMLKeyword.address: None,
YAMLKeyword.address: SystemType.host,
}
devices_list.append(host)
......
......@@ -51,6 +51,7 @@ HexagonSupportedOps = [
'QuantizedResizeBilinear_8',
'QuantizedSoftmax_8',
'QuantizedSub_8p8to8',
'QuantizedTransposeConv2d_8x8p32to8',
'QuantizeINPUT_f_to_8',
'SpaceToBatchND_8',
'SpaceToDepth_8',
......@@ -96,6 +97,7 @@ class HexagonConverter(base_converter.ConverterInterface):
MaceOp.BatchToSpaceND.name: self.convert_batchspace,
MaceOp.Concat.name: self.convert_concat,
MaceOp.Conv2D.name: self.convert_conv2d,
MaceOp.Deconv2D.name: self.convert_deconv2d,
MaceOp.DepthToSpace.name: self.convert_depthspace,
MaceOp.DepthwiseConv2d.name: self.convert_conv2d,
MaceOp.Dequantize.name: self.convert_dequantize,
......@@ -110,11 +112,6 @@ class HexagonConverter(base_converter.ConverterInterface):
}
def run(self):
if self._option.device == DeviceType.HTA.value:
mace_check(len(self._option.input_nodes) == 1
and len(self._option.output_nodes) == 1,
'hta only support single input and output')
for tensor in self._model.tensors:
self._consts[tensor.name] = tensor
......@@ -136,7 +133,7 @@ class HexagonConverter(base_converter.ConverterInterface):
self._quantize_activation_info[tensors[i]] = \
self._quantize_activation_info[node_name]
def add_const_node(self, name, val):
def add_scalar_const_node(self, name, val):
if name not in self._consts:
tensor = self._model.tensors.add()
self._consts[name] = tensor
......@@ -180,14 +177,14 @@ class HexagonConverter(base_converter.ConverterInterface):
min_tensor_name = op + ':1'
else:
min_tensor_name = op + '_min:0'
self.add_const_node(min_tensor_name, minval)
self.add_scalar_const_node(min_tensor_name, minval)
this_op.input.extend([min_tensor_name])
if add_max:
if is_activation and diff_port:
max_tensor_name = op + ':2'
else:
max_tensor_name = op + '_max:0'
self.add_const_node(max_tensor_name, maxval)
self.add_scalar_const_node(max_tensor_name, maxval)
this_op.input.extend([max_tensor_name])
def add_constant_min_max_for_first_op(self, op):
......@@ -196,8 +193,8 @@ class HexagonConverter(base_converter.ConverterInterface):
input_op, _ = get_op_and_port_from_tensor(op.input[0])
input_min = input_op + '_min:0'
input_max = input_op + '_max:0'
self.add_const_node(input_min, minval)
self.add_const_node(input_max, maxval)
self.add_scalar_const_node(input_min, minval)
self.add_scalar_const_node(input_max, maxval)
for i in range(len(op.input)):
if op.input[i] == input_op + ':1':
op.input[i] = input_min
......@@ -265,20 +262,6 @@ class HexagonConverter(base_converter.ConverterInterface):
else:
index += 1
if self._option.device == DeviceType.HTA.value:
# replace QuantizeINPUT_f_to_8 with INPUT
quantize_input_op.type = HexagonOp.INPUT.name
del quantize_input_op.output_shape[1:]
del quantize_input_op.output_type[1:]
del quantize_input_op.out_max_byte_size[1:]
# replace first op's input min max with constant
self.add_constant_min_max_for_first_op(self._model.op[1])
# replace DequantizeOUTPUT_8tof with OUTPUT
dequantize_output_op.type = HexagonOp.OUTPUT.name
del dequantize_output_op.input[1:]
return quantize_input_op.output
def add_node_id(self, model_inputs):
......@@ -421,6 +404,68 @@ class HexagonConverter(base_converter.ConverterInterface):
else:
op.type = HexagonOp.Supernode_8x8p32to8.name
def add_deconv_pad_node(self, op):
padding_type_arg = \
ConverterUtil.get_arg(op, MaceKeyword.mace_padding_type_str)
mace_check(padding_type_arg is not None, "Missing padding of Deconv.")
padding_type = PaddingMode(padding_type_arg.i)
filter_tensor = self._consts[op.input[1]]
filter_height = filter_tensor.dims[1]
filter_width = filter_tensor.dims[2]
if padding_type == PaddingMode.VALID:
paddings = [0, 0, 0, 0]
elif padding_type == PaddingMode.SAME:
pad_height, pad_width = filter_height // 2, filter_width // 2
paddings = [pad_height, pad_height, pad_width, pad_width]
else:
raise Exception('Hexagon deconv does not support padding type: ',
padding_type)
padding_tensor = self._model.tensors.add()
padding_tensor.name = op.name + "/paddings:0"
padding_tensor.data_type = mace_pb2.DT_INT32
padding_tensor.dims.extend([1, 1, 2, 2])
padding_tensor.int32_data.extend(paddings)
self._consts[padding_tensor.name] = padding_tensor
op.input.append(padding_tensor.name)
def convert_deconv2d(self, op):
channels = op.output_shape[0].dims[3]
if len(op.input) < 4:
print('Hexagon deconv requires biasadd, we add it.')
bias_data = np.zeros(channels, dtype=int)
bias_tensor = self._model.tensors.add()
bias_tensor.data_type = mace_pb2.DT_INT32
bias_tensor.dims.extend([channels])
bias_tensor.int32_data.extend(bias_data)
bias_tensor.minval = 0
bias_tensor.maxval = 0
bias_tensor.name = op.name + "/bias:0"
bias = bias_tensor.name
self._consts[bias] = bias_tensor
else:
bias = op.input.pop()
op.input.pop() # output shape
self.add_min_max_const_node(op, op.input[0])
self.add_min_max_const_node(op, op.input[1])
self.add_deconv_pad_node(op)
strides_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_strides_str)
mace_check(strides_arg is not None, "Missing strides of Deconv.")
self.add_arg_const_node(
op, '/strides:0', [1, strides_arg.ints[0], strides_arg.ints[1], 1])
op.input.append(bias)
self.add_min_max_const_node(op, bias)
self.add_min_max_const_node(
op, op.output[0], True, True, False)
op.type = HexagonOp.QuantizedTransposeConv2d_8x8p32to8.name
def convert_depthspace(self, op):
size_arg = ConverterUtil.get_arg(
op, MaceKeyword.mace_space_depth_block_size_str)
......
......@@ -1130,6 +1130,19 @@ class Transformer(base_converter.ConverterInterface):
elif self._option.quantize and \
(self._option.device == DeviceType.HEXAGON.value or
self._option.device == DeviceType.HTA.value):
for op in net.op:
# from HWOI to OHWI, deconv is unique
if op.type == MaceOp.Deconv2D.name \
and op.input[1] in self._consts \
and op.input[1] not in transposed_deconv_filter:
filter = self._consts[op.input[1]]
filter_data = np.array(filter.float_data).reshape(
filter.dims)
filter_data = filter_data.transpose(2, 0, 1, 3)
filter.float_data[:] = filter_data.flat
filter.dims[:] = filter_data.shape
transposed_deconv_filter.add(op.input[1])
print("Transpose filters to HWIO/HWIM")
mace_check(filter_format == DataFormat.HWIO,
"HEXAGON only support HWIO/HWIM filter format.")
......
......@@ -70,7 +70,8 @@ def device_lock_path(serialno):
def device_lock(serialno, timeout=7200):
import filelock
return filelock.FileLock(device_lock_path(serialno.replace("/", "")), timeout=timeout)
return filelock.FileLock(device_lock_path(serialno.replace("/", "")),
timeout=timeout)
def is_device_locked(serialno):
......@@ -100,8 +101,8 @@ def stdout_success(stdout):
def choose_a_random_device(target_devices, target_abi):
eligible_devices = [dev for dev in target_devices
if target_abi in dev[common.YAMLKeyword.target_abis]]
unlocked_devices = \
[dev for dev in eligible_devices if not is_device_locked(dev)]
unlocked_devices = [dev for dev in eligible_devices if
not is_device_locked(dev[common.YAMLKeyword.address])]
if len(unlocked_devices) > 0:
chosen_devices = [random.choice(unlocked_devices)]
else:
......@@ -607,11 +608,12 @@ def push_depended_so_libs(libmace_dynamic_library_path,
for dep in split_stdout(dep_so_libs):
if dep == "libgnustl_shared.so":
src_file = "%s/sources/cxx-stl/gnu-libstdc++/4.9/libs/" \
"%s/libgnustl_shared.so"\
"%s/libgnustl_shared.so" \
% (os.environ["ANDROID_NDK_HOME"], abi)
elif dep == "libc++_shared.so":
src_file = "%s/sources/cxx-stl/llvm-libc++/libs/" \
"%s/libc++_shared.so" % (os.environ["ANDROID_NDK_HOME"], abi)
"%s/libc++_shared.so"\
% (os.environ["ANDROID_NDK_HOME"], abi)
print("push %s to %s" % (src_file, phone_data_dir))
adb_push(src_file, phone_data_dir, serialno)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册