未验证 提交 63feaf50 编写于 作者: Y Yi-Kai-Chen 提交者: GitHub

Integrate MediaTek APU Support on mt6873, mt6885 and mt6853 (#662)

* Margaux dev.

* Support Pad, Activation, Mul operator for mnasnet

* Support document enhance model

* Apply add-init-cache-and-preemption patch

* Refine code

* Add APU int16 quantization and fix macro bugs

* Refine code
Co-authored-by: NYungChien Hsu <yungchien.hsu@mediatek.com>
Co-authored-by: NEric Chen <eric-yk.chen@mediatek.com>
上级 fb59018e
......@@ -21,7 +21,8 @@
namespace mace {
ApuWrapper::ApuWrapper(Device *device)
: quantize_util_(&device->cpu_runtime()->thread_pool()) {
: quantize_util_uint8_(&device->cpu_runtime()->thread_pool()),
quantize_util_int16_(&device->cpu_runtime()->thread_pool()) {
}
apu_data_type ApuWrapper::MapToApuDataType(DataType mace_type) {
......@@ -270,7 +271,7 @@ bool ApuWrapper::Run(const std::map<std::string, Tensor *> &input_tensors,
"Wrong outputs num");
// prepare input
for (int i = 0 ; i < static_cast<int>(input_tensors.size()) ; i++) {
Tensor *tensor = input_tensors.at(input_infos[i].name);
Tensor* tensor = input_tensors.at(input_infos[i].name);
// check size
int element_size = input_infos[i].size;
......@@ -279,18 +280,18 @@ bool ApuWrapper::Run(const std::map<std::string, Tensor *> &input_tensors,
"Wrong input size");
// quantize
if (input_infos[i].data_type == APU_DATA_TYPE_INT16) {
quantize_util_.QuantizeWithScaleAndZeropoint(
quantize_util_int16_.QuantizeWithScaleAndZeropoint(
(const float*)tensor->raw_data(),
element_size,
input_infos[i].scale,
input_infos[i].zero_point,
reinterpret_cast<int16_t*>(input_infos[i].buf.get()));
} else if (input_infos[i].data_type == APU_DATA_TYPE_FLOAT) {
std::memcpy(input_infos[i].buf.get(),
std::memcpy(input_infos[i].buf.get(),
(const float*)tensor->raw_data(),
element_size * byte_per_element);
} else {
quantize_util_.QuantizeWithScaleAndZeropoint(
quantize_util_uint8_.QuantizeWithScaleAndZeropoint(
(const float*)tensor->raw_data(),
element_size,
input_infos[i].scale,
......@@ -304,8 +305,8 @@ bool ApuWrapper::Run(const std::map<std::string, Tensor *> &input_tensors,
MACE_CHECK(ret == true, "neuron run model failed");
// process output
for (int i = 0; i < static_cast<int>(output_tensors->size()); i++) {
Tensor *tensor = output_tensors->at(output_infos[i].name);
for (int i = 0 ; i < static_cast<int>(output_tensors->size()) ; i++) {
Tensor* tensor = output_tensors->at(output_infos[i].name);
// prepare out buffer
tensor->SetDtype(DT_FLOAT);
......@@ -316,7 +317,7 @@ bool ApuWrapper::Run(const std::map<std::string, Tensor *> &input_tensors,
"Wrong output size");
// dequantize
if (output_infos[i].data_type == APU_DATA_TYPE_INT16) {
quantize_util_.Dequantize(
quantize_util_int16_.Dequantize(
reinterpret_cast<int16_t*>(output_infos[i].buf.get()),
element_size,
output_infos[i].scale,
......@@ -327,7 +328,7 @@ bool ApuWrapper::Run(const std::map<std::string, Tensor *> &input_tensors,
output_infos[i].buf.get(),
element_size * byte_per_element);
} else {
quantize_util_.Dequantize(
quantize_util_uint8_.Dequantize(
output_infos[i].buf.get(),
element_size,
output_infos[i].scale,
......@@ -348,19 +349,19 @@ bool ApuWrapper::Uninit() {
}
int ApuWrapper::GetByteNum(apu_data_type data_type) {
int byte_per_element;
if (data_type == APU_DATA_TYPE_FLOAT || data_type == APU_DATA_TYPE_INT32) {
byte_per_element = 4;
} else if (data_type == APU_DATA_TYPE_HALF ||
data_type == APU_DATA_TYPE_INT16) {
byte_per_element = 2;
} else if (data_type == APU_DATA_TYPE_UINT8) {
byte_per_element = 1;
} else {
byte_per_element = 1;
MACE_CHECK(false, "unsupport data type");
}
return byte_per_element;
int byte_per_element;
if (data_type == APU_DATA_TYPE_FLOAT || data_type == APU_DATA_TYPE_INT32) {
byte_per_element = 4;
} else if (data_type == APU_DATA_TYPE_HALF ||
data_type == APU_DATA_TYPE_INT16) {
byte_per_element = 2;
} else if (data_type == APU_DATA_TYPE_UINT8) {
byte_per_element = 1;
} else {
byte_per_element = 1;
MACE_CHECK(false, "unsupport data type");
}
return byte_per_element;
}
} // namespace mace
......@@ -59,7 +59,8 @@ class ApuWrapper {
ApuFrontend *frontend;
std::vector<ApuTensorInfo> input_infos;
std::vector<ApuTensorInfo> output_infos;
QuantizeUtil<float, uint8_t> quantize_util_;
QuantizeUtil<float, uint8_t> quantize_util_uint8_;
QuantizeUtil<float, int16_t> quantize_util_int16_;
};
} // namespace mace
......
......@@ -304,15 +304,14 @@ MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy(
return MaceStatus::MACE_SUCCESS;
}
#ifdef MACE_ENABLE_HEXAGON
MaceStatus MaceEngineConfig::Impl::SetHexagonToUnsignedPD() {
bool ret = false;
#ifdef MACE_ENABLE_HEXAGON
ret = HexagonDSPWrapper::RequestUnsignedPD();
#endif
return ret ? MaceStatus::MACE_SUCCESS : MaceStatus::MACE_RUNTIME_ERROR;
}
#endif
#ifdef MACE_ENABLE_HEXAGON
MaceStatus MaceEngineConfig::Impl::SetHexagonPower(
HexagonNNCornerType corner,
bool dcvs_enable,
......@@ -321,12 +320,12 @@ MaceStatus MaceEngineConfig::Impl::SetHexagonPower(
hexagon_dcvs_enable_ = dcvs_enable;
hexagon_latency_ = latency;
bool ret = false;
#ifdef MACE_ENABLE_HEXAGON
ret = HexagonDSPWrapper::SetPower(corner, dcvs_enable, latency);
#endif
return ret ? MaceStatus::MACE_SUCCESS : MaceStatus::MACE_RUNTIME_ERROR;
}
#endif
#ifdef MACE_ENABLE_APU
MaceStatus MaceEngineConfig::Impl::SetAPUCache(
APUCachePolicy policy,
const std::string &binary_file,
......@@ -335,10 +334,11 @@ MaceStatus MaceEngineConfig::Impl::SetAPUCache(
apu_cache_policy_ = policy;
apu_binary_file_ = binary_file;
apu_storage_file_ = storage_file;
#ifdef MACE_ENABLE_APU
ret = true;
#endif
return ret ? MaceStatus::MACE_SUCCESS : MaceStatus::MACE_RUNTIME_ERROR;
}
#endif
MaceEngineConfig::MaceEngineConfig(
const DeviceType device_type)
......
......@@ -37,9 +37,12 @@ ApuSupportedOps = [
'Concat',
'Conv2D',
'DepthwiseConv2d',
'Deconv2D',
'Eltwise',
'FullyConnected',
'Pad',
'Pooling',
'PRelu',
'Reduce',
'ResizeBilinear',
'Reshape',
......@@ -56,7 +59,9 @@ class ApuOps(object):
MaceOp.Concat.name: ApuOp.Concat.name,
MaceOp.Conv2D.name: ApuOp.Conv2D.name,
MaceOp.DepthwiseConv2d.name: ApuOp.DepthwiseConv2d.name,
MaceOp.Deconv2D.name: ApuOp.Deconv2D.name,
MaceOp.Eltwise.name: ApuOp.Eltwise.name,
MaceOp.FullyConnected.name: ApuOp.FullyConnected.name,
MaceOp.Pad.name: ApuOp.Pad.name,
MaceOp.Pooling.name: ApuOp.Pooling.name,
MaceOp.Reduce.name: ApuOp.Reduce.name,
......@@ -135,7 +140,8 @@ class ApuConverter(base_converter.ConverterInterface):
act_mode_arg = ConverterUtil.get_arg(
op, MaceKeyword.mace_activation_type_str)
if act_mode_arg is not None:
mace_check(act_mode_arg.s == b'RELU'
mace_check(act_mode_arg.s == b'PRELU'
or act_mode_arg.s == b'RELU'
or act_mode_arg.s == b'RELUX'
or act_mode_arg.s == b'TANH'
or act_mode_arg.s == b'SIGMOID',
......@@ -179,6 +185,15 @@ class ApuConverter(base_converter.ConverterInterface):
multiplier.int32_data.extend([tensor.dims[0]])
break
op.input.extend([multiplier.name])
elif op.type == MaceOp.Deconv2D.name:
mace_check(len(op.input) == 4,
op.name + ': apu only support ' + op.type + ' op'
' with 4 input')
self.add_size_tensor_from_arg(
op, MaceKeyword.mace_strides_str)
self.add_padding_value_tensor_from_arg(op)
self.add_size_tensor_from_list(
op, MaceKeyword.mace_dilations_str, [1, 1])
elif op.type == MaceOp.Eltwise.name:
eltwise_type = ConverterUtil.get_arg(
op, MaceKeyword.mace_element_type_str).i
......@@ -276,8 +291,8 @@ class ApuConverter(base_converter.ConverterInterface):
op.name + ': apu only support squeeze op with 1'
' input')
self.add_shape_tensor_from_axis_arg(op)
op.type = self._apu_ops.map_nn_op(op.type)
self.change_activation_to_prelu()
def add_op_output_type(self):
type_map = {}
......@@ -371,6 +386,14 @@ class ApuConverter(base_converter.ConverterInterface):
size_value_tensor.int32_data.extend(size_value_arg.ints)
op.input.extend([size_value_tensor.name])
def add_size_tensor_from_list(self, op, keyword, list_value):
size_value_tensor = self._model.tensors.add()
size_value_tensor.name = op.name + '/' + keyword + ':0'
size_value_tensor.data_type = mace_pb2.DT_INT32
size_value_tensor.dims.extend([len(list_value)])
size_value_tensor.int32_data.extend(list_value)
op.input.extend([size_value_tensor.name])
def add_int_tensor_from_arg(self, op, keyword):
int_value_arg = ConverterUtil.get_arg(op, keyword)
mace_check(int_value_arg.i is not None,
......@@ -420,7 +443,6 @@ class ApuConverter(base_converter.ConverterInterface):
op, MaceKeyword.mace_padding_str)
if padding_type is None:
continue
padding_arg = op.arg.add()
padding_arg.name = MaceKeyword.mace_padding_values_str
if padding_type.i == PaddingMode.VALID.value:
......@@ -431,7 +453,8 @@ class ApuConverter(base_converter.ConverterInterface):
kernel = []
dilation = [1, 1]
if op.type == MaceOp.Conv2D.name or \
op.type == MaceOp.DepthwiseConv2d.name:
op.type == MaceOp.DepthwiseConv2d.name or \
op.type == MaceOp.Deconv2D.name:
if ConverterUtil.get_arg(
op, MaceKeyword.mace_dilations_str) is not None:
dilation = ConverterUtil.get_arg(
......@@ -456,22 +479,37 @@ class ApuConverter(base_converter.ConverterInterface):
if len(in_size) > 0:
break
out_size = op.output_shape[0].dims[1:3]
h = (out_size[0] - 1) * stride[0] \
+ ((kernel[0] - 1) * dilation[0] + 1) - in_size[0]
w = (out_size[1] - 1) * stride[1] \
+ ((kernel[1] - 1) * dilation[1] + 1) - in_size[1]
if(op.type == MaceOp.Deconv2D.name):
h = (in_size[0] - 1) * stride[0] + kernel[0] - out_size[0]
w = (in_size[1] - 1) * stride[1] + kernel[1] - out_size[1]
else:
h = (out_size[0] - 1) * stride[0] \
+ ((kernel[0] - 1) * dilation[0] + 1) - in_size[0]
w = (out_size[1] - 1) * stride[1] \
+ ((kernel[1] - 1) * dilation[1] + 1) - in_size[1]
top = int(np.floor(h/2))
left = int(np.floor(w/2))
bottom = h - top
right = w - left
padding_arg.ints.extend([top, right, bottom, left])
def change_activation_to_prelu(self):
for op in self._model.op:
if op.type == ApuOp.Activation.name and \
ConverterUtil.get_arg(
op, MaceKeyword.mace_activation_type_str).s == b'PRELU':
op.type = ApuOp.PRelu.name
def ensure_bias_vector(self):
for _op in self._model.op:
if _op.type != MaceOp.Conv2D.name and \
_op.type != MaceOp.DepthwiseConv2d.name:
continue
if len(_op.input) != 2:
ensure_input = -1
if _op.type == MaceOp.Conv2D.name or \
_op.type == MaceOp.DepthwiseConv2d.name or \
_op.type == MaceOp.FullyConnected.name:
ensure_input = 3
if _op.type == MaceOp.Deconv2D.name:
ensure_input = 4
if ensure_input == -1 or len(_op.input) != ensure_input - 1:
continue
tensor = self._model.tensors.add()
......@@ -522,15 +560,14 @@ class ApuConverter(base_converter.ConverterInterface):
const_tensor.name = _op.name + '/' + \
MaceKeyword.mace_scalar_input_str + ':0'
const_tensor.dims.extend([1])
const_tensor.data_type = _op.output_type[0]
if _op.output_type[0] == mace_pb2.DT_UINT8 or \
_op.output_type[0] == mace_pb2.DT_INT16:
const_tensor.data_type = _op.output_type[0]
const_tensor.scale = scalar
const_tensor.zero_point = 0
const_tensor.quantized = True
const_tensor.int32_data.extend([1])
elif _op.output_type[0] == mace_pb2.DT_FLOAT:
const_tensor.data_type = mace_pb2.DT_FLOAT
const_tensor.float_data.extend([scalar])
_op.input.extend([const_tensor.name])
ConverterUtil.del_arg(
......
......@@ -340,6 +340,8 @@ class TransformerRule(Enum):
QUANTIZE_LARGE_WEIGHTS = 43
TRANSPOSE_SHAPE_TENSOR_TO_PARAM = 44
TRANSFORM_SINGLE_BN_TO_DEPTHWISE_CONV = 45
TRANSFORM_MUL_MAX_TO_PRELU = 46
TRANSFORM_EXPAND_DIMS_TO_RESHAPE = 47
class ConverterInterface(object):
......@@ -610,6 +612,8 @@ class ConverterOption(object):
if self._device == DeviceType.APU.value:
self._transformer_option = self._transformer_option + [
TransformerRule.TRANSFORM_SINGLE_BN_TO_DEPTHWISE_CONV,
TransformerRule.TRANSFORM_MUL_MAX_TO_PRELU,
TransformerRule.TRANSFORM_EXPAND_DIMS_TO_RESHAPE,
]
if self.quantize_large_weights:
self._transformer_option = self._transformer_option + [
......
......@@ -117,6 +117,10 @@ class Transformer(base_converter.ConverterInterface):
self.quantize_large_weights,
TransformerRule.TRANSFORM_SINGLE_BN_TO_DEPTHWISE_CONV:
self.transform_single_bn_to_depthwise_conv,
TransformerRule.TRANSFORM_MUL_MAX_TO_PRELU:
self.transform_mul_max_to_prelu,
TransformerRule.TRANSFORM_EXPAND_DIMS_TO_RESHAPE:
self.transform_expand_dims_to_reshape,
}
self._option = option
......@@ -962,17 +966,23 @@ class Transformer(base_converter.ConverterInterface):
or op.type == MaceOp.BatchNorm.name) \
and len(self._consumers.get(op.output[0], [])) == 1:
consumer_op = self._consumers[op.output[0]][0]
fold_consumer = False
if consumer_op.type == MaceOp.Activation.name:
act_type_arg = ConverterUtil.get_arg(
consumer_op, MaceKeyword.mace_activation_type_str)
act_type = act_type_arg.s.decode()
if act_type == ActivationType.PRELU.name:
continue
if self._option.device == DeviceType.APU.value:
fold_consumer = (act_type in
[ActivationType.RELU.name,
ActivationType.RELUX.name])
else:
fold_consumer = (act_type != ActivationType.PRELU.name)
# during quantization, only fold relu/relux
if (self._option.quantize_stat or self._option.quantize) \
and act_type not in [ActivationType.RELU.name,
ActivationType.RELUX.name]:
continue
if fold_consumer:
print("Fold activation: %s(%s)" % (op.name, op.type))
op.name = consumer_op.name
op.output[0] = consumer_op.output[0]
......@@ -1032,6 +1042,8 @@ class Transformer(base_converter.ConverterInterface):
return False
def reshape_fc_weight(self):
if self._option.device == DeviceType.APU.value:
return
net = self._model
filter_format = self.filter_format()
for op in net.op:
......@@ -1348,6 +1360,36 @@ class Transformer(base_converter.ConverterInterface):
weight.dims[:] = [1, 1] + list(weight_data.shape)
return True
if self._option.device == DeviceType.APU.value:
if op.type == MaceOp.MatMul.name:
transpose_a_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_transpose_a_str) # noqa
transpose_b_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_transpose_b_str) # noqa
transpose_a = transpose_a_arg is not None and transpose_a_arg.i == 1 # noqa
transpose_b = transpose_b_arg is not None and transpose_b_arg.i == 1 # noqa
if transpose_a is False and transpose_b is False and \
op.input[1] in self._consts and \
len(self.get_tensor_shape(op.input[0])) == 2 and \
len(self.get_tensor_shape(op.input[1])) == 2:
op.type = MaceOp.FullyConnected.name
del op.arg[:]
rhs = op.input[1]
if rhs in self._consts and \
len(self._consts[rhs].dims) == 2:
arg = ConverterUtil.get_arg(op, MaceKeyword.mace_transpose_b_str) # noqa
if arg is None:
arg = op.arg.add()
arg.name = MaceKeyword.mace_transpose_b_str
arg.i = 0
if arg.i == 0:
arg.i = 1
filter = self._consts[rhs]
filter_data = np.array(filter.float_data) \
.reshape(filter.dims)
filter_data = filter_data.transpose(1, 0)
filter.float_data[:] = filter_data.flat
filter.dims[:] = filter_data.shape
six.print_('Transpose matmul weight to shape:',
filter.dims)
return False
def update_float_op_data_type(self):
......@@ -2476,5 +2518,68 @@ class Transformer(base_converter.ConverterInterface):
tensor.dims[:] = [1, 1, 1, tensor.dims[0]]
break
return True
return False
def transform_mul_max_to_prelu(self):
if self._option.device != DeviceType.APU.value:
return False
net = self._model
for op in net.op:
if op.type != MaceOp.Eltwise.name or \
ConverterUtil.get_arg(
op, MaceKeyword.mace_element_type_str).i \
!= EltwiseType.PROD.value or \
op.output[0] not in self._consumers:
continue
if len(op.input) != 1:
continue
consumer_op = self._consumers[op.output[0]][0]
if consumer_op.type != MaceOp.Eltwise.name or \
ConverterUtil.get_arg(
consumer_op, MaceKeyword.mace_element_type_str).i \
!= EltwiseType.MAX.value:
continue
if op.input[0] not in consumer_op.input:
continue
float_value_arg = ConverterUtil.get_arg(
op, MaceKeyword.mace_scalar_input_str)
mace_check(float_value_arg is not None,
op.name + ': ' + MaceKeyword.mace_scalar_input_str +
' value float should not be None')
scalar = float_value_arg.f
if scalar < 0:
continue
if scalar > 1:
scalar = 1
# Change Mul op to Prelu
print("Change mul and max to prelu: %s(%s)" % (op.name, op.type))
op.name = consumer_op.name
op.output[0] = consumer_op.output[0]
alpha_tensor = net.tensors.add()
alpha_tensor.name = op.name + '_alpha'
alpha_tensor.dims.append(1)
alpha_tensor.data_type = mace_pb2.DT_FLOAT
alpha_tensor.float_data.extend([scalar])
op.input.extend([alpha_tensor.name])
ConverterUtil.del_arg(op, MaceKeyword.mace_scalar_input_str)
ConverterUtil.del_arg(
op, MaceKeyword.mace_scalar_input_index_str)
op.type = MaceOp.Activation.name
type_arg = op.arg.add()
type_arg.name = MaceKeyword.mace_activation_type_str
type_arg.s = six.b(ActivationType.PRELU.name)
self.replace_quantize_info(op, consumer_op)
self.safe_remove_node(consumer_op, op)
return True
return False
def transform_expand_dims_to_reshape(self):
if self._option.device != DeviceType.APU.value:
return False
net = self._model
for op in net.op:
if op.type == MaceOp.ExpandDims.name:
op.type = MaceOp.Reshape.name
del op.arg[:]
return True
return False
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册