提交 fb59018e 编写于 作者: 卢旭辉

Merge branch 'mtk_apu' into 'master'

Update APU support

See merge request applied-machine-learning/sysml/mace!1283
......@@ -102,6 +102,17 @@ enum HexagonNNCornerType {
HEXAGON_NN_CORNER_SVS2,
};
// APU Initial Cache Policy:
// NONE: Compile model using the information from net_def and model_data.
// STORE: Compile model using the information from net_def and model_data and
// store the compiled model.
// LOAD: Get input/output information from net_def and load pre-compiled model.
enum APUCachePolicy {
APU_CACHE_NONE = 0,
APU_CACHE_STORE = 1,
APU_CACHE_LOAD = 2,
};
struct CallStats {
int64_t start_micros;
int64_t end_micros;
......@@ -355,6 +366,21 @@ class MACE_API MaceEngineConfig {
bool dcvs_enable,
int latency);
/// \brief Set MTK APU initial cache
///
/// \param policy is a policy for loading or storing apu initial cache.
/// \param binary_file will load cache file from this path.
/// \param storage_file will store cache file to this path.
///
/// Now the path is used to store the cache to file,
/// which could speed up the APU initialization.
/// If do not call this API, the initialization maybe slow for APU.
///
/// \return MaceStatus::MACE_SUCCESS for success, other for failure.
MaceStatus SetAPUCache(APUCachePolicy policy,
const std::string &binary_file,
const std::string &storage_file);
private:
class Impl;
std::unique_ptr<Impl> impl_;
......
mace {
global:
*LoadModelData*;
*GetModelSize*;
*CreateNet*;
*ModelName*;
*ModelChecksum*;
......
......@@ -32,10 +32,14 @@ apu_data_type ApuWrapper::MapToApuDataType(DataType mace_type) {
return APU_DATA_TYPE_INT32;
case DT_HALF:
return APU_DATA_TYPE_HALF;
case DT_FLOAT16:
return APU_DATA_TYPE_HALF;
case DT_UINT8:
return APU_DATA_TYPE_UINT8;
case DT_INT16:
return APU_DATA_TYPE_INT16;
default:
MACE_CHECK(true, "unsupport mace data type");
MACE_CHECK(false, "unsupport mace data type");
break;
}
return APU_DATA_TYPE_UNDEFINED;
......@@ -48,7 +52,7 @@ apu_pooling_mode ApuWrapper::MapToApuPoolingMode(int mace_mode) {
case 2:
return APU_POOLING_MAX;
default:
MACE_CHECK(true, "unsupport mace pooling mode");
MACE_CHECK(false, "unsupport mace pooling mode");
break;
}
return APU_POOLING_UNDEFINED;
......@@ -67,62 +71,37 @@ apu_eltwise_mode ApuWrapper::MapToApuEltwiseMode(int mace_mode) {
case 5:
return APU_ELTWISE_MAX;
default:
MACE_CHECK(true, "unsupport mace eltwise mode");
MACE_CHECK(false, "unsupport mace eltwise mode");
break;
}
return APU_ELTWISE_UNDEFINED;
}
bool ApuWrapper::Init(const NetDef &net_def,
unsigned const char *model_data,
const index_t model_data_size) {
bool ApuWrapper::Init(const NetDef &net_def, unsigned const char *model_data,
const char *file_name, bool load, bool store) {
frontend = new ApuFrontend();
MACE_CHECK(!(load & store),
"Should not load and store the model simultaneously.");
// parse model argument
int const_data_num = 0;
int apu_data_type = -1;
for (auto arg : net_def.arg()) {
if (arg.name().compare("const_data_num") == 0) {
const_data_num = arg.i();
} else if (arg.name().compare("apu_data_type") == 0) {
apu_data_type = arg.i();
}
}
// const tensors
std::vector<apu_tensor> const_tensors;
for (auto const_tensor : net_def.tensors()) {
apu_tensor tensor;
tensor.tensor_id = const_tensor.node_id();
tensor.tensor_type = (tensor.tensor_id < const_data_num) ?
APU_TENSOR_CONST_DATA :
APU_TENSOR_CONST_ARGUMENT;
tensor.data_type = MapToApuDataType(const_tensor.data_type());
tensor.scale = const_tensor.has_scale() ? const_tensor.scale() : 0.0f;
tensor.zero_point = const_tensor.has_zero_point() ?
const_tensor.zero_point() : 0;
tensor.dim_size = const_tensor.dims_size();
MACE_CHECK(tensor.dim_size <= APU_TENSOR_MAX_DIMS,
"tensor dimension size not supported");
for (auto i = 0; i < tensor.dim_size; i++) {
tensor.dims[i] = const_tensor.dims(i);
}
const auto tensor_end = const_tensor.offset() +
const_tensor->data_size() * GetEnumTypeSize(const_tensor.data_type());
if (model_data_size >= 0) {
MACE_CHECK(tensor_end <= model_data_size, "tensor_end (", tensor_end,
") should <= ", model_data_size);
}
tensor.data_buf =
const_cast<unsigned char *>(model_data + const_tensor.offset());
const_tensors.push_back(tensor);
}
// input tensors
std::vector<apu_tensor> input_tensors;
for (auto input_info : net_def.input_info()) {
apu_tensor tensor;
tensor.tensor_id = input_info.node_id();
tensor.tensor_type = APU_TENSOR_MODEL_INPUT;
tensor.data_type = APU_DATA_TYPE_UINT8; // will do quantize in Run()
tensor.scale = input_info.has_scale() ? input_info.scale() : 0.0f;
tensor.data_type = MapToApuDataType(static_cast<DataType>(apu_data_type));
tensor.scale = input_info.has_scale() ? input_info.scale() : -1.0f;
tensor.zero_point = input_info.has_zero_point() ?
input_info.zero_point() : 0;
tensor.dim_size = input_info.dims_size();
......@@ -131,114 +110,156 @@ bool ApuWrapper::Init(const NetDef &net_def,
ApuTensorInfo info;
info.name = input_info.name();
info.size = 1;
for (auto i = 0; i < tensor.dim_size; i++) {
info.data_type = tensor.data_type;
int byte_per_element = GetByteNum(tensor.data_type);
for (auto i = 0 ; i < tensor.dim_size ; i++) {
tensor.dims[i] = input_info.dims(i);
info.size *= input_info.dims(i);
info.shape.push_back(input_info.dims(i));
}
info.buf = std::shared_ptr<uint8_t>(new uint8_t[info.size],
std::default_delete<uint8_t[]>());
info.buf
= std::shared_ptr<uint8_t>(new uint8_t[info.size * byte_per_element],
std::default_delete<uint8_t[]>());
info.scale = tensor.scale;
info.zero_point = tensor.zero_point;
input_infos.push_back(info);
tensor.data_buf = info.buf.get();
input_tensors.push_back(tensor);
}
// output tensors
std::vector<int> output_tensor_ids;
std::vector<void *> output_buffers;
std::vector<apu_tensor> output_tensors;
for (auto output_info : net_def.output_info()) {
output_tensor_ids.push_back(output_info.node_id());
apu_tensor tensor;
tensor.tensor_id = output_info.node_id();
tensor.tensor_type = APU_TENSOR_MODEL_OUTPUT;
tensor.data_type = MapToApuDataType(static_cast<DataType>(apu_data_type));
tensor.dim_size = output_info.dims_size();
ApuTensorInfo info;
info.name = output_info.name();
info.size = 1;
for (auto i = 0; i < output_info.dims().size(); i++) {
info.data_type = tensor.data_type;
int byte_per_element = GetByteNum(tensor.data_type);
for (auto i = 0 ; i < tensor.dim_size ; i++) {
tensor.dims[i] = output_info.dims(i);
info.size *= output_info.dims(i);
info.shape.push_back(output_info.dims(i));
}
info.buf = std::shared_ptr<uint8_t>(new uint8_t[info.size],
std::default_delete<uint8_t[]>());
info.buf =
std::shared_ptr<uint8_t>(new uint8_t[info.size * byte_per_element],
std::default_delete<uint8_t[]>());
for (auto op_def : net_def.op()) {
if (output_info.name() == op_def.output(0)) {
info.scale = op_def.quantize_info(0).scale();
info.zero_point = op_def.quantize_info(0).zero_point();
if (info.data_type == static_cast<int>(APU_DATA_TYPE_UINT8) ||
info.data_type == static_cast<int>(APU_DATA_TYPE_INT16)) {
info.scale = op_def.quantize_info(0).scale();
info.zero_point = op_def.quantize_info(0).zero_point();
} else {
info.scale = 0.0;
info.zero_point = 0;
}
}
}
output_infos.push_back(info);
output_buffers.push_back(info.buf.get());
tensor.data_buf = info.buf.get();
output_tensors.push_back(tensor);
}
// const tensors
std::vector<apu_tensor> const_tensors;
// operators
std::vector<apu_operator> ops;
std::vector<std::vector<int>> cached_op_inputs;
for (auto op_def : net_def.op()) {
apu_operator op;
strncpy(op.type, op_def.type().c_str(), APU_OP_TYPE_MAX_SIZE);
op.input_size = op_def.node_input_size();
std::vector<int> input_ids;
for (auto i = 0; i < op.input_size; i++) {
input_ids.push_back(op_def.node_input(i).node_id());
}
cached_op_inputs.push_back(input_ids);
op.input_ids = cached_op_inputs.back().data();
op.output.tensor_id = op_def.node_id();
op.output.tensor_type = APU_TENSOR_OP_OUTPUT;
op.output.data_type = MapToApuDataType(op_def.output_type(0));
if (op.output.data_type == APU_DATA_TYPE_UINT8) {
op.output.scale = op_def.quantize_info(0).scale();
op.output.zero_point = op_def.quantize_info(0).zero_point();
} else {
op.output.scale = 0.0f;
op.output.zero_point = 0;
}
op.output.dim_size = op_def.output_shape(0).dims_size();
MACE_CHECK(op.output.dim_size <= APU_TENSOR_MAX_DIMS,
"tensor dimension size not supported");
for (auto i = 0; i < op.output.dim_size; i++) {
op.output.dims[i] = op_def.output_shape(0).dims(i);
if (!load) {
// const tensors
for (auto const_tensor : net_def.tensors()) {
apu_tensor tensor;
tensor.tensor_id = const_tensor.node_id();
tensor.tensor_type = (tensor.tensor_id < const_data_num) ?
APU_TENSOR_CONST_DATA :
APU_TENSOR_CONST_ARGUMENT;
tensor.data_type = MapToApuDataType(const_tensor.data_type());
tensor.scale = const_tensor.has_scale() ? const_tensor.scale() : 0.0f;
tensor.zero_point = const_tensor.has_zero_point() ?
const_tensor.zero_point() : 0;
tensor.dim_size = const_tensor.dims_size();
MACE_CHECK(tensor.dim_size <= APU_TENSOR_MAX_DIMS,
"tensor dimension size not supported");
for (auto i = 0 ; i < tensor.dim_size ; i++) {
tensor.dims[i] = const_tensor.dims(i);
}
tensor.data_buf =
const_cast<unsigned char*>(model_data + const_tensor.offset());
const_tensors.push_back(tensor);
}
op.output.data_buf = nullptr;
// get op mode and activation mode
bool is_pooling = (strcmp(op.type, "Pooling") == 0);
bool is_eltwise = (strcmp(op.type, "Eltwise") == 0);
std::string activation;
float max_limit = 0.0f;
for (auto arg : op_def.arg()) {
if (arg.name().compare("activation") == 0) {
activation = arg.s();
// operators
for (auto op_def : net_def.op()) {
apu_operator op;
strncpy(op.type, op_def.type().c_str(), APU_OP_TYPE_MAX_SIZE);
op.input_size = op_def.node_input_size();
std::vector<int> input_ids;
for (auto i = 0 ; i < op.input_size ; i++) {
input_ids.push_back(op_def.node_input(i).node_id());
}
if (arg.name().compare("max_limit") == 0) {
max_limit = arg.f();
cached_op_inputs.push_back(input_ids);
op.input_ids = cached_op_inputs.back().data();
op.output.tensor_id = op_def.node_id();
op.output.tensor_type = APU_TENSOR_OP_OUTPUT;
op.output.data_type = MapToApuDataType(op_def.output_type(0));
if (op.output.data_type == APU_DATA_TYPE_UINT8 ||
op.output.data_type == APU_DATA_TYPE_INT16) {
op.output.scale = op_def.quantize_info(0).scale();
op.output.zero_point = op_def.quantize_info(0).zero_point();
} else {
op.output.scale = 0.0f;
op.output.zero_point = 0;
}
if (is_pooling && arg.name().compare("pooling_type") == 0) {
op.op_mode = static_cast<int>(MapToApuPoolingMode(arg.i()));
op.output.dim_size = op_def.output_shape(0).dims_size();
MACE_CHECK(op.output.dim_size <= APU_TENSOR_MAX_DIMS,
"tensor dimension size not supported");
for (auto i = 0 ; i < op.output.dim_size ; i++) {
op.output.dims[i] = op_def.output_shape(0).dims(i);
}
if (is_eltwise && arg.name().compare("type") == 0) {
op.op_mode = static_cast<int>(MapToApuEltwiseMode(arg.i()));
op.output.data_buf = nullptr;
// get op mode and activation mode
bool is_pooling = (strcmp(op.type, "Pooling") == 0);
bool is_eltwise = (strcmp(op.type, "Eltwise") == 0);
std::string activation;
float max_limit = 0.0f;
for (auto arg : op_def.arg()) {
if (arg.name().compare("activation") == 0) {
activation = arg.s();
}
if (arg.name().compare("max_limit") == 0) {
max_limit = arg.f();
}
if (is_pooling && arg.name().compare("pooling_type") == 0) {
op.op_mode = static_cast<int>(MapToApuPoolingMode(arg.i()));
}
if (is_eltwise && arg.name().compare("type") == 0) {
op.op_mode = static_cast<int>(MapToApuEltwiseMode(arg.i()));
}
}
if (activation.compare("RELU") == 0) {
op.act_mode = APU_ACT_RELU;
} else if (activation.compare("RELUX") == 0 && max_limit == 6.0) {
op.act_mode = APU_ACT_RELU6;
} else if (activation.compare("SIGMOID") == 0) {
op.act_mode = APU_ACT_SIGMOID;
} else if (activation.compare("TANH") == 0) {
op.act_mode = APU_ACT_TANH;
} else {
op.act_mode = APU_ACT_NONE;
}
ops.push_back(op);
}
if (activation.compare("RELU") == 0) {
op.act_mode = APU_ACT_RELU;
} else if (activation.compare("RELUX") == 0 && max_limit == 6.0) {
op.act_mode = APU_ACT_RELU6;
} else {
op.act_mode = APU_ACT_NONE;
}
ops.push_back(op);
}
bool print_model = false;
bool ret = frontend->InitGraph(
const_tensors.size(), const_tensors.data(),
input_tensors.size(), input_tensors.data(),
output_tensor_ids.size(), output_tensor_ids.data(),
output_buffers.data(),
ops.size(), ops.data(),
print_model);
const_tensors.size(), const_tensors.data(),
input_tensors.size(), input_tensors.data(),
output_tensors.size(), output_tensors.data(),
ops.size(), ops.data(),
print_model, file_name, load, store);
cached_op_inputs.clear();
MACE_CHECK(ret == true, "apu init graph failed");
return ret;
}
......@@ -247,22 +268,35 @@ bool ApuWrapper::Run(const std::map<std::string, Tensor *> &input_tensors,
MACE_ASSERT(input_tensors.size() == input_infos.size(), "Wrong inputs num");
MACE_ASSERT(output_tensors.size() == output_infos.size(),
"Wrong outputs num");
// prepare input
for (int i = 0 ; i < static_cast<int>(input_tensors.size()) ; i++) {
Tensor *tensor = input_tensors.at(input_infos[i].name);
// check size
int size = input_infos[i].size;
MACE_ASSERT(size == static_cast<int>(tensor->size()), "Wrong input size");
int element_size = input_infos[i].size;
int byte_per_element = GetByteNum(input_infos[i].data_type);
MACE_ASSERT(element_size == static_cast<int>(tensor->size()),
"Wrong input size");
// quantize
quantize_util_.QuantizeWithScaleAndZeropoint(
tensor->data<float>(),
size,
input_infos[i].scale,
input_infos[i].zero_point,
input_infos[i].buf.get());
if (input_infos[i].data_type == APU_DATA_TYPE_INT16) {
quantize_util_.QuantizeWithScaleAndZeropoint(
(const float*)tensor->raw_data(),
element_size,
input_infos[i].scale,
input_infos[i].zero_point,
reinterpret_cast<int16_t*>(input_infos[i].buf.get()));
} else if (input_infos[i].data_type == APU_DATA_TYPE_FLOAT) {
std::memcpy(input_infos[i].buf.get(),
(const float*)tensor->raw_data(),
element_size * byte_per_element);
} else {
quantize_util_.QuantizeWithScaleAndZeropoint(
(const float*)tensor->raw_data(),
element_size,
input_infos[i].scale,
input_infos[i].zero_point,
input_infos[i].buf.get());
}
}
// run model
......@@ -276,16 +310,30 @@ bool ApuWrapper::Run(const std::map<std::string, Tensor *> &input_tensors,
// prepare out buffer
tensor->SetDtype(DT_FLOAT);
tensor->Resize(output_infos[i].shape);
int size = output_infos[i].size;
MACE_ASSERT(size == static_cast<int>(tensor->size()), "Wrong output size");
int element_size = output_infos[i].size;
int byte_per_element = GetByteNum(output_infos[i].data_type);
MACE_ASSERT(element_size == static_cast<int>(tensor->size()),
"Wrong output size");
// dequantize
quantize_util_.Dequantize(
output_infos[i].buf.get(),
size,
output_infos[i].scale,
output_infos[i].zero_point,
tensor->mutable_data<float>());
if (output_infos[i].data_type == APU_DATA_TYPE_INT16) {
quantize_util_.Dequantize(
reinterpret_cast<int16_t*>(output_infos[i].buf.get()),
element_size,
output_infos[i].scale,
output_infos[i].zero_point,
reinterpret_cast<float*>(tensor->raw_mutable_data()));
} else if (output_infos[i].data_type == APU_DATA_TYPE_FLOAT) {
std::memcpy(reinterpret_cast<float*>(tensor->raw_mutable_data()),
output_infos[i].buf.get(),
element_size * byte_per_element);
} else {
quantize_util_.Dequantize(
output_infos[i].buf.get(),
element_size,
output_infos[i].scale,
output_infos[i].zero_point,
reinterpret_cast<float*>(tensor->raw_mutable_data()));
}
}
return true;
......@@ -299,4 +347,20 @@ bool ApuWrapper::Uninit() {
return ret;
}
int ApuWrapper::GetByteNum(apu_data_type data_type) {
int byte_per_element;
if (data_type == APU_DATA_TYPE_FLOAT || data_type == APU_DATA_TYPE_INT32) {
byte_per_element = 4;
} else if (data_type == APU_DATA_TYPE_HALF ||
data_type == APU_DATA_TYPE_INT16) {
byte_per_element = 2;
} else if (data_type == APU_DATA_TYPE_UINT8) {
byte_per_element = 1;
} else {
byte_per_element = 1;
MACE_CHECK(false, "unsupport data type");
}
return byte_per_element;
}
} // namespace mace
......@@ -37,12 +37,14 @@ class ApuWrapper {
int size;
float scale;
int zero_point;
apu_data_type data_type;
};
public:
explicit ApuWrapper(Device *device);
bool Init(const NetDef &net_def, unsigned const char *model_data,
const index_t model_data_size);
bool Init(const NetDef& net_def, unsigned const char *model_data = nullptr,
const char *file_name = nullptr,
bool load = false, bool store = false);
bool Run(const std::map<std::string, Tensor *> &input_tensors,
std::map<std::string, Tensor *> *output_tensors);
bool Uninit();
......@@ -51,6 +53,7 @@ class ApuWrapper {
apu_data_type MapToApuDataType(DataType mace_type);
apu_pooling_mode MapToApuPoolingMode(int mace_mode);
apu_eltwise_mode MapToApuEltwiseMode(int mace_mode);
int GetByteNum(apu_data_type data_type);
private:
ApuFrontend *frontend;
......
......@@ -201,6 +201,10 @@ class MaceEngineConfig::Impl {
bool dcvs_enable,
int latency);
MaceStatus SetAPUCache(APUCachePolicy policy,
const std::string &binary_file,
const std::string &storage_file);
inline DeviceType device_type() const {
return device_type_;
}
......@@ -237,6 +241,18 @@ class MaceEngineConfig::Impl {
return hexagon_latency_;
}
inline APUCachePolicy apu_cache_policy() const {
return apu_cache_policy_;
}
inline std::string apu_binary_file() const {
return apu_binary_file_;
}
inline std::string apu_storage_file() const {
return apu_storage_file_;
}
private:
DeviceType device_type_;
int num_threads_;
......@@ -247,6 +263,9 @@ class MaceEngineConfig::Impl {
HexagonNNCornerType hexagon_corner_;
bool hexagon_dcvs_enable_;
int hexagon_latency_;
APUCachePolicy apu_cache_policy_;
std::string apu_binary_file_;
std::string apu_storage_file_;
};
MaceEngineConfig::Impl::Impl(const DeviceType device_type)
......@@ -258,7 +277,10 @@ MaceEngineConfig::Impl::Impl(const DeviceType device_type)
gpu_perf_hint_(GPUPerfHint::PERF_NORMAL),
hexagon_corner_(HexagonNNCornerType::HEXAGON_NN_CORNER_TURBO),
hexagon_dcvs_enable_(true),
hexagon_latency_(100) {}
hexagon_latency_(100),
apu_cache_policy_(APUCachePolicy::APU_CACHE_NONE),
apu_binary_file_(""),
apu_storage_file_("") {}
MaceStatus MaceEngineConfig::Impl::SetGPUContext(
std::shared_ptr<GPUContext> context) {
......@@ -282,14 +304,15 @@ MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy(
return MaceStatus::MACE_SUCCESS;
}
#ifdef MACE_ENABLE_HEXAGON
MaceStatus MaceEngineConfig::Impl::SetHexagonToUnsignedPD() {
bool ret = false;
#ifdef MACE_ENABLE_HEXAGON
ret = HexagonDSPWrapper::RequestUnsignedPD();
#endif
return ret ? MaceStatus::MACE_SUCCESS : MaceStatus::MACE_RUNTIME_ERROR;
}
#endif
#ifdef MACE_ENABLE_HEXAGON
MaceStatus MaceEngineConfig::Impl::SetHexagonPower(
HexagonNNCornerType corner,
bool dcvs_enable,
......@@ -298,11 +321,24 @@ MaceStatus MaceEngineConfig::Impl::SetHexagonPower(
hexagon_dcvs_enable_ = dcvs_enable;
hexagon_latency_ = latency;
bool ret = false;
#ifdef MACE_ENABLE_HEXAGON
ret = HexagonDSPWrapper::SetPower(corner, dcvs_enable, latency);
return ret ? MaceStatus::MACE_SUCCESS : MaceStatus::MACE_RUNTIME_ERROR;
}
#endif
#ifdef MACE_ENABLE_APU
MaceStatus MaceEngineConfig::Impl::SetAPUCache(
APUCachePolicy policy,
const std::string &binary_file,
const std::string &storage_file) {
bool ret = false;
apu_cache_policy_ = policy;
apu_binary_file_ = binary_file;
apu_storage_file_ = storage_file;
ret = true;
return ret ? MaceStatus::MACE_SUCCESS : MaceStatus::MACE_RUNTIME_ERROR;
}
#endif
MaceEngineConfig::MaceEngineConfig(
const DeviceType device_type)
......@@ -338,6 +374,13 @@ MaceStatus MaceEngineConfig::SetHexagonPower(
return impl_->SetHexagonPower(corner, dcvs_enable, latency);
}
MaceStatus MaceEngineConfig::SetAPUCache(
APUCachePolicy policy,
const std::string &binary_file,
const std::string &storage_file) {
return impl_->SetAPUCache(policy, binary_file, storage_file);
}
// Mace Tensor
class MaceTensor::Impl {
public:
......@@ -478,6 +521,9 @@ class MaceEngine::Impl {
#endif
#ifdef MACE_ENABLE_APU
std::unique_ptr<ApuWrapper> apu_controller_;
APUCachePolicy apu_cache_policy_;
std::string apu_binary_file_;
std::string apu_storage_file_;
#endif
MACE_DISABLE_COPY_AND_ASSIGN(Impl);
......@@ -504,6 +550,9 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
#endif
#ifdef MACE_ENABLE_APU
, apu_controller_(nullptr)
, apu_cache_policy_(config.impl_->apu_cache_policy())
, apu_binary_file_(config.impl_->apu_binary_file())
, apu_storage_file_(config.impl_->apu_storage_file())
#endif
{
LOG(INFO) << "Creating MaceEngine, MACE version: " << MaceVersion();
......@@ -660,8 +709,21 @@ MaceStatus MaceEngine::Impl::Init(
#ifdef MACE_ENABLE_APU
if (device_type_ == APU) {
apu_controller_.reset(new ApuWrapper(device_.get()));
MACE_CHECK(apu_controller_->Init(
*net_def, model_data, model_data_size), "apu init error");
bool cache_load = apu_cache_policy_ == APUCachePolicy::APU_CACHE_LOAD;
bool cache_store = apu_cache_policy_ == APUCachePolicy::APU_CACHE_STORE;
const char* file_name = cache_store ?
apu_storage_file_.c_str() : apu_binary_file_.c_str();
bool ret = false;
if (cache_load || cache_store) {
VLOG(1) << "Loading/Storing init cache";
ret = apu_controller_->Init(
*net_def, model_data, file_name, cache_load, cache_store);
}
if (!ret && !cache_store) {
VLOG(1) << "Do not use init cache";
ret = apu_controller_->Init(*net_def, model_data);
}
MACE_CHECK(ret, "apu int error", cache_load, cache_store);
} else {
#endif
MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(
......
......@@ -15,6 +15,7 @@ enum DataType {
DT_INT32 = 4;
DT_FLOAT16 = 5;
DT_BFLOAT16 = 6;
DT_INT16 = 7;
}
enum MemoryType {
......
......@@ -5,6 +5,7 @@ load(
"if_android",
"if_hexagon_enabled",
"if_opencl_enabled",
"if_apu_enabled",
)
licenses(["notice"]) # Apache 2.0
......@@ -22,6 +23,8 @@ cc_binary(
"-DMACE_ENABLE_OPENCL",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]) + if_apu_enabled([
"-DMACE_ENABLE_APU",
]),
linkstatic = 1,
deps = [
......@@ -46,6 +49,8 @@ cc_binary(
"-DMACE_ENABLE_OPENCL",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]) + if_apu_enabled([
"-DMACE_ENABLE_APU",
]),
linkopts = [
"-lm",
......
......@@ -144,6 +144,12 @@ DEFINE_string(model_data_file,
DEFINE_string(model_file,
"",
"model file name, used when load mace model in pb");
DEFINE_string(apu_binary_file,
"",
"apu init cache path, used when load apu init cache");
DEFINE_string(apu_storage_file,
"",
"apu init cache path, used when store apu init cache");
DEFINE_string(device, "GPU", "CPU/GPU/HEXAGON/APU");
DEFINE_int32(round, 1, "round");
DEFINE_int32(restart_round, 1, "restart round");
......@@ -153,6 +159,7 @@ DEFINE_int32(gpu_priority_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
DEFINE_int32(num_threads, -1, "num of threads");
DEFINE_int32(cpu_affinity_policy, 1,
"0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY");
DEFINE_int32(apu_cache_policy, 0, "0:NONE/1:STORE/2:LOAD");
DEFINE_bool(benchmark, false, "enable benchmark op");
bool RunModel(const std::string &model_name,
......@@ -201,6 +208,11 @@ bool RunModel(const std::string &model_name,
// firmware) or 8250 family above to run hexagon nn on unsigned PD.
// config.SetHexagonToUnsignedPD();
config.SetHexagonPower(HEXAGON_NN_CORNER_TURBO, true, 100);
#endif
#ifdef MACE_ENABLE_APU
config.SetAPUCache(static_cast<APUCachePolicy>(FLAGS_apu_cache_policy),
FLAGS_apu_binary_file,
FLAGS_apu_storage_file);
#endif
std::unique_ptr<mace::port::ReadOnlyMemoryRegion> model_graph_data =
make_unique<mace::port::ReadOnlyBufferMemoryRegion>();
......@@ -539,6 +551,9 @@ int Main(int argc, char **argv) {
LOG(INFO) << "output dir: " << FLAGS_output_dir;
LOG(INFO) << "model_data_file: " << FLAGS_model_data_file;
LOG(INFO) << "model_file: " << FLAGS_model_file;
LOG(INFO) << "apu_cache_policy: " << FLAGS_apu_cache_policy;
LOG(INFO) << "apu_binary_file: " << FLAGS_apu_binary_file;
LOG(INFO) << "apu_storage_file: " << FLAGS_apu_storage_file;
LOG(INFO) << "device: " << FLAGS_device;
LOG(INFO) << "round: " << FLAGS_round;
LOG(INFO) << "restart_round: " << FLAGS_restart_round;
......
......@@ -6,6 +6,8 @@ enum apu_act_mode {
APU_ACT_NONE = 0,
APU_ACT_RELU = 1,
APU_ACT_RELU6 = 2,
APU_ACT_SIGMOID = 3,
APU_ACT_TANH = 4,
};
enum apu_pooling_mode {
......@@ -29,6 +31,7 @@ enum apu_data_type {
APU_DATA_TYPE_UINT8 = 2,
APU_DATA_TYPE_HALF = 3,
APU_DATA_TYPE_INT32 = 4,
APU_DATA_TYPE_INT16 = 5,
};
enum apu_tensor_type {
......@@ -37,6 +40,7 @@ enum apu_tensor_type {
APU_TENSOR_CONST_ARGUMENT = 2,
APU_TENSOR_MODEL_INPUT = 3,
APU_TENSOR_OP_OUTPUT = 4,
APU_TENSOR_MODEL_OUTPUT = 5,
};
#define APU_TENSOR_MAX_DIMS 4
......@@ -70,10 +74,10 @@ class ApuFrontend {
bool InitGraph(int const_tensor_size, const apu_tensor* const_tensors,
int input_tensor_size, const apu_tensor* input_tensors,
int output_tensor_size, const int* output_tensor_ids,
void** output_buffers,
int output_tensor_size, const apu_tensor* output_tensors,
int operator_size, const apu_operator* operators,
bool print_model);
bool print_model, const char *file_name,
bool load, bool store);
bool RunGraph();
bool UninitGraph();
......
无法预览此类型文件
......@@ -415,6 +415,7 @@ class YAMLKeyword(object):
quantize_large_weights = 'quantize_large_weights'
quantize_range_file = 'quantize_range_file'
quantize_stat = 'quantize_stat'
quantize_schema = 'quantize_schema'
change_concat_ranges = 'change_concat_ranges'
validation_inputs_data = 'validation_inputs_data'
validation_threshold = 'validation_threshold'
......
......@@ -122,6 +122,7 @@ class DefaultValues(object):
cpu_affinity_policy = 1,
gpu_perf_hint = 3,
gpu_priority_hint = 3,
apu_cache_policy = 0,
class ValidationThreshold(object):
......@@ -1175,6 +1176,21 @@ def parse_args():
"--benchmark",
action="store_true",
help="enable op benchmark.")
run.add_argument(
"--apu_cache_policy",
type=int,
default=DefaultValues.apu_cache_policy,
help="0:NONE/1:STORE/2:LOAD")
run.add_argument(
"--apu_binary_file",
type=str,
default="",
help="apu cache load dir.")
run.add_argument(
"--apu_storage_file",
type=str,
default="",
help="apu cache store dir.")
return parser.parse_known_args()
......
......@@ -177,6 +177,9 @@ class DeviceWrapper:
cpu_affinity_policy=1,
gpu_perf_hint=3,
gpu_priority_hint=3,
apu_cache_policy=0,
apu_binary_file="",
apu_storage_file="",
input_file_name='model_input',
output_file_name='model_out',
input_dir="",
......@@ -282,7 +285,20 @@ class DeviceWrapper:
"third_party/nnlib/%s/libhexagon_controller.so" % abi,
self.data_dir)
apu_storage_cpy = False
if device_type == common.DeviceType.APU:
if apu_cache_policy == 1:
if not apu_storage_file:
apu_storage_cpy = True
apu_src_file = model_tag + ".bin"
apu_storage_file = os.path.join(self.data_dir,
apu_src_file)
elif apu_cache_policy == 2:
if os.path.exists(apu_binary_file):
self.push(apu_binary_file, self.data_dir)
apu_binary_file = os.path.join(self.data_dir,
os.path.basename(
apu_binary_file))
self.push("third_party/apu/libapu-frontend.so",
self.data_dir)
......@@ -345,6 +361,9 @@ class DeviceWrapper:
(self.data_dir, os.path.basename(opencl_binary_file)),
"--opencl_parameter_file=%s/%s" %
(self.data_dir, os.path.basename(opencl_parameter_file)),
"--apu_cache_policy=%s" % apu_cache_policy,
"--apu_binary_file=%s" % apu_binary_file,
"--apu_storage_file=%s" % apu_storage_file,
])
if benchmark:
cmd.append("--benchmark=%s" % benchmark)
......@@ -364,6 +383,11 @@ class DeviceWrapper:
_out=process_output,
_err_to_out=True)
self.stdout = "".join(stdout_buff)
if apu_storage_cpy:
self.pull_from_data_dir(
apu_src_file, '{}/apu_init_cache/'.format(mace_model_dir))
if not sh_commands.stdout_success(self.stdout):
common.MaceLogger.error("Mace Run", "Mace run failed.")
......@@ -545,6 +569,9 @@ class DeviceWrapper:
cpu_affinity_policy=flags.cpu_affinity_policy,
gpu_perf_hint=flags.gpu_perf_hint,
gpu_priority_hint=flags.gpu_priority_hint,
apu_cache_policy=flags.apu_cache_policy,
apu_binary_file=flags.apu_binary_file,
apu_storage_file=flags.apu_storage_file,
runtime_failure_ratio=flags.runtime_failure_ratio,
address_sanitizer=flags.address_sanitizer,
opencl_binary_file=model_opencl_output_bin_path,
......
......@@ -113,6 +113,8 @@ def convert_model(conf, quantize_stat):
option.winograd = conf[ModelKeys.winograd]
if ModelKeys.quantize in conf:
option.quantize = conf[ModelKeys.quantize]
if ModelKeys.quantize_schema in conf:
option.quantize_schema = conf[ModelKeys.quantize_schema]
if ModelKeys.quantize_large_weights in conf:
option.quantize_large_weights = conf[ModelKeys.quantize_large_weights]
if ModelKeys.quantize_range_file in conf:
......
......@@ -171,6 +171,24 @@ def quantize(data, device, non_zero):
return quantized_data
# only support int16 symmetric quantization.
def quantize_int16(data):
np_data = np.array(data).astype(float)
max_val = max(abs(np_data.min()), abs(np_data.max()))
scale = max_val / 2**15
zero = 0
output = np.clip((np.round(zero + data / scale).astype(np.int32)),
-2**15, 2**15 - 1)
quantized_data = QuantizedData()
quantized_data.data = output
quantized_data.scale = scale
quantized_data.zero = zero
quantized_data.minval = -max_val
quantized_data.maxval = max_val
return quantized_data
def quantize_bias_for_hexagon(data):
np_data = np.array(data).astype(float)
max_val = max(abs(np_data.min()), abs(np_data.max()))
......
......@@ -26,6 +26,7 @@ namespace {{tag}} {
MACE_API extern const unsigned char *LoadModelData();
MACE_API extern const int64_t GetModelSize();
MACE_API extern const std::shared_ptr<NetDef> CreateNet();
......
......@@ -24,6 +24,7 @@ from transform.base_converter import EltwiseType
from transform.base_converter import MaceKeyword
from transform.base_converter import MaceOp
from transform.base_converter import PaddingMode
from transform.base_converter import PadType
from transform.base_converter import PoolingType
from transform.base_converter import ReduceType
from transform.base_converter import DataFormat
......@@ -32,16 +33,17 @@ from utils.util import mace_check
ApuSupportedOps = [
'Activation',
'Concat',
'Conv2D',
'DepthwiseConv2d',
'Eltwise',
'Pad',
'Pooling',
'Reduce',
'ResizeBilinear',
'Reshape',
'Softmax',
'Squeeze',
]
ApuOp = Enum('ApuOp', [(op, op) for op in ApuSupportedOps], type=str)
......@@ -50,16 +52,18 @@ ApuOp = Enum('ApuOp', [(op, op) for op in ApuSupportedOps], type=str)
class ApuOps(object):
def __init__(self):
self.apu_ops = {
MaceOp.Activation.name: ApuOp.Activation.name,
MaceOp.Concat.name: ApuOp.Concat.name,
MaceOp.Conv2D.name: ApuOp.Conv2D.name,
MaceOp.DepthwiseConv2d.name: ApuOp.DepthwiseConv2d.name,
MaceOp.Eltwise.name: ApuOp.Eltwise.name,
MaceOp.Pad.name: ApuOp.Pad.name,
MaceOp.Pooling.name: ApuOp.Pooling.name,
MaceOp.Reduce.name: ApuOp.Reduce.name,
MaceOp.ResizeBilinear.name: ApuOp.ResizeBilinear.name,
MaceOp.Reshape.name: ApuOp.Reshape.name,
MaceOp.Softmax.name: ApuOp.Softmax.name,
MaceOp.Squeeze.name: ApuOp.Squeeze.name,
MaceOp.Squeeze.name: ApuOp.Reshape.name,
}
def has_op(self, op_name):
......@@ -78,17 +82,30 @@ class ApuConverter(base_converter.ConverterInterface):
self._apu_ops = ApuOps()
def run(self):
self.use_uint8_in_out()
if self._option.quantize:
self.use_quant_in_out()
self.add_op_output_type()
self.ensure_bias_vector()
self.ensure_binary_input()
self.common_check()
if ConverterUtil.get_arg(self._model.op[0],
MaceKeyword.mace_framework_type_str).i == \
FrameworkType.TENSORFLOW.value:
self.add_tensorflow_padding_value()
# Calculate the number of apu constant tensors
# Any tensors which will be apu constant tensors should be added
# above this line
const_data_num_arg = self._model.arg.add()
const_data_num_arg.name = MaceKeyword.mace_const_data_num_arg_str
const_data_num_arg.i = len(self._model.tensors)
apu_data_type_arg = self._model.arg.add()
apu_data_type_arg.name = MaceKeyword.mace_apu_data_type_arg_str
if self._option.quantize_schema == 'mace_apu_16bit_per_tensor':
apu_data_type_arg.i = mace_pb2.DT_INT16
elif self._option.quantize:
apu_data_type_arg.i = mace_pb2.DT_UINT8
else:
apu_data_type_arg.i = mace_pb2.DT_FLOAT
self.convert_ops()
self.add_node_id()
return self._model
......@@ -104,9 +121,11 @@ class ApuConverter(base_converter.ConverterInterface):
' match')
mace_check(len(op.output_shape[0].dims) <= 4,
op.name + ': apu only support 1D~4D tensor')
mace_check(len(op.output) == len(op.quantize_info),
op.name + ': length of output and quantize_info not'
' match')
if op.output_type[0] == mace_pb2.DT_UINT8 \
or op.output_type[0] == mace_pb2.DT_INT16:
mace_check(len(op.output) == len(op.quantize_info),
op.name + ': length of output and quantize_info not'
' match')
data_format = ConverterUtil.data_format(op)
if data_format is not None and len(op.output_shape[0].dims) == 4:
mace_check((data_format == DataFormat.NHWC)
......@@ -117,9 +136,11 @@ class ApuConverter(base_converter.ConverterInterface):
op, MaceKeyword.mace_activation_type_str)
if act_mode_arg is not None:
mace_check(act_mode_arg.s == b'RELU'
or act_mode_arg.s == b'RELUX',
op.name + ': apu only support activation RELU and'
' RELUX')
or act_mode_arg.s == b'RELUX'
or act_mode_arg.s == b'TANH'
or act_mode_arg.s == b'SIGMOID',
op.name + ': apu only support activation RELU,'
' RELUX, TANH and SIGMOID')
for tensor in self._model.tensors:
mace_check(len(tensor.dims) <= 4,
tensor.name + ': apu only support 1D~4D tensor')
......@@ -138,7 +159,6 @@ class ApuConverter(base_converter.ConverterInterface):
for op in self._model.op:
if not self._apu_ops.has_op(op.type):
raise Exception('Unsupported op: ', op)
if op.type == MaceOp.Conv2D.name \
or op.type == MaceOp.DepthwiseConv2d.name:
mace_check(len(op.input) == 3,
......@@ -146,7 +166,7 @@ class ApuConverter(base_converter.ConverterInterface):
' with 3 input')
self.add_size_tensor_from_arg(
op, MaceKeyword.mace_strides_str)
self.add_padding_tensor_from_arg(op)
self.add_padding_value_tensor_from_arg(op)
self.add_size_tensor_from_arg(
op, MaceKeyword.mace_dilations_str)
if op.type == MaceOp.DepthwiseConv2d.name:
......@@ -160,22 +180,64 @@ class ApuConverter(base_converter.ConverterInterface):
break
op.input.extend([multiplier.name])
elif op.type == MaceOp.Eltwise.name:
eltwise_type = ConverterUtil.get_arg(
op, MaceKeyword.mace_element_type_str).i
# We only handle SUM and PROD operators now which are
# commutative
mace_check(len(op.input) == 2,
op.name + ': apu only support eltwise op with 2'
' input')
eltwise_type = ConverterUtil.get_arg(
op, MaceKeyword.mace_element_type_str).i
mace_check(eltwise_type == EltwiseType.SUM.value,
op.name + ': apu only support eltwise type SUM')
mace_check(eltwise_type == EltwiseType.SUM.value
or eltwise_type == EltwiseType.PROD.value,
op.name +
': apu only support eltwise type SUM or PROD')
elif op.type == MaceOp.Pad.name:
mace_check(len(op.input) == 1,
op.name + ': apu only support Pad op with 1'
' input')
pad_type_arg = \
ConverterUtil.get_arg(op, MaceKeyword.mace_pad_type_str)
if pad_type_arg is not None:
mace_check(PadType(pad_type_arg.i) ==
PadType.CONSTANT, op.name +
': apu only support Pad type CONSTANT')
padding_arg = ConverterUtil.get_arg(
op, MaceKeyword.mace_paddings_str)
mace_check(len(padding_arg.ints) == 8,
op.name + ': paddings does not have size 8')
mace_check({0} ==
{padding_arg.ints[0], padding_arg.ints[1],
padding_arg.ints[6], padding_arg.ints[7]},
op.name + ': apu only support Pad op with padding'
' in H/W dimensions')
data_type = ConverterUtil.get_arg(op, 'T').i
constant_value_arg = ConverterUtil.get_arg(
op, MaceKeyword.mace_constant_value_str)
if constant_value_arg is not None:
if data_type in [mace_pb2.DT_FLOAT, mace_pb2.DT_HALF]:
constant_value = constant_value_arg.f
elif data_type == mace_pb2.DT_INT32:
constant_value = constant_value_arg.i
else:
mace_check(False, "Not supported data type")
mace_check(constant_value == 0,
op.name + ': apu only support Pad op with zero'
' padding')
self.add_paddings_tensor_from_arg(op)
elif op.type == MaceOp.Pooling.name:
mace_check(len(op.input) == 1,
op.name + ': apu only support pooling op with 1'
' input')
pooling_type_arg = ConverterUtil.get_arg(
op, MaceKeyword.mace_pooling_type_str)
mace_check(PoolingType(pooling_type_arg.i) == PoolingType.AVG,
op.name + ': apu only support pooling type AVG')
self.add_padding_tensor_from_arg(op)
mace_check(PoolingType(pooling_type_arg.i) in
[PoolingType.AVG, PoolingType.MAX],
op.name + ': apu only support pooling type AVG,'
' MAX')
self.add_padding_value_tensor_from_arg(op)
self.add_size_tensor_from_arg(
op, MaceKeyword.mace_strides_str)
self.add_size_tensor_from_arg(op, MaceKeyword.mace_kernel_str)
......@@ -213,8 +275,7 @@ class ApuConverter(base_converter.ConverterInterface):
mace_check(len(op.input) == 1,
op.name + ': apu only support squeeze op with 1'
' input')
self.add_int_list_tensor_from_arg(
op, MaceKeyword.mace_axis_str)
self.add_shape_tensor_from_axis_arg(op)
op.type = self._apu_ops.map_nn_op(op.type)
......@@ -222,7 +283,12 @@ class ApuConverter(base_converter.ConverterInterface):
type_map = {}
for input_info in self._model.input_info:
# will do input quantize in wrapper
type_map[input_info.name] = mace_pb2.DT_UINT8
if self._option.quantize_schema == 'mace_apu_16bit_per_tensor':
type_map[input_info.name] = mace_pb2.DT_INT16
elif self._option.quantize:
type_map[input_info.name] = mace_pb2.DT_UINT8
else:
type_map[input_info.name] = mace_pb2.DT_FLOAT
for op in self._model.op:
if len(op.output_type) >= 1:
......@@ -239,8 +305,11 @@ class ApuConverter(base_converter.ConverterInterface):
op.name + ': length of output and output_type not'
' match')
mace_check(op.output_type[0] == mace_pb2.DT_UINT8
or op.output_type[0] == mace_pb2.DT_INT32,
op.name + ': apu only support quantized node')
or op.output_type[0] == mace_pb2.DT_INT16
or op.output_type[0] == mace_pb2.DT_INT32
or op.output_type[0] == mace_pb2.DT_FLOAT,
op.name + ': apu only support quantized or float16'
' node')
def add_node_id(self):
node_id_counter = 0
......@@ -266,7 +335,7 @@ class ApuConverter(base_converter.ConverterInterface):
for output_info in self._model.output_info:
output_info.node_id = node_id_map[output_info.name]
def add_padding_tensor_from_arg(self, op):
def add_padding_value_tensor_from_arg(self, op):
padding_value_arg = ConverterUtil.get_arg(
op, MaceKeyword.mace_padding_values_str)
mace_check(len(padding_value_arg.ints) == 4,
......@@ -278,6 +347,19 @@ class ApuConverter(base_converter.ConverterInterface):
padding_value_tensor.int32_data.extend(padding_value_arg.ints)
op.input.extend([padding_value_tensor.name])
def add_paddings_tensor_from_arg(self, op):
padding_value_arg = ConverterUtil.get_arg(
op, MaceKeyword.mace_paddings_str)
padding_value_tensor = self._model.tensors.add()
padding_value_tensor.name = op.name + '/padding:0'
padding_value_tensor.data_type = mace_pb2.DT_INT32
mace_check(len(padding_value_arg.ints) % 2 == 0,
op.name + ': the rank of paddings should be even')
padding_value_tensor.dims.extend(
[int(len(padding_value_arg.ints) / 2), 2])
padding_value_tensor.int32_data.extend(padding_value_arg.ints)
op.input.extend([padding_value_tensor.name])
def add_size_tensor_from_arg(self, op, keyword):
size_value_arg = ConverterUtil.get_arg(op, keyword)
mace_check(len(size_value_arg.ints) == 2,
......@@ -311,6 +393,27 @@ class ApuConverter(base_converter.ConverterInterface):
list_value_tensor.int32_data.extend(list_value_arg.ints)
op.input.extend([list_value_tensor.name])
def add_shape_tensor_from_axis_arg(self, op):
list_value_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_axis_str)
mace_check(list_value_arg.ints is not None,
op.name + ': ' + MaceKeyword.mace_axis_str +
' value ints should not be None')
axes = list_value_arg.ints
for producer in self._model.op:
if producer.output[0] == op.input[0]:
input_tensor_shape = producer.output_shape[0].dims
break
shape_tensor = self._model.tensors.add()
shape_tensor.name = op.name + '/' + MaceKeyword.mace_axis_str + ':0'
shape_tensor.data_type = mace_pb2.DT_INT32
shape_tensor.dims.extend([len(input_tensor_shape) - len(axes)])
shape_tensor.int32_data.extend(input_tensor_shape)
for axis in sorted(axes, reverse=True):
del shape_tensor.int32_data[axis]
op.input.extend([shape_tensor.name])
ConverterUtil.del_arg(op, MaceKeyword.mace_axis_str)
def add_tensorflow_padding_value(self):
for op in self._model.op:
padding_type = ConverterUtil.get_arg(
......@@ -374,7 +477,8 @@ class ApuConverter(base_converter.ConverterInterface):
tensor = self._model.tensors.add()
tensor.name = _op.name + '/add/bias_add'
tensor.dims.extend([_op.output_shape[0].dims[-1]])
if _op.output_type[0] == mace_pb2.DT_UINT8:
if _op.output_type[0] == mace_pb2.DT_UINT8 or \
_op.output_type[0] == mace_pb2.DT_INT16:
tensor.data_type = mace_pb2.DT_INT32
input_name = _op.input[0]
for input_op in self._model.op:
......@@ -395,7 +499,46 @@ class ApuConverter(base_converter.ConverterInterface):
tensor.float_data.extend([0.0] * tensor.dims[0])
_op.input.extend([tensor.name])
def use_uint8_in_out(self):
def ensure_binary_input(self):
for _op in self._model.op:
if _op.type != MaceOp.Eltwise.name:
continue
if len(_op.input) != 1:
continue
eltwise_type = ConverterUtil.get_arg(
_op, MaceKeyword.mace_element_type_str).i
if eltwise_type != EltwiseType.SUM.value and \
eltwise_type != EltwiseType.PROD.value:
continue
float_value_arg = ConverterUtil.get_arg(
_op, MaceKeyword.mace_scalar_input_str)
mace_check(float_value_arg.f is not None,
_op.name + ': ' +
MaceKeyword.mace_scalar_input_str +
' value float should not be None')
scalar = float_value_arg.f
const_tensor = self._model.tensors.add()
const_tensor.name = _op.name + '/' + \
MaceKeyword.mace_scalar_input_str + ':0'
const_tensor.dims.extend([1])
if _op.output_type[0] == mace_pb2.DT_UINT8 or \
_op.output_type[0] == mace_pb2.DT_INT16:
const_tensor.data_type = _op.output_type[0]
const_tensor.scale = scalar
const_tensor.zero_point = 0
const_tensor.quantized = True
const_tensor.int32_data.extend([1])
elif _op.output_type[0] == mace_pb2.DT_FLOAT:
const_tensor.data_type = mace_pb2.DT_FLOAT
const_tensor.float_data.extend([scalar])
_op.input.extend([const_tensor.name])
ConverterUtil.del_arg(
_op, MaceKeyword.mace_scalar_input_str)
ConverterUtil.del_arg(
_op, MaceKeyword.mace_scalar_input_index_str)
def use_quant_in_out(self):
replace_dict = {}
for input_info in self._model.input_info:
if input_info.data_type == mace_pb2.DT_FLOAT:
......
......@@ -290,6 +290,8 @@ class MaceKeyword(object):
mace_p_str = 'p'
mace_nor_var_str = 'normalize_variance'
mace_across_ch_str = 'across_channels'
mace_apu_16bit_per_tensor = 'mace_apu_16bit_per_tensor'
mace_apu_data_type_arg_str = 'apu_data_type'
class TransformerRule(Enum):
......@@ -337,6 +339,7 @@ class TransformerRule(Enum):
FP16_GATHER_WEIGHT = 42
QUANTIZE_LARGE_WEIGHTS = 43
TRANSPOSE_SHAPE_TENSOR_TO_PARAM = 44
TRANSFORM_SINGLE_BN_TO_DEPTHWISE_CONV = 45
class ConverterInterface(object):
......@@ -411,6 +414,7 @@ class ConverterOption(object):
self._device = DeviceType.CPU.value
self._winograd = 0
self._quantize = False
self._quantize_schema = ""
self._quantize_large_weights = False
self._quantize_range_file = ""
self._change_concat_ranges = False
......@@ -446,6 +450,10 @@ class ConverterOption(object):
def quantize(self):
return self._quantize
@property
def quantize_schema(self):
return self._quantize_schema
@property
def quantize_large_weights(self):
return self._quantize_large_weights
......@@ -510,6 +518,10 @@ class ConverterOption(object):
def quantize(self, quantize):
self._quantize = quantize
@quantize_schema.setter
def quantize_schema(self, quantize_schema):
self._quantize_schema = quantize_schema
@quantize_large_weights.setter
def quantize_large_weights(self, quantize_large_weights):
self._quantize_large_weights = quantize_large_weights
......@@ -595,6 +607,10 @@ class ConverterOption(object):
# Need to be put after SORT_BY_EXECUTION
TransformerRule.ADD_QUANTIZE_TENSOR_RANGE,
]
if self._device == DeviceType.APU.value:
self._transformer_option = self._transformer_option + [
TransformerRule.TRANSFORM_SINGLE_BN_TO_DEPTHWISE_CONV,
]
if self.quantize_large_weights:
self._transformer_option = self._transformer_option + [
TransformerRule.QUANTIZE_LARGE_WEIGHTS
......
......@@ -115,6 +115,8 @@ class Transformer(base_converter.ConverterInterface):
self.fp16_gather_weight,
TransformerRule.QUANTIZE_LARGE_WEIGHTS:
self.quantize_large_weights,
TransformerRule.TRANSFORM_SINGLE_BN_TO_DEPTHWISE_CONV:
self.transform_single_bn_to_depthwise_conv,
}
self._option = option
......@@ -736,7 +738,6 @@ class Transformer(base_converter.ConverterInterface):
net.tensors.remove(scale)
self.replace_quantize_info(op, consumer_op)
self.safe_remove_node(consumer_op, op)
return True
return False
......@@ -1099,9 +1100,9 @@ class Transformer(base_converter.ConverterInterface):
transposed_filter = set()
transposed_deconv_filter = set()
if self._option.quantize and \
(self._option.device == DeviceType.CPU.value or
self._option.device == DeviceType.APU.value):
if ((self._option.quantize and
self._option.device == DeviceType.CPU.value) or
self._option.device == DeviceType.APU.value):
print("Transpose filters to OHWI")
if filter_format == DataFormat.HWIO:
transpose_order = [3, 0, 1, 2]
......@@ -1621,12 +1622,23 @@ class Transformer(base_converter.ConverterInterface):
mace_check(data_type_arg, "Data type does not exist for %s(%s)"
% (op.name, op.type))
if data_type_arg.i == mace_pb2.DT_FLOAT:
data_type_arg.i = mace_pb2.DT_UINT8
if self._option.quantize_schema == \
MaceKeyword.mace_apu_16bit_per_tensor:
data_type_arg.i = mace_pb2.DT_INT16
else:
data_type_arg.i = mace_pb2.DT_UINT8
elif data_type_arg.i == mace_pb2.DT_UINT8:
mace_check(op.type == MaceOp.Quantize.name
or op.type == MaceOp.Dequantize.name,
"Only Quantization ops support uint8, "
"but got %s(%s)" % (op.name, op.type))
elif data_type_arg.i == mace_pb2.DT_INT16 \
and self._option.quantize_schema == \
MaceKeyword.mace_apu_16bit_per_tensor:
mace_check(op.type == MaceOp.Quantize.name
or op.type == MaceOp.Dequantize.name,
"Only Quantization ops support int16, "
"but got %s(%s)" % (op.name, op.type))
else:
mace_check(op.type == MaceOp.Quantize.name,
"Quantization only support float ops, "
......@@ -1648,7 +1660,11 @@ class Transformer(base_converter.ConverterInterface):
self._model.input_info[i].scale = quantize_info.scale
self._model.input_info[i].zero_point = quantize_info.zero_point
ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8)
if self._option.quantize_schema == \
MaceKeyword.mace_apu_16bit_per_tensor:
ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_INT16)
else:
ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8)
ConverterUtil.add_data_format_arg(op_def, input_node.data_format)
# use actual ranges for model input quantize
find_range_every_time_arg = op_def.arg.add()
......@@ -1671,7 +1687,11 @@ class Transformer(base_converter.ConverterInterface):
self._model.output_info[i].scale = quantize_info.scale
self._model.output_info[i].zero_point = quantize_info.zero_point
ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8)
if self._option.quantize_schema == \
MaceKeyword.mace_apu_16bit_per_tensor:
ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_INT16)
else:
ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8)
ConverterUtil.add_data_format_arg(op_def, output_node.data_format)
quantize_flag_arg = self._model.arg.add()
......@@ -1726,6 +1746,11 @@ class Transformer(base_converter.ConverterInterface):
else:
mace_check(False, "wrong device.")
tensor.data_type = mace_pb2.DT_INT32
elif self._option.quantize_schema == \
MaceKeyword.mace_apu_16bit_per_tensor:
quantized_tensor = \
quantize_util.quantize_int16(tensor.float_data)
tensor.data_type = mace_pb2.DT_INT16
else:
non_zero = self._option.device == DeviceType.CPU.value
quantized_tensor = quantize_util.quantize(tensor.float_data,
......@@ -1782,9 +1807,16 @@ class Transformer(base_converter.ConverterInterface):
return False
def add_quantize_info(self, op, minval, maxval):
scale, zero, minval, maxval = \
quantize_util.adjust_range(minval, maxval, self._option.device,
non_zero=False)
quantize_schema = self._option.quantize_schema
if quantize_schema == MaceKeyword.mace_apu_16bit_per_tensor:
maxval = max(abs(minval), abs(maxval))
minval = -maxval
scale = maxval / 2**15
zero = 0
else:
scale, zero, minval, maxval = \
quantize_util.adjust_range(minval, maxval, self._option.device,
non_zero=False)
quantize_info = op.quantize_info.add()
quantize_info.minval = minval
quantize_info.maxval = maxval
......@@ -1877,6 +1909,7 @@ class Transformer(base_converter.ConverterInterface):
def add_quantize_tensor_range(self):
# Quantize info from range statistics
range_file = self._option.quantize_range_file
quantize_schema = self._option.quantize_schema
if range_file:
print("Add quantize tensor range")
post_quantize_info = {}
......@@ -1885,10 +1918,17 @@ class Transformer(base_converter.ConverterInterface):
tensor_name, minmax = line.split("@@")[:2]
min_val, max_val = [float(i) for i in
minmax.strip().split(",")]
scale, zero, min_val, max_val = \
quantize_util.adjust_range(min_val, max_val,
self._option.device,
non_zero=False)
if (quantize_schema ==
MaceKeyword.mace_apu_16bit_per_tensor):
max_val = max(abs(min_val), abs(max_val))
min_val = -max_val
scale = max_val / 2**15
zero = 0
else:
scale, zero, min_val, max_val = \
quantize_util.adjust_range(min_val, max_val,
self._option.device,
non_zero=False)
activation_info = mace_pb2.QuantizeActivationInfo()
activation_info.minval = min_val
activation_info.maxval = max_val
......@@ -1919,11 +1959,18 @@ class Transformer(base_converter.ConverterInterface):
print("Input range %s: %s" % (input_node.name,
str(input_node.range)))
new_input_name = self.input_name_map[input_node.name]
scale, zero, minval, maxval = \
quantize_util.adjust_range(input_node.range[0],
input_node.range[1],
self._option.device,
non_zero=False)
if quantize_schema == MaceKeyword.mace_apu_16bit_per_tensor:
maxval = max(abs(input_node.range[0]),
abs(input_node.range[0]))
minval = -maxval
scale = maxval / 2**15
zero = 0
else:
scale, zero, minval, maxval = \
quantize_util.adjust_range(input_node.range[0],
input_node.range[1],
self._option.device,
non_zero=False)
quantize_info = \
mace_pb2.QuantizeActivationInfo()
quantize_info.minval = minval
......@@ -2397,3 +2444,37 @@ class Transformer(base_converter.ConverterInterface):
return True
return False
def transform_single_bn_to_depthwise_conv(self):
for op in self._model.op:
if op.type != MaceOp.BatchNorm.name:
continue
if len(op.input) != 3:
continue
producer = self._producer[op.input[0]]
if producer.type in [MaceOp.Conv2D.name,
MaceOp.Deconv2D.name,
MaceOp.DepthwiseDeconv2d.name,
MaceOp.DepthwiseConv2d.name,
MaceOp.BatchToSpaceND.name]:
continue
op.type = MaceOp.DepthwiseConv2d.name
padding_arg = op.arg.add()
padding_arg.name = MaceKeyword.mace_padding_str
padding_arg.i = PaddingMode.VALID.value
strides_arg = op.arg.add()
strides_arg.name = MaceKeyword.mace_strides_str
strides_arg.ints.extend([1, 1])
dilation_arg = op.arg.add()
dilation_arg.name = MaceKeyword.mace_dilations_str
dilation_arg.ints.extend([1, 1])
for tensor in self._model.tensors:
if tensor.name == op.input[1]:
tensor.dims[:] = [1, 1, 1, tensor.dims[0]]
break
return True
return False
......@@ -92,6 +92,7 @@ class ModelKeys(object):
weight_sha256_checksum = "weight_sha256_checksum"
quantize_range_file = "quantize_range_file"
quantize = "quantize"
quantize_schema = "quantize_schema"
quantize_large_weights = "quantize_large_weights"
quantize_stat = "quantize_stat"
change_concat_ranges = "change_concat_ranges"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册