diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5a86f2eb1729d784135f5499da6dcc7ba66af776..89916b0959ed482f0c9dcdfd767945d43643f21b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -106,7 +106,7 @@ mace_cc_test: GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git DEVICE_CONF_FILE=generic-mobile-devices/devices.yml fi - - python tools/bazel_adb_run.py --target="//test/ccunit:mace_cc_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS + - python tools/bazel_adb_run.py --target="//test/ccunit:mace_cc_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS - python tools/bazel_adb_run.py --target="//micro/test/ccunit:micro_ops_test" --run_target=True --stdout_processor=ops_benchmark_stdout_processor --target_abis=arm64-v8a mace_cc_benchmark: @@ -133,7 +133,7 @@ model_tests: fi - if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi - python tools/converter.py convert --config=${CONF_FILE} --target_socs=$TARGET_SOCS --model_graph_format=file --model_data_format=file --cl_mem_type=buffer - - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file + - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file - CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml - python tools/converter.py convert --config=${CONF_FILE} --target_socs=$TARGET_SOCS --model_graph_format=file --model_data_format=file - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=1 --validate --model_graph_format=file --model_data_format=file --address_sanitizer diff --git a/include/mace/public/mace.h b/include/mace/public/mace.h index 77488e77ba7097692aea12ef11f760b9df56dfb2..52cda774df420bd155b01cc540fede6879a3fccd 100644 --- a/include/mace/public/mace.h +++ b/include/mace/public/mace.h @@ -102,6 +102,17 @@ enum HexagonNNCornerType { HEXAGON_NN_CORNER_SVS2, }; +// APU Initial Cache Policy: +// NONE: Compile model using the information from net_def and model_data. +// STORE: Compile model using the information from net_def and model_data and +// store the compiled model. +// LOAD: Get input/output information from net_def and load pre-compiled model. +enum APUCachePolicy { + APU_CACHE_NONE = 0, + APU_CACHE_STORE = 1, + APU_CACHE_LOAD = 2, +}; + struct CallStats { int64_t start_micros; int64_t end_micros; @@ -355,6 +366,21 @@ class MACE_API MaceEngineConfig { bool dcvs_enable, int latency); + /// \brief Set MTK APU initial cache + /// + /// \param policy is a policy for loading or storing apu initial cache. + /// \param binary_file will load cache file from this path. + /// \param storage_file will store cache file to this path. + /// + /// Now the path is used to store the cache to file, + /// which could speed up the APU initialization. + /// If do not call this API, the initialization maybe slow for APU. + /// + /// \return MaceStatus::MACE_SUCCESS for success, other for failure. + MaceStatus SetAPUCache(APUCachePolicy policy, + const std::string &binary_file, + const std::string &storage_file); + private: class Impl; std::unique_ptr impl_; diff --git a/mace/codegen/model_version_script.lds b/mace/codegen/model_version_script.lds index d1cc9dad28cab6cdb42aa503c54c0500287f806d..4a215dfc48d9cb7e52ab28d00fda2433f9ab3cbb 100644 --- a/mace/codegen/model_version_script.lds +++ b/mace/codegen/model_version_script.lds @@ -1,6 +1,7 @@ mace { global: *LoadModelData*; + *GetModelSize*; *CreateNet*; *ModelName*; *ModelChecksum*; diff --git a/mace/core/runtime/apu/apu_wrapper.cc b/mace/core/runtime/apu/apu_wrapper.cc index 42b8956af504b26df903b58cde01ec8f9ce4bd42..6feac5c699cab4503904037833b925a5ddb0f545 100644 --- a/mace/core/runtime/apu/apu_wrapper.cc +++ b/mace/core/runtime/apu/apu_wrapper.cc @@ -32,10 +32,14 @@ apu_data_type ApuWrapper::MapToApuDataType(DataType mace_type) { return APU_DATA_TYPE_INT32; case DT_HALF: return APU_DATA_TYPE_HALF; + case DT_FLOAT16: + return APU_DATA_TYPE_HALF; case DT_UINT8: return APU_DATA_TYPE_UINT8; + case DT_INT16: + return APU_DATA_TYPE_INT16; default: - MACE_CHECK(true, "unsupport mace data type"); + MACE_CHECK(false, "unsupport mace data type"); break; } return APU_DATA_TYPE_UNDEFINED; @@ -48,7 +52,7 @@ apu_pooling_mode ApuWrapper::MapToApuPoolingMode(int mace_mode) { case 2: return APU_POOLING_MAX; default: - MACE_CHECK(true, "unsupport mace pooling mode"); + MACE_CHECK(false, "unsupport mace pooling mode"); break; } return APU_POOLING_UNDEFINED; @@ -67,62 +71,37 @@ apu_eltwise_mode ApuWrapper::MapToApuEltwiseMode(int mace_mode) { case 5: return APU_ELTWISE_MAX; default: - MACE_CHECK(true, "unsupport mace eltwise mode"); + MACE_CHECK(false, "unsupport mace eltwise mode"); break; } return APU_ELTWISE_UNDEFINED; } -bool ApuWrapper::Init(const NetDef &net_def, - unsigned const char *model_data, - const index_t model_data_size) { +bool ApuWrapper::Init(const NetDef &net_def, unsigned const char *model_data, + const char *file_name, bool load, bool store) { frontend = new ApuFrontend(); + MACE_CHECK(!(load & store), + "Should not load and store the model simultaneously."); + // parse model argument int const_data_num = 0; + int apu_data_type = -1; for (auto arg : net_def.arg()) { if (arg.name().compare("const_data_num") == 0) { const_data_num = arg.i(); + } else if (arg.name().compare("apu_data_type") == 0) { + apu_data_type = arg.i(); } } - - // const tensors - std::vector const_tensors; - for (auto const_tensor : net_def.tensors()) { - apu_tensor tensor; - tensor.tensor_id = const_tensor.node_id(); - tensor.tensor_type = (tensor.tensor_id < const_data_num) ? - APU_TENSOR_CONST_DATA : - APU_TENSOR_CONST_ARGUMENT; - tensor.data_type = MapToApuDataType(const_tensor.data_type()); - tensor.scale = const_tensor.has_scale() ? const_tensor.scale() : 0.0f; - tensor.zero_point = const_tensor.has_zero_point() ? - const_tensor.zero_point() : 0; - tensor.dim_size = const_tensor.dims_size(); - MACE_CHECK(tensor.dim_size <= APU_TENSOR_MAX_DIMS, - "tensor dimension size not supported"); - for (auto i = 0; i < tensor.dim_size; i++) { - tensor.dims[i] = const_tensor.dims(i); - } - const auto tensor_end = const_tensor.offset() + - const_tensor->data_size() * GetEnumTypeSize(const_tensor.data_type()); - if (model_data_size >= 0) { - MACE_CHECK(tensor_end <= model_data_size, "tensor_end (", tensor_end, - ") should <= ", model_data_size); - } - tensor.data_buf = - const_cast(model_data + const_tensor.offset()); - const_tensors.push_back(tensor); - } - // input tensors std::vector input_tensors; for (auto input_info : net_def.input_info()) { apu_tensor tensor; tensor.tensor_id = input_info.node_id(); tensor.tensor_type = APU_TENSOR_MODEL_INPUT; - tensor.data_type = APU_DATA_TYPE_UINT8; // will do quantize in Run() - tensor.scale = input_info.has_scale() ? input_info.scale() : 0.0f; + tensor.data_type = MapToApuDataType(static_cast(apu_data_type)); + tensor.scale = input_info.has_scale() ? input_info.scale() : -1.0f; tensor.zero_point = input_info.has_zero_point() ? input_info.zero_point() : 0; tensor.dim_size = input_info.dims_size(); @@ -131,114 +110,156 @@ bool ApuWrapper::Init(const NetDef &net_def, ApuTensorInfo info; info.name = input_info.name(); info.size = 1; - for (auto i = 0; i < tensor.dim_size; i++) { + info.data_type = tensor.data_type; + int byte_per_element = GetByteNum(tensor.data_type); + for (auto i = 0 ; i < tensor.dim_size ; i++) { tensor.dims[i] = input_info.dims(i); info.size *= input_info.dims(i); info.shape.push_back(input_info.dims(i)); } - info.buf = std::shared_ptr(new uint8_t[info.size], - std::default_delete()); + info.buf + = std::shared_ptr(new uint8_t[info.size * byte_per_element], + std::default_delete()); info.scale = tensor.scale; info.zero_point = tensor.zero_point; input_infos.push_back(info); tensor.data_buf = info.buf.get(); input_tensors.push_back(tensor); } - // output tensors - std::vector output_tensor_ids; - std::vector output_buffers; + std::vector output_tensors; for (auto output_info : net_def.output_info()) { - output_tensor_ids.push_back(output_info.node_id()); + apu_tensor tensor; + tensor.tensor_id = output_info.node_id(); + tensor.tensor_type = APU_TENSOR_MODEL_OUTPUT; + tensor.data_type = MapToApuDataType(static_cast(apu_data_type)); + tensor.dim_size = output_info.dims_size(); ApuTensorInfo info; info.name = output_info.name(); info.size = 1; - for (auto i = 0; i < output_info.dims().size(); i++) { + info.data_type = tensor.data_type; + int byte_per_element = GetByteNum(tensor.data_type); + for (auto i = 0 ; i < tensor.dim_size ; i++) { + tensor.dims[i] = output_info.dims(i); info.size *= output_info.dims(i); info.shape.push_back(output_info.dims(i)); } - info.buf = std::shared_ptr(new uint8_t[info.size], - std::default_delete()); + info.buf = + std::shared_ptr(new uint8_t[info.size * byte_per_element], + std::default_delete()); for (auto op_def : net_def.op()) { if (output_info.name() == op_def.output(0)) { - info.scale = op_def.quantize_info(0).scale(); - info.zero_point = op_def.quantize_info(0).zero_point(); + if (info.data_type == static_cast(APU_DATA_TYPE_UINT8) || + info.data_type == static_cast(APU_DATA_TYPE_INT16)) { + info.scale = op_def.quantize_info(0).scale(); + info.zero_point = op_def.quantize_info(0).zero_point(); + } else { + info.scale = 0.0; + info.zero_point = 0; + } } } output_infos.push_back(info); - output_buffers.push_back(info.buf.get()); + tensor.data_buf = info.buf.get(); + output_tensors.push_back(tensor); } - + // const tensors + std::vector const_tensors; // operators std::vector ops; std::vector> cached_op_inputs; - for (auto op_def : net_def.op()) { - apu_operator op; - strncpy(op.type, op_def.type().c_str(), APU_OP_TYPE_MAX_SIZE); - op.input_size = op_def.node_input_size(); - std::vector input_ids; - for (auto i = 0; i < op.input_size; i++) { - input_ids.push_back(op_def.node_input(i).node_id()); - } - cached_op_inputs.push_back(input_ids); - op.input_ids = cached_op_inputs.back().data(); - op.output.tensor_id = op_def.node_id(); - op.output.tensor_type = APU_TENSOR_OP_OUTPUT; - op.output.data_type = MapToApuDataType(op_def.output_type(0)); - if (op.output.data_type == APU_DATA_TYPE_UINT8) { - op.output.scale = op_def.quantize_info(0).scale(); - op.output.zero_point = op_def.quantize_info(0).zero_point(); - } else { - op.output.scale = 0.0f; - op.output.zero_point = 0; - } - op.output.dim_size = op_def.output_shape(0).dims_size(); - MACE_CHECK(op.output.dim_size <= APU_TENSOR_MAX_DIMS, - "tensor dimension size not supported"); - for (auto i = 0; i < op.output.dim_size; i++) { - op.output.dims[i] = op_def.output_shape(0).dims(i); + if (!load) { + // const tensors + for (auto const_tensor : net_def.tensors()) { + apu_tensor tensor; + tensor.tensor_id = const_tensor.node_id(); + tensor.tensor_type = (tensor.tensor_id < const_data_num) ? + APU_TENSOR_CONST_DATA : + APU_TENSOR_CONST_ARGUMENT; + tensor.data_type = MapToApuDataType(const_tensor.data_type()); + tensor.scale = const_tensor.has_scale() ? const_tensor.scale() : 0.0f; + tensor.zero_point = const_tensor.has_zero_point() ? + const_tensor.zero_point() : 0; + tensor.dim_size = const_tensor.dims_size(); + MACE_CHECK(tensor.dim_size <= APU_TENSOR_MAX_DIMS, + "tensor dimension size not supported"); + for (auto i = 0 ; i < tensor.dim_size ; i++) { + tensor.dims[i] = const_tensor.dims(i); + } + tensor.data_buf = + const_cast(model_data + const_tensor.offset()); + const_tensors.push_back(tensor); } - op.output.data_buf = nullptr; - // get op mode and activation mode - bool is_pooling = (strcmp(op.type, "Pooling") == 0); - bool is_eltwise = (strcmp(op.type, "Eltwise") == 0); - std::string activation; - float max_limit = 0.0f; - for (auto arg : op_def.arg()) { - if (arg.name().compare("activation") == 0) { - activation = arg.s(); + // operators + for (auto op_def : net_def.op()) { + apu_operator op; + strncpy(op.type, op_def.type().c_str(), APU_OP_TYPE_MAX_SIZE); + op.input_size = op_def.node_input_size(); + std::vector input_ids; + for (auto i = 0 ; i < op.input_size ; i++) { + input_ids.push_back(op_def.node_input(i).node_id()); } - if (arg.name().compare("max_limit") == 0) { - max_limit = arg.f(); + cached_op_inputs.push_back(input_ids); + op.input_ids = cached_op_inputs.back().data(); + op.output.tensor_id = op_def.node_id(); + op.output.tensor_type = APU_TENSOR_OP_OUTPUT; + op.output.data_type = MapToApuDataType(op_def.output_type(0)); + if (op.output.data_type == APU_DATA_TYPE_UINT8 || + op.output.data_type == APU_DATA_TYPE_INT16) { + op.output.scale = op_def.quantize_info(0).scale(); + op.output.zero_point = op_def.quantize_info(0).zero_point(); + } else { + op.output.scale = 0.0f; + op.output.zero_point = 0; } - if (is_pooling && arg.name().compare("pooling_type") == 0) { - op.op_mode = static_cast(MapToApuPoolingMode(arg.i())); + op.output.dim_size = op_def.output_shape(0).dims_size(); + MACE_CHECK(op.output.dim_size <= APU_TENSOR_MAX_DIMS, + "tensor dimension size not supported"); + for (auto i = 0 ; i < op.output.dim_size ; i++) { + op.output.dims[i] = op_def.output_shape(0).dims(i); } - if (is_eltwise && arg.name().compare("type") == 0) { - op.op_mode = static_cast(MapToApuEltwiseMode(arg.i())); + op.output.data_buf = nullptr; + // get op mode and activation mode + bool is_pooling = (strcmp(op.type, "Pooling") == 0); + bool is_eltwise = (strcmp(op.type, "Eltwise") == 0); + std::string activation; + float max_limit = 0.0f; + for (auto arg : op_def.arg()) { + if (arg.name().compare("activation") == 0) { + activation = arg.s(); + } + if (arg.name().compare("max_limit") == 0) { + max_limit = arg.f(); + } + if (is_pooling && arg.name().compare("pooling_type") == 0) { + op.op_mode = static_cast(MapToApuPoolingMode(arg.i())); + } + if (is_eltwise && arg.name().compare("type") == 0) { + op.op_mode = static_cast(MapToApuEltwiseMode(arg.i())); + } } + if (activation.compare("RELU") == 0) { + op.act_mode = APU_ACT_RELU; + } else if (activation.compare("RELUX") == 0 && max_limit == 6.0) { + op.act_mode = APU_ACT_RELU6; + } else if (activation.compare("SIGMOID") == 0) { + op.act_mode = APU_ACT_SIGMOID; + } else if (activation.compare("TANH") == 0) { + op.act_mode = APU_ACT_TANH; + } else { + op.act_mode = APU_ACT_NONE; + } + ops.push_back(op); } - if (activation.compare("RELU") == 0) { - op.act_mode = APU_ACT_RELU; - } else if (activation.compare("RELUX") == 0 && max_limit == 6.0) { - op.act_mode = APU_ACT_RELU6; - } else { - op.act_mode = APU_ACT_NONE; - } - ops.push_back(op); } - bool print_model = false; bool ret = frontend->InitGraph( - const_tensors.size(), const_tensors.data(), - input_tensors.size(), input_tensors.data(), - output_tensor_ids.size(), output_tensor_ids.data(), - output_buffers.data(), - ops.size(), ops.data(), - print_model); + const_tensors.size(), const_tensors.data(), + input_tensors.size(), input_tensors.data(), + output_tensors.size(), output_tensors.data(), + ops.size(), ops.data(), + print_model, file_name, load, store); cached_op_inputs.clear(); - MACE_CHECK(ret == true, "apu init graph failed"); - return ret; } @@ -247,22 +268,35 @@ bool ApuWrapper::Run(const std::map &input_tensors, MACE_ASSERT(input_tensors.size() == input_infos.size(), "Wrong inputs num"); MACE_ASSERT(output_tensors.size() == output_infos.size(), "Wrong outputs num"); - // prepare input for (int i = 0 ; i < static_cast(input_tensors.size()) ; i++) { Tensor *tensor = input_tensors.at(input_infos[i].name); // check size - int size = input_infos[i].size; - MACE_ASSERT(size == static_cast(tensor->size()), "Wrong input size"); - + int element_size = input_infos[i].size; + int byte_per_element = GetByteNum(input_infos[i].data_type); + MACE_ASSERT(element_size == static_cast(tensor->size()), + "Wrong input size"); // quantize - quantize_util_.QuantizeWithScaleAndZeropoint( - tensor->data(), - size, - input_infos[i].scale, - input_infos[i].zero_point, - input_infos[i].buf.get()); + if (input_infos[i].data_type == APU_DATA_TYPE_INT16) { + quantize_util_.QuantizeWithScaleAndZeropoint( + (const float*)tensor->raw_data(), + element_size, + input_infos[i].scale, + input_infos[i].zero_point, + reinterpret_cast(input_infos[i].buf.get())); + } else if (input_infos[i].data_type == APU_DATA_TYPE_FLOAT) { + std::memcpy(input_infos[i].buf.get(), + (const float*)tensor->raw_data(), + element_size * byte_per_element); + } else { + quantize_util_.QuantizeWithScaleAndZeropoint( + (const float*)tensor->raw_data(), + element_size, + input_infos[i].scale, + input_infos[i].zero_point, + input_infos[i].buf.get()); + } } // run model @@ -276,16 +310,30 @@ bool ApuWrapper::Run(const std::map &input_tensors, // prepare out buffer tensor->SetDtype(DT_FLOAT); tensor->Resize(output_infos[i].shape); - int size = output_infos[i].size; - MACE_ASSERT(size == static_cast(tensor->size()), "Wrong output size"); - + int element_size = output_infos[i].size; + int byte_per_element = GetByteNum(output_infos[i].data_type); + MACE_ASSERT(element_size == static_cast(tensor->size()), + "Wrong output size"); // dequantize - quantize_util_.Dequantize( - output_infos[i].buf.get(), - size, - output_infos[i].scale, - output_infos[i].zero_point, - tensor->mutable_data()); + if (output_infos[i].data_type == APU_DATA_TYPE_INT16) { + quantize_util_.Dequantize( + reinterpret_cast(output_infos[i].buf.get()), + element_size, + output_infos[i].scale, + output_infos[i].zero_point, + reinterpret_cast(tensor->raw_mutable_data())); + } else if (output_infos[i].data_type == APU_DATA_TYPE_FLOAT) { + std::memcpy(reinterpret_cast(tensor->raw_mutable_data()), + output_infos[i].buf.get(), + element_size * byte_per_element); + } else { + quantize_util_.Dequantize( + output_infos[i].buf.get(), + element_size, + output_infos[i].scale, + output_infos[i].zero_point, + reinterpret_cast(tensor->raw_mutable_data())); + } } return true; @@ -299,4 +347,20 @@ bool ApuWrapper::Uninit() { return ret; } +int ApuWrapper::GetByteNum(apu_data_type data_type) { + int byte_per_element; + if (data_type == APU_DATA_TYPE_FLOAT || data_type == APU_DATA_TYPE_INT32) { + byte_per_element = 4; + } else if (data_type == APU_DATA_TYPE_HALF || + data_type == APU_DATA_TYPE_INT16) { + byte_per_element = 2; + } else if (data_type == APU_DATA_TYPE_UINT8) { + byte_per_element = 1; + } else { + byte_per_element = 1; + MACE_CHECK(false, "unsupport data type"); + } + return byte_per_element; +} + } // namespace mace diff --git a/mace/core/runtime/apu/apu_wrapper.h b/mace/core/runtime/apu/apu_wrapper.h old mode 100755 new mode 100644 index 7b87e56c8500a854904111c6aed2678cf9d13ce3..a18694edd0681f0b9a65a2ceaa922a1e6bf0582c --- a/mace/core/runtime/apu/apu_wrapper.h +++ b/mace/core/runtime/apu/apu_wrapper.h @@ -37,12 +37,14 @@ class ApuWrapper { int size; float scale; int zero_point; + apu_data_type data_type; }; public: explicit ApuWrapper(Device *device); - bool Init(const NetDef &net_def, unsigned const char *model_data, - const index_t model_data_size); + bool Init(const NetDef& net_def, unsigned const char *model_data = nullptr, + const char *file_name = nullptr, + bool load = false, bool store = false); bool Run(const std::map &input_tensors, std::map *output_tensors); bool Uninit(); @@ -51,6 +53,7 @@ class ApuWrapper { apu_data_type MapToApuDataType(DataType mace_type); apu_pooling_mode MapToApuPoolingMode(int mace_mode); apu_eltwise_mode MapToApuEltwiseMode(int mace_mode); + int GetByteNum(apu_data_type data_type); private: ApuFrontend *frontend; diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index 743d10c6acdad0559c7aa310f0f653ef6eb897c8..d31f9eb56e1415f3691da8593926263abfb6b846 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -201,6 +201,10 @@ class MaceEngineConfig::Impl { bool dcvs_enable, int latency); + MaceStatus SetAPUCache(APUCachePolicy policy, + const std::string &binary_file, + const std::string &storage_file); + inline DeviceType device_type() const { return device_type_; } @@ -237,6 +241,18 @@ class MaceEngineConfig::Impl { return hexagon_latency_; } + inline APUCachePolicy apu_cache_policy() const { + return apu_cache_policy_; + } + + inline std::string apu_binary_file() const { + return apu_binary_file_; + } + + inline std::string apu_storage_file() const { + return apu_storage_file_; + } + private: DeviceType device_type_; int num_threads_; @@ -247,6 +263,9 @@ class MaceEngineConfig::Impl { HexagonNNCornerType hexagon_corner_; bool hexagon_dcvs_enable_; int hexagon_latency_; + APUCachePolicy apu_cache_policy_; + std::string apu_binary_file_; + std::string apu_storage_file_; }; MaceEngineConfig::Impl::Impl(const DeviceType device_type) @@ -258,7 +277,10 @@ MaceEngineConfig::Impl::Impl(const DeviceType device_type) gpu_perf_hint_(GPUPerfHint::PERF_NORMAL), hexagon_corner_(HexagonNNCornerType::HEXAGON_NN_CORNER_TURBO), hexagon_dcvs_enable_(true), - hexagon_latency_(100) {} + hexagon_latency_(100), + apu_cache_policy_(APUCachePolicy::APU_CACHE_NONE), + apu_binary_file_(""), + apu_storage_file_("") {} MaceStatus MaceEngineConfig::Impl::SetGPUContext( std::shared_ptr context) { @@ -282,14 +304,15 @@ MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy( return MaceStatus::MACE_SUCCESS; } +#ifdef MACE_ENABLE_HEXAGON MaceStatus MaceEngineConfig::Impl::SetHexagonToUnsignedPD() { bool ret = false; -#ifdef MACE_ENABLE_HEXAGON ret = HexagonDSPWrapper::RequestUnsignedPD(); -#endif return ret ? MaceStatus::MACE_SUCCESS : MaceStatus::MACE_RUNTIME_ERROR; } +#endif +#ifdef MACE_ENABLE_HEXAGON MaceStatus MaceEngineConfig::Impl::SetHexagonPower( HexagonNNCornerType corner, bool dcvs_enable, @@ -298,11 +321,24 @@ MaceStatus MaceEngineConfig::Impl::SetHexagonPower( hexagon_dcvs_enable_ = dcvs_enable; hexagon_latency_ = latency; bool ret = false; -#ifdef MACE_ENABLE_HEXAGON ret = HexagonDSPWrapper::SetPower(corner, dcvs_enable, latency); + return ret ? MaceStatus::MACE_SUCCESS : MaceStatus::MACE_RUNTIME_ERROR; +} #endif + +#ifdef MACE_ENABLE_APU +MaceStatus MaceEngineConfig::Impl::SetAPUCache( + APUCachePolicy policy, + const std::string &binary_file, + const std::string &storage_file) { + bool ret = false; + apu_cache_policy_ = policy; + apu_binary_file_ = binary_file; + apu_storage_file_ = storage_file; + ret = true; return ret ? MaceStatus::MACE_SUCCESS : MaceStatus::MACE_RUNTIME_ERROR; } +#endif MaceEngineConfig::MaceEngineConfig( const DeviceType device_type) @@ -338,6 +374,13 @@ MaceStatus MaceEngineConfig::SetHexagonPower( return impl_->SetHexagonPower(corner, dcvs_enable, latency); } +MaceStatus MaceEngineConfig::SetAPUCache( + APUCachePolicy policy, + const std::string &binary_file, + const std::string &storage_file) { + return impl_->SetAPUCache(policy, binary_file, storage_file); +} + // Mace Tensor class MaceTensor::Impl { public: @@ -478,6 +521,9 @@ class MaceEngine::Impl { #endif #ifdef MACE_ENABLE_APU std::unique_ptr apu_controller_; + APUCachePolicy apu_cache_policy_; + std::string apu_binary_file_; + std::string apu_storage_file_; #endif MACE_DISABLE_COPY_AND_ASSIGN(Impl); @@ -504,6 +550,9 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config) #endif #ifdef MACE_ENABLE_APU , apu_controller_(nullptr) + , apu_cache_policy_(config.impl_->apu_cache_policy()) + , apu_binary_file_(config.impl_->apu_binary_file()) + , apu_storage_file_(config.impl_->apu_storage_file()) #endif { LOG(INFO) << "Creating MaceEngine, MACE version: " << MaceVersion(); @@ -660,8 +709,21 @@ MaceStatus MaceEngine::Impl::Init( #ifdef MACE_ENABLE_APU if (device_type_ == APU) { apu_controller_.reset(new ApuWrapper(device_.get())); - MACE_CHECK(apu_controller_->Init( - *net_def, model_data, model_data_size), "apu init error"); + bool cache_load = apu_cache_policy_ == APUCachePolicy::APU_CACHE_LOAD; + bool cache_store = apu_cache_policy_ == APUCachePolicy::APU_CACHE_STORE; + const char* file_name = cache_store ? + apu_storage_file_.c_str() : apu_binary_file_.c_str(); + bool ret = false; + if (cache_load || cache_store) { + VLOG(1) << "Loading/Storing init cache"; + ret = apu_controller_->Init( + *net_def, model_data, file_name, cache_load, cache_store); + } + if (!ret && !cache_store) { + VLOG(1) << "Do not use init cache"; + ret = apu_controller_->Init(*net_def, model_data); + } + MACE_CHECK(ret, "apu int error", cache_load, cache_store); } else { #endif MACE_RETURN_IF_ERROR(ws_->LoadModelTensor( diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto index d92f9b13d244a88754579de12cabf35fe6476fc7..0997046095be6325f70a70f4626f97ba32e81857 100644 --- a/mace/proto/mace.proto +++ b/mace/proto/mace.proto @@ -15,6 +15,7 @@ enum DataType { DT_INT32 = 4; DT_FLOAT16 = 5; DT_BFLOAT16 = 6; + DT_INT16 = 7; } enum MemoryType { diff --git a/mace/tools/BUILD.bazel b/mace/tools/BUILD.bazel index 43201a290903a3597f1fad90c555ca38da68e358..6f66158a1c8f7e144a8ee09f014584897f1dd012 100644 --- a/mace/tools/BUILD.bazel +++ b/mace/tools/BUILD.bazel @@ -5,6 +5,7 @@ load( "if_android", "if_hexagon_enabled", "if_opencl_enabled", + "if_apu_enabled", ) licenses(["notice"]) # Apache 2.0 @@ -22,6 +23,8 @@ cc_binary( "-DMACE_ENABLE_OPENCL", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", + ]) + if_apu_enabled([ + "-DMACE_ENABLE_APU", ]), linkstatic = 1, deps = [ @@ -46,6 +49,8 @@ cc_binary( "-DMACE_ENABLE_OPENCL", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", + ]) + if_apu_enabled([ + "-DMACE_ENABLE_APU", ]), linkopts = [ "-lm", diff --git a/mace/tools/mace_run.cc b/mace/tools/mace_run.cc index 25b054111cdf8f88bab16bd11cdaaa56aa1e7002..61fc3369e5a9b4bf350cc2ae547bde778b6bd333 100644 --- a/mace/tools/mace_run.cc +++ b/mace/tools/mace_run.cc @@ -144,6 +144,12 @@ DEFINE_string(model_data_file, DEFINE_string(model_file, "", "model file name, used when load mace model in pb"); +DEFINE_string(apu_binary_file, + "", + "apu init cache path, used when load apu init cache"); +DEFINE_string(apu_storage_file, + "", + "apu init cache path, used when store apu init cache"); DEFINE_string(device, "GPU", "CPU/GPU/HEXAGON/APU"); DEFINE_int32(round, 1, "round"); DEFINE_int32(restart_round, 1, "restart round"); @@ -153,6 +159,7 @@ DEFINE_int32(gpu_priority_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); DEFINE_int32(num_threads, -1, "num of threads"); DEFINE_int32(cpu_affinity_policy, 1, "0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY"); +DEFINE_int32(apu_cache_policy, 0, "0:NONE/1:STORE/2:LOAD"); DEFINE_bool(benchmark, false, "enable benchmark op"); bool RunModel(const std::string &model_name, @@ -201,6 +208,11 @@ bool RunModel(const std::string &model_name, // firmware) or 8250 family above to run hexagon nn on unsigned PD. // config.SetHexagonToUnsignedPD(); config.SetHexagonPower(HEXAGON_NN_CORNER_TURBO, true, 100); +#endif +#ifdef MACE_ENABLE_APU + config.SetAPUCache(static_cast(FLAGS_apu_cache_policy), + FLAGS_apu_binary_file, + FLAGS_apu_storage_file); #endif std::unique_ptr model_graph_data = make_unique(); @@ -539,6 +551,9 @@ int Main(int argc, char **argv) { LOG(INFO) << "output dir: " << FLAGS_output_dir; LOG(INFO) << "model_data_file: " << FLAGS_model_data_file; LOG(INFO) << "model_file: " << FLAGS_model_file; + LOG(INFO) << "apu_cache_policy: " << FLAGS_apu_cache_policy; + LOG(INFO) << "apu_binary_file: " << FLAGS_apu_binary_file; + LOG(INFO) << "apu_storage_file: " << FLAGS_apu_storage_file; LOG(INFO) << "device: " << FLAGS_device; LOG(INFO) << "round: " << FLAGS_round; LOG(INFO) << "restart_round: " << FLAGS_restart_round; diff --git a/third_party/apu/ApuFrontend.h b/third_party/apu/ApuFrontend.h old mode 100755 new mode 100644 index a715f1dc93d64e3790a220851b4101115f45a3d9..352185f5c08fa84c187a200d8b9312b0c82f9ce2 --- a/third_party/apu/ApuFrontend.h +++ b/third_party/apu/ApuFrontend.h @@ -6,6 +6,8 @@ enum apu_act_mode { APU_ACT_NONE = 0, APU_ACT_RELU = 1, APU_ACT_RELU6 = 2, + APU_ACT_SIGMOID = 3, + APU_ACT_TANH = 4, }; enum apu_pooling_mode { @@ -29,6 +31,7 @@ enum apu_data_type { APU_DATA_TYPE_UINT8 = 2, APU_DATA_TYPE_HALF = 3, APU_DATA_TYPE_INT32 = 4, + APU_DATA_TYPE_INT16 = 5, }; enum apu_tensor_type { @@ -37,6 +40,7 @@ enum apu_tensor_type { APU_TENSOR_CONST_ARGUMENT = 2, APU_TENSOR_MODEL_INPUT = 3, APU_TENSOR_OP_OUTPUT = 4, + APU_TENSOR_MODEL_OUTPUT = 5, }; #define APU_TENSOR_MAX_DIMS 4 @@ -70,10 +74,10 @@ class ApuFrontend { bool InitGraph(int const_tensor_size, const apu_tensor* const_tensors, int input_tensor_size, const apu_tensor* input_tensors, - int output_tensor_size, const int* output_tensor_ids, - void** output_buffers, + int output_tensor_size, const apu_tensor* output_tensors, int operator_size, const apu_operator* operators, - bool print_model); + bool print_model, const char *file_name, + bool load, bool store); bool RunGraph(); bool UninitGraph(); diff --git a/third_party/apu/libapu-frontend.so b/third_party/apu/libapu-frontend.so old mode 100755 new mode 100644 index a6ffaa76aa685c29c224bf61138da004c533309a..79a1be2ef424c7fa473f153c7a66281001d41023 Binary files a/third_party/apu/libapu-frontend.so and b/third_party/apu/libapu-frontend.so differ diff --git a/third_party/apu/mt6853/libapu-frontend.so b/third_party/apu/mt6853/libapu-frontend.so new file mode 100644 index 0000000000000000000000000000000000000000..2144858555caa0c0926de15a16eeab9ce3aabf46 Binary files /dev/null and b/third_party/apu/mt6853/libapu-frontend.so differ diff --git a/third_party/apu/mt6853/libapu-platform.so b/third_party/apu/mt6853/libapu-platform.so new file mode 100644 index 0000000000000000000000000000000000000000..7537371553ec0daf3c97c6277a2ba16a3275b173 Binary files /dev/null and b/third_party/apu/mt6853/libapu-platform.so differ diff --git a/third_party/apu/mt6873/libapu-frontend.so b/third_party/apu/mt6873/libapu-frontend.so new file mode 100644 index 0000000000000000000000000000000000000000..453f5388c1986bd749fec9d0249dc7c0fbe7e530 Binary files /dev/null and b/third_party/apu/mt6873/libapu-frontend.so differ diff --git a/third_party/apu/mt6873/libapu-platform.so b/third_party/apu/mt6873/libapu-platform.so new file mode 100644 index 0000000000000000000000000000000000000000..af29cee6a9f6554595fd8c9066dbaff12a4fe07c Binary files /dev/null and b/third_party/apu/mt6873/libapu-platform.so differ diff --git a/third_party/apu/mt6885/libapu-frontend.so b/third_party/apu/mt6885/libapu-frontend.so new file mode 100644 index 0000000000000000000000000000000000000000..453f5388c1986bd749fec9d0249dc7c0fbe7e530 Binary files /dev/null and b/third_party/apu/mt6885/libapu-frontend.so differ diff --git a/third_party/apu/mt6885/libapu-platform.so b/third_party/apu/mt6885/libapu-platform.so new file mode 100644 index 0000000000000000000000000000000000000000..af29cee6a9f6554595fd8c9066dbaff12a4fe07c Binary files /dev/null and b/third_party/apu/mt6885/libapu-platform.so differ diff --git a/tools/common.py b/tools/common.py index fd17f80fb7d06059a6dcdb3a1e31c6ed67b3a9d9..a736960d3789c090793e1d62b1caec5f70c7fa61 100644 --- a/tools/common.py +++ b/tools/common.py @@ -415,6 +415,7 @@ class YAMLKeyword(object): quantize_large_weights = 'quantize_large_weights' quantize_range_file = 'quantize_range_file' quantize_stat = 'quantize_stat' + quantize_schema = 'quantize_schema' change_concat_ranges = 'change_concat_ranges' validation_inputs_data = 'validation_inputs_data' validation_threshold = 'validation_threshold' diff --git a/tools/converter.py b/tools/converter.py index 3f1601d1e53644c9e01332007cadae95ecdd3375..0b5d81095d06eac6addbc9ace79eef46d72ca848 100644 --- a/tools/converter.py +++ b/tools/converter.py @@ -122,6 +122,7 @@ class DefaultValues(object): cpu_affinity_policy = 1, gpu_perf_hint = 3, gpu_priority_hint = 3, + apu_cache_policy = 0, class ValidationThreshold(object): @@ -1175,6 +1176,21 @@ def parse_args(): "--benchmark", action="store_true", help="enable op benchmark.") + run.add_argument( + "--apu_cache_policy", + type=int, + default=DefaultValues.apu_cache_policy, + help="0:NONE/1:STORE/2:LOAD") + run.add_argument( + "--apu_binary_file", + type=str, + default="", + help="apu cache load dir.") + run.add_argument( + "--apu_storage_file", + type=str, + default="", + help="apu cache store dir.") return parser.parse_known_args() diff --git a/tools/device.py b/tools/device.py index 312cbb0855ca21c8bc8363fb00c1ddbc13be1d34..fc49a5dc15cbe36a04ab678d27cfd9ffa9879264 100644 --- a/tools/device.py +++ b/tools/device.py @@ -177,6 +177,9 @@ class DeviceWrapper: cpu_affinity_policy=1, gpu_perf_hint=3, gpu_priority_hint=3, + apu_cache_policy=0, + apu_binary_file="", + apu_storage_file="", input_file_name='model_input', output_file_name='model_out', input_dir="", @@ -282,7 +285,20 @@ class DeviceWrapper: "third_party/nnlib/%s/libhexagon_controller.so" % abi, self.data_dir) + apu_storage_cpy = False if device_type == common.DeviceType.APU: + if apu_cache_policy == 1: + if not apu_storage_file: + apu_storage_cpy = True + apu_src_file = model_tag + ".bin" + apu_storage_file = os.path.join(self.data_dir, + apu_src_file) + elif apu_cache_policy == 2: + if os.path.exists(apu_binary_file): + self.push(apu_binary_file, self.data_dir) + apu_binary_file = os.path.join(self.data_dir, + os.path.basename( + apu_binary_file)) self.push("third_party/apu/libapu-frontend.so", self.data_dir) @@ -345,6 +361,9 @@ class DeviceWrapper: (self.data_dir, os.path.basename(opencl_binary_file)), "--opencl_parameter_file=%s/%s" % (self.data_dir, os.path.basename(opencl_parameter_file)), + "--apu_cache_policy=%s" % apu_cache_policy, + "--apu_binary_file=%s" % apu_binary_file, + "--apu_storage_file=%s" % apu_storage_file, ]) if benchmark: cmd.append("--benchmark=%s" % benchmark) @@ -364,6 +383,11 @@ class DeviceWrapper: _out=process_output, _err_to_out=True) self.stdout = "".join(stdout_buff) + + if apu_storage_cpy: + self.pull_from_data_dir( + apu_src_file, '{}/apu_init_cache/'.format(mace_model_dir)) + if not sh_commands.stdout_success(self.stdout): common.MaceLogger.error("Mace Run", "Mace run failed.") @@ -545,6 +569,9 @@ class DeviceWrapper: cpu_affinity_policy=flags.cpu_affinity_policy, gpu_perf_hint=flags.gpu_perf_hint, gpu_priority_hint=flags.gpu_priority_hint, + apu_cache_policy=flags.apu_cache_policy, + apu_binary_file=flags.apu_binary_file, + apu_storage_file=flags.apu_storage_file, runtime_failure_ratio=flags.runtime_failure_ratio, address_sanitizer=flags.address_sanitizer, opencl_binary_file=model_opencl_output_bin_path, diff --git a/tools/python/convert.py b/tools/python/convert.py index f12e613dbe559c00431ab0e8d563cb4004543cab..cde07cffdbf8f375cfd2762a06270b03229762b7 100644 --- a/tools/python/convert.py +++ b/tools/python/convert.py @@ -113,6 +113,8 @@ def convert_model(conf, quantize_stat): option.winograd = conf[ModelKeys.winograd] if ModelKeys.quantize in conf: option.quantize = conf[ModelKeys.quantize] + if ModelKeys.quantize_schema in conf: + option.quantize_schema = conf[ModelKeys.quantize_schema] if ModelKeys.quantize_large_weights in conf: option.quantize_large_weights = conf[ModelKeys.quantize_large_weights] if ModelKeys.quantize_range_file in conf: diff --git a/tools/python/quantize/quantize_util.py b/tools/python/quantize/quantize_util.py index bb80e0dc41ddecb6f424cc3e55d112397d67c7c6..410c049300605718b35eccb5b9ff25a78a4efb6d 100644 --- a/tools/python/quantize/quantize_util.py +++ b/tools/python/quantize/quantize_util.py @@ -171,6 +171,24 @@ def quantize(data, device, non_zero): return quantized_data +# only support int16 symmetric quantization. +def quantize_int16(data): + np_data = np.array(data).astype(float) + max_val = max(abs(np_data.min()), abs(np_data.max())) + scale = max_val / 2**15 + zero = 0 + output = np.clip((np.round(zero + data / scale).astype(np.int32)), + -2**15, 2**15 - 1) + + quantized_data = QuantizedData() + quantized_data.data = output + quantized_data.scale = scale + quantized_data.zero = zero + quantized_data.minval = -max_val + quantized_data.maxval = max_val + return quantized_data + + def quantize_bias_for_hexagon(data): np_data = np.array(data).astype(float) max_val = max(abs(np_data.min()), abs(np_data.max())) diff --git a/tools/python/template/model_header.jinja2 b/tools/python/template/model_header.jinja2 index ea1c5f6ce3fbf1290e3268793a8b083cdf3e1bc1..cd8d2035902b2b022b77c8e07ee464522f3dbe84 100644 --- a/tools/python/template/model_header.jinja2 +++ b/tools/python/template/model_header.jinja2 @@ -26,6 +26,7 @@ namespace {{tag}} { MACE_API extern const unsigned char *LoadModelData(); +MACE_API extern const int64_t GetModelSize(); MACE_API extern const std::shared_ptr CreateNet(); diff --git a/tools/python/transform/apu_converter.py b/tools/python/transform/apu_converter.py index 12f302ec1752a53117f4855e1bc21e228452cdb1..faeb0be688010cbbe776f635f4b9545b4444e931 100644 --- a/tools/python/transform/apu_converter.py +++ b/tools/python/transform/apu_converter.py @@ -24,6 +24,7 @@ from transform.base_converter import EltwiseType from transform.base_converter import MaceKeyword from transform.base_converter import MaceOp from transform.base_converter import PaddingMode +from transform.base_converter import PadType from transform.base_converter import PoolingType from transform.base_converter import ReduceType from transform.base_converter import DataFormat @@ -32,16 +33,17 @@ from utils.util import mace_check ApuSupportedOps = [ + 'Activation', 'Concat', 'Conv2D', 'DepthwiseConv2d', 'Eltwise', + 'Pad', 'Pooling', 'Reduce', 'ResizeBilinear', 'Reshape', 'Softmax', - 'Squeeze', ] ApuOp = Enum('ApuOp', [(op, op) for op in ApuSupportedOps], type=str) @@ -50,16 +52,18 @@ ApuOp = Enum('ApuOp', [(op, op) for op in ApuSupportedOps], type=str) class ApuOps(object): def __init__(self): self.apu_ops = { + MaceOp.Activation.name: ApuOp.Activation.name, MaceOp.Concat.name: ApuOp.Concat.name, MaceOp.Conv2D.name: ApuOp.Conv2D.name, MaceOp.DepthwiseConv2d.name: ApuOp.DepthwiseConv2d.name, MaceOp.Eltwise.name: ApuOp.Eltwise.name, + MaceOp.Pad.name: ApuOp.Pad.name, MaceOp.Pooling.name: ApuOp.Pooling.name, MaceOp.Reduce.name: ApuOp.Reduce.name, MaceOp.ResizeBilinear.name: ApuOp.ResizeBilinear.name, MaceOp.Reshape.name: ApuOp.Reshape.name, MaceOp.Softmax.name: ApuOp.Softmax.name, - MaceOp.Squeeze.name: ApuOp.Squeeze.name, + MaceOp.Squeeze.name: ApuOp.Reshape.name, } def has_op(self, op_name): @@ -78,17 +82,30 @@ class ApuConverter(base_converter.ConverterInterface): self._apu_ops = ApuOps() def run(self): - self.use_uint8_in_out() + if self._option.quantize: + self.use_quant_in_out() self.add_op_output_type() self.ensure_bias_vector() + self.ensure_binary_input() self.common_check() if ConverterUtil.get_arg(self._model.op[0], MaceKeyword.mace_framework_type_str).i == \ FrameworkType.TENSORFLOW.value: self.add_tensorflow_padding_value() + # Calculate the number of apu constant tensors + # Any tensors which will be apu constant tensors should be added + # above this line const_data_num_arg = self._model.arg.add() const_data_num_arg.name = MaceKeyword.mace_const_data_num_arg_str const_data_num_arg.i = len(self._model.tensors) + apu_data_type_arg = self._model.arg.add() + apu_data_type_arg.name = MaceKeyword.mace_apu_data_type_arg_str + if self._option.quantize_schema == 'mace_apu_16bit_per_tensor': + apu_data_type_arg.i = mace_pb2.DT_INT16 + elif self._option.quantize: + apu_data_type_arg.i = mace_pb2.DT_UINT8 + else: + apu_data_type_arg.i = mace_pb2.DT_FLOAT self.convert_ops() self.add_node_id() return self._model @@ -104,9 +121,11 @@ class ApuConverter(base_converter.ConverterInterface): ' match') mace_check(len(op.output_shape[0].dims) <= 4, op.name + ': apu only support 1D~4D tensor') - mace_check(len(op.output) == len(op.quantize_info), - op.name + ': length of output and quantize_info not' - ' match') + if op.output_type[0] == mace_pb2.DT_UINT8 \ + or op.output_type[0] == mace_pb2.DT_INT16: + mace_check(len(op.output) == len(op.quantize_info), + op.name + ': length of output and quantize_info not' + ' match') data_format = ConverterUtil.data_format(op) if data_format is not None and len(op.output_shape[0].dims) == 4: mace_check((data_format == DataFormat.NHWC) @@ -117,9 +136,11 @@ class ApuConverter(base_converter.ConverterInterface): op, MaceKeyword.mace_activation_type_str) if act_mode_arg is not None: mace_check(act_mode_arg.s == b'RELU' - or act_mode_arg.s == b'RELUX', - op.name + ': apu only support activation RELU and' - ' RELUX') + or act_mode_arg.s == b'RELUX' + or act_mode_arg.s == b'TANH' + or act_mode_arg.s == b'SIGMOID', + op.name + ': apu only support activation RELU,' + ' RELUX, TANH and SIGMOID') for tensor in self._model.tensors: mace_check(len(tensor.dims) <= 4, tensor.name + ': apu only support 1D~4D tensor') @@ -138,7 +159,6 @@ class ApuConverter(base_converter.ConverterInterface): for op in self._model.op: if not self._apu_ops.has_op(op.type): raise Exception('Unsupported op: ', op) - if op.type == MaceOp.Conv2D.name \ or op.type == MaceOp.DepthwiseConv2d.name: mace_check(len(op.input) == 3, @@ -146,7 +166,7 @@ class ApuConverter(base_converter.ConverterInterface): ' with 3 input') self.add_size_tensor_from_arg( op, MaceKeyword.mace_strides_str) - self.add_padding_tensor_from_arg(op) + self.add_padding_value_tensor_from_arg(op) self.add_size_tensor_from_arg( op, MaceKeyword.mace_dilations_str) if op.type == MaceOp.DepthwiseConv2d.name: @@ -160,22 +180,64 @@ class ApuConverter(base_converter.ConverterInterface): break op.input.extend([multiplier.name]) elif op.type == MaceOp.Eltwise.name: + eltwise_type = ConverterUtil.get_arg( + op, MaceKeyword.mace_element_type_str).i + # We only handle SUM and PROD operators now which are + # commutative mace_check(len(op.input) == 2, op.name + ': apu only support eltwise op with 2' ' input') - eltwise_type = ConverterUtil.get_arg( - op, MaceKeyword.mace_element_type_str).i - mace_check(eltwise_type == EltwiseType.SUM.value, - op.name + ': apu only support eltwise type SUM') + mace_check(eltwise_type == EltwiseType.SUM.value + or eltwise_type == EltwiseType.PROD.value, + op.name + + ': apu only support eltwise type SUM or PROD') + elif op.type == MaceOp.Pad.name: + mace_check(len(op.input) == 1, + op.name + ': apu only support Pad op with 1' + ' input') + pad_type_arg = \ + ConverterUtil.get_arg(op, MaceKeyword.mace_pad_type_str) + if pad_type_arg is not None: + mace_check(PadType(pad_type_arg.i) == + PadType.CONSTANT, op.name + + ': apu only support Pad type CONSTANT') + + padding_arg = ConverterUtil.get_arg( + op, MaceKeyword.mace_paddings_str) + mace_check(len(padding_arg.ints) == 8, + op.name + ': paddings does not have size 8') + mace_check({0} == + {padding_arg.ints[0], padding_arg.ints[1], + padding_arg.ints[6], padding_arg.ints[7]}, + op.name + ': apu only support Pad op with padding' + ' in H/W dimensions') + data_type = ConverterUtil.get_arg(op, 'T').i + constant_value_arg = ConverterUtil.get_arg( + op, MaceKeyword.mace_constant_value_str) + if constant_value_arg is not None: + if data_type in [mace_pb2.DT_FLOAT, mace_pb2.DT_HALF]: + constant_value = constant_value_arg.f + elif data_type == mace_pb2.DT_INT32: + constant_value = constant_value_arg.i + else: + mace_check(False, "Not supported data type") + + mace_check(constant_value == 0, + op.name + ': apu only support Pad op with zero' + ' padding') + self.add_paddings_tensor_from_arg(op) + elif op.type == MaceOp.Pooling.name: mace_check(len(op.input) == 1, op.name + ': apu only support pooling op with 1' ' input') pooling_type_arg = ConverterUtil.get_arg( op, MaceKeyword.mace_pooling_type_str) - mace_check(PoolingType(pooling_type_arg.i) == PoolingType.AVG, - op.name + ': apu only support pooling type AVG') - self.add_padding_tensor_from_arg(op) + mace_check(PoolingType(pooling_type_arg.i) in + [PoolingType.AVG, PoolingType.MAX], + op.name + ': apu only support pooling type AVG,' + ' MAX') + self.add_padding_value_tensor_from_arg(op) self.add_size_tensor_from_arg( op, MaceKeyword.mace_strides_str) self.add_size_tensor_from_arg(op, MaceKeyword.mace_kernel_str) @@ -213,8 +275,7 @@ class ApuConverter(base_converter.ConverterInterface): mace_check(len(op.input) == 1, op.name + ': apu only support squeeze op with 1' ' input') - self.add_int_list_tensor_from_arg( - op, MaceKeyword.mace_axis_str) + self.add_shape_tensor_from_axis_arg(op) op.type = self._apu_ops.map_nn_op(op.type) @@ -222,7 +283,12 @@ class ApuConverter(base_converter.ConverterInterface): type_map = {} for input_info in self._model.input_info: # will do input quantize in wrapper - type_map[input_info.name] = mace_pb2.DT_UINT8 + if self._option.quantize_schema == 'mace_apu_16bit_per_tensor': + type_map[input_info.name] = mace_pb2.DT_INT16 + elif self._option.quantize: + type_map[input_info.name] = mace_pb2.DT_UINT8 + else: + type_map[input_info.name] = mace_pb2.DT_FLOAT for op in self._model.op: if len(op.output_type) >= 1: @@ -239,8 +305,11 @@ class ApuConverter(base_converter.ConverterInterface): op.name + ': length of output and output_type not' ' match') mace_check(op.output_type[0] == mace_pb2.DT_UINT8 - or op.output_type[0] == mace_pb2.DT_INT32, - op.name + ': apu only support quantized node') + or op.output_type[0] == mace_pb2.DT_INT16 + or op.output_type[0] == mace_pb2.DT_INT32 + or op.output_type[0] == mace_pb2.DT_FLOAT, + op.name + ': apu only support quantized or float16' + ' node') def add_node_id(self): node_id_counter = 0 @@ -266,7 +335,7 @@ class ApuConverter(base_converter.ConverterInterface): for output_info in self._model.output_info: output_info.node_id = node_id_map[output_info.name] - def add_padding_tensor_from_arg(self, op): + def add_padding_value_tensor_from_arg(self, op): padding_value_arg = ConverterUtil.get_arg( op, MaceKeyword.mace_padding_values_str) mace_check(len(padding_value_arg.ints) == 4, @@ -278,6 +347,19 @@ class ApuConverter(base_converter.ConverterInterface): padding_value_tensor.int32_data.extend(padding_value_arg.ints) op.input.extend([padding_value_tensor.name]) + def add_paddings_tensor_from_arg(self, op): + padding_value_arg = ConverterUtil.get_arg( + op, MaceKeyword.mace_paddings_str) + padding_value_tensor = self._model.tensors.add() + padding_value_tensor.name = op.name + '/padding:0' + padding_value_tensor.data_type = mace_pb2.DT_INT32 + mace_check(len(padding_value_arg.ints) % 2 == 0, + op.name + ': the rank of paddings should be even') + padding_value_tensor.dims.extend( + [int(len(padding_value_arg.ints) / 2), 2]) + padding_value_tensor.int32_data.extend(padding_value_arg.ints) + op.input.extend([padding_value_tensor.name]) + def add_size_tensor_from_arg(self, op, keyword): size_value_arg = ConverterUtil.get_arg(op, keyword) mace_check(len(size_value_arg.ints) == 2, @@ -311,6 +393,27 @@ class ApuConverter(base_converter.ConverterInterface): list_value_tensor.int32_data.extend(list_value_arg.ints) op.input.extend([list_value_tensor.name]) + def add_shape_tensor_from_axis_arg(self, op): + list_value_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_axis_str) + mace_check(list_value_arg.ints is not None, + op.name + ': ' + MaceKeyword.mace_axis_str + + ' value ints should not be None') + axes = list_value_arg.ints + for producer in self._model.op: + if producer.output[0] == op.input[0]: + input_tensor_shape = producer.output_shape[0].dims + break + + shape_tensor = self._model.tensors.add() + shape_tensor.name = op.name + '/' + MaceKeyword.mace_axis_str + ':0' + shape_tensor.data_type = mace_pb2.DT_INT32 + shape_tensor.dims.extend([len(input_tensor_shape) - len(axes)]) + shape_tensor.int32_data.extend(input_tensor_shape) + for axis in sorted(axes, reverse=True): + del shape_tensor.int32_data[axis] + op.input.extend([shape_tensor.name]) + ConverterUtil.del_arg(op, MaceKeyword.mace_axis_str) + def add_tensorflow_padding_value(self): for op in self._model.op: padding_type = ConverterUtil.get_arg( @@ -374,7 +477,8 @@ class ApuConverter(base_converter.ConverterInterface): tensor = self._model.tensors.add() tensor.name = _op.name + '/add/bias_add' tensor.dims.extend([_op.output_shape[0].dims[-1]]) - if _op.output_type[0] == mace_pb2.DT_UINT8: + if _op.output_type[0] == mace_pb2.DT_UINT8 or \ + _op.output_type[0] == mace_pb2.DT_INT16: tensor.data_type = mace_pb2.DT_INT32 input_name = _op.input[0] for input_op in self._model.op: @@ -395,7 +499,46 @@ class ApuConverter(base_converter.ConverterInterface): tensor.float_data.extend([0.0] * tensor.dims[0]) _op.input.extend([tensor.name]) - def use_uint8_in_out(self): + def ensure_binary_input(self): + for _op in self._model.op: + if _op.type != MaceOp.Eltwise.name: + continue + if len(_op.input) != 1: + continue + eltwise_type = ConverterUtil.get_arg( + _op, MaceKeyword.mace_element_type_str).i + if eltwise_type != EltwiseType.SUM.value and \ + eltwise_type != EltwiseType.PROD.value: + continue + + float_value_arg = ConverterUtil.get_arg( + _op, MaceKeyword.mace_scalar_input_str) + mace_check(float_value_arg.f is not None, + _op.name + ': ' + + MaceKeyword.mace_scalar_input_str + + ' value float should not be None') + scalar = float_value_arg.f + const_tensor = self._model.tensors.add() + const_tensor.name = _op.name + '/' + \ + MaceKeyword.mace_scalar_input_str + ':0' + const_tensor.dims.extend([1]) + if _op.output_type[0] == mace_pb2.DT_UINT8 or \ + _op.output_type[0] == mace_pb2.DT_INT16: + const_tensor.data_type = _op.output_type[0] + const_tensor.scale = scalar + const_tensor.zero_point = 0 + const_tensor.quantized = True + const_tensor.int32_data.extend([1]) + elif _op.output_type[0] == mace_pb2.DT_FLOAT: + const_tensor.data_type = mace_pb2.DT_FLOAT + const_tensor.float_data.extend([scalar]) + _op.input.extend([const_tensor.name]) + ConverterUtil.del_arg( + _op, MaceKeyword.mace_scalar_input_str) + ConverterUtil.del_arg( + _op, MaceKeyword.mace_scalar_input_index_str) + + def use_quant_in_out(self): replace_dict = {} for input_info in self._model.input_info: if input_info.data_type == mace_pb2.DT_FLOAT: diff --git a/tools/python/transform/base_converter.py b/tools/python/transform/base_converter.py index 4c23e65fbf9efa190c6f8e87e86a507f92b3af8e..38edafb3879d48a177bb64b9c4d28d81c637936b 100644 --- a/tools/python/transform/base_converter.py +++ b/tools/python/transform/base_converter.py @@ -288,6 +288,8 @@ class MaceKeyword(object): mace_p_str = 'p' mace_nor_var_str = 'normalize_variance' mace_across_ch_str = 'across_channels' + mace_apu_16bit_per_tensor = 'mace_apu_16bit_per_tensor' + mace_apu_data_type_arg_str = 'apu_data_type' class TransformerRule(Enum): @@ -335,6 +337,7 @@ class TransformerRule(Enum): FP16_GATHER_WEIGHT = 42 QUANTIZE_LARGE_WEIGHTS = 43 TRANSPOSE_SHAPE_TENSOR_TO_PARAM = 44 + TRANSFORM_SINGLE_BN_TO_DEPTHWISE_CONV = 45 class ConverterInterface(object): @@ -409,6 +412,7 @@ class ConverterOption(object): self._device = DeviceType.CPU.value self._winograd = 0 self._quantize = False + self._quantize_schema = "" self._quantize_large_weights = False self._quantize_range_file = "" self._change_concat_ranges = False @@ -444,6 +448,10 @@ class ConverterOption(object): def quantize(self): return self._quantize + @property + def quantize_schema(self): + return self._quantize_schema + @property def quantize_large_weights(self): return self._quantize_large_weights @@ -508,6 +516,10 @@ class ConverterOption(object): def quantize(self, quantize): self._quantize = quantize + @quantize_schema.setter + def quantize_schema(self, quantize_schema): + self._quantize_schema = quantize_schema + @quantize_large_weights.setter def quantize_large_weights(self, quantize_large_weights): self._quantize_large_weights = quantize_large_weights @@ -593,6 +605,10 @@ class ConverterOption(object): # Need to be put after SORT_BY_EXECUTION TransformerRule.ADD_QUANTIZE_TENSOR_RANGE, ] + if self._device == DeviceType.APU.value: + self._transformer_option = self._transformer_option + [ + TransformerRule.TRANSFORM_SINGLE_BN_TO_DEPTHWISE_CONV, + ] if self.quantize_large_weights: self._transformer_option = self._transformer_option + [ TransformerRule.QUANTIZE_LARGE_WEIGHTS diff --git a/tools/python/transform/transformer.py b/tools/python/transform/transformer.py index c35973a0622ad84f6407bfb4c6a8adde8bcb42a6..136bc9cd5d1051357b931ee22ed8d19edc0e3528 100644 --- a/tools/python/transform/transformer.py +++ b/tools/python/transform/transformer.py @@ -115,6 +115,8 @@ class Transformer(base_converter.ConverterInterface): self.fp16_gather_weight, TransformerRule.QUANTIZE_LARGE_WEIGHTS: self.quantize_large_weights, + TransformerRule.TRANSFORM_SINGLE_BN_TO_DEPTHWISE_CONV: + self.transform_single_bn_to_depthwise_conv, } self._option = option @@ -736,7 +738,6 @@ class Transformer(base_converter.ConverterInterface): net.tensors.remove(scale) self.replace_quantize_info(op, consumer_op) self.safe_remove_node(consumer_op, op) - return True return False @@ -1099,9 +1100,9 @@ class Transformer(base_converter.ConverterInterface): transposed_filter = set() transposed_deconv_filter = set() - if self._option.quantize and \ - (self._option.device == DeviceType.CPU.value or - self._option.device == DeviceType.APU.value): + if ((self._option.quantize and + self._option.device == DeviceType.CPU.value) or + self._option.device == DeviceType.APU.value): print("Transpose filters to OHWI") if filter_format == DataFormat.HWIO: transpose_order = [3, 0, 1, 2] @@ -1620,12 +1621,23 @@ class Transformer(base_converter.ConverterInterface): mace_check(data_type_arg, "Data type does not exist for %s(%s)" % (op.name, op.type)) if data_type_arg.i == mace_pb2.DT_FLOAT: - data_type_arg.i = mace_pb2.DT_UINT8 + if self._option.quantize_schema == \ + MaceKeyword.mace_apu_16bit_per_tensor: + data_type_arg.i = mace_pb2.DT_INT16 + else: + data_type_arg.i = mace_pb2.DT_UINT8 elif data_type_arg.i == mace_pb2.DT_UINT8: mace_check(op.type == MaceOp.Quantize.name or op.type == MaceOp.Dequantize.name, "Only Quantization ops support uint8, " "but got %s(%s)" % (op.name, op.type)) + elif data_type_arg.i == mace_pb2.DT_INT16 \ + and self._option.quantize_schema == \ + MaceKeyword.mace_apu_16bit_per_tensor: + mace_check(op.type == MaceOp.Quantize.name + or op.type == MaceOp.Dequantize.name, + "Only Quantization ops support int16, " + "but got %s(%s)" % (op.name, op.type)) else: mace_check(op.type == MaceOp.Quantize.name, "Quantization only support float ops, " @@ -1647,7 +1659,11 @@ class Transformer(base_converter.ConverterInterface): self._model.input_info[i].scale = quantize_info.scale self._model.input_info[i].zero_point = quantize_info.zero_point - ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8) + if self._option.quantize_schema == \ + MaceKeyword.mace_apu_16bit_per_tensor: + ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_INT16) + else: + ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8) ConverterUtil.add_data_format_arg(op_def, input_node.data_format) # use actual ranges for model input quantize find_range_every_time_arg = op_def.arg.add() @@ -1670,7 +1686,11 @@ class Transformer(base_converter.ConverterInterface): self._model.output_info[i].scale = quantize_info.scale self._model.output_info[i].zero_point = quantize_info.zero_point - ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8) + if self._option.quantize_schema == \ + MaceKeyword.mace_apu_16bit_per_tensor: + ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_INT16) + else: + ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8) ConverterUtil.add_data_format_arg(op_def, output_node.data_format) quantize_flag_arg = self._model.arg.add() @@ -1725,6 +1745,11 @@ class Transformer(base_converter.ConverterInterface): else: mace_check(False, "wrong device.") tensor.data_type = mace_pb2.DT_INT32 + elif self._option.quantize_schema == \ + MaceKeyword.mace_apu_16bit_per_tensor: + quantized_tensor = \ + quantize_util.quantize_int16(tensor.float_data) + tensor.data_type = mace_pb2.DT_INT16 else: non_zero = self._option.device == DeviceType.CPU.value quantized_tensor = quantize_util.quantize(tensor.float_data, @@ -1781,9 +1806,16 @@ class Transformer(base_converter.ConverterInterface): return False def add_quantize_info(self, op, minval, maxval): - scale, zero, minval, maxval = \ - quantize_util.adjust_range(minval, maxval, self._option.device, - non_zero=False) + quantize_schema = self._option.quantize_schema + if quantize_schema == MaceKeyword.mace_apu_16bit_per_tensor: + maxval = max(abs(minval), abs(maxval)) + minval = -maxval + scale = maxval / 2**15 + zero = 0 + else: + scale, zero, minval, maxval = \ + quantize_util.adjust_range(minval, maxval, self._option.device, + non_zero=False) quantize_info = op.quantize_info.add() quantize_info.minval = minval quantize_info.maxval = maxval @@ -1876,6 +1908,7 @@ class Transformer(base_converter.ConverterInterface): def add_quantize_tensor_range(self): # Quantize info from range statistics range_file = self._option.quantize_range_file + quantize_schema = self._option.quantize_schema if range_file: print("Add quantize tensor range") post_quantize_info = {} @@ -1884,10 +1917,17 @@ class Transformer(base_converter.ConverterInterface): tensor_name, minmax = line.split("@@")[:2] min_val, max_val = [float(i) for i in minmax.strip().split(",")] - scale, zero, min_val, max_val = \ - quantize_util.adjust_range(min_val, max_val, - self._option.device, - non_zero=False) + if (quantize_schema == + MaceKeyword.mace_apu_16bit_per_tensor): + max_val = max(abs(min_val), abs(max_val)) + min_val = -max_val + scale = max_val / 2**15 + zero = 0 + else: + scale, zero, min_val, max_val = \ + quantize_util.adjust_range(min_val, max_val, + self._option.device, + non_zero=False) activation_info = mace_pb2.QuantizeActivationInfo() activation_info.minval = min_val activation_info.maxval = max_val @@ -1918,11 +1958,18 @@ class Transformer(base_converter.ConverterInterface): print("Input range %s: %s" % (input_node.name, str(input_node.range))) new_input_name = self.input_name_map[input_node.name] - scale, zero, minval, maxval = \ - quantize_util.adjust_range(input_node.range[0], - input_node.range[1], - self._option.device, - non_zero=False) + if quantize_schema == MaceKeyword.mace_apu_16bit_per_tensor: + maxval = max(abs(input_node.range[0]), + abs(input_node.range[0])) + minval = -maxval + scale = maxval / 2**15 + zero = 0 + else: + scale, zero, minval, maxval = \ + quantize_util.adjust_range(input_node.range[0], + input_node.range[1], + self._option.device, + non_zero=False) quantize_info = \ mace_pb2.QuantizeActivationInfo() quantize_info.minval = minval @@ -2396,3 +2443,37 @@ class Transformer(base_converter.ConverterInterface): return True return False + + def transform_single_bn_to_depthwise_conv(self): + for op in self._model.op: + if op.type != MaceOp.BatchNorm.name: + continue + + if len(op.input) != 3: + continue + + producer = self._producer[op.input[0]] + if producer.type in [MaceOp.Conv2D.name, + MaceOp.Deconv2D.name, + MaceOp.DepthwiseDeconv2d.name, + MaceOp.DepthwiseConv2d.name, + MaceOp.BatchToSpaceND.name]: + continue + + op.type = MaceOp.DepthwiseConv2d.name + padding_arg = op.arg.add() + padding_arg.name = MaceKeyword.mace_padding_str + padding_arg.i = PaddingMode.VALID.value + strides_arg = op.arg.add() + strides_arg.name = MaceKeyword.mace_strides_str + strides_arg.ints.extend([1, 1]) + dilation_arg = op.arg.add() + dilation_arg.name = MaceKeyword.mace_dilations_str + dilation_arg.ints.extend([1, 1]) + for tensor in self._model.tensors: + if tensor.name == op.input[1]: + tensor.dims[:] = [1, 1, 1, tensor.dims[0]] + break + return True + + return False diff --git a/tools/python/utils/config_parser.py b/tools/python/utils/config_parser.py index c3805da22ed63a8f3935382d5ad372d0d04eb2f4..5d78de49bd48a1e6e3f364af456fa6175d8f4166 100644 --- a/tools/python/utils/config_parser.py +++ b/tools/python/utils/config_parser.py @@ -92,6 +92,7 @@ class ModelKeys(object): weight_sha256_checksum = "weight_sha256_checksum" quantize_range_file = "quantize_range_file" quantize = "quantize" + quantize_schema = "quantize_schema" quantize_large_weights = "quantize_large_weights" quantize_stat = "quantize_stat" change_concat_ranges = "change_concat_ranges" diff --git a/tools/python/utils/device.py b/tools/python/utils/device.py index 9037d0c882f26d5962df68f44e2cf97194237ea3..53d2535804857be857de0a6ae1ae7dabe350c87d 100644 --- a/tools/python/utils/device.py +++ b/tools/python/utils/device.py @@ -36,10 +36,11 @@ def execute(cmd, verbose=True): universal_newlines=True) if not verbose: - # use p.communicate instead of p.wait to avoid such situation: pipe is filled and the child process is blocked. + # use p.communicate instead of p.wait to avoid such situation: + # pipe is filled and the child process is blocked. out, err = p.communicate() if p.returncode != 0: - raise Exception("errorcode: {}".format(p.returncode) ) + raise Exception("errorcode: {}".format(p.returncode)) return out buf = []