From 2f6595282b9d7a40b2744e05ef1a6889e5daf05d Mon Sep 17 00:00:00 2001 From: Liangliang He Date: Mon, 11 Jun 2018 14:30:38 +0800 Subject: [PATCH] Update MACE docs --- docs/development/adding_a_new_op.md | 4 +- docs/development/contributing.md | 4 +- docs/faq.md | 41 ++-- docs/getting_started/how_to_build.rst | 242 ++++++++++++----------- docs/getting_started/how_to_build_zh.rst | 10 +- docs/getting_started/introduction.rst | 4 +- tools/converter.py | 4 +- 7 files changed, 152 insertions(+), 157 deletions(-) diff --git a/docs/development/adding_a_new_op.md b/docs/development/adding_a_new_op.md index be0c82b3..299c65ef 100644 --- a/docs/development/adding_a_new_op.md +++ b/docs/development/adding_a_new_op.md @@ -3,7 +3,7 @@ Adding a new Op You can create a custom op if it is not supported yet. -To add a custom op, you need to finish the following steps: +To add a custom op, you need to follow these steps: Define the Op class -------------------- @@ -93,7 +93,7 @@ kernel with NEON. Add test and benchmark ---------------------- -It's strongly recommended to add unit test and micro benchmark for your +It's strongly recommended to add unit tests and micro benchmarks for your new Op. If you wish to contribute back, it's required. Document the new Op diff --git a/docs/development/contributing.md b/docs/development/contributing.md index aac0e9f1..5dfd9e78 100644 --- a/docs/development/contributing.md +++ b/docs/development/contributing.md @@ -5,7 +5,7 @@ License ------- The source file should contains a license header. See the existing files -as an example. +as the example. Python coding style ------------------- @@ -13,7 +13,7 @@ Python coding style Changes to Python code should conform to [PEP8 Style Guide for Python Code](https://www.python.org/dev/peps/pep-0008/). -You can use pycodestyle to check the style. +You can use [pycodestyle](ihttps://github.com/PyCQA/pycodestyle) check the style. C++ coding style ---------------- diff --git a/docs/faq.md b/docs/faq.md index c8ee4a90..8c321816 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -3,29 +3,31 @@ Frequently asked questions Does the tensor data consume extra memory when compiled into C++ code? ---------------------------------------------------------------------- -When compiled into C++ code, the data will be mmaped by the system loader. -For CPU runtime, the tensor data are used without memory copy. -For GPU and DSP runtime, the tensor data is used once during model +When compiled into C++ code, the tensor data will be mmaped by the system +loader. For the CPU runtime, the tensor data are used without memory copy. +For the GPU and DSP runtime, the tensor data are used once during model initialization. The operating system is free to swap the pages out, however, -it still consumes virtual memory space. So generally speaking, it takes +it still consumes virtual memory addresses. So generally speaking, it takes no extra physical memory. If you are short of virtual memory space (this -should be very rare), you can choose load the tensor data from a file, which -can be unmapped after initialization. +should be very rare), you can use the option to load the tensor data from +data file (can be manually unmapped after initialization) instead of compiled +code. Why is the generated static library file size so huge? ------------------------------------------------------- The static library is simply an archive of a set of object files which are -intermediate and contains many extra information, please check whether the +intermediate and contain many extra information, please check whether the final binary file size is as expected. Why is the generated binary file (including shared library) size so huge? ------------------------------------------------------------------------- When compiling the model into C++ code, the final binary may contains extra debug symbols, they usually takes a lot of space. Try to strip the shared -library or binary. The common overhead of the file size including the compiled -model (excluding the model weights) after the strip should be less than 2MB. -If the model weights is embedded into the binary, the extra overhead should be -around {model weights size in float32}/2. +library or binary and make sure you are following best practices to reduce +the size of an ELF binary, including disabling C++ exception, disabling RTTI, +avoiding C++ iostream, hidden internal functions etc. +In most cases, the expected overhead should be less than +{model weights size in float32}/2 + 3MB. OpenCL allocator failed with CL_OUT_OF_RESOURCES ------------------------------------------------ @@ -35,27 +37,16 @@ due to high memory usage or fragmentation. Several solutions can be tried: * Change the model by reducing its memory usage * Split the Op with the biggest single memory buffer -* Changed from armeabi-v7a to arm64-v8a to expand the virtual address space +* Change from armeabi-v7a to arm64-v8a to expand the virtual address space * Reduce the memory consumption of other modules of the same process -Why the performance is worce than the official result for the same model? +Why is the performance worse than the official result for the same model? ------------------------------------------------------------------------- The power options may not set properly, see `mace/public/mace_runtime.h` for details. -Why the UI is getting poor responsiveness when running model with GPU runtime? +Why is the UI getting poor responsiveness when running model with GPU runtime? ------------------------------------------------------------------------------ Try to set `limit_opencl_kernel_time` to `1`. If still not resolved, try to modify the source code to use even smaller time intervals or changed to CPU or DSP runtime. - -How to include more than one deployment files in one application(process)? ------------------------------------------------------------------------------- -This case may happen when an application is developed by multiple teams as -submodules. If the all the submodules are linked into a single shared library, -then use the same version of MiAI Compute Engine will resolve this issue. -Ortherwise, different deployment models are contained in different shared -libraries, it's not required to use the same MiAI version but you should -controls the exported symbols from the shared library. This is actually a -best practice for all shared library, please read about GNU loader -version script for more details. diff --git a/docs/getting_started/how_to_build.rst b/docs/getting_started/how_to_build.rst index 51c4b0ac..552269a0 100644 --- a/docs/getting_started/how_to_build.rst +++ b/docs/getting_started/how_to_build.rst @@ -11,23 +11,15 @@ Supported Platforms * - Platform - Explanation - * - Tensorflow - - >= 1.6.0. (first choice, convenient for Android NN API in the future) + * - TensorFlow + - >= 1.6.0. * - Caffe - >= 1.0. Environment Requirement ------------------------- -``mace``\ supply a docker image which contains all required environment. ``Dockerfile`` under the ``./docker`` directory. -the followings are start commands: - -.. code:: sh - - sudo docker pull cr.d.xiaomi.net/mace/mace-dev - sudo docker run -it --rm --privileged -v /dev/bus/usb:/dev/bus/usb --net=host -v /local/path:/container/path cr.d.xiaomi.net/mace/mace-dev /bin/bash - -if you want to run on your local computer, you have to install the following softwares. +MiAI Compute Engine requires the following dependencies: .. list-table:: :widths: auto @@ -71,6 +63,14 @@ if you want to run on your local computer, you have to install the following sof - >= 17.09.0-ce - `install doc `__ +MiAI Compute Engine provides Dockerfile with these dependencies installed and +the pre-built image is also available: + +.. code:: sh + + sudo docker pull cr.d.xiaomi.net/mace/mace-dev + sudo docker run -it --rm --privileged -v /dev/bus/usb:/dev/bus/usb --net=host -v /local/path:/container/path cr.d.xiaomi.net/mace/mace-dev /bin/bash + Docker Images ---------------- @@ -105,40 +105,37 @@ Docker Images Usage -------- -============================ -1. Pull code with latest tag -============================ - -.. warning:: - - please do not use master branch for deployment. +======================================= +1. Pull MiAI Compute Engine source code +======================================= .. code:: sh git clone git@v9.git.n.xiaomi.com:deep-computing/mace.git - - # update git fetch --all --tags --prune - # get latest tag version + # Checkout the latest tag (i.e. release version) tag_name=`git describe --abbrev=0 --tags` + git checkout tags/${tag_name} - # checkout to latest tag branch - git checkout -b ${tag_name} tags/${tag_name} +.. note:: + + It's highly recommanded to use a release version instead of master branch. ============================ 2. Model Optimization ============================ -- Tensorflow +- TensorFlow -Tensorflow supply a -`model optimization tool `__ -for speed up inference. The docker image contain the tool, -by the way you can download from `transform_graph `__ -or compile from tensorflow source code. +TensorFlow provides a +`Graph Transform Tool `__ +to improve inference efficiency. You can build it from TensorFlow source, +or download `a pre-compiled x86-64 binary `__. +The MiAI Compute Engine docker image has this tool pre-installed. -The following commands are optimization for CPU, GPU and DSP. +The following commands show the suggested graph transformations and +optimizations for CPU, GPU and DSP runtime. .. code:: sh @@ -158,6 +155,8 @@ The following commands are optimization for CPU, GPU and DSP. strip_unused_nodes sort_by_execution_order' +.. code:: sh + # DSP: ./transform_graph \ --in_graph=tf_model.pb \ @@ -178,7 +177,8 @@ The following commands are optimization for CPU, GPU and DSP. - Caffe -Only support versions greater then 1.0, please use the tools caffe supplied to upgrade the models. +The converter only supports Caffe 1.0+, please upgrade your models with Caffe +built-in tool when necessary. .. code:: bash @@ -195,34 +195,38 @@ Only support versions greater then 1.0, please use the tools caffe supplied to u ----------------- 3.1 Overview ----------------- -Mace only build static library. the followings are two use cases. +MiAI Compute Engine only build static library. The followings are two use cases. -* **build for specified SOC** +* **Build well tuned library for specific SoCs** - You must assign ``target_socs`` in yaml configuration file. - if you want to use gpu for the soc, mace will tuning the parameters for better performance automatically. + When ``target_socs`` is specified in YAML model deployment file, the build + tool will enable automatic tuning for GPU kernels. This usually takes some + time to finish depending on the complexity of your model. - .. warning:: + .. note:: - you should plug in a phone with that soc. + You should plug in device(s) with the correspoding SoC(s). -* **build for all SOC** +* **Build generic library for all SoCs** - When no ``target_soc`` specified, the library is suitable for all soc. + When ``target_soc`` is not specified, the generated library is compatible + with general devices. - .. warning:: + .. note:: - The performance will be a little poorer than the first case. + There will be around of 1 ~ 10% performance drop for GPU + runtime compared to the well tuned library. -We supply a python script ``tools/converter.py`` to build the library and run the model on the command line. +MiAI Compute Engine provide command line tool (``tools/converter.py``) for +model conversion, compiling, test run, benchmark and correctness validation. -.. warning:: +.. note:: - must run the script on the root directory of the mace code. + ``tools/converter.py`` should be run at the root directory of this project. ------------------------------------------ -3.2 \ ``tools/converter.py``\ explanation +3.2 \ ``tools/converter.py``\ usage ------------------------------------------ **Commands** @@ -231,24 +235,24 @@ We supply a python script ``tools/converter.py`` to build the library and run th .. note:: - build static library and test tools. + build static library and test tools. * *--config* (type=str, default="", required): the path of model yaml configuration file. - * *--tuning* (default=false, optional): whether tuning the parameters for the GPU of specified SOC. + * *--tuning* (default=false, optional): whether tuning the parameters for the GPU of specified SoC. * *--enable_openmp* (default=true, optional): whether use openmp. * **run** .. note:: - run the models in command line + run the model(s). * *--config* (type=str, default="", required): the path of model yaml configuration file. * *--round* (type=int, default=1, optional): times for run. - * *--validate* (default=false, optional): whether to verify the results of mace are consistent with the frameworks。 + * *--validate* (default=false, optional): whether to verify the results are consistent with the frameworks。 * *--caffe_env* (type=local/docker, default=docker, optional): you can specific caffe environment for validation. local environment or caffe docker image. * *--restart_round* (type=int, default=1, optional): restart round between run. - * *--check_gpu_out_of_memory* (default=false, optional): whether check out of memory for gpu. + * *--gpu_out_of_range_check* (default=false, optional): whether check out of memory for gpu. * *--vlog_level* (type=int[0-5], default=0, optional): verbose log level for debug. .. warning:: @@ -306,120 +310,120 @@ We supply a python script ``tools/converter.py`` to build the library and run th - ``run``/``benchmark`` - 0:DEFAULT/1:LOW/2:NORMAL/3:HIGH ---------------------------------------------- -3.3 \ ``tools/converter.py``\ usage examples ---------------------------------------------- +Using ``-h`` to get detailed help. .. code:: sh - # print help message python tools/converter.py -h python tools/converter.py build -h python tools/converter.py run -h python tools/converter.py benchmark -h + +--------------------------------------------- +3.3 \ ``tools/converter.py``\ usage examples +--------------------------------------------- + +.. code:: sh + # Build the static library python tools/converter.py build --config=models/config.yaml # Test model run time python tools/converter.py run --config=models/config.yaml --round=100 - # Compare the results of mace and platform. use the **cosine distance** to represent similarity. + # Validate the correctness by comparing the results against the + # original model and framework, measured with cosine distance for similarity. python tools/converter.py run --config=models/config.yaml --validate - # Benchmark Model: check the execution time of each Op. + # Benchmark and profiling model, get detailed statistics of each Op. python tools/converter.py benchmark --config=models/config.yaml # Check the memory usage of the model(**Just keep only one model in configuration file**) python tools/converter.py run --config=models/config.yaml --round=10000 & + sleep 5 adb shell dumpsys meminfo | grep mace_run - sleep 10 kill %1 ============= 4. Deployment ============= -``build`` command will generate a package which contains the static library, model files and header files. -the package is at ``./build/${library_name}/libmace_${library_name}.tar.gz``. -The followings list the details. - -**header files** - * ``include/mace/public/*.h`` - -**static libraries** - * ``library/${target_abi}/*.a`` - -**dynamic libraries** - * ``library/libhexagon_controller.so`` - - .. note:: - - only use for DSP - -**model files** - * ``model/${MODEL_TAG}.pb`` - * ``model/${MODEL_TAG}.data`` - - .. note:: - - ``.pb`` file will be generated only when build_type is ``proto``. - -**OpenCL compiled kernel binary file** - * ``opencl/${target_abi}/${library_name}_compiled_opencl_kernel.${device_name}.${target_soc}.bin`` - - .. note:: - - This file will be generated only when specify ``target_soc`` and runtime is ``gpu``. +``build`` command will generate the static library, model files and header files +and packaged as ``build/${library_name}/libmace_${library_name}.tar.gz``. +They are organized as follows, + +.. code:: + + build/ + └── mobilenet-v2-gpu + ├── include + │   └── mace + │   └── public + │   ├── mace.h + │   └── mace_runtime.h + ├── libmace_mobilenet-v2-gpu.tar.gz + ├── library + │   ├── arm64-v8a + │   │   └── libmace_mobilenet-v2-gpu.MI6.msm8998.a + │   └── armeabi-v7a + │   └── libmace_mobilenet-v2-gpu.MI6.msm8998.a + ├── model + │   ├── mobilenet_v2.data + │   └── mobilenet_v2.pb + └─── opencl +    └── compiled_opencl_kernel.bin + +.. note:: + + 1. DSP runtime depends on ``libhexagon_controller.so``. + 2. ``${MODEL_TAG}.pb`` file will be generated only when ``build_type`` is ``proto``. + 3. ``compiled_kernel.bin`` will be generated only when ``target_soc`` and ``gpu`` runtime are specified. - .. warning:: - - This file rely on the OpenCL driver on the phone, you should update the file when OpenCL driver changed. - -**tar package** - * ``./build/${library_name}/libmace_${library_name}.tar.gz`` - - .. note:: +.. warning:: - This file package all the above files which used for deployment. + ``compiled_kernel.bin`` depends on the OpenCL version of the device, you should maintan the + compatibility or configure compiling cache store with ``ConfigKVStorageFactory``. -============= -5. how to use -============= +========================================= +5. How to use the library in your project +========================================= -Please refer to \ ``mace/examples/example.cc``\ for full usage. the following list the key steps. +Please refer to \ ``mace/examples/example.cc``\ for full usage. The following list the key steps. .. code:: cpp - // include the header files + // Include the headers #include "mace/public/mace.h" #include "mace/public/mace_runtime.h" #include "mace/public/mace_engine_factory.h" - // 0. set internal storage factory(**Call once**) - const std::string file_path ="/path/to/store/internel/files"; - std::shared_ptr storage_factory( - new FileStorageFactory(file_path)); - ConfigKVStorageFactory(storage_factory); - - // 1. set precompiled OpenCL binary file paths if you use gpu of specified SOC, - // Besides the binary rely on the OpenCL driver of the SOC, - // if OpenCL driver changed, you should recompiled the binary file. + // 0. Set pre-compiled OpenCL binary program file paths when available if (device_type == DeviceType::GPU) { mace::SetOpenCLBinaryPaths(opencl_binary_paths); } - // 2. Declare the device type(must be same with ``runtime`` in configuration file) + // 1. Set compiled OpenCL kernel cache, this is used to reduce the + // initialization time since the compiling is too slow. It's suggested + // to set this even when pre-compiled OpenCL program file is provided + // because the OpenCL version upgrade may also leads to kernel + // recompilations. + const std::string file_path ="path/to/opencl_cache_file"; + std::shared_ptr storage_factory( + new FileStorageFactory(file_path)); + ConfigKVStorageFactory(storage_factory); + + // 2. Declare the device type (must be same with ``runtime`` in configuration file) DeviceType device_type = DeviceType::GPU; // 3. Define the input and output tensor names. std::vector input_names = {...}; std::vector output_names = {...}; - // 4. Create MaceEngine object + // 4. Create MaceEngine instance std::shared_ptr engine; MaceStatus create_engine_status; - // Create Engine from code + // Create Engine from compiled code create_engine_status = CreateMaceEngineFromCode(model_name.c_str(), nullptr, @@ -427,7 +431,7 @@ Please refer to \ ``mace/examples/example.cc``\ for full usage. the following li output_names, device_type, &engine); - // Create Engine from proto file + // Create Engine from model file create_engine_status = CreateMaceEngineFromProto(model_pb_data, model_data_file.c_str(), @@ -436,10 +440,10 @@ Please refer to \ ``mace/examples/example.cc``\ for full usage. the following li device_type, &engine); if (create_engine_status != MaceStatus::MACE_SUCCESS) { - // do something + // Report error } - // 5. Create Input and Output objects + // 5. Create Input and Output tensor buffers std::map inputs; std::map outputs; for (size_t i = 0; i < input_count; ++i) { @@ -449,8 +453,8 @@ Please refer to \ ``mace/examples/example.cc``\ for full usage. the following li std::multiplies()); auto buffer_in = std::shared_ptr(new float[input_size], std::default_delete()); - // load input - ... + // Load input here + // ... inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in); } diff --git a/docs/getting_started/how_to_build_zh.rst b/docs/getting_started/how_to_build_zh.rst index 0695442e..11897f4b 100644 --- a/docs/getting_started/how_to_build_zh.rst +++ b/docs/getting_started/how_to_build_zh.rst @@ -12,14 +12,14 @@ * - Platform - Explanation * - Tensorflow - - >= 1.6.0. (first choice, convenient for Android NN API in the future) + - >= 1.6.0. * - Caffe - >= 1.0. 环境要求 --------- -``mace``\ 提供了包含开发运行所需环境的docker镜像,镜像文件可以参考\ ``./docker/``\ 。启动命令: +MiAI计算引擎提供了包含开发运行所需环境的docker镜像,镜像文件可以参考\ ``./docker/``\ 。启动命令: .. code:: sh @@ -218,7 +218,7 @@ Mace目前只提供静态库,有以下两种使用场景。 .. warning:: - 必须在mace项目的根目录下运行\ ``tools/converter.py``\ 脚本。 + 必须在项目的根目录下运行\ ``tools/converter.py``\ 脚本。 --------------------------------------- @@ -248,7 +248,7 @@ Mace目前只提供静态库,有以下两种使用场景。 * *--validate* (default=false, optional): 是否需要验证运行结果与框架运行结果是否一致。 * *--caffe_env* (type=local/docker, default=docker, optional):当vaildate时,可以选择指定caffe环境,local表示本地,docker表示使用docker容器. * *--restart_round* (type=int, default=1, optional):模型重启次数。 - * *--check_gpu_out_of_memory* (default=false, optional): 是否需要检查gpu内存越界。 + * *--gpu_out_of_range_check* (default=false, optional): 是否需要检查gpu内存越界。 * *--vlog_level* (type=int[0-5], default=0, optional):详细日志级别. .. warning:: @@ -323,7 +323,7 @@ Mace目前只提供静态库,有以下两种使用场景。 # 测试模型的运行时间 python tools/converter.py run --config=models/config.yaml --round=100 - # 对比编译好的模型在mace上与直接使用tensorflow或者caffe运行的结果,相似度使用`余弦距离表示` + # 对比编译好的模型在MiAI计算引擎上与直接使用tensorflow或者caffe运行的结果,相似度使用`余弦距离表示` # 其中使用OpenCL设备,默认相似度大于等于`0.995`为通过;DSP设备下,相似度需要达到`0.930`。 python tools/converter.py run --config=models/config.yaml --validate diff --git a/docs/getting_started/introduction.rst b/docs/getting_started/introduction.rst index 4f9d4c8a..ad114fbf 100644 --- a/docs/getting_started/introduction.rst +++ b/docs/getting_started/introduction.rst @@ -14,7 +14,7 @@ Model format MiAI Compute Engine defines a customized model format which is similar to Caffe2. The MiAI model can be converted from exported models by TensorFlow -and Caffe. We define a YAML schema to describe the model deployment. In the +and Caffe. A YAML file is used to describe the model deployment details. In the next chapter, there is a detailed guide showing how to create this YAML file. Model conversion @@ -29,7 +29,7 @@ Model loading The MiAI model format contains two parts: the model graph definition and the model parameter tensors. The graph part utilizes Protocol Buffers for serialization. All the model parameter tensors are concatenated -together into a continuous array, and we call this array tensor data in +together into a continuous byte array, and we call this array tensor data in the following paragraphs. In the model graph, the tensor data offsets and lengths are recorded. diff --git a/tools/converter.py b/tools/converter.py index 0230f144..0d2b19c3 100644 --- a/tools/converter.py +++ b/tools/converter.py @@ -941,7 +941,7 @@ def run_specific_target(flags, configs, target_abi, restart_round=flags.restart_round, limit_opencl_kernel_time=model_config[YAMLKeyword.limit_opencl_kernel_time], # noqa tuning=False, - out_of_range_check=flags.check_gpu_out_of_memory, + out_of_range_check=flags.gpu_out_of_range_check, phone_data_dir=PHONE_DATA_DIR, build_type=build_type, omp_num_threads=flags.omp_num_threads, @@ -1211,7 +1211,7 @@ def parse_args(): default=0, help="VLOG level: [1~5].") run.add_argument( - "--check_gpu_out_of_memory", + "--gpu_out_of_range_check", action="store_true", help="Enable out of memory check for gpu.") run.add_argument( -- GitLab