From cd5270e7e8986caeb7db769f14c809536732e4d2 Mon Sep 17 00:00:00 2001 From: liutuo Date: Wed, 4 Jul 2018 14:10:24 +0800 Subject: [PATCH] update usage docs --- docs/installation/env_requirement.rst | 4 - docs/user_guide/advanced_usage.rst | 421 ++++++++++++++++++ docs/user_guide/basic_usage.rst | 233 ++++++---- docs/user_guide/create_a_model_deployment.rst | 4 +- docs/user_guide/how_to_build.rst | 406 +++++++++++++++++ docs/user_guide/models/demo_app_models.yml | 78 ++-- 6 files changed, 1013 insertions(+), 133 deletions(-) create mode 100644 docs/user_guide/how_to_build.rst diff --git a/docs/installation/env_requirement.rst b/docs/installation/env_requirement.rst index c0202de6..fa66ccff 100644 --- a/docs/installation/env_requirement.rst +++ b/docs/installation/env_requirement.rst @@ -7,9 +7,7 @@ Necessary Dependencies: ----------------------- .. list-table:: - :widths: auto :header-rows: 1 - :align: left * - software - version @@ -56,9 +54,7 @@ Optional Dependencies: --------------------- .. list-table:: - :widths: auto :header-rows: 1 - :align: left * - software - version diff --git a/docs/user_guide/advanced_usage.rst b/docs/user_guide/advanced_usage.rst index 43c66c77..3d7e5503 100644 --- a/docs/user_guide/advanced_usage.rst +++ b/docs/user_guide/advanced_usage.rst @@ -1,3 +1,424 @@ Advanced usage ============== +This part contains the full usage of MACE. + + +How to build +-------------------------------- + + +==================== +Overview +==================== + +As mentioned in the previous part, a model deployment file defines a case of model deployment. +The whole building process is loading a deployment file, converting models, building MACE and packing generated files. + +==================== +Deployment file +==================== + + +One deployment file will generate one library normally, but if more than one ABIs are specified, +one library will be generated for each ABI. +A deployment file can also contain multiple models. For example, an AI camera application may +contain face recognition, object recognition, and voice recognition models, all of which can be defined +in one deployment file. + +* **Example** + + Here is an example deployment file used by an Android demo application. + + .. literalinclude:: models/demo_app_models.yml + :language: yaml + + +* **Configurations** + + +.. list-table:: + :header-rows: 1 + + * - Options + - Usage + * - library_name + - Library name. + * - target_abis + - The target ABI(s) to build, could be 'host', 'armeabi-v7a' or 'arm64-v8a'. + If more than one ABIs will be used, seperate them by comas. + * - target_socs + - [optional] Build for specific SoCs. + * - embed_model_data + - Whether embedding model weights into the code, default is 0. + * - build_type + - model build type, can be 'proto' or 'code'. 'proto' for converting model to ProtoBuf file and 'code' for converting model to c++ code. + * - linkshared + - [optional] 1 for building shared library, and 0 for static library, default to 0. + * - model_name + - model name, should be unique if there are more than one models. + **LIMIT: if build_type is code, model_name will be used in c++ code so that model_name must comply with c++ name specification.** + * - platform + - The source framework, tensorflow or caffe. + * - model_file_path + - The path of your model file, can be local path or remote url. + * - model_sha256_checksum + - The SHA256 checksum of the model file. + * - weight_file_path + - [optional] The path of Caffe model weights file. + * - weight_sha256_checksum + - [optional] The SHA256 checksum of Caffe model weights file. + * - subgraphs + - subgraphs key. **DO NOT EDIT** + * - input_tensors + - The input tensor name(s) (tensorflow) or top name(s) of inputs' layer (caffe). + If there are more than one tensors, use one line for a tensor. + * - output_tensors + - The output tensor name(s) (tensorflow) or top name(s) of outputs' layer (caffe). + If there are more than one tensors, use one line for a tensor. + * - input_shapes + - The shapes of the input tensors, in NHWC order. + * - output_shapes + - The shapes of the output tensors, in NHWC order. + * - input_ranges + - The numerical range of the input tensors' data, default [-1, 1]. It is only for test. + * - validation_inputs_data + - [optional] Specify Numpy validation inputs. When not provided, [-1, 1] random values will be used. + * - runtime + - The running device, one of [cpu, gpu, dsp, cpu_gpu]. cpu_gpu contains CPU and GPU model definition so you can run the model on both CPU and GPU. + * - data_type + - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU, default is fp16_fp32, [fp32] for CPU and [uint8] for DSP. + * - limit_opencl_kernel_time + - [optional] Whether splitting the OpenCL kernel within 1 ms to keep UI responsiveness, default is 0. + * - nnlib_graph_mode + - [optional] Control the DSP precision and performance, default to 0 usually works for most cases. + * - obfuscate + - [optional] Whether to obfuscate the model operator name, default to 0. + * - winograd + - [optional] Whether to enable Winograd convolution, **will increase memory consumption**. + + +.. note:: + + Some command tools: + + .. code:: bash + + # command for fetching android device's soc info. + adb shell getprop | grep "model\|version.sdk\|manufacturer\|hardware\|platform\|brand" + + # command for generating sha256_sum + sha256sum path/to/your/file + + +==================== +Building +==================== + +* **Build static or shared library** + + MACE can build either static or shared library (which is + specified by ``linkshared`` in YAML model deployment file). + The followings are two using cases. + +* **Build well tuned library for specific SoCs** + + When ``target_socs`` is specified in YAML model deployment file, the build + tool will enable automatic tuning for GPU kernels. This usually takes some + time to finish depending on the complexity of your model. + + .. note:: + + 1. You should plug in device(s) with the specific SoC(s). + +* **Build generic library for all SoCs** + + When ``target_socs`` is not specified, the generated library is compatible + with general devices. + + .. note:: + + 1. There will be around of 1 ~ 10% performance drop for GPU + runtime compared to the well tuned library. + +* **Build models into file or code** + + When ``build_type`` is set to ``code``, model's graph and weights data will be embedded into codes. + This is used for model protection. + + .. note:: + + 1. When ``linkshared`` is set to ``1``, ``build_type`` should be ``proto``. + And currently only android devices supported. + 2. Another model protection method is using ``obfuscate`` to obfuscate the model operator name. + + +**Commands** + + * **build library and test tools** + + .. code:: sh + + # Build library + python tools/converter.py build --config=/path/to/model_deployment_file.yml + + + + * **run the model** + + .. code:: sh + + # Test model run time + python tools/converter.py run --config=/path/to/model_deployment_file.yml --round=100 + + # Validate the correctness by comparing the results against the + # original model and framework, measured with cosine distance for similarity. + python tools/converter.py run --config=/path/to/model_deployment_file.yml --validate + + # Check the memory usage of the model(**Just keep only one model in configuration file**) + python tools/converter.py run --config=/path/to/model_deployment_file.yml --round=10000 & + sleep 5 + adb shell dumpsys meminfo | grep mace_run + kill %1 + + + .. warning:: + + ``run`` rely on ``build`` command, you should ``run`` after ``build``. + + * **benchmark and profiling model** + + .. code:: sh + + # Benchmark model, get detailed statistics of each Op. + python tools/converter.py benchmark --config=/path/to/model_deployment_file.yml + + + .. warning:: + + ``benchmark`` rely on ``build`` command, you should ``benchmark`` after ``build``. + +**Common arguments** + + .. list-table:: + :header-rows: 1 + + * - option + - type + - default + - commands + - explanation + * - --omp_num_threads + - int + - -1 + - ``run``/``benchmark`` + - number of threads + * - --cpu_affinity_policy + - int + - 1 + - ``run``/``benchmark`` + - 0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY + * - --gpu_perf_hint + - int + - 3 + - ``run``/``benchmark`` + - 0:DEFAULT/1:LOW/2:NORMAL/3:HIGH + * - --gpu_perf_hint + - int + - 3 + - ``run``/``benchmark`` + - 0:DEFAULT/1:LOW/2:NORMAL/3:HIGH + * - --gpu_priority_hint + - int + - 3 + - ``run``/``benchmark`` + - 0:DEFAULT/1:LOW/2:NORMAL/3:HIGH + +Use ``-h`` to get detailed help. + +.. code:: sh + + python tools/converter.py -h + python tools/converter.py build -h + python tools/converter.py run -h + python tools/converter.py benchmark -h + + + +How to deploy +-------------------------------- + + +======================== +Overview +======================== + +``build`` command will generate the static/shared library, model files and +header files and package them as +``build/${library_name}/libmace_${library_name}.tar.gz``. + +- The generated ``static`` libraries are organized as follows, + +.. code:: + + build/ + └── mobilenet-v2-gpu + ├── include + │   └── mace + │   └── public + │   ├── mace.h + │   └── mace_runtime.h + | └── mace_engine_factory.h (Only exists if ``build_type`` set to ``code``)) + ├── libmace_mobilenet-v2-gpu.tar.gz + ├── lib + │   ├── arm64-v8a + │   │   └── libmace_mobilenet-v2-gpu.MI6.msm8998.a + │   └── armeabi-v7a + │   └── libmace_mobilenet-v2-gpu.MI6.msm8998.a + ├── model + │   ├── mobilenet_v2.data + │   └── mobilenet_v2.pb + └── opencl + ├── arm64-v8a + │   └── mobilenet-v2-gpu_compiled_opencl_kernel.MI6.msm8998.bin + └── armeabi-v7a + └── mobilenet-v2-gpu_compiled_opencl_kernel.MI6.msm8998.bin + + +- The generated ``shared`` libraries are organized as follows, + +.. code:: + + build + └── mobilenet-v2-gpu + ├── include + │   └── mace + │   └── public + │   ├── mace.h + │   └── mace_runtime.h + | └── mace_engine_factory.h (Only exists if ``build_type`` set to ``code``) + ├── lib + │   ├── arm64-v8a + │   │   ├── libgnustl_shared.so + │   │   └── libmace.so + │   └── armeabi-v7a + │   ├── libgnustl_shared.so + │   └── libmace.so + ├── model + │   ├── mobilenet_v2.data + │   └── mobilenet_v2.pb + └── opencl + ├── arm64-v8a + │   └── mobilenet-v2-gpu_compiled_opencl_kernel.MI6.msm8998.bin + └── armeabi-v7a + └── mobilenet-v2-gpu_compiled_opencl_kernel.MI6.msm8998.bin + +.. note:: + + 1. DSP runtime depends on ``libhexagon_controller.so``. + 2. ``${MODEL_TAG}.pb`` file will be generated only when ``build_type`` is ``proto``. + 3. ``${library_name}_compiled_opencl_kernel.${device_name}.${soc}.bin`` will + be generated only when ``target_socs`` and ``gpu`` runtime are specified. + 4. Generated shared library depends on ``libgnustl_shared.so``. + 5. Files in opencl folder will be generated only if + ``target_soc`` was set and ``runtime`` contains ``gpu`` in the deployment file. + 6. When ``build_type`` has been set to ``code``, ${library_name}.h and mace_engine_factory.h + will be generated in ``include`` folder. This header file will be used to create mace_engine of your model. + + +.. warning:: + + ``${library_name}_compiled_opencl_kernel.${device_name}.${soc}.bin`` depends + on the OpenCL version of the device, you should maintan the compatibility or + configure compiling cache store with ``ConfigKVStorageFactory``. + + +=========== +Deployment +=========== + +Unpack the generated libmace_${library_name}.tar.gz file and copy all of the uncompressed files into your project. + +Please refer to \ ``mace/examples/example.cc``\ for full usage. The following list the key steps. + +.. code:: cpp + + // Include the headers + #include "mace/public/mace.h" + #include "mace/public/mace_runtime.h" + // If the build_type is code + #include "mace/public/mace_engine_factory.h" + + // 0. Set pre-compiled OpenCL binary program file paths when available + if (device_type == DeviceType::GPU) { + mace::SetOpenCLBinaryPaths(opencl_binary_paths); + } + + // 1. Set compiled OpenCL kernel cache, this is used to reduce the + // initialization time since the compiling is too slow. It's suggested + // to set this even when pre-compiled OpenCL program file is provided + // because the OpenCL version upgrade may also leads to kernel + // recompilations. + const std::string file_path ="path/to/opencl_cache_file"; + std::shared_ptr storage_factory( + new FileStorageFactory(file_path)); + ConfigKVStorageFactory(storage_factory); + + // 2. Declare the device type (must be same with ``runtime`` in configuration file) + DeviceType device_type = DeviceType::GPU; + + // 3. Define the input and output tensor names. + std::vector input_names = {...}; + std::vector output_names = {...}; + + // 4. Create MaceEngine instance + std::shared_ptr engine; + MaceStatus create_engine_status; + // Create Engine from compiled code + create_engine_status = + CreateMaceEngineFromCode(model_name.c_str(), + nullptr, + input_names, + output_names, + device_type, + &engine); + // Create Engine from model file + create_engine_status = + CreateMaceEngineFromProto(model_pb_data, + model_data_file.c_str(), + input_names, + output_names, + device_type, + &engine); + if (create_engine_status != MaceStatus::MACE_SUCCESS) { + // Report error + } + + // 5. Create Input and Output tensor buffers + std::map inputs; + std::map outputs; + for (size_t i = 0; i < input_count; ++i) { + // Allocate input and output + int64_t input_size = + std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1, + std::multiplies()); + auto buffer_in = std::shared_ptr(new float[input_size], + std::default_delete()); + // Load input here + // ... + + inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in); + } + + for (size_t i = 0; i < output_count; ++i) { + int64_t output_size = + std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1, + std::multiplies()); + auto buffer_out = std::shared_ptr(new float[output_size], + std::default_delete()); + outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out); + } + + // 6. Run the model + MaceStatus status = engine.Run(inputs, &outputs); + diff --git a/docs/user_guide/basic_usage.rst b/docs/user_guide/basic_usage.rst index e90ef29c..de58ce6e 100644 --- a/docs/user_guide/basic_usage.rst +++ b/docs/user_guide/basic_usage.rst @@ -5,63 +5,97 @@ Basic usage Build and run an example model -------------------------------- -Make sure the environment has been set up correctly already (refer to `Installation`). +At first, make sure the environment has been set up correctly already (refer to :doc:`installation`). -Pull the mace model zoo project. +The followings are instructions about how to quickly build and run a provided model in *MACE Model Zoo*. -.. code:: sh +Here we use the mobilenet-v2 model as an example. - git clone https://github.com/XiaoMi/mace-models.git +**Commands** + + 1. Pull *MACE* project. + + .. code:: sh + + git clone https://github.com/XiaoMi/mace.git + git fetch --all --tags --prune + + # Checkout the latest tag (i.e. release version) + tag_name=`git describe --abbrev=0 --tags` + git checkout tags/${tag_name} + + .. note:: + + It's highly recommanded to use a release version instead of master branch. + + + 2. Pull *MACE Model Zoo* project. + + .. code:: sh + + git clone https://github.com/XiaoMi/mace-models.git + + + 3. Build MACE. + + .. code:: sh -Here we use the provided mobilenet-v2 model in mace model zoo as an example. -Plug an android device into your pc and enable Developer Mode for the device. + cd path/to/mace + # Build library + python tools/converter.py build --config=path/to/mace-models/mobilenet-v2/mobilenet-v2.yml -.. code:: sh - cd /path/to/mace - python tools/converter.py build --config=/path/to/mace-models/mobilenet-v2/mobilenet-v2.yml + 4. Convert the model to MACE format model. -Validate and benchmark the model. + .. code:: sh -.. code:: sh + cd path/to/mace + # Build library + python tools/converter.py build --config=path/to/mace-models/mobilenet-v2/mobilenet-v2.yml - # Validate the model. - python tools/converter.py run --config=/path/to/mace-models/mobilenet-v2/mobilenet-v2.yml --validate - # Benchmark - python tools/converter.py benchmark --config=/path/to/mace-models/mobilenet-v2/mobilenet-v2.yml -.. note:: + 5. Run the model. - If you want to build and run the model on pc, just use the mobilenet-v2-host.yml file instead for ``--config``. + .. code:: sh + + # Test model run time + python tools/converter.py run --config=path/to/mace-models/mobilenet-v2/mobilenet-v2.yml --round=100 + + # Validate the correctness by comparing the results against the + # original model and framework, measured with cosine distance for similarity. + python tools/converter.py run --config=path/to/mace-models/mobilenet-v2/mobilenet-v2.yml --validate Build your own model ---------------------------- + +This part will show you how to use your pre-trained model in MACE. + ================================== 1. Prepare your model ================================== -Mace now supports models from tensorflow and caffe. +Mace now supports models from Tensorflow and Caffe(more frameworks will be supported). - TensorFlow - Prepare your tensorflow model.pb file. + Prepare your pre-trained Tensorflow model.pb file. Use `Graph Transform Tool `__ to optimize your model for inference. - This tool will improve the efficiency of inference by making several optimizations like operations + This tool will improve the efficiency of inference by making several optimizations like operators folding, redundant node removal etc. We strongly recommend MACE users to use it before building. - The following command shows how to use the graph transform tool for CPU/GPU, + Usage for CPU/GPU, .. code:: bash # CPU/GPU: ./transform_graph \ - --in_graph=tf_model.pb \ - --out_graph=tf_model_opt.pb \ - --inputs='input' \ - --outputs='output' \ + --in_graph=/path/to/your/tf_model.pb \ + --out_graph=/path/to/your/output/tf_model_opt.pb \ + --inputs='input node name' \ + --outputs='output node name' \ --transforms='strip_unused_nodes(type=float, shape="1,64,64,3") strip_unused_nodes(type=float, shape="1,64,64,3") remove_nodes(op=Identity, op=CheckNumerics) @@ -74,8 +108,9 @@ Mace now supports models from tensorflow and caffe. - Caffe - MACE converter only supports Caffe 1.0+, you need to upgrade - your model by using the Caffe built-in tool if your model is from lower version caffe. + Caffe 1.0+ models are supported in MACE converter tool. + + If your model is from lower version Caffe, you need to upgrade it by using the Caffe built-in tool before converting. .. code:: bash @@ -85,12 +120,18 @@ Mace now supports models from tensorflow and caffe. # Upgrade caffemodel $CAFFE_ROOT/build/tools/upgrade_net_proto_binary MODEL.caffemodel MODEL.new.caffemodel + ============================================ 2. Create a deployment file for your model ============================================ -The followings are basic usage example deployment files for Tensorflow and Caffe models. -Modify one of them for your own case. +When converting a model or building a library, MACE needs to read a YAML file which is called model deployment file here. + +A model deployment file contains all the information of your model(s) and building options. There are several example +deployment files in *MACE Model Zoo* project. + +The following shows two basic usage of deployment files for Tensorflow and Caffe models. +Modify one of them and use it for your own case. - Tensorflow @@ -102,34 +143,62 @@ Modify one of them for your own case. .. literalinclude:: models/demo_app_models_caffe.yml :language: yaml -More details about model deployment file, refer to `Advanced_usage`. - +More details about model deployment file, please refer to :doc:`advanced_usage`. ====================================== -3. Build a library for your model +3. Convert your model ====================================== -MACE provides a python tool (``tools/converter.py``) for -model conversion, compiling, testing, benchmark and validation. +When the deployment file is ready for your model, you can use MACE converter tool to convert your model(s). -MACE can build either static or shared library (which is -specified by ``linkshared`` in YAML model deployment file). +To convert your pre-trained model to a MACE model, you need to set ``build_type:proto`` in your model deployment file. -**Commands** +And then run this command: - * **build** +.. code:: bash - build library. + python tools/converter.py convert --config=path/to/your/model_deployment.yml + +This command will download or load your pre-trained model and convert it to a MACE model proto file and weights file. +The generated model files will be stored in ``build/${library_name}/model`` folder. + +.. warning:: + + Please set ``build_type:proto`` in your deployment file before converting. + The usage of ``build_type:code`` will be demonstrated in :doc:`advanced_usage`. + +====================================== +4. Build MACE into a library +====================================== + +MACE can be built into either a static or a shared library (which is +specified by ``linkshared`` in YAML model deployment file). + +Use bazel to build MACE source code into a library. .. code:: sh cd path/to/mace # Build library - python tools/converter.py build --config=path/to/your/model_deployment_file.yml + bazel build --config=path/to/your/model_deployment_file.yml + +The above command will generate library files in the ``build/${library_name}/libs`` folder. + + .. warning:: + + 1. Please verify the target_abis params in the above command and the deployment file are the same. + 2. If you want to build a library for a specific soc, please refer to :doc:`advanced_usage`. - * **run** - run the model. +====================================== +5. Run your model +====================================== + +With the converted model, *.so or *.a library and header files, you can use the following commands to run and validate your model. + +* **run** + + run the model. .. code:: sh @@ -140,28 +209,23 @@ specified by ``linkshared`` in YAML model deployment file). # original model and framework, measured with cosine distance for similarity. python tools/converter.py run --config=path/to/your/model_deployment_file.yml --validate - * **benchmark** +* **benchmark** - benchmark and profile the model. + benchmark and profile the model. .. code:: sh # Benchmark model, get detailed statistics of each Op. python tools/converter.py benchmark --config=path/to/your/model_deployment_file.yml - .. warning:: - - 1. Plug an android device into your pc and enable Developer Mode before building. - 2. If you want to build the model for pc, set ``target_abis: [host]`` and ``runtime: cpu`` in your deployment YAML file. +======================================================== +6. Deploy your model into applications +======================================================== -============================================ -4. Deploy generated library in your project -============================================ - -``build`` command will generate the static/shared library, model files and -header files. All of these generated files will be packaged into -``path/to/mace/build/${library_name}/libmace_${library_name}.tar.gz``. +In the converting and building steps, you've got the static/shared library, model files and +header files. All of these generated files have been packaged into +``build/${library_name}/libmace_${library_name}.tar.gz`` when building. ``${library_name}`` is the name you defined in the first line of your deployment YAML file. @@ -170,33 +234,33 @@ header files. All of these generated files will be packaged into .. code:: build/ - └── mobilenet-v2-gpu + └── mobilenet-v2 ├── include │   └── mace │   └── public │   ├── mace.h │   └── mace_runtime.h - ├── libmace_mobilenet-v2-gpu.tar.gz + ├── libmace_mobilenet-v2.tar.gz ├── lib │   ├── arm64-v8a - │   │   └── libmace_mobilenet-v2-gpu.MI6.msm8998.a + │   │   └── libmace_mobilenet-v2.MI6.msm8998.a │   └── armeabi-v7a - │   └── libmace_mobilenet-v2-gpu.MI6.msm8998.a + │   └── libmace_mobilenet-v2.MI6.msm8998.a ├── model │   ├── mobilenet_v2.data │   └── mobilenet_v2.pb └── opencl ├── arm64-v8a - │   └── mobilenet-v2-gpu_compiled_opencl_kernel.MI6.msm8998.bin + │   └── mobilenet-v2_compiled_opencl_kernel.MI6.msm8998.bin └── armeabi-v7a - └── mobilenet-v2-gpu_compiled_opencl_kernel.MI6.msm8998.bin + └── mobilenet-v2_compiled_opencl_kernel.MI6.msm8998.bin - The generated ``shared`` library files are organized as follows, .. code:: build - └── mobilenet-v2-gpu + └── mobilenet-v2 ├── include │   └── mace │   └── public @@ -214,49 +278,37 @@ header files. All of these generated files will be packaged into │   └── mobilenet_v2.pb └── opencl ├── arm64-v8a - │   └── mobilenet-v2-gpu_compiled_opencl_kernel.MI6.msm8998.bin + │   └── mobilenet-v2_compiled_opencl_kernel.MI6.msm8998.bin └── armeabi-v7a - └── mobilenet-v2-gpu_compiled_opencl_kernel.MI6.msm8998.bin - -.. note:: - - 1. ``${MODEL_TAG}.pb`` file will be generated only when ``build_type`` is ``proto``. - 2. ``${library_name}_compiled_opencl_kernel.${device_name}.${soc}.bin`` will - be generated only when ``target_socs`` and ``gpu`` runtime are specified. - 3. Generated shared library depends on ``libgnustl_shared.so``. - -.. warning:: - - ``${library_name}_compiled_opencl_kernel.${device_name}.${soc}.bin`` depends - on the OpenCL version of the device, you should maintan the compatibility or - configure compiling cache store with ``ConfigKVStorageFactory``. + └── mobilenet-v2_compiled_opencl_kernel.MI6.msm8998.bin Unpack the generated libmace_${library_name}.tar.gz file and copy all of the uncompressed files into your project. -Please refer to \ ``mace/examples/example.cc``\ for full usage. The following lists the key steps. +Please refer to \ ``mace/examples/example.cc``\ for full usage. The following list the key steps. .. code:: cpp // Include the headers #include "mace/public/mace.h" #include "mace/public/mace_runtime.h" - // If the build_type is code - #include "mace/public/mace_engine_factory.h" // 0. Set pre-compiled OpenCL binary program file paths when available if (device_type == DeviceType::GPU) { mace::SetOpenCLBinaryPaths(opencl_binary_paths); } - // 1. Set compiled OpenCL kernel cache to reduce the - // initialization time. + // 1. Set compiled OpenCL kernel cache, this is used to reduce the + // initialization time since the compiling is too slow. It's suggested + // to set this even when pre-compiled OpenCL program file is provided + // because the OpenCL version upgrade may also leads to kernel + // recompilations. const std::string file_path ="path/to/opencl_cache_file"; std::shared_ptr storage_factory( new FileStorageFactory(file_path)); ConfigKVStorageFactory(storage_factory); - // 2. Declare the device type (must be same with ``runtime`` in deployment file) + // 2. Declare the device type (must be same with ``runtime`` in configuration file) DeviceType device_type = DeviceType::GPU; // 3. Define the input and output tensor names. @@ -266,15 +318,8 @@ Please refer to \ ``mace/examples/example.cc``\ for full usage. The following li // 4. Create MaceEngine instance std::shared_ptr engine; MaceStatus create_engine_status; - // If the build_type is code, create Engine from compiled code - create_engine_status = - CreateMaceEngineFromCode(model_name.c_str(), - nullptr, - input_names, - output_names, - device_type, - &engine); - // If the build_type is proto, Create Engine from model file + + // Create Engine from model file create_engine_status = CreateMaceEngineFromProto(model_pb_data, model_data_file.c_str(), @@ -312,6 +357,6 @@ Please refer to \ ``mace/examples/example.cc``\ for full usage. The following li } // 6. Run the model - engine->Run(inputs, &outputs); + MaceStatus status = engine.Run(inputs, &outputs); -More details in `advanced_usage`. \ No newline at end of file +More details are in :doc:`advanced_usage`. \ No newline at end of file diff --git a/docs/user_guide/create_a_model_deployment.rst b/docs/user_guide/create_a_model_deployment.rst index 30090cb6..871d55a2 100644 --- a/docs/user_guide/create_a_model_deployment.rst +++ b/docs/user_guide/create_a_model_deployment.rst @@ -19,16 +19,14 @@ Here is an example deployment file used by an Android demo application. TODO: change this example file to the demo deployment file (reuse the same file) and rename to a reasonable name. -.. literalinclude:: models/demo_app_models.yaml +.. literalinclude:: models/demo_app_models.yml :language: yaml Configurations -------------------- .. list-table:: - :widths: auto :header-rows: 1 - :align: left * - library_name - library name. diff --git a/docs/user_guide/how_to_build.rst b/docs/user_guide/how_to_build.rst new file mode 100644 index 00000000..59d12a23 --- /dev/null +++ b/docs/user_guide/how_to_build.rst @@ -0,0 +1,406 @@ +How to build +============ + +Supported Platforms +------------------- + +.. list-table:: + :header-rows: 1 + + * - Platform + - Explanation + * - TensorFlow + - >= 1.6.0. + * - Caffe + - >= 1.0. + +Usage +-------- + +======================================= +1. Pull MACE source code +======================================= + +.. code:: sh + + git clone https://github.com/XiaoMi/mace.git + git fetch --all --tags --prune + + # Checkout the latest tag (i.e. release version) + tag_name=`git describe --abbrev=0 --tags` + git checkout tags/${tag_name} + +.. note:: + + It's highly recommended to use a release version instead of master branch. + +============================ +2. Model Preprocessing +============================ + +- TensorFlow + +TensorFlow provides +`Graph Transform Tool `__ +to improve inference efficiency by making various optimizations like Ops +folding, redundant node removal etc. It's strongly recommended to make these +optimizations before graph conversion step. + +The following commands show the suggested graph transformations and +optimizations for different runtimes, + +.. code:: sh + + # CPU/GPU: + ./transform_graph \ + --in_graph=tf_model.pb \ + --out_graph=tf_model_opt.pb \ + --inputs='input' \ + --outputs='output' \ + --transforms='strip_unused_nodes(type=float, shape="1,64,64,3") + strip_unused_nodes(type=float, shape="1,64,64,3") + remove_nodes(op=Identity, op=CheckNumerics) + fold_constants(ignore_errors=true) + flatten_atrous_conv + fold_batch_norms + fold_old_batch_norms + strip_unused_nodes + sort_by_execution_order' + +.. code:: sh + + # DSP: + ./transform_graph \ + --in_graph=tf_model.pb \ + --out_graph=tf_model_opt.pb \ + --inputs='input' \ + --outputs='output' \ + --transforms='strip_unused_nodes(type=float, shape="1,64,64,3") + strip_unused_nodes(type=float, shape="1,64,64,3") + remove_nodes(op=Identity, op=CheckNumerics) + fold_constants(ignore_errors=true) + fold_batch_norms + fold_old_batch_norms + backport_concatv2 + quantize_weights(minimum_size=2) + quantize_nodes + strip_unused_nodes + sort_by_execution_order' + +- Caffe + +MACE converter only supports Caffe 1.0+, you need to upgrade +your models with Caffe built-in tool when necessary, + +.. code:: bash + + # Upgrade prototxt + $CAFFE_ROOT/build/tools/upgrade_net_proto_text MODEL.prototxt MODEL.new.prototxt + + # Upgrade caffemodel + $CAFFE_ROOT/build/tools/upgrade_net_proto_binary MODEL.caffemodel MODEL.new.caffemodel + +============================== +3. Build static/shared library +============================== + +----------------- +3.1 Overview +----------------- +MACE can build either static or shared library (which is +specified by ``linkshared`` in YAML model deployment file). +The followings are two use cases. + +* **Build well tuned library for specific SoCs** + + When ``target_socs`` is specified in YAML model deployment file, the build + tool will enable automatic tuning for GPU kernels. This usually takes some + time to finish depending on the complexity of your model. + + .. note:: + + You should plug in device(s) with the correspoding SoC(s). + +* **Build generic library for all SoCs** + + When ``target_socs`` is not specified, the generated library is compatible + with general devices. + + .. note:: + + There will be around of 1 ~ 10% performance drop for GPU + runtime compared to the well tuned library. + +MACE provide command line tool (``tools/converter.py``) for +model conversion, compiling, test run, benchmark and correctness validation. + +.. note:: + + 1. ``tools/converter.py`` should be run at the root directory of this project. + 2. When ``linkshared`` is set to ``1``, ``build_type`` should be ``proto``. + And currently only android devices supported. + + +------------------------------------------ +3.2 \ ``tools/converter.py``\ usage +------------------------------------------ + +**Commands** + + * **build** + + build library and test tools. + + .. code:: sh + + # Build library + python tools/converter.py build --config=models/config.yaml + + + + * **run** + + run the model(s). + + .. code:: sh + + # Test model run time + python tools/converter.py run --config=models/config.yaml --round=100 + + # Validate the correctness by comparing the results against the + # original model and framework, measured with cosine distance for similarity. + python tools/converter.py run --config=models/config.yaml --validate + + # Check the memory usage of the model(**Just keep only one model in configuration file**) + python tools/converter.py run --config=models/config.yaml --round=10000 & + sleep 5 + adb shell dumpsys meminfo | grep mace_run + kill %1 + + + .. warning:: + + ``run`` rely on ``build`` command, you should ``run`` after ``build``. + + * **benchmark** + + benchmark and profiling model. + + .. code:: sh + + # Benchmark model, get detailed statistics of each Op. + python tools/converter.py benchmark --config=models/config.yaml + + + .. warning:: + + ``benchmark`` rely on ``build`` command, you should ``benchmark`` after ``build``. + +**Common arguments** + + .. list-table:: + :header-rows: 1 + + * - option + - type + - default + - commands + - explanation + * - --omp_num_threads + - int + - -1 + - ``run``/``benchmark`` + - number of threads + * - --cpu_affinity_policy + - int + - 1 + - ``run``/``benchmark`` + - 0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY + * - --gpu_perf_hint + - int + - 3 + - ``run``/``benchmark`` + - 0:DEFAULT/1:LOW/2:NORMAL/3:HIGH + * - --gpu_perf_hint + - int + - 3 + - ``run``/``benchmark`` + - 0:DEFAULT/1:LOW/2:NORMAL/3:HIGH + * - --gpu_priority_hint + - int + - 3 + - ``run``/``benchmark`` + - 0:DEFAULT/1:LOW/2:NORMAL/3:HIGH + +Using ``-h`` to get detailed help. + +.. code:: sh + + python tools/converter.py -h + python tools/converter.py build -h + python tools/converter.py run -h + python tools/converter.py benchmark -h + + +============= +4. Deployment +============= + +``build`` command will generate the static/shared library, model files and +header files and package them as +``build/${library_name}/libmace_${library_name}.tar.gz``. + +- The generated ``static`` libraries are organized as follows, + +.. code:: + + build/ + └── mobilenet-v2-gpu + ├── include + │   └── mace + │   └── public + │   ├── mace.h + │   └── mace_runtime.h + ├── libmace_mobilenet-v2-gpu.tar.gz + ├── lib + │   ├── arm64-v8a + │   │   └── libmace_mobilenet-v2-gpu.MI6.msm8998.a + │   └── armeabi-v7a + │   └── libmace_mobilenet-v2-gpu.MI6.msm8998.a + ├── model + │   ├── mobilenet_v2.data + │   └── mobilenet_v2.pb + └── opencl + ├── arm64-v8a + │   └── mobilenet-v2-gpu_compiled_opencl_kernel.MI6.msm8998.bin + └── armeabi-v7a + └── mobilenet-v2-gpu_compiled_opencl_kernel.MI6.msm8998.bin + +- The generated ``shared`` libraries are organized as follows, + +.. code:: + + build + └── mobilenet-v2-gpu + ├── include + │   └── mace + │   └── public + │   ├── mace.h + │   └── mace_runtime.h + ├── lib + │   ├── arm64-v8a + │   │   ├── libgnustl_shared.so + │   │   └── libmace.so + │   └── armeabi-v7a + │   ├── libgnustl_shared.so + │   └── libmace.so + ├── model + │   ├── mobilenet_v2.data + │   └── mobilenet_v2.pb + └── opencl + ├── arm64-v8a + │   └── mobilenet-v2-gpu_compiled_opencl_kernel.MI6.msm8998.bin + └── armeabi-v7a + └── mobilenet-v2-gpu_compiled_opencl_kernel.MI6.msm8998.bin + +.. note:: + + 1. DSP runtime depends on ``libhexagon_controller.so``. + 2. ``${MODEL_TAG}.pb`` file will be generated only when ``build_type`` is ``proto``. + 3. ``${library_name}_compiled_opencl_kernel.${device_name}.${soc}.bin`` will + be generated only when ``target_socs`` and ``gpu`` runtime are specified. + 4. Generated shared library depends on ``libgnustl_shared.so``. + +.. warning:: + + ``${library_name}_compiled_opencl_kernel.${device_name}.${soc}.bin`` depends + on the OpenCL version of the device, you should maintan the compatibility or + configure compiling cache store with ``ConfigKVStorageFactory``. + +========================================= +5. How to use the library in your project +========================================= + +Please refer to \ ``mace/examples/example.cc``\ for full usage. The following list the key steps. + +.. code:: cpp + + // Include the headers + #include "mace/public/mace.h" + #include "mace/public/mace_runtime.h" + // If the build_type is code + #include "mace/public/mace_engine_factory.h" + + // 0. Set pre-compiled OpenCL binary program file paths when available + if (device_type == DeviceType::GPU) { + mace::SetOpenCLBinaryPaths(opencl_binary_paths); + } + + // 1. Set compiled OpenCL kernel cache, this is used to reduce the + // initialization time since the compiling is too slow. It's suggested + // to set this even when pre-compiled OpenCL program file is provided + // because the OpenCL version upgrade may also leads to kernel + // recompilations. + const std::string file_path ="path/to/opencl_cache_file"; + std::shared_ptr storage_factory( + new FileStorageFactory(file_path)); + ConfigKVStorageFactory(storage_factory); + + // 2. Declare the device type (must be same with ``runtime`` in configuration file) + DeviceType device_type = DeviceType::GPU; + + // 3. Define the input and output tensor names. + std::vector input_names = {...}; + std::vector output_names = {...}; + + // 4. Create MaceEngine instance + std::shared_ptr engine; + MaceStatus create_engine_status; + // Create Engine from compiled code + create_engine_status = + CreateMaceEngineFromCode(model_name.c_str(), + nullptr, + input_names, + output_names, + device_type, + &engine); + // Create Engine from model file + create_engine_status = + CreateMaceEngineFromProto(model_pb_data, + model_data_file.c_str(), + input_names, + output_names, + device_type, + &engine); + if (create_engine_status != MaceStatus::MACE_SUCCESS) { + // Report error + } + + // 5. Create Input and Output tensor buffers + std::map inputs; + std::map outputs; + for (size_t i = 0; i < input_count; ++i) { + // Allocate input and output + int64_t input_size = + std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1, + std::multiplies()); + auto buffer_in = std::shared_ptr(new float[input_size], + std::default_delete()); + // Load input here + // ... + + inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in); + } + + for (size_t i = 0; i < output_count; ++i) { + int64_t output_size = + std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1, + std::multiplies()); + auto buffer_out = std::shared_ptr(new float[output_size], + std::default_delete()); + outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out); + } + + // 6. Run the model + MaceStatus status = engine.Run(inputs, &outputs); + diff --git a/docs/user_guide/models/demo_app_models.yml b/docs/user_guide/models/demo_app_models.yml index df130c4b..8b126b44 100644 --- a/docs/user_guide/models/demo_app_models.yml +++ b/docs/user_guide/models/demo_app_models.yml @@ -1,39 +1,53 @@ # The name of library -library_name: mobilenet +library_name: mobile_squeeze +# host, armeabi-v7a or arm64-v8a target_abis: [arm64-v8a] +# set 1 to embed model weights data into code. default is 0, keep weights in model.data file embed_model_data: 1 # The build mode for model(s). -# 'code' stand for transfer model(s) into cpp code, 'proto' for model(s) in protobuf file(s). +# 'code' for transferring model(s) into cpp code, 'proto' for keeping model(s) in protobuf file(s). build_type: code +# 0 for static library, 1 for shared library. linkshared: 0 -# One yaml config file can contain multi models' config message. +# One yaml config file can contain multi models' deployment info. models: - mobilenet_v1: # model tag, which will be used in model loading and must be specific. - platform: tensorflow - # support local path, http:// and https:// - model_file_path: https://cnbj1.fds.api.xiaomi.com/mace/miai-models/mobilenet-v1/mobilenet-v1-1.0.pb - model_sha256_checksum: 71b10f540ece33c49a7b51f5d4095fc9bd78ce46ebf0300487b2ee23d71294e6 - subgraphs: - - input_tensors: input - input_shapes: 1,224,224,3 - output_tensors: MobilenetV1/Predictions/Reshape_1 - output_shapes: 1,1001 - runtime: cpu+gpu - limit_opencl_kernel_time: 0 - nnlib_graph_mode: 0 - obfuscate: 0 - winograd: 0 - mobilenet_v2: - platform: tensorflow - model_file_path: https://cnbj1.fds.api.xiaomi.com/mace/miai-models/mobilenet-v2/mobilenet-v2-1.0.pb - model_sha256_checksum: 369f9a5f38f3c15b4311c1c84c032ce868da9f371b5f78c13d3ea3c537389bb4 - subgraphs: - - input_tensors: input - input_shapes: 1,224,224,3 - output_tensors: MobilenetV2/Predictions/Reshape_1 - output_shapes: 1,1001 - runtime: cpu+gpu - limit_opencl_kernel_time: 0 - nnlib_graph_mode: 0 - obfuscate: 0 - winograd: 0 + mobilenet_v1: + platform: tensorflow + model_file_path: https://cnbj1.fds.api.xiaomi.com/mace/miai-models/mobilenet-v1/mobilenet-v1-1.0.pb + model_sha256_checksum: 71b10f540ece33c49a7b51f5d4095fc9bd78ce46ebf0300487b2ee23d71294e6 + subgraphs: + - input_tensors: + - input + input_shapes: + - 1,224,224,3 + output_tensors: + - MobilenetV1/Predictions/Reshape_1 + output_shapes: + - 1,1001 + validation_inputs_data: + - https://cnbj1.fds.api.xiaomi.com/mace/inputs/dog.npy + runtime: cpu+gpu + limit_opencl_kernel_time: 0 + nnlib_graph_mode: 0 + obfuscate: 0 + winograd: 0 + squeezenet_v11: + platform: caffe + model_file_path: http://cnbj1-inner-fds.api.xiaomi.net/mace/mace-models/squeezenet/SqueezeNet_v1.1/model.prototxt + weight_file_path: http://cnbj1-inner-fds.api.xiaomi.net/mace/mace-models/squeezenet/SqueezeNet_v1.1/weight.caffemodel + model_sha256_checksum: 625c952063da1569e22d2f499dc454952244d42cd8feca61f05502566e70ae1c + weight_sha256_checksum: 72b912ace512e8621f8ff168a7d72af55910d3c7c9445af8dfbff4c2ee960142 + subgraphs: + - input_tensors: + - data + input_shapes: + - 1,227,227,3 + output_tensors: + - prob + output_shapes: + - 1,1,1,1000 + runtime: cpu+gpu + limit_opencl_kernel_time: 0 + nnlib_graph_mode: 0 + obfuscate: 0 + winograd: 0 \ No newline at end of file -- GitLab