From 31601a5e16f58afeac551341ff6ec27c8e4b024a Mon Sep 17 00:00:00 2001
From: Bin Li <libin11@xiaomi.com>
Date: Tue, 17 Mar 2020 17:05:02 +0800
Subject: [PATCH] Support Armv8.2+dotproduct

---
 WORKSPACE                              |  6 +++---
 docs/user_guide/quantization_usage.rst | 29 +++++++++++++++++---------
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/WORKSPACE b/WORKSPACE
index e07d66b9..2bbd89da 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -78,10 +78,10 @@ new_http_archive(
 
 http_archive(
     name = "gemmlowp",
-    sha256 = "f340384e7728cea605e83597593699dfe8d13ff333b834d24c256935e3dc1758",
-    strip_prefix = "gemmlowp-master-48c0547a046d49b466aa01e3a82a18028f288924",
+    sha256 = "d445e5a0ef6ae18dcb68adb3c38245708cb357c0e1e51cb752dd933b7c975314",
+    strip_prefix = "gemmlowp-76272a197495297154e97cdcb624a52581165497",
     urls = [
-        "http://cnbj1.fds.api.xiaomi.com/mace/third-party/gemmlowp/gemmlowp-master-48c0547a046d49b466aa01e3a82a18028f288924.zip",
+        "http://cnbj1.fds.api.xiaomi.com/mace/third-party/gemmlowp/gemmlowp-76272a197495297154e97cdcb624a52581165497.zip",
     ],
 )
 
diff --git a/docs/user_guide/quantization_usage.rst b/docs/user_guide/quantization_usage.rst
index 205f854f..159af8c6 100644
--- a/docs/user_guide/quantization_usage.rst
+++ b/docs/user_guide/quantization_usage.rst
@@ -32,7 +32,7 @@ Post-training quantization
 ---------------------------
 MACE supports post-training quantization if you want to take a chance to quantize model directly without fine tuning.
 This method requires developer to calculate tensor range of each activation layer statistically using sample inputs.
-MACE provides tools to do statistics with following steps(using `inception-v3` from `MACE Model Zoo <https://github.com/XiaoMi/mace-models>`__ as an example):
+MACE provides tools to do statistics with following steps (using `inception-v3` from `MACE Model Zoo <https://github.com/XiaoMi/mace-models>`__ as an example):
 
   1. Convert original model to run on CPU host without obfuscation (by setting `target_abis` to `host`, `runtime` to `cpu`,
   and `obfuscate` to `0`, appending `:0` to `output_tensors` if missing in yaml config).
@@ -101,11 +101,18 @@ ARM CPU is ubiquitous, which can speed up most of edge devices. However, AI spec
 than ARM CPU, and in the meantime consume much lower power. Headers and libraries of these devices can be found in `third_party`
 directory.
 
-* **To run models on Hexagon DSP, users should**
+* To run models on **ARM CPU**, users should
 
-  1. Make sure SOCs of the phone is manufactured by Qualcomm and has HVX supported.
+  1. Set `runtime` in yaml config to `cpu` (`Armv8.2+dotproduct` instructions will be used automatically
+     if detected by `getauxval`, which can greatly improve convolution/gemm performance).
+  
+* To run models on **Hexagon DSP**, users should
 
-  2. Make sure the phone disables secure boot (once enabled, cannot be reversed, so you probably can only get that type
+  1. Set `runtime` in yaml config to `dsp`.
+
+  2. Make sure SOCs of the phone is manufactured by Qualcomm and has HVX supported.
+
+  3. Make sure the phone disables secure boot (once enabled, cannot be reversed, so you probably can only get that type
      phones from manufacturers). This can be checked by executing the following command.
 
    .. code-block:: sh
@@ -114,12 +121,12 @@ directory.
 
    The return value should be 0.
 
-  3. Root the phone.
+  4. Root the phone.
 
-  4. Sign the phone by using testsig provided by Qualcomm. (Download Qualcomm Hexagon SDK first, plugin the phone to PC,
+  5. Sign the phone by using testsig provided by Qualcomm. (Download Qualcomm Hexagon SDK first, plugin the phone to PC,
      run scripts/testsig.py)
 
-  5. Push `third_party/nnlib/v6x/libhexagon_nn_skel.so` to `/system/vendor/lib/rfsa/adsp/`. You can check
+  6. Push `third_party/nnlib/v6x/libhexagon_nn_skel.so` to `/system/vendor/lib/rfsa/adsp/`. You can check
      `docs/feature_matrix.html` in Hexagon SDK to make sure which version to use.
 
 Then, there you go, you can run Mace on Hexagon DSP. This indeed seems like a whole lot of work to do. Well, the good news
@@ -127,8 +134,10 @@ is that starting in the SM8150 family(some devices with old firmware may still n
 module offload is enabled on cDSP. So, steps 2-4 can be skipped. This can be achieved by calling `SetHexagonToUnsignedPD()`
 before creating MACE engine.
 
-* **To run models on MediaTek APU, users should**
+* To run models on **MediaTek APU**, users should
+
+  1. Set `runtime` in yaml config to `apu`.
 
-  1. Make sure SOCs of the phone is manufactured by MediaTek and has APU supported.
+  2. Make sure SOCs of the phone is manufactured by MediaTek and has APU supported.
 
-  2. Push `third_party/apu/mtxxxx/libapu-platform.so` to `/vendor/lib64/`.
+  3. Push `third_party/apu/mtxxxx/libapu-platform.so` to `/vendor/lib64/`.
-- 
GitLab