Merge branch 'master' into 'micro_cmsis'

# Conflicts: # tools/converter.py # tools/python/convert.py # tools/python/transform/base_converter.py # tools/python/utils/config_parser.py

Merge branch 'master' into 'micro_cmsis'
# Conflicts: # tools/converter.py # tools/python/convert.py # tools/python/transform/base_converter.py # tools/python/utils/config_parser.py
0fec0882 · 张志敏 · cd839dfc · 2cf5ee88 · 0fec0882 · 0fec0882
31 changed file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -42,16 +42,16 @@ build_docs:
 cmake_build_android-armeabi-v7a:
  stage: build
  script:
-    - RUNTIME=GPU bash tools/cmake/cmake-build-armeabi-v7a.sh
+    - RUNTIME=GPU QUANTIZE=OFF bash tools/cmake/cmake-build-armeabi-v7a.sh
    - LIBMACE32_FULL_SIZE=`stat -c%s build/cmake-build/armeabi-v7a/install/lib/libmace.so`
-    - if (( LIBMACE32_FULL_SIZE > 2200000 )) ; then echo "The libmace.so size too large"; exit 1; fi
+    - if (( LIBMACE32_FULL_SIZE > 1400000 )) ; then echo "The libmace.so size too large"; exit 1; fi

 cmake_build_android-arm64-v8:
  stage: build
  script:
-    - RUNTIME=GPU bash tools/cmake/cmake-build-arm64-v8a.sh
+    - RUNTIME=GPU QUANTIZE=OFF bash tools/cmake/cmake-build-arm64-v8a.sh
    - LIBMACE64_FULL_SIZE=`stat -c%s build/cmake-build/arm64-v8a/install/lib/libmace.so`
-    - if (( LIBMACE64_FULL_SIZE > 3100000 )) ; then echo "The libmace.so size too large"; exit 1; fi
+    - if (( LIBMACE64_FULL_SIZE > 2300000 )) ; then echo "The libmace.so size too large"; exit 1; fi

 bazel_build:
  stage: build
@@ -59,10 +59,14 @@ bazel_build:
    - bash tools/bazel_build_standalone_lib.sh
    - bash tools/bazel_build_standalone_lib.sh --abi=armeabi-v7a --runtimes=cpu
    - bash tools/bazel_build_standalone_lib.sh --abi=armeabi-v7a --runtimes=cpu,gpu
+    - LIBMACE32_FULL_SIZE=`stat -c%s build/lib/armeabi-v7a/libmace.so`
+    - if (( LIBMACE32_FULL_SIZE > 1400000 )) ; then echo "The libmace.so size too large"; exit 1; fi
    - bash tools/bazel_build_standalone_lib.sh --abi=armeabi-v7a --runtimes=cpu,gpu,dsp
    - bash tools/bazel_build_standalone_lib.sh --abi=armeabi-v7a --runtimes=cpu,gpu,apu
    - bash tools/bazel_build_standalone_lib.sh --abi=arm64-v8a --runtimes=cpu
    - bash tools/bazel_build_standalone_lib.sh --abi=arm64-v8a --runtimes=cpu,gpu
+    - LIBMACE64_FULL_SIZE=`stat -c%s build/lib/arm64-v8a/libmace.so`
+      - if (( LIBMACE64_FULL_SIZE > 2300000 )) ; then echo "The libmace.so size too large"; exit 1; fi
    - bash tools/bazel_build_standalone_lib.sh --abi=arm64-v8a --runtimes=cpu,gpu,dsp
    - bash tools/bazel_build_standalone_lib.sh --abi=arm64-v8a --runtimes=cpu,gpu,apu
    - bash tools/bazel_build_standalone_lib.sh --abi=arm_linux_gnueabihf --runtimes=cpu

--- a/docs/user_guide/advanced_usage_cmake.rst
+++ b/docs/user_guide/advanced_usage_cmake.rst
@@ -175,7 +175,7 @@ After that you can rebuild the engine.
    
    .. code-block:: bash

-        RUNTIME=GPU RUNMODE=code bash tools/cmake/cmake-build-armeabi-v7a.sh
+        RUNTIME=GPU RUNMODE=code QUANTIZE=OFF bash tools/cmake/cmake-build-armeabi-v7a.sh

 ``RUNMODE=code`` means you compile and link model library with MACE engine.


--- a/docs/user_guide/basic_usage.rst
+++ b/docs/user_guide/basic_usage.rst
@@ -45,7 +45,7 @@ Here we use the mobilenet-v2 model as an example.
        cd path/to/mace
        # Build library
        # output lib path: build/lib
-        bash tools/bazel_build_standalone_lib.sh [-abi=abi][-runtimes=rt1,rt2,...][-static]
+        bash tools/bazel_build_standalone_lib.sh [-abi=abi][-runtimes=rt1,rt2,...][-quantize][-static]

    .. note::


--- a/docs/user_guide/basic_usage_cmake.rst
+++ b/docs/user_guide/basic_usage_cmake.rst
@@ -20,7 +20,7 @@ Please make sure you have CMake installed.

    .. code-block:: sh

-        RUNTIME=GPU bash tools/cmake/cmake-build-armeabi-v7a.sh
+        RUNTIME=GPU QUANTIZE=OFF bash tools/cmake/cmake-build-armeabi-v7a.sh

 which generate libraries in ``build/cmake-build/armeabi-v7a``, you can use either static libraries or the ``libmace.so`` shared library.


--- a/docs/zh/user_guide/basic_usage.rst
+++ b/docs/zh/user_guide/basic_usage.rst
@@ -20,7 +20,7 @@

    .. code-block:: sh

-        RUNTIME=GPU bash tools/cmake/cmake-build-armeabi-v7a.sh
+        RUNTIME=GPU QUANTIZE=OFF bash tools/cmake/cmake-build-armeabi-v7a.sh

 编译安装位置为 ``build/cmake-build/armeabi-v7a``, 可以使用 libmace 静态库或者动态库。


--- a/mace/core/runtime/opencl/cl2_header.h
+++ b/mace/core/runtime/opencl/cl2_header.h
@@ -37,7 +37,7 @@
 #pragma GCC diagnostic ignored "-Wignored-attributes"
 #endif  // MACE_OS_MAC

-#include "CL/cl2.hpp"
+#include <CL/cl2.hpp>

 #ifdef MACE_OS_MAC
 #pragma GCC diagnostic pop

--- a/mace/core/types.h
+++ b/mace/core/types.h
@@ -69,6 +69,8 @@ enum FrameworkType {
  TENSORFLOW = 0,
  CAFFE = 1,
  ONNX = 2,
+  MEGENGINE = 3,
+  PYTORCH = 4
 };

 template <typename T>

--- a/mace/ops/arm/base/common_neon.h
+++ b/mace/ops/arm/base/common_neon.h
@@ -144,7 +144,7 @@ inline void vst3q(float *ptr, float32x4x3_t v) {
 }

 inline float32x8_t vld1o(float *ptr) {
-  return {vld1q_f32(ptr), vld1q_f32(ptr + 4)};
+  return {{vld1q_f32(ptr), vld1q_f32(ptr + 4)}};
 }

 inline void vst1o(float *ptr, float32x8_t v) {
@@ -209,8 +209,8 @@ inline float32x4_t vld1q(const BFloat16 *ptr) {
 // load of 2 4D vectors and perform de-interleaving
 inline float32x4x2_t vld2q_bf16(const uint16_t *ptr) {
  uint16x4x2_t u = vld2_u16(ptr);
-  return {vreinterpretq_f32_u32(vshll_n_u16(u.val[0], 16)),
-          vreinterpretq_f32_u32(vshll_n_u16(u.val[1], 16))};
+  return {{vreinterpretq_f32_u32(vshll_n_u16(u.val[0], 16)),
+           vreinterpretq_f32_u32(vshll_n_u16(u.val[1], 16))}};
 }

 inline float32x4x2_t vld2q_bf16(const BFloat16 *ptr) {
@@ -228,9 +228,9 @@ inline float32x4x2_t vld2q(const BFloat16 *ptr) {
 // load of 3 4D vectors and perform de-interleaving
 inline float32x4x3_t vld3q_bf16(const uint16_t *ptr) {
  uint16x4x3_t u = vld3_u16(ptr);
-  return {vreinterpretq_f32_u32(vshll_n_u16(u.val[0], 16)),
-          vreinterpretq_f32_u32(vshll_n_u16(u.val[1], 16)),
-          vreinterpretq_f32_u32(vshll_n_u16(u.val[2], 16))};
+  return {{vreinterpretq_f32_u32(vshll_n_u16(u.val[0], 16)),
+           vreinterpretq_f32_u32(vshll_n_u16(u.val[1], 16)),
+           vreinterpretq_f32_u32(vshll_n_u16(u.val[2], 16))}};
 }

 inline float32x4x3_t vld3q_bf16(const BFloat16 *ptr) {
@@ -264,8 +264,8 @@ inline void vst1q(BFloat16 *ptr, const float32x4_t v) {

 // store of 2 4D vectors and perform interleaving
 inline void vst2q_bf16(uint16_t *ptr, const float32x4x2_t v) {
-  uint16x4x2_t u = {vshrn_n_u32(vreinterpretq_u32_f32(v.val[0]), 16),
-                    vshrn_n_u32(vreinterpretq_u32_f32(v.val[1]), 16)};
+  uint16x4x2_t u = {{vshrn_n_u32(vreinterpretq_u32_f32(v.val[0]), 16),
+                     vshrn_n_u32(vreinterpretq_u32_f32(v.val[1]), 16)}};
  vst2_u16(ptr, u);
 }

@@ -283,9 +283,9 @@ inline void vst2q(BFloat16 *ptr, const float32x4x2_t v) {

 // store of 3 4D vectors and perform interleaving
 inline void vst3q_bf16(uint16_t *ptr, const float32x4x3_t v) {
-  uint16x4x3_t u = {vshrn_n_u32(vreinterpretq_u32_f32(v.val[0]), 16),
+  uint16x4x3_t u = {{vshrn_n_u32(vreinterpretq_u32_f32(v.val[0]), 16),
                    vshrn_n_u32(vreinterpretq_u32_f32(v.val[0]), 16),
-                    vshrn_n_u32(vreinterpretq_u32_f32(v.val[0]), 16)};
+                    vshrn_n_u32(vreinterpretq_u32_f32(v.val[0]), 16)}};
  vst3_u16(ptr, u);
 }

@@ -304,8 +304,8 @@ inline void vst3q(BFloat16 *ptr, const float32x4x3_t v) {
 // load of 8D vector
 inline float32x8_t vld1o_bf16(const uint16_t *ptr) {
  uint16x8_t u = vld1q_u16(ptr);
-  return {vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(u), 16)),
-          vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(u), 16))};
+  return {{vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(u), 16)),
+           vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(u), 16))}};
 }

 inline float32x8_t vld1o_bf16(const BFloat16 *ptr) {

--- a/mace/ops/reshape.cc
+++ b/mace/ops/reshape.cc
@@ -169,7 +169,7 @@ void RegisterReshape(OpRegistry *op_registry) {
                         int has_data_format =
                             ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                                 *op, "has_data_format", 0);
-                         if (has_data_format) {
+                         if (has_data_format && op->input_size() == 1) {
                           return {DeviceType::CPU, DeviceType::GPU};
                         }

@@ -183,7 +183,8 @@ void RegisterReshape(OpRegistry *op_registry) {
                             op->output_shape(0).dims_size();
                         if (op_data_format == DataFormat::NHWC &&
                             4 == tensor_shape_info->at(input_0).size() &&
-                             (out_dims_size == 4 || out_dims_size == 2)) {
+                             (out_dims_size == 4 || out_dims_size == 2) &&
+                             op->input_size() == 1) {
                           return {DeviceType::CPU, DeviceType::GPU};
                         }


--- a/test/ccunit/mace/ops/depthwise_conv2d_test.cc
+++ b/test/ccunit/mace/ops/depthwise_conv2d_test.cc
@@ -492,6 +492,7 @@ TEST_F(DepthwiseConv2dOpTest, Quant) {
  TestQuant(3, 1, 128, 56, 56, 3, 3, SAME, {2, 2});
 }

+#ifdef MACE_ENABLE_BFLOAT16
 namespace {
 void TestBFloat16(const index_t batch,
                  const index_t multiplier,
@@ -557,6 +558,8 @@ TEST_F(DepthwiseConv2dOpTest, BFloat16) {
  TestBFloat16(3, 1, 128, 56, 56, 3, 3, SAME, {2, 2});
 }

+#endif  // MACE_ENABLE_BFLOAT16
+
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/third_party/opencl-clhpp/opencl-clhpp.BUILD
+++ b/third_party/opencl-clhpp/opencl-clhpp.BUILD
@@ -12,9 +12,12 @@ genrule(
    cmd = "workdir=$$(mktemp -d -t opencl-clhpp-build.XXXXXXXXXX); cp -aL $$(dirname $(location CMakeLists.txt))/* $$workdir; pushd $$workdir; mkdir build; pushd build; cmake ../ -DBUILD_DOCS=OFF -DBUILD_EXAMPLES=OFF -DBUILD_TESTS=OFF; make generate_clhpp generate_cl2hpp; popd; popd; cp -a $$workdir/build/* $(@D); rm -rf $$workdir; echo installing to  $(@D)",
 )

+# The `srcs` is not used in c++ Code, but we need it to trigger the `genrule`,
+# So we add the "include/CL/cl.hpp", "include/CL/cl2.hpp" into `srcs`, these
+# two files is imported by the `includes` instead of `srcs`.
 cc_library(
    name = "opencl_clhpp",
-    hdrs = ["include/CL/cl.hpp", "include/CL/cl2.hpp"],
-    strip_include_prefix = "include",
+    includes = ["include"],
+    srcs = ["include/CL/cl.hpp", "include/CL/cl2.hpp"],
    visibility = ["//visibility:public"],
 )
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -10,6 +10,7 @@ build --copt=-D_GLIBCXX_USE_C99_MATH_TR1
 build --copt=-DMACE_OBFUSCATE_LITERALS
 build --copt=-DGEMMLOWP_USE_MACE_THREAD_POOL
 build --copt=-DMACE_DEPTHWISE_U8_USE_MULTI_THREAD
+build --copt=-O2

 # Usage example: bazel build --config android
 build:android --define linux_base=true
@@ -20,6 +21,7 @@ build:android --linkopt=-lm
 build:android --distinct_host_configuration=true
 build:android --crosstool_top=//external:android/crosstool
 build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
+build:android --copt=-Oz
 build:android --copt -Wall
 build:android --copt -Wno-mismatched-tags
 build:android --copt -Wno-missing-braces
@@ -75,20 +77,27 @@ build:aarch64_linux_gnu --copt -Wno-implicit-fallthrough

 # Usage example: bazel build --config optimization
 build:optimization -c opt
-build:optimization --copt=-O3
 build:optimization --linkopt=-Wl,--strip-all
 build:optimization --copt=-ffunction-sections
 build:optimization --copt=-fdata-sections
 build:optimization --linkopt=-Wl,--gc-sections
+build:optimization --copt=-fno-rtti
+build:optimization --copt=-fno-exceptions
+build:optimization --copt=-DGOOGLE_PROTOBUF_NO_RTTI
+build:optimization --copt=-DPROTOBUF_USE_EXCEPTIONS=0

 # Usage example: bazel build --config optimization_darwin
-build:optimization_darwin --copt=-O3
 build:optimization_darwin --copt=-ffunction-sections
 build:optimization_darwin --copt=-fdata-sections
 build:optimization_darwin --linkopt=-Wl,-dead_strip
+build:optimization_darwin --copt=-fno-rtti
+build:optimization_darwin --copt=-fno-exceptions
+build:optimization_darwin --copt=-DGOOGLE_PROTOBUF_NO_RTTI
+build:optimization_darwin --copt=-DPROTOBUF_USE_EXCEPTIONS=0

 # Usage example: bazel build --config symbol_hidden
 build:symbol_hidden --copt=-fvisibility=hidden
+build:symbol_hidden --copt=-fvisibility-inlines-hidden

 # Usage example: bazel build --config debug
 build:debug -c dbg

--- a/tools/bazel_build_standalone_lib.sh
+++ b/tools/bazel_build_standalone_lib.sh
@@ -40,8 +40,8 @@ enable_cpu=true
 enable_gpu=false
 enable_dsp=false
 enable_apu=false
-enable_quantize=true
-enable_bfloat16=true
+enable_quantize=false
+enable_bfloat16=false
 enable_rpcmem=true
 static_lib=false
 symbol_hidden=
@@ -97,6 +97,12 @@ for opt in "${@}";do
    static|-static|--static)
      static_lib=true
      ;;
+    quantize|-quantize|--quantize)
+      enable_quantize=true
+      ;;
+    bfloat16|-bfloat16|--bfloat16)
+      enable_bfloat16=true
+      ;;
    help|-help|--help)
      helper
      ;;

--- a/tools/cmake/cmake-build-aarch64-linux-gnu.sh
+++ b/tools/cmake/cmake-build-aarch64-linux-gnu.sh
@@ -17,14 +17,24 @@ if [[ "$RUNMODE" == "code" ]]; then
    MACE_ENABLE_CODE_MODE=ON
 fi

+MACE_ENABLE_QUANTIZE=OFF
+if [[ "$QUANTIZE" == "ON" ]]; then
+    MACE_ENABLE_QUANTIZE=ON
+fi
+
+DMACE_ENABLE_BFLOAT16=OFF
+if [[ "$BFLOAT16" == "ON" ]]; then
+    DMACE_ENABLE_BFLOAT16=ON
+fi
+
 mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
 cmake -DCROSSTOOL_ROOT=${LINARO_AARCH64_LINUX_GNU} \
      -DCMAKE_TOOLCHAIN_FILE=./cmake/toolchains/aarch64-linux-gnu.cmake \
      -DCMAKE_BUILD_TYPE=Release          \
      -DMACE_ENABLE_NEON=ON               \
-      -DMACE_ENABLE_QUANTIZE=ON           \
+      -DMACE_ENABLE_QUANTIZE=${MACE_ENABLE_QUANTIZE}         \
      -DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL}             \
-      -DMACE_ENABLE_BFLOAT16=ON           \
+      -DMACE_ENABLE_BFLOAT16=${DMACE_ENABLE_BFLOAT16}        \
      -DMACE_ENABLE_OPT_SIZE=ON           \
      -DMACE_ENABLE_OBFUSCATE=ON          \
      -DMACE_ENABLE_TESTS=ON              \

--- a/tools/cmake/cmake-build-arm-linux-gnueabihf.sh
+++ b/tools/cmake/cmake-build-arm-linux-gnueabihf.sh
@@ -17,19 +17,29 @@ if [[ "$RUNTIME" == "GPU" ]]; then
    MACE_ENABLE_OPENCL=ON
 fi

+MACE_ENABLE_QUANTIZE=OFF
+if [[ "$QUANTIZE" == "ON" ]]; then
+    MACE_ENABLE_QUANTIZE=ON
+fi
+
+DMACE_ENABLE_BFLOAT16=OFF
+if [[ "$BFLOAT16" == "ON" ]]; then
+    DMACE_ENABLE_BFLOAT16=ON
+fi
+
 mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
 cmake -DCROSSTOOL_ROOT=${LINARO_ARM_LINUX_GNUEABIHF} \
      -DCMAKE_TOOLCHAIN_FILE=./cmake/toolchains/arm-linux-gnueabihf.cmake \
      -DCMAKE_BUILD_TYPE=Release          \
      -DMACE_ENABLE_NEON=ON               \
-      -DMACE_ENABLE_QUANTIZE=ON           \
-      -DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL}              \
-      -DMACE_ENABLE_BFLOAT16=ON           \
+      -DMACE_ENABLE_QUANTIZE=${MACE_ENABLE_QUANTIZE}         \
+      -DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL}             \
+      -DMACE_ENABLE_BFLOAT16=${DMACE_ENABLE_BFLOAT16}        \
      -DMACE_ENABLE_OPT_SIZE=ON           \
      -DMACE_ENABLE_OBFUSCATE=ON          \
      -DMACE_ENABLE_TESTS=ON              \
      -DMACE_ENABLE_BENCHMARKS=ON         \
-      -DMACE_ENABLE_CODE_MODE=${MACE_ENABLE_CODE_MODE}        \
+      -DMACE_ENABLE_CODE_MODE=${MACE_ENABLE_CODE_MODE}       \
      -DCMAKE_INSTALL_PREFIX=install      \
      ../../..
 make -j$(nproc) VERBOSE=1 && make install

--- a/tools/cmake/cmake-build-arm64-v8a.sh
+++ b/tools/cmake/cmake-build-arm64-v8a.sh
@@ -26,6 +26,16 @@ if [[ "$RUNMODE" == "code" ]]; then
    MACE_ENABLE_CODE_MODE=ON
 fi

+MACE_ENABLE_QUANTIZE=OFF
+if [[ "$QUANTIZE" == "ON" ]]; then
+    MACE_ENABLE_QUANTIZE=ON
+fi
+
+DMACE_ENABLE_BFLOAT16=OFF
+if [[ "$BFLOAT16" == "ON" ]]; then
+    DMACE_ENABLE_BFLOAT16=ON
+fi
+
 mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
 cmake -DANDROID_ABI="arm64-v8a" \
      -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake \
@@ -33,12 +43,12 @@ cmake -DANDROID_ABI="arm64-v8a" \
      -DCMAKE_BUILD_TYPE=Release          \
      -DANDROID_STL=c++_shared            \
      -DMACE_ENABLE_NEON=ON               \
-      -DMACE_ENABLE_QUANTIZE=ON           \
+      -DMACE_ENABLE_QUANTIZE=${MACE_ENABLE_QUANTIZE}         \
      -DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL}             \
      -DMACE_ENABLE_HEXAGON_DSP=${MACE_ENABLE_HEXAGON_DSP}   \
      -DMACE_ENABLE_HEXAGON_HTA=${MACE_ENABLE_HEXAGON_HTA}   \
      -DMACE_ENABLE_MTK_APU=${MACE_ENABLE_MTK_APU}           \
-      -DMACE_ENABLE_BFLOAT16=ON           \
+      -DMACE_ENABLE_BFLOAT16=${DMACE_ENABLE_BFLOAT16}        \
      -DMACE_ENABLE_OPT_SIZE=ON           \
      -DMACE_ENABLE_OBFUSCATE=ON          \
      -DMACE_ENABLE_TESTS=ON              \

--- a/tools/cmake/cmake-build-armeabi-v7a.sh
+++ b/tools/cmake/cmake-build-armeabi-v7a.sh
@@ -27,6 +27,16 @@ if [[ "$RUNMODE" == "code" ]]; then
    MACE_ENABLE_CODE_MODE=ON
 fi

+MACE_ENABLE_QUANTIZE=OFF
+if [[ "$QUANTIZE" == "ON" ]]; then
+    MACE_ENABLE_QUANTIZE=ON
+fi
+
+DMACE_ENABLE_BFLOAT16=OFF
+if [[ "$BFLOAT16" == "ON" ]]; then
+    DMACE_ENABLE_BFLOAT16=ON
+fi
+
 mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
 cmake -DANDROID_ABI="armeabi-v7a" \
      -DANDROID_ARM_NEON=ON \
@@ -35,12 +45,12 @@ cmake -DANDROID_ABI="armeabi-v7a" \
      -DCMAKE_BUILD_TYPE=Release                             \
      -DANDROID_STL=c++_shared                               \
      -DMACE_ENABLE_NEON=ON                                  \
-      -DMACE_ENABLE_QUANTIZE=ON                              \
+      -DMACE_ENABLE_QUANTIZE=${MACE_ENABLE_QUANTIZE}         \
      -DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL}             \
      -DMACE_ENABLE_HEXAGON_DSP=${MACE_ENABLE_HEXAGON_DSP}   \
      -DMACE_ENABLE_HEXAGON_HTA=${MACE_ENABLE_HEXAGON_HTA}   \
      -DMACE_ENABLE_MTK_APU=${MACE_ENABLE_MTK_APU}           \
-      -DMACE_ENABLE_BFLOAT16=ON                              \
+      -DMACE_ENABLE_BFLOAT16=${DMACE_ENABLE_BFLOAT16}        \
      -DMACE_ENABLE_OPT_SIZE=ON                              \
      -DMACE_ENABLE_OBFUSCATE=ON                             \
      -DMACE_ENABLE_TESTS=ON                                 \

--- a/tools/cmake/cmake-build-host.sh
+++ b/tools/cmake/cmake-build-host.sh
@@ -13,12 +13,16 @@ if [[ "$RUNMODE" == "code" ]]; then
    MACE_ENABLE_CODE_MODE=ON
 fi

+DMACE_ENABLE_BFLOAT16=OFF
+if [[ "$BFLOAT16" == "ON" ]]; then
+    DMACE_ENABLE_BFLOAT16=ON
+fi

 mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
 cmake -DMACE_ENABLE_NEON=OFF         \
      -DMACE_ENABLE_QUANTIZE=OFF     \
      -DMACE_ENABLE_OPENCL=OFF       \
-      -DMACE_ENABLE_BFLOAT16=ON      \
+      -DMACE_ENABLE_BFLOAT16=${DMACE_ENABLE_BFLOAT16}     \
      -DMACE_ENABLE_TESTS=ON         \
      -DMACE_ENABLE_BENCHMARKS=ON    \
      -DMACE_ENABLE_CODE_MODE=${MACE_ENABLE_CODE_MODE}    \

--- a/tools/converter.py
+++ b/tools/converter.py
@@ -61,7 +61,8 @@ PlatformTypeStrs = [
    "caffe",
    "onnx",
    "megengine",
-    "keras"
+    "keras",
+    "pytorch",
 ]
 PlatformType = Enum('PlatformType', [(ele, ele) for ele in PlatformTypeStrs],
                    type=str)
@@ -521,6 +522,13 @@ def format_model_config(flags):
                if not isinstance(value, list):
                    subgraph[key] = [value]
                subgraph[key] = [str(v) for v in subgraph[key]]
+# --inputs_shapes will be passed to ELF file `mace_run_static', if input_shapes
+# contains spaces, such as: '1, 3, 224, 224', because mace_run.cc use gflags to
+# parse command line arguments, --input_shapes 1, 3, 224, 224 will be passed as
+# `--input_shapes 1,'. So we strip out spaces here.
+                if key in [YAMLKeyword.input_shapes,
+                           YAMLKeyword.output_shapes]:
+                    subgraph[key] = [e.replace(' ', '') for e in subgraph[key]]
            input_size = len(subgraph[YAMLKeyword.input_tensors])
            output_size = len(subgraph[YAMLKeyword.output_tensors])


--- a/tools/device.py
+++ b/tools/device.py
@@ -632,6 +632,9 @@ class DeviceWrapper:
                    'Run model {} on {}'.format(model_name, self.device_name)))

            model_config = configs[YAMLKeyword.models][model_name]
+            if model_config[YAMLKeyword.platform] == 'pytorch':
+                mace_check(flags.layers == "-1", "Device",
+                           'extracting intermediate layer output is not supported in pytorch JIT yet')  # noqa
            model_runtime = model_config[YAMLKeyword.runtime]
            subgraphs = model_config[YAMLKeyword.subgraphs]


--- a/tools/python/convert.py
+++ b/tools/python/convert.py
@@ -196,6 +196,10 @@ def convert_model(conf, quantize_stat):
        from transform import keras_converter
        converter = keras_converter.KerasConverter(
            option, conf["model_file_path"])
+    elif platform == Platform.PYTORCH:
+        from transform import pytorch_converter
+        converter = pytorch_converter.PytorchConverter(
+            option, conf["model_file_path"])
    else:
        mace_check(False, "Mace do not support platorm %s yet." % platform)


--- a/tools/python/transform/base_converter.py
+++ b/tools/python/transform/base_converter.py
@@ -90,6 +90,7 @@ class FrameworkType(Enum):
    ONNX = 2
    MEGENGINE = 3
    KERAS = 4
+    PYTORCH = 5


 MaceSupportedOps = [

--- a/tools/python/transform/onnx_converter.py
+++ b/tools/python/transform/onnx_converter.py
@@ -604,8 +604,8 @@ class OnnxConverter(base_converter.ConverterInterface):
        for output in node.outputs:
            op.output.append(output)
            if with_shape:
+                output_shape = op.output_shape.add()
                if output in self._graph_shapes_dict:
-                    output_shape = op.output_shape.add()
                    shape_info = self._graph_shapes_dict[output]
                    output_shape.dims.extend(shape_info)

@@ -950,7 +950,8 @@ class OnnxConverter(base_converter.ConverterInterface):
                    node.inputs[0] not in self._consts:
                const_name = node.inputs[1]
                const_tensor = self._consts[const_name]
-                if len(const_tensor.dims) == 0:
+                dims = const_tensor.dims
+                if len(dims) == 0 or (len(dims) == 1 and dims[0] == 1):
                    value_arg = op.arg.add()
                    value_arg.name = MaceKeyword.mace_scalar_input_str
                    if const_tensor.data_type == mace_pb2.DT_INT32:
@@ -970,7 +971,8 @@ class OnnxConverter(base_converter.ConverterInterface):
                    node.inputs[1] not in self._consts:
                const_name = node.inputs[0]
                const_tensor = self._consts[const_name]
-                if len(const_tensor.dims) == 0:
+                dims = const_tensor.dims
+                if len(dims) == 0 or (len(dims) == 1 and dims[0] == 1):
                    value_arg = op.arg.add()
                    value_arg.name = MaceKeyword.mace_scalar_input_str
                    if const_tensor.data_type == mace_pb2.DT_INT32:

--- a/tools/python/transform/pytorch_converter.py
+++ b/tools/python/transform/pytorch_converter.py
--- a/tools/python/transform/tensorflow_converter.py
+++ b/tools/python/transform/tensorflow_converter.py
@@ -587,33 +587,38 @@ class TensorflowConverter(base_converter.ConverterInterface):
                        EltwiseType.SUM, EltwiseType.PROD,
                        EltwiseType.MAX, EltwiseType.MIN]

-                if (len(tf_op.inputs) > 1 and
-                        len(self.infer_tensor_shape(tf_op.inputs[1])) == 0 and
-                        tf_op.inputs[1].op.type == TFOpType.Const.name):
-                    scalar = tf_op.inputs[1].eval().astype(np.float32)
-                    value_arg = op.arg.add()
-                    value_arg.name = MaceKeyword.mace_scalar_input_str
-                    value_arg.f = scalar
-                    self._skip_tensor.add(tf_op.inputs[1].name)
-                    value_index_arg = op.arg.add()
-                    value_index_arg.name = \
-                        MaceKeyword.mace_scalar_input_index_str
-                    value_index_arg.i = 1
-                    self._skip_tensor.add(tf_op.inputs[1].name)
-                    del op.input[1]
-                elif len(self.infer_tensor_shape(tf_op.inputs[0])) == 0 and \
-                        tf_op.inputs[0].op.type == TFOpType.Const.name and \
-                        is_commutative(type_arg.i):
-                    scalar = tf_op.inputs[0].eval().astype(np.float32)
-                    value_arg = op.arg.add()
-                    value_arg.name = MaceKeyword.mace_scalar_input_str
-                    value_arg.f = scalar
-                    value_index_arg = op.arg.add()
-                    value_index_arg.name = \
-                        MaceKeyword.mace_scalar_input_index_str
-                    value_index_arg.i = 0
-                    self._skip_tensor.add(tf_op.inputs[0].name)
-                    del op.input[0]
+                if len(tf_op.inputs) > 1:
+                    shape = self.infer_tensor_shape(tf_op.inputs[1])
+                    if (len(shape) == 0 or
+                            (len(shape) == 1 and shape[0] == 1)) and \
+                            tf_op.inputs[1].op.type == TFOpType.Const.name:
+                        scalar = tf_op.inputs[1].eval().astype(np.float32)
+                        value_arg = op.arg.add()
+                        value_arg.name = MaceKeyword.mace_scalar_input_str
+                        value_arg.f = scalar
+                        self._skip_tensor.add(tf_op.inputs[1].name)
+                        value_index_arg = op.arg.add()
+                        value_index_arg.name = \
+                            MaceKeyword.mace_scalar_input_index_str
+                        value_index_arg.i = 1
+                        self._skip_tensor.add(tf_op.inputs[1].name)
+                        del op.input[1]
+                else:
+                    shape = self.infer_tensor_shape(tf_op.inputs[0])
+                    if (len(shape) == 0 or
+                            (len(shape) == 1 and shape[0] == 1)) and \
+                            is_commutative(type_arg.i) and \
+                            tf_op.inputs[0].op.type == TFOpType.Const.name:
+                        scalar = tf_op.inputs[0].eval().astype(np.float32)
+                        value_arg = op.arg.add()
+                        value_arg.name = MaceKeyword.mace_scalar_input_str
+                        value_arg.f = scalar
+                        value_index_arg = op.arg.add()
+                        value_index_arg.name = \
+                            MaceKeyword.mace_scalar_input_index_str
+                        value_index_arg.i = 0
+                        self._skip_tensor.add(tf_op.inputs[0].name)
+                        del op.input[0]
            except tf.errors.InvalidArgumentError:
                pass


--- a/tools/python/transform/transformer.py
+++ b/tools/python/transform/transformer.py
@@ -350,6 +350,7 @@ class Transformer(base_converter.ConverterInterface):
            input_info.dims.extend(input_node.shape)
            input_info.data_type = input_node.data_type

+        # tools/python/convert.py sets option.check_nodes
        output_nodes = self._option.check_nodes.values()
        for output_node in output_nodes:
            output_info = net.output_info.add()
@@ -872,7 +873,9 @@ class Transformer(base_converter.ConverterInterface):
            if (((op.type == MaceOp.Conv2D.name
                  or op.type == MaceOp.DepthwiseConv2d.name
                  or op.type == MaceOp.FullyConnected.name
-                  or op.type == MaceOp.MatMul.name)
+                  or (op.type == MaceOp.MatMul.name
+                      and self._option.device == DeviceType.CPU.value
+                      and not self._option.quantize))
                 and len(op.input) == 2)
                or (op.type == MaceOp.Deconv2D.name
                    and ((ConverterUtil.get_arg(
@@ -1321,12 +1324,18 @@ class Transformer(base_converter.ConverterInterface):
        for op in net.op:
            # transform `input(4D) -> reshape(2D) -> matmul` to `fc(2D)`
            # fc output is 2D in transformer, using as 4D in op kernel
-            # work for TensorFlow
+            # work for TensorFlow/PyTorch/ONNX
+            framework = ConverterUtil.get_arg(
+                op, MaceKeyword.mace_framework_type_str).i
+            is_torch = framework == FrameworkType.PYTORCH.value
+            is_tf = framework == FrameworkType.TENSORFLOW.value
+            is_onnx = framework == FrameworkType.ONNX.value
+
            if op.type == MaceOp.Reshape.name and \
                    len(op.input) == 2 and \
                    op.input[1] in self._consts and \
                    len(op.output_shape[0].dims) == 2 and \
-                    filter_format == DataFormat.HWIO and \
+                    (is_tf or is_torch or is_onnx) and \
                    op.input[0] in self._producer:
                input_op = self._producer[op.input[0]]
                input_shape = input_op.output_shape[0].dims
@@ -1341,8 +1350,13 @@ class Transformer(base_converter.ConverterInterface):
                            is_fc = False
                        else:
                            weight = self._consts[matmul_op.input[1]]
-                            if len(weight.dims) != 2 or \
-                               weight.dims[0] != op.output_shape[0].dims[1]:
+                            od = op.output_shape[0].dims
+                            wd = weight.dims
+                            if len(wd) != 2:
+                                is_fc = False
+                            # tf fc weight: IO; onnx/pytorch fc weight: OI
+                            if (is_tf and wd[0] != od[1]) or \
+                                    ((is_torch or is_onnx) and wd[1] != od[1]):
                                is_fc = False
                    if is_fc:
                        print('convert reshape and matmul to fc')
@@ -1353,24 +1367,40 @@ class Transformer(base_converter.ConverterInterface):
                            matmul_op.type = MaceOp.FullyConnected.name
                            weight_data = np.array(weight.float_data).reshape(
                                weight.dims)
-                            weight.dims[:] = input_shape[1:] + \
-                                [weight_data.shape[1]]
+                            if is_tf:
+                                weight.dims[:] = input_shape[1:] + \
+                                    [weight_data.shape[1]]
+                            if is_torch or is_onnx:
+                                in_data_format = ConverterUtil.data_format(
+                                    input_op)
+                                # OI+NCHW[2:]=OIHW
+                                if in_data_format == DataFormat.NCHW:
+                                    weight.dims.extend(input_shape[2:])
+                                # OI+NHWC[1:3]=OIHW
+                                else:
+                                    weight.dims.extend(input_shape[1:3])
                        return True

            # transform `fc1(2D) -> matmul` to `fc1(2D) -> fc1(2D)`
            if op.type == MaceOp.MatMul.name and \
-                    filter_format == DataFormat.HWIO and \
+                    (is_tf or is_torch or is_onnx) and \
                    op.input[1] in self._consts:
                producer = self._producer[op.input[0]]
                weight = self._consts[op.input[1]]
                if len(weight.dims) == 2 and self.is_after_fc(op) and \
                        len(producer.output_shape[0].dims) == 2 and \
-                        weight.dims[0] == producer.output_shape[0].dims[1]:
+                        ((is_tf and weight.dims[0] == producer.output_shape[0].dims[1]) or  # noqa
+                         (is_torch and weight.dims[1] == producer.output_shape[0].dims[1]) or  # noqa
+                         (is_onnx and weight.dims[1] == producer.output_shape[0].dims[1])):  # noqa
                    six.print_('convert matmul to fc')
                    op.type = MaceOp.FullyConnected.name
                    weight_data = np.array(weight.float_data).reshape(
                        weight.dims)
-                    weight.dims[:] = [1, 1] + list(weight_data.shape)
+                    # only 1 of the 2 branches can be executed
+                    if is_tf:
+                        weight.dims[:] = [1, 1] + list(weight_data.shape)
+                    if is_torch or is_onnx:
+                        weight.dims.extend([1, 1])
                    return True

            if self._option.device == DeviceType.APU.value:
@@ -2293,7 +2323,7 @@ class Transformer(base_converter.ConverterInterface):
            dim_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_dim_str)
            shape_tensor = None
            if len(op.input) == 1:
-                print("Transform Caffe Reshape")
+                print("Transform Caffe or PyTorch Reshape")
                dims = []
                axis_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_axis_str)
                # transform caffe reshape op

--- a/tools/python/utils/config_parser.py
+++ b/tools/python/utils/config_parser.py
@@ -152,6 +152,7 @@ class Platform(Enum):
    ONNX = 2
    MEGENGINE = 3
    KERAS = 4
+    PYTORCH = 5


 def parse_platform(str):

--- a/tools/python/utils/device.py
+++ b/tools/python/utils/device.py
@@ -51,8 +51,8 @@ def execute(cmd, verbose=True):
            print(line)
        buf.append(line)

-    for l in p.stdout:
-        line = l.strip()
+    for li in p.stdout:
+        line = li.strip()
        if verbose:
            print(line)
        buf.append(line)

--- a/tools/python/validate.py
+++ b/tools/python/validate.py
@@ -13,6 +13,7 @@
 # limitations under the License.

 import os
+import sys
 import os.path
 import numpy as np
 import six
@@ -204,6 +205,48 @@ def validate_tf_model(model_file,
                                   validation_threshold, log_file)


+def validate_pytorch_model(model_file,
+                           input_file, mace_out_file,
+                           input_names, input_shapes, input_data_formats,
+                           output_names, output_shapes, output_data_formats,
+                           validation_threshold, input_data_types, log_file):
+    import torch
+    loaded_model = torch.jit.load(model_file)
+    pytorch_inputs = []
+    for i in range(len(input_names)):
+        input_value = load_data(
+            util.formatted_file_name(input_file, input_names[i]),
+            input_data_types[i])
+        input_value = input_value.reshape(input_shapes[i])
+        if input_data_formats[i] == DataFormat.NHWC and \
+                len(input_shapes[i]) == 4:
+            input_value = input_value.transpose((0, 3, 1, 2))
+        input_value = torch.from_numpy(input_value)
+        pytorch_inputs.append(input_value)
+    with torch.no_grad():
+        pytorch_outputs = loaded_model(*pytorch_inputs)
+
+    if isinstance(pytorch_outputs, torch.Tensor):
+        pytorch_outputs = [pytorch_outputs]
+    else:
+        if not isinstance(pytorch_outputs, (list, tuple)):
+            print('return type {} unsupported'.format(type(pytorch_outputs)))
+            sys.exit(1)
+    for i in range(len(output_names)):
+        value = pytorch_outputs[i].numpy()
+        output_file_name = util.formatted_file_name(
+            mace_out_file, output_names[i])
+        mace_out_value = load_data(output_file_name)
+        # MACE: always returns tensor of dim 1
+        # pytorch: NCHW, conversion is needed
+        if output_data_formats[i] == DataFormat.NHWC and \
+                len(output_shapes[i]) == 4:
+            mace_out_value = mace_out_value.reshape(output_shapes[i])\
+                .transpose((0, 3, 1, 2))
+        compare_output(output_names[i], mace_out_value,
+                       value, validation_threshold, log_file)
+
+
 def validate_caffe_model(model_file, input_file,
                         mace_out_file, weight_file,
                         input_names, input_shapes, input_data_formats,
@@ -387,6 +430,12 @@ def validate(platform, model_file, weight_file, input_file, mace_out_file,
                          output_node, output_shape, output_data_format,
                          validation_threshold, input_data_type,
                          log_file)
+    elif platform == Platform.PYTORCH:
+        validate_pytorch_model(model_file, input_file, mace_out_file,
+                               input_node, input_shape, input_data_format,
+                               output_node, output_shape, output_data_format,
+                               validation_threshold, input_data_type,
+                               log_file)
    elif platform == Platform.CAFFE:
        validate_caffe_model(model_file,
                             input_file, mace_out_file, weight_file,

--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -53,7 +53,8 @@ def strip_invalid_utf8(str):
 def split_stdout(stdout_str):
    stdout_str = strip_invalid_utf8(stdout_str)
    # Filter out last empty line
-    return [l.strip() for l in stdout_str.split('\n') if len(l.strip()) > 0]
+    return [line.strip() for line in stdout_str.split('\n') if
+            len(line.strip()) > 0]


 def make_output_processor(buff):
@@ -659,7 +660,7 @@ def validate_model(abi,
                sh.rm("-rf", "%s/%s" % (model_output_dir, formatted_name))
            device.pull_from_data_dir(formatted_name, model_output_dir)

-    if platform == "tensorflow" or platform == "onnx":
+    if platform == "tensorflow" or platform == "onnx" or platform == "pytorch":
        validate(platform, model_file_path, "",
                 "%s/%s" % (model_output_dir, input_file_name),
                 "%s/%s" % (model_output_dir, output_file_name), device_type,

--- a/tools/validate.py
+++ b/tools/validate.py
@@ -216,6 +216,48 @@ def validate_tf_model(platform, device_type, model_file,
                                   validation_threshold, log_file)


+def validate_pytorch_model(platform, device_type, model_file,
+                           input_file, mace_out_file,
+                           input_names, input_shapes, input_data_formats,
+                           output_names, output_shapes, output_data_formats,
+                           validation_threshold, input_data_types, log_file):
+    import torch
+    loaded_model = torch.jit.load(model_file)
+    pytorch_inputs = []
+    for i in range(len(input_names)):
+        input_value = load_data(
+            common.formatted_file_name(input_file, input_names[i]),
+            input_data_types[i])
+        input_value = input_value.reshape(input_shapes[i])
+        if input_data_formats[i] == common.DataFormat.NHWC and \
+                len(input_shapes[i]) == 4:
+            input_value = input_value.transpose((0, 3, 1, 2))
+        input_value = torch.from_numpy(input_value)
+        pytorch_inputs.append(input_value)
+    with torch.no_grad():
+        pytorch_outputs = loaded_model(*pytorch_inputs)
+
+    if isinstance(pytorch_outputs, torch.Tensor):
+        pytorch_outputs = [pytorch_outputs]
+    else:
+        if not isinstance(pytorch_outputs, (list, tuple)):
+            print('return type {} unsupported yet'.format(
+                type(pytorch_outputs)))
+            sys.exit(1)
+    for i in range(len(output_names)):
+        value = pytorch_outputs[i].numpy()
+        output_file_name = common.formatted_file_name(
+            mace_out_file, output_names[i])
+        mace_out_value = load_data(output_file_name)
+        # MACE: NHWC, pytorch: NCHW, conversion is needed
+        if output_data_formats[i] == common.DataFormat.NHWC and \
+                len(output_shapes[i]) == 4:
+            mace_out_value = mace_out_value.reshape(output_shapes[i])\
+                .transpose((0, 3, 1, 2))
+        compare_output(platform, device_type, output_names[i], mace_out_value,
+                       value, validation_threshold, log_file)
+
+
 def validate_caffe_model(platform, device_type, model_file, input_file,
                         mace_out_file, weight_file,
                         input_names, input_shapes, input_data_formats,
@@ -418,6 +460,13 @@ def validate(platform, model_file, weight_file, input_file, mace_out_file,
                          output_names, output_shapes, output_data_formats,
                          validation_threshold, input_data_types,
                          log_file)
+    elif platform == 'pytorch':
+        validate_pytorch_model(platform, device_type,
+                               model_file, input_file, mace_out_file,
+                               input_names, input_shapes, input_data_formats,
+                               output_names, output_shapes,
+                               output_data_formats, validation_threshold,
+                               input_data_types, log_file)
    elif platform == 'caffe':
        validate_caffe_model(platform, device_type, model_file,
                             input_file, mace_out_file, weight_file,