Merge branch 'fix_cmake' into 'master'

opt: reduce the so size for bazel and fix error in cmake compile See merge request applied-machine-learning/sysml/mace!1307

Merge branch 'fix_cmake' into 'master'
opt: reduce the so size for bazel and fix error in cmake compile See merge request applied-machine-learning/sysml/mace!1307
37cbf203 · 李滨 · 701e86ca · a11195a7 · 37cbf203 · 37cbf203
16 changed file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -42,16 +42,16 @@ build_docs:
 cmake_build_android-armeabi-v7a:
  stage: build
  script:
-    - RUNTIME=GPU bash tools/cmake/cmake-build-armeabi-v7a.sh
+    - RUNTIME=GPU QUANTIZE=OFF bash tools/cmake/cmake-build-armeabi-v7a.sh
    - LIBMACE32_FULL_SIZE=`stat -c%s build/cmake-build/armeabi-v7a/install/lib/libmace.so`
-    - if (( LIBMACE32_FULL_SIZE > 2200000 )) ; then echo "The libmace.so size too large"; exit 1; fi
+    - if (( LIBMACE32_FULL_SIZE > 1400000 )) ; then echo "The libmace.so size too large"; exit 1; fi

 cmake_build_android-arm64-v8:
  stage: build
  script:
-    - RUNTIME=GPU bash tools/cmake/cmake-build-arm64-v8a.sh
+    - RUNTIME=GPU QUANTIZE=OFF bash tools/cmake/cmake-build-arm64-v8a.sh
    - LIBMACE64_FULL_SIZE=`stat -c%s build/cmake-build/arm64-v8a/install/lib/libmace.so`
-    - if (( LIBMACE64_FULL_SIZE > 3100000 )) ; then echo "The libmace.so size too large"; exit 1; fi
+    - if (( LIBMACE64_FULL_SIZE > 2300000 )) ; then echo "The libmace.so size too large"; exit 1; fi

 bazel_build:
  stage: build
@@ -59,10 +59,14 @@ bazel_build:
    - bash tools/bazel_build_standalone_lib.sh
    - bash tools/bazel_build_standalone_lib.sh --abi=armeabi-v7a --runtimes=cpu
    - bash tools/bazel_build_standalone_lib.sh --abi=armeabi-v7a --runtimes=cpu,gpu
+    - LIBMACE32_FULL_SIZE=`stat -c%s build/lib/armeabi-v7a/libmace.so`
+    - if (( LIBMACE32_FULL_SIZE > 1400000 )) ; then echo "The libmace.so size too large"; exit 1; fi
    - bash tools/bazel_build_standalone_lib.sh --abi=armeabi-v7a --runtimes=cpu,gpu,dsp
    - bash tools/bazel_build_standalone_lib.sh --abi=armeabi-v7a --runtimes=cpu,gpu,apu
    - bash tools/bazel_build_standalone_lib.sh --abi=arm64-v8a --runtimes=cpu
    - bash tools/bazel_build_standalone_lib.sh --abi=arm64-v8a --runtimes=cpu,gpu
+    - LIBMACE64_FULL_SIZE=`stat -c%s build/lib/arm64-v8a/libmace.so`
+      - if (( LIBMACE64_FULL_SIZE > 2300000 )) ; then echo "The libmace.so size too large"; exit 1; fi
    - bash tools/bazel_build_standalone_lib.sh --abi=arm64-v8a --runtimes=cpu,gpu,dsp
    - bash tools/bazel_build_standalone_lib.sh --abi=arm64-v8a --runtimes=cpu,gpu,apu
    - bash tools/bazel_build_standalone_lib.sh --abi=arm_linux_gnueabihf --runtimes=cpu

--- a/docs/user_guide/advanced_usage_cmake.rst
+++ b/docs/user_guide/advanced_usage_cmake.rst
@@ -175,7 +175,7 @@ After that you can rebuild the engine.
    
    .. code-block:: bash

-        RUNTIME=GPU RUNMODE=code bash tools/cmake/cmake-build-armeabi-v7a.sh
+        RUNTIME=GPU RUNMODE=code QUANTIZE=OFF bash tools/cmake/cmake-build-armeabi-v7a.sh

 ``RUNMODE=code`` means you compile and link model library with MACE engine.


--- a/docs/user_guide/basic_usage.rst
+++ b/docs/user_guide/basic_usage.rst
@@ -45,7 +45,7 @@ Here we use the mobilenet-v2 model as an example.
        cd path/to/mace
        # Build library
        # output lib path: build/lib
-        bash tools/bazel_build_standalone_lib.sh [-abi=abi][-runtimes=rt1,rt2,...][-static]
+        bash tools/bazel_build_standalone_lib.sh [-abi=abi][-runtimes=rt1,rt2,...][-quantize][-static]

    .. note::


--- a/docs/user_guide/basic_usage_cmake.rst
+++ b/docs/user_guide/basic_usage_cmake.rst
@@ -20,7 +20,7 @@ Please make sure you have CMake installed.

    .. code-block:: sh

-        RUNTIME=GPU bash tools/cmake/cmake-build-armeabi-v7a.sh
+        RUNTIME=GPU QUANTIZE=OFF bash tools/cmake/cmake-build-armeabi-v7a.sh

 which generate libraries in ``build/cmake-build/armeabi-v7a``, you can use either static libraries or the ``libmace.so`` shared library.


--- a/docs/zh/user_guide/basic_usage.rst
+++ b/docs/zh/user_guide/basic_usage.rst
@@ -20,7 +20,7 @@

    .. code-block:: sh

-        RUNTIME=GPU bash tools/cmake/cmake-build-armeabi-v7a.sh
+        RUNTIME=GPU QUANTIZE=OFF bash tools/cmake/cmake-build-armeabi-v7a.sh

 编译安装位置为 ``build/cmake-build/armeabi-v7a``, 可以使用 libmace 静态库或者动态库。


--- a/mace/core/runtime/opencl/cl2_header.h
+++ b/mace/core/runtime/opencl/cl2_header.h
@@ -37,7 +37,7 @@
 #pragma GCC diagnostic ignored "-Wignored-attributes"
 #endif  // MACE_OS_MAC

-#include "CL/cl2.hpp"
+#include <CL/cl2.hpp>

 #ifdef MACE_OS_MAC
 #pragma GCC diagnostic pop

--- a/mace/ops/arm/base/common_neon.h
+++ b/mace/ops/arm/base/common_neon.h
@@ -144,7 +144,7 @@ inline void vst3q(float *ptr, float32x4x3_t v) {
 }

 inline float32x8_t vld1o(float *ptr) {
-  return {vld1q_f32(ptr), vld1q_f32(ptr + 4)};
+  return {{vld1q_f32(ptr), vld1q_f32(ptr + 4)}};
 }

 inline void vst1o(float *ptr, float32x8_t v) {
@@ -209,8 +209,8 @@ inline float32x4_t vld1q(const BFloat16 *ptr) {
 // load of 2 4D vectors and perform de-interleaving
 inline float32x4x2_t vld2q_bf16(const uint16_t *ptr) {
  uint16x4x2_t u = vld2_u16(ptr);
-  return {vreinterpretq_f32_u32(vshll_n_u16(u.val[0], 16)),
-          vreinterpretq_f32_u32(vshll_n_u16(u.val[1], 16))};
+  return {{vreinterpretq_f32_u32(vshll_n_u16(u.val[0], 16)),
+           vreinterpretq_f32_u32(vshll_n_u16(u.val[1], 16))}};
 }

 inline float32x4x2_t vld2q_bf16(const BFloat16 *ptr) {
@@ -228,9 +228,9 @@ inline float32x4x2_t vld2q(const BFloat16 *ptr) {
 // load of 3 4D vectors and perform de-interleaving
 inline float32x4x3_t vld3q_bf16(const uint16_t *ptr) {
  uint16x4x3_t u = vld3_u16(ptr);
-  return {vreinterpretq_f32_u32(vshll_n_u16(u.val[0], 16)),
-          vreinterpretq_f32_u32(vshll_n_u16(u.val[1], 16)),
-          vreinterpretq_f32_u32(vshll_n_u16(u.val[2], 16))};
+  return {{vreinterpretq_f32_u32(vshll_n_u16(u.val[0], 16)),
+           vreinterpretq_f32_u32(vshll_n_u16(u.val[1], 16)),
+           vreinterpretq_f32_u32(vshll_n_u16(u.val[2], 16))}};
 }

 inline float32x4x3_t vld3q_bf16(const BFloat16 *ptr) {
@@ -264,8 +264,8 @@ inline void vst1q(BFloat16 *ptr, const float32x4_t v) {

 // store of 2 4D vectors and perform interleaving
 inline void vst2q_bf16(uint16_t *ptr, const float32x4x2_t v) {
-  uint16x4x2_t u = {vshrn_n_u32(vreinterpretq_u32_f32(v.val[0]), 16),
-                    vshrn_n_u32(vreinterpretq_u32_f32(v.val[1]), 16)};
+  uint16x4x2_t u = {{vshrn_n_u32(vreinterpretq_u32_f32(v.val[0]), 16),
+                     vshrn_n_u32(vreinterpretq_u32_f32(v.val[1]), 16)}};
  vst2_u16(ptr, u);
 }

@@ -283,9 +283,9 @@ inline void vst2q(BFloat16 *ptr, const float32x4x2_t v) {

 // store of 3 4D vectors and perform interleaving
 inline void vst3q_bf16(uint16_t *ptr, const float32x4x3_t v) {
-  uint16x4x3_t u = {vshrn_n_u32(vreinterpretq_u32_f32(v.val[0]), 16),
+  uint16x4x3_t u = {{vshrn_n_u32(vreinterpretq_u32_f32(v.val[0]), 16),
                    vshrn_n_u32(vreinterpretq_u32_f32(v.val[0]), 16),
-                    vshrn_n_u32(vreinterpretq_u32_f32(v.val[0]), 16)};
+                    vshrn_n_u32(vreinterpretq_u32_f32(v.val[0]), 16)}};
  vst3_u16(ptr, u);
 }

@@ -304,8 +304,8 @@ inline void vst3q(BFloat16 *ptr, const float32x4x3_t v) {
 // load of 8D vector
 inline float32x8_t vld1o_bf16(const uint16_t *ptr) {
  uint16x8_t u = vld1q_u16(ptr);
-  return {vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(u), 16)),
-          vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(u), 16))};
+  return {{vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(u), 16)),
+           vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(u), 16))}};
 }

 inline float32x8_t vld1o_bf16(const BFloat16 *ptr) {

--- a/test/ccunit/mace/ops/depthwise_conv2d_test.cc
+++ b/test/ccunit/mace/ops/depthwise_conv2d_test.cc
@@ -492,6 +492,7 @@ TEST_F(DepthwiseConv2dOpTest, Quant) {
  TestQuant(3, 1, 128, 56, 56, 3, 3, SAME, {2, 2});
 }

+#ifdef MACE_ENABLE_BFLOAT16
 namespace {
 void TestBFloat16(const index_t batch,
                  const index_t multiplier,
@@ -557,6 +558,8 @@ TEST_F(DepthwiseConv2dOpTest, BFloat16) {
  TestBFloat16(3, 1, 128, 56, 56, 3, 3, SAME, {2, 2});
 }

+#endif  // MACE_ENABLE_BFLOAT16
+
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/third_party/opencl-clhpp/opencl-clhpp.BUILD
+++ b/third_party/opencl-clhpp/opencl-clhpp.BUILD
@@ -12,9 +12,12 @@ genrule(
    cmd = "workdir=$$(mktemp -d -t opencl-clhpp-build.XXXXXXXXXX); cp -aL $$(dirname $(location CMakeLists.txt))/* $$workdir; pushd $$workdir; mkdir build; pushd build; cmake ../ -DBUILD_DOCS=OFF -DBUILD_EXAMPLES=OFF -DBUILD_TESTS=OFF; make generate_clhpp generate_cl2hpp; popd; popd; cp -a $$workdir/build/* $(@D); rm -rf $$workdir; echo installing to  $(@D)",
 )

+# The `srcs` is not used in c++ Code, but we need it to trigger the `genrule`,
+# So we add the "include/CL/cl.hpp", "include/CL/cl2.hpp" into `srcs`, these
+# two files is imported by the `includes` instead of `srcs`.
 cc_library(
    name = "opencl_clhpp",
-    hdrs = ["include/CL/cl.hpp", "include/CL/cl2.hpp"],
-    strip_include_prefix = "include",
+    includes = ["include"],
+    srcs = ["include/CL/cl.hpp", "include/CL/cl2.hpp"],
    visibility = ["//visibility:public"],
 )
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -10,6 +10,7 @@ build --copt=-D_GLIBCXX_USE_C99_MATH_TR1
 build --copt=-DMACE_OBFUSCATE_LITERALS
 build --copt=-DGEMMLOWP_USE_MACE_THREAD_POOL
 build --copt=-DMACE_DEPTHWISE_U8_USE_MULTI_THREAD
+build --copt=-O2

 # Usage example: bazel build --config android
 build:android --define linux_base=true
@@ -20,6 +21,7 @@ build:android --linkopt=-lm
 build:android --distinct_host_configuration=true
 build:android --crosstool_top=//external:android/crosstool
 build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
+build:android --copt=-Oz
 build:android --copt -Wall
 build:android --copt -Wno-mismatched-tags
 build:android --copt -Wno-missing-braces
@@ -75,20 +77,27 @@ build:aarch64_linux_gnu --copt -Wno-implicit-fallthrough

 # Usage example: bazel build --config optimization
 build:optimization -c opt
-build:optimization --copt=-O3
 build:optimization --linkopt=-Wl,--strip-all
 build:optimization --copt=-ffunction-sections
 build:optimization --copt=-fdata-sections
 build:optimization --linkopt=-Wl,--gc-sections
+build:optimization --copt=-fno-rtti
+build:optimization --copt=-fno-exceptions
+build:optimization --copt=-DGOOGLE_PROTOBUF_NO_RTTI
+build:optimization --copt=-DPROTOBUF_USE_EXCEPTIONS=0

 # Usage example: bazel build --config optimization_darwin
-build:optimization_darwin --copt=-O3
 build:optimization_darwin --copt=-ffunction-sections
 build:optimization_darwin --copt=-fdata-sections
 build:optimization_darwin --linkopt=-Wl,-dead_strip
+build:optimization_darwin --copt=-fno-rtti
+build:optimization_darwin --copt=-fno-exceptions
+build:optimization_darwin --copt=-DGOOGLE_PROTOBUF_NO_RTTI
+build:optimization_darwin --copt=-DPROTOBUF_USE_EXCEPTIONS=0

 # Usage example: bazel build --config symbol_hidden
 build:symbol_hidden --copt=-fvisibility=hidden
+build:symbol_hidden --copt=-fvisibility-inlines-hidden

 # Usage example: bazel build --config debug
 build:debug -c dbg

--- a/tools/bazel_build_standalone_lib.sh
+++ b/tools/bazel_build_standalone_lib.sh
@@ -40,8 +40,8 @@ enable_cpu=true
 enable_gpu=false
 enable_dsp=false
 enable_apu=false
-enable_quantize=true
-enable_bfloat16=true
+enable_quantize=false
+enable_bfloat16=false
 enable_rpcmem=true
 static_lib=false
 symbol_hidden=
@@ -97,6 +97,12 @@ for opt in "${@}";do
    static|-static|--static)
      static_lib=true
      ;;
+    quantize|-quantize|--quantize)
+      enable_quantize=true
+      ;;
+    bfloat16|-bfloat16|--bfloat16)
+      enable_bfloat16=true
+      ;;
    help|-help|--help)
      helper
      ;;

--- a/tools/cmake/cmake-build-aarch64-linux-gnu.sh
+++ b/tools/cmake/cmake-build-aarch64-linux-gnu.sh
@@ -17,14 +17,24 @@ if [[ "$RUNMODE" == "code" ]]; then
    MACE_ENABLE_CODE_MODE=ON
 fi

+MACE_ENABLE_QUANTIZE=OFF
+if [[ "$QUANTIZE" == "ON" ]]; then
+    MACE_ENABLE_QUANTIZE=ON
+fi
+
+DMACE_ENABLE_BFLOAT16=OFF
+if [[ "$BFLOAT16" == "ON" ]]; then
+    DMACE_ENABLE_BFLOAT16=ON
+fi
+
 mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
 cmake -DCROSSTOOL_ROOT=${LINARO_AARCH64_LINUX_GNU} \
      -DCMAKE_TOOLCHAIN_FILE=./cmake/toolchains/aarch64-linux-gnu.cmake \
      -DCMAKE_BUILD_TYPE=Release          \
      -DMACE_ENABLE_NEON=ON               \
-      -DMACE_ENABLE_QUANTIZE=ON           \
+      -DMACE_ENABLE_QUANTIZE=${MACE_ENABLE_QUANTIZE}         \
      -DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL}             \
-      -DMACE_ENABLE_BFLOAT16=ON           \
+      -DMACE_ENABLE_BFLOAT16=${DMACE_ENABLE_BFLOAT16}        \
      -DMACE_ENABLE_OPT_SIZE=ON           \
      -DMACE_ENABLE_OBFUSCATE=ON          \
      -DMACE_ENABLE_TESTS=ON              \

--- a/tools/cmake/cmake-build-arm-linux-gnueabihf.sh
+++ b/tools/cmake/cmake-build-arm-linux-gnueabihf.sh
@@ -17,19 +17,29 @@ if [[ "$RUNTIME" == "GPU" ]]; then
    MACE_ENABLE_OPENCL=ON
 fi

+MACE_ENABLE_QUANTIZE=OFF
+if [[ "$QUANTIZE" == "ON" ]]; then
+    MACE_ENABLE_QUANTIZE=ON
+fi
+
+DMACE_ENABLE_BFLOAT16=OFF
+if [[ "$BFLOAT16" == "ON" ]]; then
+    DMACE_ENABLE_BFLOAT16=ON
+fi
+
 mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
 cmake -DCROSSTOOL_ROOT=${LINARO_ARM_LINUX_GNUEABIHF} \
      -DCMAKE_TOOLCHAIN_FILE=./cmake/toolchains/arm-linux-gnueabihf.cmake \
      -DCMAKE_BUILD_TYPE=Release          \
      -DMACE_ENABLE_NEON=ON               \
-      -DMACE_ENABLE_QUANTIZE=ON           \
-      -DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL}              \
-      -DMACE_ENABLE_BFLOAT16=ON           \
+      -DMACE_ENABLE_QUANTIZE=${MACE_ENABLE_QUANTIZE}         \
+      -DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL}             \
+      -DMACE_ENABLE_BFLOAT16=${DMACE_ENABLE_BFLOAT16}        \
      -DMACE_ENABLE_OPT_SIZE=ON           \
      -DMACE_ENABLE_OBFUSCATE=ON          \
      -DMACE_ENABLE_TESTS=ON              \
      -DMACE_ENABLE_BENCHMARKS=ON         \
-      -DMACE_ENABLE_CODE_MODE=${MACE_ENABLE_CODE_MODE}        \
+      -DMACE_ENABLE_CODE_MODE=${MACE_ENABLE_CODE_MODE}       \
      -DCMAKE_INSTALL_PREFIX=install      \
      ../../..
 make -j$(nproc) VERBOSE=1 && make install

--- a/tools/cmake/cmake-build-arm64-v8a.sh
+++ b/tools/cmake/cmake-build-arm64-v8a.sh
@@ -26,6 +26,16 @@ if [[ "$RUNMODE" == "code" ]]; then
    MACE_ENABLE_CODE_MODE=ON
 fi

+MACE_ENABLE_QUANTIZE=OFF
+if [[ "$QUANTIZE" == "ON" ]]; then
+    MACE_ENABLE_QUANTIZE=ON
+fi
+
+DMACE_ENABLE_BFLOAT16=OFF
+if [[ "$BFLOAT16" == "ON" ]]; then
+    DMACE_ENABLE_BFLOAT16=ON
+fi
+
 mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
 cmake -DANDROID_ABI="arm64-v8a" \
      -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake \
@@ -33,12 +43,12 @@ cmake -DANDROID_ABI="arm64-v8a" \
      -DCMAKE_BUILD_TYPE=Release          \
      -DANDROID_STL=c++_shared            \
      -DMACE_ENABLE_NEON=ON               \
-      -DMACE_ENABLE_QUANTIZE=ON           \
+      -DMACE_ENABLE_QUANTIZE=${MACE_ENABLE_QUANTIZE}         \
      -DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL}             \
      -DMACE_ENABLE_HEXAGON_DSP=${MACE_ENABLE_HEXAGON_DSP}   \
      -DMACE_ENABLE_HEXAGON_HTA=${MACE_ENABLE_HEXAGON_HTA}   \
      -DMACE_ENABLE_MTK_APU=${MACE_ENABLE_MTK_APU}           \
-      -DMACE_ENABLE_BFLOAT16=ON           \
+      -DMACE_ENABLE_BFLOAT16=${DMACE_ENABLE_BFLOAT16}        \
      -DMACE_ENABLE_OPT_SIZE=ON           \
      -DMACE_ENABLE_OBFUSCATE=ON          \
      -DMACE_ENABLE_TESTS=ON              \

--- a/tools/cmake/cmake-build-armeabi-v7a.sh
+++ b/tools/cmake/cmake-build-armeabi-v7a.sh
@@ -27,6 +27,16 @@ if [[ "$RUNMODE" == "code" ]]; then
    MACE_ENABLE_CODE_MODE=ON
 fi

+MACE_ENABLE_QUANTIZE=OFF
+if [[ "$QUANTIZE" == "ON" ]]; then
+    MACE_ENABLE_QUANTIZE=ON
+fi
+
+DMACE_ENABLE_BFLOAT16=OFF
+if [[ "$BFLOAT16" == "ON" ]]; then
+    DMACE_ENABLE_BFLOAT16=ON
+fi
+
 mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
 cmake -DANDROID_ABI="armeabi-v7a" \
      -DANDROID_ARM_NEON=ON \
@@ -35,12 +45,12 @@ cmake -DANDROID_ABI="armeabi-v7a" \
      -DCMAKE_BUILD_TYPE=Release                             \
      -DANDROID_STL=c++_shared                               \
      -DMACE_ENABLE_NEON=ON                                  \
-      -DMACE_ENABLE_QUANTIZE=ON                              \
+      -DMACE_ENABLE_QUANTIZE=${MACE_ENABLE_QUANTIZE}         \
      -DMACE_ENABLE_OPENCL=${MACE_ENABLE_OPENCL}             \
      -DMACE_ENABLE_HEXAGON_DSP=${MACE_ENABLE_HEXAGON_DSP}   \
      -DMACE_ENABLE_HEXAGON_HTA=${MACE_ENABLE_HEXAGON_HTA}   \
      -DMACE_ENABLE_MTK_APU=${MACE_ENABLE_MTK_APU}           \
-      -DMACE_ENABLE_BFLOAT16=ON                              \
+      -DMACE_ENABLE_BFLOAT16=${DMACE_ENABLE_BFLOAT16}        \
      -DMACE_ENABLE_OPT_SIZE=ON                              \
      -DMACE_ENABLE_OBFUSCATE=ON                             \
      -DMACE_ENABLE_TESTS=ON                                 \

--- a/tools/cmake/cmake-build-host.sh
+++ b/tools/cmake/cmake-build-host.sh
@@ -13,12 +13,16 @@ if [[ "$RUNMODE" == "code" ]]; then
    MACE_ENABLE_CODE_MODE=ON
 fi

+DMACE_ENABLE_BFLOAT16=OFF
+if [[ "$BFLOAT16" == "ON" ]]; then
+    DMACE_ENABLE_BFLOAT16=ON
+fi

 mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
 cmake -DMACE_ENABLE_NEON=OFF         \
      -DMACE_ENABLE_QUANTIZE=OFF     \
      -DMACE_ENABLE_OPENCL=OFF       \
-      -DMACE_ENABLE_BFLOAT16=ON      \
+      -DMACE_ENABLE_BFLOAT16=${DMACE_ENABLE_BFLOAT16}     \
      -DMACE_ENABLE_TESTS=ON         \
      -DMACE_ENABLE_BENCHMARKS=ON    \
      -DMACE_ENABLE_CODE_MODE=${MACE_ENABLE_CODE_MODE}    \