[arm] fix clang v7 bug (#3118)

* set arm_lang default is off. test=develop * fix resize error, test-develop

[arm] fix clang v7 bug (#3118)
* set arm_lang default is off. test=develop * fix resize error, test-develop
1bd34c3f · HappyAngel · GitHub · f427eb18 · 1bd34c3f · 1bd34c3f
8 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -76,6 +76,7 @@ lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)
 # cv build options
 lite_option(LITE_WITH_CV  "Enable build cv image in lite" OFF)
 lite_option(LITE_WITH_STATIC_CUDA  "Statically link cuda libraries." ON)
+lite_option(LITE_WITH_ARM_CLANG "when arm lang is clang, its ON." OFF)

 # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
 if(ANDROID OR IOS OR ARMLINUX)

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -12,6 +12,7 @@ message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
 message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
 message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
+message(STATUS "LITE_WITH_ARM_LANG:\t${LITE_WITH_ARM_LANG}")

 set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install")
 set(LITE_ON_MOBILE ${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK})

--- a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
@@ -508,6 +508,8 @@ void act_switch_3x3s1(const float* inr0,
                       "x5",
                       "x6",
                       "x7");
+#else
+#ifdef LITE_WITH_ARM_CLANG
 #else
        asm volatile(COMPUTE RELU STORE
                     : [r0] "+r"(inr0),
@@ -541,6 +543,7 @@ void act_switch_3x3s1(const float* inr0,
                       "r3",
                       "r4",
                       "r5");
+#endif
 #endif
        break;
      case lite_api::ActivationType::kRelu6:
@@ -593,6 +596,8 @@ void act_switch_3x3s1(const float* inr0,
                       "x5",
                       "x6",
                       "x7");
+#else
+#ifdef LITE_WITH_ARM_CLANG
 #else
        asm volatile(COMPUTE RELU RELU6 STORE
                     : [r0] "+r"(inr0),
@@ -626,6 +631,7 @@ void act_switch_3x3s1(const float* inr0,
                       "r3",
                       "r4",
                       "r5");
+#endif
 #endif
        break;
      case lite_api::ActivationType::kLeakyRelu:
@@ -678,6 +684,8 @@ void act_switch_3x3s1(const float* inr0,
                       "x5",
                       "x6",
                       "x7");
+#else
+#ifdef LITE_WITH_ARM_CLANG
 #else
        asm volatile(COMPUTE LEAKY_RELU STORE
                     : [r0] "+r"(inr0),
@@ -711,6 +719,7 @@ void act_switch_3x3s1(const float* inr0,
                       "r3",
                       "r4",
                       "r5");
+#endif
 #endif
        break;
      default:
@@ -768,6 +777,8 @@ void act_switch_3x3s1(const float* inr0,
                   "x5",
                   "x6",
                   "x7");
+#else
+#ifdef LITE_WITH_ARM_CLANG
 #else
    asm volatile(COMPUTE STORE
                 : [r0] "+r"(inr0),
@@ -801,6 +812,7 @@ void act_switch_3x3s1(const float* inr0,
                   "r3",
                   "r4",
                   "r5");
+#endif
 #endif
  }
 }
@@ -988,6 +1000,8 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
                           w8,
                           vbias,
                           act_param);
+#else
+#ifdef LITE_WITH_ARM_CLANG
 #else
          act_switch_3x3s1(inr0,
                           inr1,
@@ -1008,6 +1022,7 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
                           vbias,
                           vbias,
                           act_param);
+#endif
 #endif
          outl[0] += 4;
          outl[1] += 4;

--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
@@ -629,6 +629,7 @@ void conv_depthwise_3x3_fp32(const void* din,
                                act_param,
                                ctx);
    } else {
+#ifdef __aarch64__
      conv_3x3s1_depthwise_fp32(reinterpret_cast<const float*>(din),
                                reinterpret_cast<float*>(dout),
                                num,
@@ -643,6 +644,27 @@ void conv_depthwise_3x3_fp32(const void* din,
                                param,
                                act_param,
                                ctx);
+#else
+#ifdef LITE_WITH_ARM_CLANG
+      LOG(FATAL) << "fp32 depthwise conv3x3s1px doesnot support in v7-clang, "
+                    "this can run in basic";
+#else
+      conv_3x3s1_depthwise_fp32(reinterpret_cast<const float*>(din),
+                                reinterpret_cast<float*>(dout),
+                                num,
+                                ch_out,
+                                h_out,
+                                w_out,
+                                ch_in,
+                                h_in,
+                                w_in,
+                                reinterpret_cast<const float*>(weights),
+                                bias,
+                                param,
+                                act_param,
+                                ctx);
+#endif
+#endif
    }
  } else if (stride == 2) {
    if (pads_less && pad_h == pad_w && (pad < 2)) {  // support pad = [0, 1]

--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
@@ -60,6 +60,10 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
  bool flag_dw_5x5 = (kw == 5) && (kh == 5) && (stride == 1 || stride == 2);
  bool flag_dw = flag_dw_3x3 || flag_dw_5x5;

+#ifdef LITE_WITH_ARM_CLANG  // clang
+  flag_dw_3x3 =
+      (stride == 1 && (paddings[0] > 1 || paddings[2] > 1)) ? false : true;
+#endif
  /// select conv impl
  if (param.groups == ic && ic == oc && ks_equal && no_dilation && flag_dw) {
    impl_ = new DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>;

--- a/lite/tests/cv/image_convert_test.cc
+++ b/lite/tests/cv/image_convert_test.cc
@@ -559,7 +559,7 @@ void test_img(const std::vector<int>& cluster_id,
  }
 }

-#if 0
+#if 1
 TEST(TestImageConvertRand, test_func_image_convert_preprocess) {
  if (FLAGS_basic_test) {
    for (auto w : {1, 4, 8, 16, 112, 224, 1092}) {
@@ -573,12 +573,12 @@ TEST(TestImageConvertRand, test_func_image_convert_preprocess) {
                    for (auto layout : {1}) {
                      if ((srcFormat == ImageFormat::NV12 ||
                           srcFormat == ImageFormat::NV21) &&
-                              (dstFormat == ImageFormat::GRAY)) {
+                          (dstFormat == ImageFormat::GRAY)) {
                        continue;
                      }
                      if ((dstFormat == ImageFormat::NV12 ||
                           dstFormat == ImageFormat::NV21) &&
-                              (srcFormat == ImageFormat::GRAY)) {
+                          (srcFormat == ImageFormat::GRAY)) {
                        continue;
                      }
                      if (srcFormat == ImageFormat::NV12 ||
@@ -611,7 +611,7 @@ TEST(TestImageConvertRand, test_func_image_convert_preprocess) {
  }
 }
 #endif
-#if 0
+#if 1
 TEST(TestImageConvertRand, test_func_image_resize_preprocess) {
  if (FLAGS_basic_test) {
    for (auto w : {1, 4, 8, 16, 112, 224, 1092}) {
@@ -624,7 +624,7 @@ TEST(TestImageConvertRand, test_func_image_resize_preprocess) {
                  for (auto dstFormat : {0, 1, 2, 3, 4, 11}) {
                    for (auto layout : {1}) {
                      if (dstFormat == ImageFormat::NV12 ||
-                           dstFormat == ImageFormat::NV21 ||
+                          dstFormat == ImageFormat::NV21 ||
                          (srcFormat == ImageFormat::NV12 ||
                           srcFormat == ImageFormat::NV21) &&
                              dstFormat == ImageFormat::GRAY) {

--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -21,6 +21,7 @@ OPTMODEL_DIR=""
 BUILD_TAILOR=OFF
 BUILD_CV=OFF
 SHUTDOWN_LOG=ON
+LITE_WITH_ARM_LANG=OFF

 readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz

@@ -37,6 +38,14 @@ fi
 function prepare_workspace {
    local root_dir=$1
    local build_dir=$2
+    # ARM LANG
+    if [ ${ARM_LANG} == "clang" ]; then
+        LITE_WITH_ARM_LANG=ON
+    else
+        LITE_WITH_ARM_LANG=OFF
+    fi
+    echo "ARM_LANG is  ${ARM_LANG}"
+    echo "LITE_WITH_ARM_LANG is ${LITE_WITH_ARM_LANG}"
    # in build directory
    # 1. Prepare gen_code file
    GEN_CODE_PATH_PREFIX=$build_dir/lite/gen_code
@@ -106,7 +115,7 @@ function make_tiny_publish_so {
  if [ ${os} == "armlinux" ]; then
    BUILD_JAVA=OFF
  fi
-
+  
  cmake .. \
      ${PYTHON_FLAGS} \
      ${CMAKE_COMMON_OPTIONS} \
@@ -118,6 +127,7 @@ function make_tiny_publish_so {
      -DANDROID_STL_TYPE=$android_stl \
      -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
      -DLITE_WITH_CV=$BUILD_CV \
+      -DLITE_WITH_ARM_LANG=$LITE_WITH_ARM_LANG \
      -DLITE_BUILD_TAILOR=$BUILD_TAILOR \
      -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
      -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
@@ -200,6 +210,7 @@ function make_full_publish_so {
      -DANDROID_STL_TYPE=$android_stl \
      -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
      -DLITE_WITH_CV=$BUILD_CV \
+      -DLITE_WITH_ARM_LANG=$LITE_WITH_ARM_LANG \
      -DLITE_BUILD_TAILOR=$BUILD_TAILOR \
      -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
      -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
@@ -223,13 +234,14 @@ function make_all_tests {
  fi
  mkdir -p $build_directory
  cd $build_directory
-
+ 
  prepare_workspace $root_dir $build_directory
  cmake $root_dir \
      ${CMAKE_COMMON_OPTIONS} \
      -DWITH_TESTING=ON \
      -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
      -DLITE_WITH_CV=$BUILD_CV \
+      -DLITE_WITH_ARM_LANG=$LITE_WITH_ARM_LANG \
      -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}

  make lite_compile_deps -j$NUM_PROC

--- a/lite/utils/cv/image_resize.cc
+++ b/lite/utils/cv/image_resize.cc
@@ -147,52 +147,34 @@ void resize(const uint8_t* src,
      yofs = yofs1;
      ialpha = ialpha1;
    }
-    if (sy == prev_sy1) {
-      memset(rowsbuf0, 0, sizeof(uint16_t) * w_out);
-      const uint8_t* S1 = src + srcw * (sy + 1);
-      const int16_t* ialphap = ialpha;
-      int16_t* rows1p = rowsbuf1;
-      for (int dx = 0; dx < dstw; dx++) {
-        int sx = xofs[dx];
-        int16_t a0 = ialphap[0];
-        int16_t a1 = ialphap[1];

-        const uint8_t* S1pl = S1 + sx;
-        const uint8_t* S1pr = S1 + sx + num;
-        for (int i = 0; i < num; i++) {
-          *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
-        }
-        ialphap += 2;
-      }
-    } else {
-      // hresize two rows
-      const uint8_t* S0 = src + w_in * (sy);
-      const uint8_t* S1 = src + w_in * (sy + 1);
-      const int16_t* ialphap = ialpha;
-      int16_t* rows0p = rowsbuf0;
-      int16_t* rows1p = rowsbuf1;
-      for (int dx = 0; dx < dstw; dx++) {
-        int sx = xofs[dx];
-        int16_t a0 = ialphap[0];
-        int16_t a1 = ialphap[1];
+    // hresize two rows
+    const uint8_t* S0 = src + w_in * (sy);
+    const uint8_t* S1 = src + w_in * (sy + 1);
+    const int16_t* ialphap = ialpha;
+    int16_t* rows0p = rowsbuf0;
+    int16_t* rows1p = rowsbuf1;
+    for (int dx = 0; dx < dstw; dx++) {
+      int sx = xofs[dx];
+      int16_t a0 = ialphap[0];
+      int16_t a1 = ialphap[1];

-        const uint8_t* S0pl = S0 + sx;
-        const uint8_t* S0pr = S0 + sx + num;
-        const uint8_t* S1pl = S1 + sx;
-        const uint8_t* S1pr = S1 + sx + num;
-        for (int i = 0; i < num; i++) {
-          *rows0p++ = ((*S0pl++) * a0 + (*S0pr++) * a1) >> 4;
-          *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
-        }
-        ialphap += 2;
+      const uint8_t* S0pl = S0 + sx;
+      const uint8_t* S0pr = S0 + sx + num;
+      const uint8_t* S1pl = S1 + sx;
+      const uint8_t* S1pr = S1 + sx + num;
+      for (int i = 0; i < num; i++) {
+        *rows0p++ = ((*S0pl++) * a0 + (*S0pr++) * a1) >> 4;
+        *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
      }
+      ialphap += 2;
    }
-    prev_sy1 = sy + 1;
+
    int16_t b0 = ibeta[0];
    int16_t b1 = ibeta[1];
    uint8_t* dp_ptr = dst + dy * w_out;
-    int16_t* rows0p = rowsbuf0;
-    int16_t* rows1p = rowsbuf1;
+    rows0p = rowsbuf0;
+    rows1p = rowsbuf1;
    int16x8_t _b0 = vdupq_n_s16(b0);
    int16x8_t _b1 = vdupq_n_s16(b1);
    int re_cnt = cnt;
@@ -281,6 +263,13 @@ void resize(const uint8_t* src,
                    2);
    }
    ibeta += 2;
+    delete[] rowsbuf0;
+    delete[] rowsbuf1;
+  }
+  if (orih < dsth) {  // uv
+    delete[] xofs1;
+    delete[] yofs1;
+    delete[] ialpha1;
  }
  delete[] buf;
 }