diff --git a/CMakeLists.txt b/CMakeLists.txt index e3f7a211d70920aa74765b976af6939d55a328ab..b4bfe5981e5c3f524c6d781665c100e33b0713ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -76,6 +76,7 @@ lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF) # cv build options lite_option(LITE_WITH_CV "Enable build cv image in lite" OFF) lite_option(LITE_WITH_STATIC_CUDA "Statically link cuda libraries." ON) +lite_option(LITE_WITH_ARM_CLANG "when arm lang is clang, its ON." OFF) # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter. if(ANDROID OR IOS OR ARMLINUX) diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index 657b1ac82adeeceedc96dce259a523e7025ad527..fefa11d2e2089cf4838385dc6ed3dfe0aad994e6 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -12,6 +12,7 @@ message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}") message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}") message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}") +message(STATUS "LITE_WITH_ARM_LANG:\t${LITE_WITH_ARM_LANG}") set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install") set(LITE_ON_MOBILE ${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK}) diff --git a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc index 55ea94949ba93396c97be5e3ea66d6e29ce95429..cf7acfcb959760aa9d33c556ff0579a4dac940ed 100644 --- a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc +++ b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc @@ -508,6 +508,8 @@ void act_switch_3x3s1(const float* inr0, "x5", "x6", "x7"); +#else +#ifdef LITE_WITH_ARM_CLANG #else asm volatile(COMPUTE RELU STORE : [r0] "+r"(inr0), @@ -541,6 +543,7 @@ void act_switch_3x3s1(const float* inr0, "r3", "r4", "r5"); +#endif #endif break; case lite_api::ActivationType::kRelu6: @@ -593,6 +596,8 @@ void act_switch_3x3s1(const float* inr0, "x5", "x6", "x7"); +#else +#ifdef LITE_WITH_ARM_CLANG #else asm volatile(COMPUTE RELU RELU6 STORE : [r0] "+r"(inr0), @@ -626,6 +631,7 @@ void act_switch_3x3s1(const float* inr0, "r3", "r4", "r5"); +#endif #endif break; case lite_api::ActivationType::kLeakyRelu: @@ -678,6 +684,8 @@ void act_switch_3x3s1(const float* inr0, "x5", "x6", "x7"); +#else +#ifdef LITE_WITH_ARM_CLANG #else asm volatile(COMPUTE LEAKY_RELU STORE : [r0] "+r"(inr0), @@ -711,6 +719,7 @@ void act_switch_3x3s1(const float* inr0, "r3", "r4", "r5"); +#endif #endif break; default: @@ -768,6 +777,8 @@ void act_switch_3x3s1(const float* inr0, "x5", "x6", "x7"); +#else +#ifdef LITE_WITH_ARM_CLANG #else asm volatile(COMPUTE STORE : [r0] "+r"(inr0), @@ -801,6 +812,7 @@ void act_switch_3x3s1(const float* inr0, "r3", "r4", "r5"); +#endif #endif } } @@ -988,6 +1000,8 @@ void conv_3x3s1_depthwise_fp32(const float* i_data, w8, vbias, act_param); +#else +#ifdef LITE_WITH_ARM_CLANG #else act_switch_3x3s1(inr0, inr1, @@ -1008,6 +1022,7 @@ void conv_3x3s1_depthwise_fp32(const float* i_data, vbias, vbias, act_param); +#endif #endif outl[0] += 4; outl[1] += 4; diff --git a/lite/backends/arm/math/conv_impl.cc b/lite/backends/arm/math/conv_impl.cc index 9412fc43f1eba87f5685256eeb6435bfad4438eb..4fcef3813b792808414415fa874e14f5ef253fcd 100644 --- a/lite/backends/arm/math/conv_impl.cc +++ b/lite/backends/arm/math/conv_impl.cc @@ -629,6 +629,7 @@ void conv_depthwise_3x3_fp32(const void* din, act_param, ctx); } else { +#ifdef __aarch64__ conv_3x3s1_depthwise_fp32(reinterpret_cast(din), reinterpret_cast(dout), num, @@ -643,6 +644,27 @@ void conv_depthwise_3x3_fp32(const void* din, param, act_param, ctx); +#else +#ifdef LITE_WITH_ARM_CLANG + LOG(FATAL) << "fp32 depthwise conv3x3s1px doesnot support in v7-clang, " + "this can run in basic"; +#else + conv_3x3s1_depthwise_fp32(reinterpret_cast(din), + reinterpret_cast(dout), + num, + ch_out, + h_out, + w_out, + ch_in, + h_in, + w_in, + reinterpret_cast(weights), + bias, + param, + act_param, + ctx); +#endif +#endif } } else if (stride == 2) { if (pads_less && pad_h == pad_w && (pad < 2)) { // support pad = [0, 1] diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc index 74083b3617f428e4f94f12498e337328d0f1a2a8..34dd6ac97c6556b2fa4623fcb15f2180cf4a5656 100644 --- a/lite/kernels/arm/conv_compute.cc +++ b/lite/kernels/arm/conv_compute.cc @@ -60,6 +60,10 @@ void ConvCompute::PrepareForRun() { bool flag_dw_5x5 = (kw == 5) && (kh == 5) && (stride == 1 || stride == 2); bool flag_dw = flag_dw_3x3 || flag_dw_5x5; +#ifdef LITE_WITH_ARM_CLANG // clang + flag_dw_3x3 = + (stride == 1 && (paddings[0] > 1 || paddings[2] > 1)) ? false : true; +#endif /// select conv impl if (param.groups == ic && ic == oc && ks_equal && no_dilation && flag_dw) { impl_ = new DepthwiseConv; diff --git a/lite/tests/cv/image_convert_test.cc b/lite/tests/cv/image_convert_test.cc index e22e327e8b10d1237f5e07b5b0a8d95d3b19e70b..a6835fc8f883e84c8235fd141dfbdab537812d16 100644 --- a/lite/tests/cv/image_convert_test.cc +++ b/lite/tests/cv/image_convert_test.cc @@ -559,7 +559,7 @@ void test_img(const std::vector& cluster_id, } } -#if 0 +#if 1 TEST(TestImageConvertRand, test_func_image_convert_preprocess) { if (FLAGS_basic_test) { for (auto w : {1, 4, 8, 16, 112, 224, 1092}) { @@ -573,12 +573,12 @@ TEST(TestImageConvertRand, test_func_image_convert_preprocess) { for (auto layout : {1}) { if ((srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) && - (dstFormat == ImageFormat::GRAY)) { + (dstFormat == ImageFormat::GRAY)) { continue; } if ((dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) && - (srcFormat == ImageFormat::GRAY)) { + (srcFormat == ImageFormat::GRAY)) { continue; } if (srcFormat == ImageFormat::NV12 || @@ -611,7 +611,7 @@ TEST(TestImageConvertRand, test_func_image_convert_preprocess) { } } #endif -#if 0 +#if 1 TEST(TestImageConvertRand, test_func_image_resize_preprocess) { if (FLAGS_basic_test) { for (auto w : {1, 4, 8, 16, 112, 224, 1092}) { @@ -624,7 +624,7 @@ TEST(TestImageConvertRand, test_func_image_resize_preprocess) { for (auto dstFormat : {0, 1, 2, 3, 4, 11}) { for (auto layout : {1}) { if (dstFormat == ImageFormat::NV12 || - dstFormat == ImageFormat::NV21 || + dstFormat == ImageFormat::NV21 || (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) && dstFormat == ImageFormat::GRAY) { diff --git a/lite/tools/build.sh b/lite/tools/build.sh index c21f52ae40617329fb9e2e5209361344ee5110c8..089f5905f96f20f4f4287ab86e036c83c5c93035 100755 --- a/lite/tools/build.sh +++ b/lite/tools/build.sh @@ -21,6 +21,7 @@ OPTMODEL_DIR="" BUILD_TAILOR=OFF BUILD_CV=OFF SHUTDOWN_LOG=ON +LITE_WITH_ARM_LANG=OFF readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz @@ -37,6 +38,14 @@ fi function prepare_workspace { local root_dir=$1 local build_dir=$2 + # ARM LANG + if [ ${ARM_LANG} == "clang" ]; then + LITE_WITH_ARM_LANG=ON + else + LITE_WITH_ARM_LANG=OFF + fi + echo "ARM_LANG is ${ARM_LANG}" + echo "LITE_WITH_ARM_LANG is ${LITE_WITH_ARM_LANG}" # in build directory # 1. Prepare gen_code file GEN_CODE_PATH_PREFIX=$build_dir/lite/gen_code @@ -106,7 +115,7 @@ function make_tiny_publish_so { if [ ${os} == "armlinux" ]; then BUILD_JAVA=OFF fi - + cmake .. \ ${PYTHON_FLAGS} \ ${CMAKE_COMMON_OPTIONS} \ @@ -118,6 +127,7 @@ function make_tiny_publish_so { -DANDROID_STL_TYPE=$android_stl \ -DLITE_BUILD_EXTRA=$BUILD_EXTRA \ -DLITE_WITH_CV=$BUILD_CV \ + -DLITE_WITH_ARM_LANG=$LITE_WITH_ARM_LANG \ -DLITE_BUILD_TAILOR=$BUILD_TAILOR \ -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \ -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang} @@ -200,6 +210,7 @@ function make_full_publish_so { -DANDROID_STL_TYPE=$android_stl \ -DLITE_BUILD_EXTRA=$BUILD_EXTRA \ -DLITE_WITH_CV=$BUILD_CV \ + -DLITE_WITH_ARM_LANG=$LITE_WITH_ARM_LANG \ -DLITE_BUILD_TAILOR=$BUILD_TAILOR \ -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \ -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang} @@ -223,13 +234,14 @@ function make_all_tests { fi mkdir -p $build_directory cd $build_directory - + prepare_workspace $root_dir $build_directory cmake $root_dir \ ${CMAKE_COMMON_OPTIONS} \ -DWITH_TESTING=ON \ -DLITE_BUILD_EXTRA=$BUILD_EXTRA \ -DLITE_WITH_CV=$BUILD_CV \ + -DLITE_WITH_ARM_LANG=$LITE_WITH_ARM_LANG \ -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang} make lite_compile_deps -j$NUM_PROC diff --git a/lite/utils/cv/image_resize.cc b/lite/utils/cv/image_resize.cc index 1baef9de2e636ade8630d76dce14e7cfc1ee25f5..1ad7e7aa9edd8191b8a99f1b71e883ad4467d8ff 100644 --- a/lite/utils/cv/image_resize.cc +++ b/lite/utils/cv/image_resize.cc @@ -147,52 +147,34 @@ void resize(const uint8_t* src, yofs = yofs1; ialpha = ialpha1; } - if (sy == prev_sy1) { - memset(rowsbuf0, 0, sizeof(uint16_t) * w_out); - const uint8_t* S1 = src + srcw * (sy + 1); - const int16_t* ialphap = ialpha; - int16_t* rows1p = rowsbuf1; - for (int dx = 0; dx < dstw; dx++) { - int sx = xofs[dx]; - int16_t a0 = ialphap[0]; - int16_t a1 = ialphap[1]; - const uint8_t* S1pl = S1 + sx; - const uint8_t* S1pr = S1 + sx + num; - for (int i = 0; i < num; i++) { - *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4; - } - ialphap += 2; - } - } else { - // hresize two rows - const uint8_t* S0 = src + w_in * (sy); - const uint8_t* S1 = src + w_in * (sy + 1); - const int16_t* ialphap = ialpha; - int16_t* rows0p = rowsbuf0; - int16_t* rows1p = rowsbuf1; - for (int dx = 0; dx < dstw; dx++) { - int sx = xofs[dx]; - int16_t a0 = ialphap[0]; - int16_t a1 = ialphap[1]; + // hresize two rows + const uint8_t* S0 = src + w_in * (sy); + const uint8_t* S1 = src + w_in * (sy + 1); + const int16_t* ialphap = ialpha; + int16_t* rows0p = rowsbuf0; + int16_t* rows1p = rowsbuf1; + for (int dx = 0; dx < dstw; dx++) { + int sx = xofs[dx]; + int16_t a0 = ialphap[0]; + int16_t a1 = ialphap[1]; - const uint8_t* S0pl = S0 + sx; - const uint8_t* S0pr = S0 + sx + num; - const uint8_t* S1pl = S1 + sx; - const uint8_t* S1pr = S1 + sx + num; - for (int i = 0; i < num; i++) { - *rows0p++ = ((*S0pl++) * a0 + (*S0pr++) * a1) >> 4; - *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4; - } - ialphap += 2; + const uint8_t* S0pl = S0 + sx; + const uint8_t* S0pr = S0 + sx + num; + const uint8_t* S1pl = S1 + sx; + const uint8_t* S1pr = S1 + sx + num; + for (int i = 0; i < num; i++) { + *rows0p++ = ((*S0pl++) * a0 + (*S0pr++) * a1) >> 4; + *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4; } + ialphap += 2; } - prev_sy1 = sy + 1; + int16_t b0 = ibeta[0]; int16_t b1 = ibeta[1]; uint8_t* dp_ptr = dst + dy * w_out; - int16_t* rows0p = rowsbuf0; - int16_t* rows1p = rowsbuf1; + rows0p = rowsbuf0; + rows1p = rowsbuf1; int16x8_t _b0 = vdupq_n_s16(b0); int16x8_t _b1 = vdupq_n_s16(b1); int re_cnt = cnt; @@ -281,6 +263,13 @@ void resize(const uint8_t* src, 2); } ibeta += 2; + delete[] rowsbuf0; + delete[] rowsbuf1; + } + if (orih < dsth) { // uv + delete[] xofs1; + delete[] yofs1; + delete[] ialpha1; } delete[] buf; }