merge to newest version

c28971a9 · baolei.an · 662e4d7c · c28971a9 · c28971a9 · c28971a9
394 changed file
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -174,15 +174,26 @@ if(NOT WITH_DSO)
    endif(WIN32)
 endif(NOT WITH_DSO)

-get_filename_component(CUDA_LIB_PATH ${CUDA_curand_LIBRARY} DIRECTORY)
-function(import_static_library alias path)
+function(add_cuda_static_lib alias cuda_lib_paths file_name)
+    unset(ABS_PATH CACHE)
+    find_library(ABS_PATH NAMES ${file_name} PATHS ${${cuda_lib_paths}} NO_DEFAULT_PATH)
    add_library(${alias} STATIC IMPORTED GLOBAL)
-    set_property(TARGET ${alias} PROPERTY IMPORTED_LOCATION ${path})
+    set_property(TARGET ${alias} PROPERTY IMPORTED_LOCATION ${ABS_PATH})
+    set(CUDA_STATIC_MODULES ${CUDA_STATIC_MODULES} ${alias} PARENT_SCOPE)
+    if (NOT ABS_PATH)
+      message(FATAL_ERROR "Can not find CUDA static library: ${file_name}")
+    endif()
 endfunction()
-import_static_library(cudart_static ${CUDA_LIB_PATH}/libcudart_static.a)
-import_static_library(cublas_static ${CUDA_LIB_PATH}/libcublas_static.a)
-import_static_library(curand_static ${CUDA_LIB_PATH}/libcurand_static.a)
-import_static_library(culibos_static ${CUDA_LIB_PATH}/libculibos.a)
+
+add_cuda_static_lib(cudart_static CUDNN_CHECK_LIBRARY_DIRS libcudart_static.a)
+add_cuda_static_lib(cublas_static CUDNN_CHECK_LIBRARY_DIRS libcublas_static.a)
+add_cuda_static_lib(curand_static CUDNN_CHECK_LIBRARY_DIRS libcurand_static.a)
+add_cuda_static_lib(culibos_static CUDNN_CHECK_LIBRARY_DIRS libculibos.a)
+if(NOT ${CUDA_VERSION} LESS 10.1)
+  add_cuda_static_lib(cublasLt_static CUDNN_CHECK_LIBRARY_DIRS libcublasLt_static.a)
+endif()
+
+set_property(GLOBAL PROPERTY CUDA_STATIC_MODULES cudnn_static ${CUDA_STATIC_MODULES})

 # setting nvcc arch flags
 select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)

--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -26,13 +26,15 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
    ${CUDNN_ROOT}/lib64
    ${CUDNN_ROOT}/lib
    ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
-    ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
+    /usr/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
+    /usr/lib/${TARGET_ARCH}-linux-gnu/
    $ENV{CUDNN_ROOT}
    $ENV{CUDNN_ROOT}/lib64
    $ENV{CUDNN_ROOT}/lib
    /usr/lib
 	${CUDA_TOOLKIT_ROOT_DIR}
        ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+	${CUDA_TOOLKIT_ROOT_DIR}/lib64
 	)

 if((${CUDA_VERSION} GREATER 10.0) OR (${CUDA_VERSION} EQUAL 10.0))

--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
 INCLUDE(ExternalProject)

+SET(EIGEN_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/eigen3)
 SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
 SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
 INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
@@ -16,9 +17,12 @@ if(WITH_AMD_GPU)
    ExternalProject_Add(
        extern_eigen3
        ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
-        GIT_TAG         7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e
+        GIT_TAG
+        URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Fhipeigen-upstream-702834151eaebcf955fd09ed0ad83c06.zip
+        DOWNLOAD_DIR          ${EIGEN_SOURCECODE_DIR}
+        DOWNLOAD_NO_PROGRESS  1
        PREFIX          ${EIGEN_SOURCE_DIR}
+        DOWNLOAD_NAME   "hipeigen-upstream-702834151eaebcf955fd09ed0ad83c06.zip"
        UPDATE_COMMAND  ""
        CONFIGURE_COMMAND ""
        BUILD_COMMAND     ""
@@ -29,12 +33,14 @@ else()
    ExternalProject_Add(
        extern_eigen3
        ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY  "https://github.com/eigenteam/eigen-git-mirror"
        # eigen on cuda9.1 missing header of math_funtions.hpp
        # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
-        GIT_TAG         917060c364181f33a735dc023818d5a54f60e54c
+        GIT_TAG
+        URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
+        DOWNLOAD_DIR          ${EIGEN_SOURCECODE_DIR}
+        DOWNLOAD_NO_PROGRESS  1
        PREFIX          ${EIGEN_SOURCE_DIR}
-        DOWNLOAD_NAME   "eigen"
+        DOWNLOAD_NAME   "eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip"
        UPDATE_COMMAND  ""
        CONFIGURE_COMMAND ""
        BUILD_COMMAND     ""

--- a/cmake/external/xbyak.cmake
+++ b/cmake/external/xbyak.cmake
@@ -20,6 +20,7 @@ endif()

 include(ExternalProject)

+SET(XBYAK_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/xbyak)
 set(XBYAK_PROJECT       extern_xbyak)
 set(XBYAK_PREFIX_DIR    ${THIRD_PARTY_PATH}/xbyak)
 set(XBYAK_INSTALL_ROOT  ${THIRD_PARTY_PATH}/install/xbyak)
@@ -38,8 +39,11 @@ ExternalProject_Add(
    ${XBYAK_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ""
-    GIT_REPOSITORY      "https://github.com/herumi/xbyak.git"
    GIT_TAG             "v5.661"  # Jul 26th
+    URL                 http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Fxbyak-5.66.zip
+    DOWNLOAD_DIR        ${XBYAK_SOURCECODE_DIR}
+    DOWNLOAD_NAME   "xbyak-5.66.zip"
+    DOWNLOAD_NO_PROGRESS 1
    PREFIX              ${XBYAK_PREFIX_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}

--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
 INCLUDE(ExternalProject)

+SET(XXHASH_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/xxhash)
 set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash)
 set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash)
 set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include")
@@ -18,10 +19,12 @@ if(WIN32)
  ExternalProject_Add(
          extern_xxhash
          ${EXTERNAL_PROJECT_LOG_ARGS}
-          GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
          GIT_TAG         "v0.6.5"
+          URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2FxxHash-0.6.5.zip
+          DOWNLOAD_DIR          ${XXHASH_SOURCECODE_DIR}
+          DOWNLOAD_NAME   "xxHash-0.6.5.zip"
+          DOWNLOAD_NO_PROGRESS  1
          PREFIX          ${XXHASH_SOURCE_DIR}
-          DOWNLOAD_NAME   "xxhash"
          UPDATE_COMMAND  ""
          BUILD_IN_SOURCE 1
          PATCH_COMMAND
@@ -41,10 +44,12 @@ else()
  ExternalProject_Add(
      extern_xxhash
      ${EXTERNAL_PROJECT_LOG_ARGS}
-      GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
      GIT_TAG         "v0.6.5"
+      URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2FxxHash-0.6.5.zip
+      DOWNLOAD_DIR          ${XXHASH_SOURCECODE_DIR}
+      DOWNLOAD_NO_PROGRESS  1
      PREFIX          ${XXHASH_SOURCE_DIR}
-      DOWNLOAD_NAME   "xxhash"
+      DOWNLOAD_NAME   "xxHash-0.6.5.zip"
      UPDATE_COMMAND  ""
      CONFIGURE_COMMAND ""
      BUILD_IN_SOURCE 1

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -490,6 +490,9 @@ function(nv_binary TARGET_NAME)
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS})
+    target_link_libraries(${TARGET_NAME} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES})
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(${TARGET_NAME} ${os_dependency_modules})
    if(nv_binary_DEPS)
      target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
      add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
@@ -507,7 +510,7 @@ function(nv_test TARGET_NAME)
    cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest
-gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES} )
+       gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES} )
    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest gflags glog)
    common_link(${TARGET_NAME})
    add_test(${TARGET_NAME} ${TARGET_NAME})

--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -164,7 +164,9 @@ function(lite_cc_library TARGET)
 endfunction()

 function(lite_cc_binary TARGET)
-    set(options "")
+    if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+        set(options " -g ")
+    endif()
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS PROFILE_DEPS
      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
@@ -255,6 +257,7 @@ endfunction()

 set(arm_kernels CACHE INTERNAL "arm kernels")
 set(x86_kernels CACHE INTERNAL "x86 kernels")
+set(cuda_kernels CACHE INTERNAL "cuda kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -5,6 +5,7 @@ message(STATUS "LIGHT_FRAMEWORK:\t${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK}")
 message(STATUS "LITE_WITH_CUDA:\t${LITE_WITH_CUDA}")
 message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}")
 message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
+message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
 message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
 message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
@@ -121,6 +122,9 @@ if (LITE_WITH_X86)
    add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
 endif()

+if(LITE_WITH_CUDA)
+    add_dependencies(publish_inference paddle_full_api_shared)
+endif(LITE_WITH_CUDA) 
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
    if (NOT LITE_ON_TINY_PUBLISH)
        # add cxx lib
@@ -161,7 +165,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/include"
                    COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include"
                    COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/lib"
-                    COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+                    COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include"
                    )
            add_dependencies(tiny_publish_lib bundle_light_api)
            add_dependencies(publish_inference tiny_publish_lib)
@@ -177,6 +181,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                    )
                add_dependencies(tiny_publish_cxx_lib paddle_light_api_shared)
                add_dependencies(publish_inference tiny_publish_cxx_lib)
+                add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD
+                            COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so)
            endif()
        endif()
    endif()
@@ -199,7 +205,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
        endif()
    endif()

-    if ((ARM_TARGET_OS STREQUAL "android") AND (NOT LITE_WITH_OPENCL) AND
+    if ((ARM_TARGET_OS STREQUAL "android") AND
            ((ARM_TARGET_ARCH_ABI STREQUAL armv7) OR (ARM_TARGET_ARCH_ABI STREQUAL armv8)))
        if (NOT LITE_ON_TINY_PUBLISH)
            # copy
@@ -214,6 +220,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/include"
            )
            add_dependencies(publish_inference_android_cxx_demos logging gflags)
            add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
@@ -225,6 +234,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/README.md" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
+
            )
            add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos)
        endif()

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -9,7 +9,7 @@ if (LITE_ON_TINY_PUBLISH)
    set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG")
 endif()
 set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer)
-if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
+if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
    #full api dynamic library
    add_library(paddle_full_api_shared SHARED "")
    target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc)
@@ -19,7 +19,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "and
       add_dependencies(paddle_full_api_shared xxhash)
       target_link_libraries(paddle_full_api_shared xxhash)
    endif()
-    
+    if(LITE_WITH_CUDA)
+        target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive")
+    endif(LITE_WITH_CUDA) 
    #light api dynamic library
    lite_cc_library(paddle_light_api_shared MODULE
    SRCS light_api_shared.cc
@@ -65,6 +67,7 @@ endif()

 message(STATUS "get ops ${ops}")
 message(STATUS "get X86 kernels ${x86_kernels}")
+message(STATUS "get CUDA kernels ${cuda_kernels}")
 message(STATUS "get Host kernels ${host_kernels}")
 message(STATUS "get ARM kernels ${arm_kernels}")
 message(STATUS "get NPU kernels ${npu_kernels}")
@@ -83,18 +86,17 @@ if (NOT LITE_ON_TINY_PUBLISH)
                    ARM_DEPS ${arm_kernels}
                    NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass
                    XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass
-                    CL_DEPS ${opencl_kenrels}
-                    FPGA_DEPS ${fpga_kenrels}
-                    BM_DEPS ${bm_kenrels})
+                    CL_DEPS ${opencl_kernels}
+                    FPGA_DEPS ${fpga_kernels})
+                    BM_DEPS ${bm_kernels})
 endif()

 # for light api
 set(light_api_deps
    scope target_wrapper_host model_parser program)
 if(LITE_WITH_CUDA)
+    get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES)
    set(light_api_deps ${light_api_deps} target_wrapper_cuda)
-    set(cuda_static_deps cudart_static cublas_static curand_static
-        cudnn_static culibos_static)
 endif()
 lite_cc_library(light_api SRCS light_api.cc
        DEPS scope target_wrapper_host model_parser
@@ -104,9 +106,9 @@ lite_cc_library(light_api SRCS light_api.cc
        ARM_DEPS ${arm_kernels}
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
-        CL_DEPS ${opencl_kenrels}
-        FPGA_DEPS ${fpga_kenrels}
-        BM_DEPS ${bm_kenrels})
+        CL_DEPS ${opencl_kernels}
+        FPGA_DEPS ${fpga_kernels})
+        BM_DEPS ${bm_kernels})

 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
@@ -305,9 +307,10 @@ if(NOT IOS)
    NPU_DEPS ${npu_kernels}
    XPU_DEPS ${xpu_kernels}
    CL_DEPS ${opencl_kernels}
-    FPGA_DEPS ${fpga_kernels}
    BM_DEPS ${bm_kernels}
-    X86_DEPS ${x86_kernels})
+    FPGA_DEPS ${fpga_kernels}
+    X86_DEPS ${x86_kernels}
+    CUDA_DEPS ${cuda_kernels})
  lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
    ${ops} ${host_kernels}
    ARM_DEPS ${arm_kernels}
@@ -316,7 +319,9 @@ if(NOT IOS)
    CL_DEPS ${opencl_kernels}
    BM_DEPS ${bm_kernels}
    FPGA_DEPS ${fpga_kernels}
-    X86_DEPS ${x86_kernels})
+    X86_DEPS ${x86_kernels}
+    CUDA_DEPS ${cuda_kernels})
+
 endif()

 #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc

--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -44,9 +44,10 @@ void OutputOptModel(const std::string& load_model_dir,
                    const std::vector<std::vector<int64_t>>& input_shapes) {
  lite_api::CxxConfig config;
  config.set_model_dir(load_model_dir);
-  std::vector<Place> vaild_places = {Place{TARGET(kARM), PRECISION(kFloat)},
+  std::vector<Place> vaild_places = {
+      Place{TARGET(kARM), PRECISION(kFloat)},
      Place{TARGET(kX86), PRECISION(kFloat)},
-                                     Place{TARGET(kOpenCL), PRECISION(kFloat)}};
+  };
  if (FLAGS_is_quantized_model) {
    vaild_places.insert(vaild_places.begin(),
                        Place{TARGET(kARM), PRECISION(kInt8)});

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -24,13 +24,6 @@
 namespace paddle {
 namespace lite {

-static const char TAILORD_OPS_SOURCE_LIST_FILENAME[] =
-    ".tailored_ops_source_list";
-static const char TAILORD_OPS_LIST_NAME[] = ".tailored_ops_list";
-static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] =
-    ".tailored_kernels_source_list";
-static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";
-
 void Predictor::SaveModel(const std::string &dir,
                          lite_api::LiteModelType model_type,
                          bool record_info) {
@@ -140,21 +133,35 @@ lite::Tensor *Predictor::GetInput(size_t offset) {

 // get inputs names
 std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
+
 // get outputnames
 std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }
+
 // append the names of inputs and outputs into input_names_ and output_names_
 void Predictor::PrepareFeedFetch() {
+  std::vector<const cpp::OpDesc *> feeds;
+  std::vector<const cpp::OpDesc *> fetchs;
+#if defined(LITE_WITH_NPU) || defined(LITE_WITH_XPU)
+  // The shape of input tensors must be determined before generating NPU and XPU
+  // program.
  auto current_block = program_desc_.GetBlock<cpp::BlockDesc>(0);
-  std::vector<cpp::OpDesc *> feeds;
-  std::vector<cpp::OpDesc *> fetchs;
  for (size_t i = 0; i < current_block->OpsSize(); i++) {
    auto op = current_block->GetOp<cpp::OpDesc>(i);
+#else
+  if (!program_) {
+    GenRuntimeProgram();
+  }
+  const auto &insts = program_->instructions();
+  for (size_t i = 0; i < program_->num_instructions(); i++) {
+    const auto &op = insts[i].op()->op_info();
+#endif
    if (op->Type() == "feed") {
      feeds.push_back(op);
    } else if (op->Type() == "fetch") {
      fetchs.push_back(op);
    }
  }
+
  input_names_.resize(feeds.size());
  output_names_.resize(fetchs.size());
  for (size_t i = 0; i < feeds.size(); i++) {
@@ -190,6 +197,7 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
 const cpp::ProgramDesc &Predictor::program_desc() const {
  return program_desc_;
 }
+
 const RuntimeProgram &Predictor::runtime_program() const { return *program_; }

 void Predictor::Build(const lite_api::CxxConfig &config,
@@ -246,16 +254,18 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
                      const std::vector<Place> &valid_places,
                      const std::vector<std::string> &passes) {
  program_desc_ = desc;
+  // `inner_places` is used to optimize passes
  std::vector<Place> inner_places = valid_places;
  inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
  inner_places.emplace_back(
      TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
  Program program(desc, scope_, inner_places);
-  /// The first place in valid_places is
+
  core::KernelPickFactor factor;
  factor.ConsiderTarget();
  factor.ConsiderPrecision();
  factor.ConsiderDataLayout();
+
  optimizer_.Run(std::move(program), inner_places, factor, passes);
  exec_scope_ = optimizer_.exec_scope();
  PrepareFeedFetch();
@@ -271,6 +281,7 @@ const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
  auto *var = exec_scope_->FindVar(name);
  return &var->Get<lite::Tensor>();
 }
+
 // get input by name
 lite::Tensor *Predictor::GetInputByName(const std::string &name) {
  auto element = std::find(input_names_.begin(), input_names_.end(), name);

--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -29,6 +29,13 @@
 namespace paddle {
 namespace lite {

+static const char TAILORD_OPS_SOURCE_LIST_FILENAME[] =
+    ".tailored_ops_source_list";
+static const char TAILORD_OPS_LIST_NAME[] = ".tailored_ops_list";
+static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] =
+    ".tailored_kernels_source_list";
+static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";
+
 /*
 * Predictor for inference, input a model, it will optimize and execute it.
 */

--- a/lite/api/mobilenetv1_test.cc
+++ b/lite/api/mobilenetv1_test.cc
@@ -123,8 +123,11 @@ TEST(MobileNetV1, test_arm) {
 #ifdef LITE_WITH_OPENCL
 TEST(MobileNetV1, test_opencl) {
  std::vector<Place> valid_places({
-      Place{TARGET(kOpenCL), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
+      Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)},
+      Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)},
+      TARGET(kARM),  // enable kARM CPU kernel when no opencl kernel
  });

  TestModel(valid_places);

--- a/lite/api/model_optimize_tool.cc
+++ b/lite/api/model_optimize_tool.cc
@@ -20,6 +20,7 @@
 // model_optimize_tool's compiling period
 #include "all_kernel_faked.cc"  // NOLINT
 #include "kernel_src_map.h"     // NOLINT
+#include "lite/api/cxx_api.h"
 #include "lite/api/paddle_api.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
@@ -31,6 +32,18 @@ DEFINE_string(model_dir,
              "",
              "path of the model. This option will be ignored if model_file "
              "and param_file are exist");
+DEFINE_string(model_filename,
+              "",
+              "model topo filename of the model in models set. This option"
+              " will be used to specific tailoring");
+DEFINE_string(param_filename,
+              "",
+              "model param filename of the model in models set. This option"
+              " will be used to specific tailoring");
+DEFINE_string(model_set_dir,
+              "",
+              "path of the models set. This option will be used to specific"
+              " tailoring");
 DEFINE_string(model_file, "", "model file path of the combined-param model");
 DEFINE_string(param_file, "", "param file path of the combined-param model");
 DEFINE_string(
@@ -58,29 +71,23 @@ void DisplayKernels() {
  LOG(INFO) << ::paddle::lite::KernelRegistry::Global().DebugString();
 }

-void Main() {
-  if (!FLAGS_model_file.empty() && !FLAGS_param_file.empty()) {
-    LOG(WARNING)
-        << "Load combined-param model. Option model_dir will be ignored";
-  }
-
-  if (FLAGS_display_kernels) {
-    DisplayKernels();
-    exit(0);
-  }
-
-  lite_api::CxxConfig config;
-  config.set_model_dir(FLAGS_model_dir);
-  config.set_model_file(FLAGS_model_file);
-  config.set_param_file(FLAGS_param_file);
-
+std::vector<Place> ParserValidPlaces() {
  std::vector<Place> valid_places;
-  auto target_reprs = lite::Split(FLAGS_valid_targets, " ");
+  auto target_reprs = lite::Split(FLAGS_valid_targets, ",");
  for (auto& target_repr : target_reprs) {
    if (target_repr == "arm") {
      valid_places.emplace_back(TARGET(kARM));
    } else if (target_repr == "opencl") {
-      valid_places.emplace_back(TARGET(kOpenCL));
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)});
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)});
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)});
+      valid_places.emplace_back(
+          TARGET(kARM));  // enable kARM CPU kernel when no opencl kernel
    } else if (target_repr == "x86") {
      valid_places.emplace_back(TARGET(kX86));
    } else {
@@ -100,26 +107,130 @@ void Main() {
    valid_places.insert(valid_places.begin(),
                        Place{TARGET(kARM), PRECISION(kInt8)});
  }
+  return valid_places;
+}
+
+void RunOptimize(const std::string& model_dir,
+                 const std::string& model_file,
+                 const std::string& param_file,
+                 const std::string& optimize_out,
+                 const std::string& optimize_out_type,
+                 const std::vector<Place>& valid_places,
+                 bool record_tailoring_info) {
+  if (!model_file.empty() && !param_file.empty()) {
+    LOG(WARNING)
+        << "Load combined-param model. Option model_dir will be ignored";
+  }
+
+  lite_api::CxxConfig config;
+  config.set_model_dir(model_dir);
+  config.set_model_file(model_file);
+  config.set_param_file(param_file);
+
  config.set_valid_places(valid_places);

  auto predictor = lite_api::CreatePaddlePredictor(config);

  LiteModelType model_type;
-  if (FLAGS_optimize_out_type == "protobuf") {
+  if (optimize_out_type == "protobuf") {
    model_type = LiteModelType::kProtobuf;
-  } else if (FLAGS_optimize_out_type == "naive_buffer") {
+  } else if (optimize_out_type == "naive_buffer") {
    model_type = LiteModelType::kNaiveBuffer;
  } else {
-    LOG(FATAL) << "Unsupported Model type :" << FLAGS_optimize_out_type;
+    LOG(FATAL) << "Unsupported Model type :" << optimize_out_type;
  }
-  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);

+  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
  predictor->SaveOptimizedModel(
-      FLAGS_optimize_out, model_type, FLAGS_record_tailoring_info);
-  if (FLAGS_record_tailoring_info) {
+      optimize_out, model_type, record_tailoring_info);
+  if (record_tailoring_info) {
    LOG(INFO) << "Record the information of tailored model into :"
-              << FLAGS_optimize_out;
+              << optimize_out;
+  }
+}
+
+void CollectModelMetaInfo(const std::string& output_dir,
+                          const std::vector<std::string>& models,
+                          const std::string& filename) {
+  std::set<std::string> total;
+  for (const auto& name : models) {
+    std::string model_path =
+        lite::Join<std::string>({output_dir, name, filename}, "/");
+    auto lines = lite::ReadLines(model_path);
+    total.insert(lines.begin(), lines.end());
+  }
+  std::string output_path =
+      lite::Join<std::string>({output_dir, filename}, "/");
+  lite::WriteLines(std::vector<std::string>(total.begin(), total.end()),
+                   output_path);
+}
+
+void Main() {
+  if (FLAGS_display_kernels) {
+    DisplayKernels();
+    exit(0);
  }
+
+  auto valid_places = ParserValidPlaces();
+  if (FLAGS_model_set_dir == "") {
+    RunOptimize(FLAGS_model_dir,
+                FLAGS_model_file,
+                FLAGS_param_file,
+                FLAGS_optimize_out,
+                FLAGS_optimize_out_type,
+                valid_places,
+                FLAGS_record_tailoring_info);
+    return;
+  }
+
+  if (!FLAGS_record_tailoring_info) {
+    LOG(WARNING) << "--model_set_dir option only be used with "
+                    "--record_tailoring_info=true together";
+    return;
+  }
+
+  auto model_dirs = lite::ListDir(FLAGS_model_set_dir, true);
+  if (model_dirs.size() == 0) {
+    LOG(FATAL) << "[" << FLAGS_model_set_dir << "] does not contain any model";
+  }
+  // Optimize models in FLAGS_model_set_dir
+  for (const auto& name : model_dirs) {
+    std::string input_model_dir =
+        lite::Join<std::string>({FLAGS_model_set_dir, name}, "/");
+    std::string output_model_dir =
+        lite::Join<std::string>({FLAGS_optimize_out, name}, "/");
+
+    std::string model_file = "";
+    std::string param_file = "";
+
+    if (FLAGS_model_filename != "" && FLAGS_param_filename != "") {
+      model_file =
+          lite::Join<std::string>({input_model_dir, FLAGS_model_filename}, "/");
+      param_file =
+          lite::Join<std::string>({input_model_dir, FLAGS_param_filename}, "/");
+    }
+
+    LOG(INFO) << "Start optimize model: " << input_model_dir;
+    RunOptimize(input_model_dir,
+                model_file,
+                param_file,
+                output_model_dir,
+                FLAGS_optimize_out_type,
+                valid_places,
+                FLAGS_record_tailoring_info);
+    LOG(INFO) << "Optimize done. ";
+  }
+
+  // Collect all models information
+  CollectModelMetaInfo(
+      FLAGS_optimize_out, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
+  CollectModelMetaInfo(
+      FLAGS_optimize_out, model_dirs, lite::TAILORD_OPS_LIST_NAME);
+  CollectModelMetaInfo(FLAGS_optimize_out,
+                       model_dirs,
+                       lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
+  CollectModelMetaInfo(
+      FLAGS_optimize_out, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
 }

 }  // namespace lite_api

--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -21,14 +21,14 @@
 #include "lite/api/paddle_use_passes.h"
 #include "lite/api/test_helper.h"
 #include "lite/core/device_info.h"
-#include "lite/tests/utils/timer.h"
+#include "lite/core/profile/timer.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"
 #ifdef LITE_WITH_PROFILE
 #include "lite/core/profile/basic_profiler.h"
 #endif  // LITE_WITH_PROFILE

-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;

 DEFINE_string(input_shape,
              "1,3,224,224",
@@ -102,20 +102,20 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,

  Timer ti;
  for (int j = 0; j < repeat; ++j) {
-    ti.start();
+    ti.Start();
    predictor->Run();
-    ti.end();
-    LOG(INFO) << "iter: " << j << ", time: " << ti.latest_time() << " ms";
+    float t = ti.Stop();
+    LOG(INFO) << "iter: " << j << ", time: " << t << " ms";
  }

  LOG(INFO) << "================== Speed Report ===================";
  LOG(INFO) << "Model: " << model_dir
            << ", power_mode: " << static_cast<int>(power_mode)
            << ", threads num " << thread_num << ", warmup: " << warmup_times
-            << ", repeats: " << repeat << ", avg time: " << ti.get_average_ms()
+            << ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg()
            << " ms"
-            << ", min time: " << ti.get_min_time() << " ms"
-            << ", max time: " << ti.get_max_time() << " ms.";
+            << ", min time: " << ti.LapTimes().Min() << " ms"
+            << ", max time: " << ti.LapTimes().Max() << " ms.";

  auto output = predictor->GetOutput(0);
  auto out = output->data<float>();

--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -93,7 +93,7 @@ void Tensor::CopyFromCpu(const T *src_data) {
  }
 }
 template <typename T>
-void Tensor::CopyToCpu(T *data) {
+void Tensor::CopyToCpu(T *data) const {
  const T *src_data = tensor(raw_tensor_)->data<T>();
  int64_t num = tensor(raw_tensor_)->numel();
  CHECK(num > 0) << "You should call Resize interface first";
@@ -121,12 +121,13 @@ template void Tensor::CopyFromCpu<int, TargetType::kARM>(const int *);
 template void Tensor::CopyFromCpu<float, TargetType::kARM>(const float *);
 template void Tensor::CopyFromCpu<int8_t, TargetType::kARM>(const int8_t *);
 template void Tensor::CopyFromCpu<int, TargetType::kCUDA>(const int *);
+template void Tensor::CopyFromCpu<int64_t, TargetType::kCUDA>(const int64_t *);
 template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *);
 template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *);

-template void Tensor::CopyToCpu(int8_t *);
-template void Tensor::CopyToCpu(float *);
-template void Tensor::CopyToCpu(int *);
+template void Tensor::CopyToCpu(int8_t *) const;
+template void Tensor::CopyToCpu(float *) const;
+template void Tensor::CopyToCpu(int *) const;

 shape_t Tensor::shape() const {
  return ctensor(raw_tensor_)->dims().Vectorize();

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -49,7 +49,7 @@ struct LITE_API Tensor {
  void CopyFromCpu(const T* data);

  template <typename T>
-  void CopyToCpu(T* data);
+  void CopyToCpu(T* data) const;
  /// Shape of the tensor.
  shape_t shape() const;
  TargetType target() const;

--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -55,8 +55,7 @@ const std::string& TargetToStr(TargetType target) {
                                              "any",
                                              "fpga",
                                              "npu",
-                                              "xpu",
-                                              "bm"};
+                                              "xpu"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -94,8 +93,7 @@ const std::string& TargetRepr(TargetType target) {
                                              "kAny",
                                              "kFPGA",
                                              "kNPU",
-                                              "kXPU",
-                                              "kBM"};
+                                              "kXPU"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -131,8 +129,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                               TARGET(kOpenCL),
                                               TARGET(kNPU),
                                               TARGET(kXPU),
-                                               TARGET(kFPGA),
-                                               TARGET(kBM)});
+                                               TARGET(kFPGA)});
  if (target == TARGET(kAny)) {
    return valid_set;
  }

--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -52,9 +52,8 @@ enum class TargetType : int {
  kFPGA = 7,
  kNPU = 8,
  kXPU = 9,
-  kBM = 10,
  kAny = 6,  // any target
-  NUM = 11,  // number of fields.
+  NUM = 10,  // number of fields.
 };
 enum class PrecisionType : int {
  kUnk = 0,

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -20,7 +20,12 @@ USE_MIR_PASS(static_kernel_pick_pass);
 USE_MIR_PASS(variable_place_inference_pass);
 USE_MIR_PASS(type_target_cast_pass);
 USE_MIR_PASS(generate_program_pass);
-USE_MIR_PASS(subgraph_program_pass);
+#ifdef LITE_WITH_NPU
+USE_MIR_PASS(generate_npu_program_pass);
+#endif
+#ifdef LITE_WITH_XPU
+USE_MIR_PASS(generate_xpu_program_pass);
+#endif

 USE_MIR_PASS(io_copy_kernel_pick_pass);
 USE_MIR_PASS(argument_type_display_pass);

--- a/lite/api/test_step_rnn_lite_x86.cc
+++ b/lite/api/test_step_rnn_lite_x86.cc
@@ -12,20 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <vector>

--- a/lite/backends/arm/math/CMakeLists.txt
+++ b/lite/backends/arm/math/CMakeLists.txt
@@ -60,6 +60,7 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
  cc_library(math_arm SRCS
      funcs.cc
      packed_sgemm.cc
+      packed_sgemm_c4.cc
      sgemm.cc
      gemm_prepacked_int8.cc
      gemm_s8.cc
@@ -67,8 +68,10 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
      gemv_arm_int8.cc
      conv3x3s1_direct_fp32.cc
      conv3x3s2_direct_fp32.cc
-      conv3x3s1_depthwise_fp32.cc
-      conv3x3s2_depthwise_fp32.cc
+      conv3x3s1p01_depthwise_fp32.cc
+      conv3x3s2p01_depthwise_fp32.cc
+      conv3x3s1px_depthwise_fp32.cc
+      conv3x3s2px_depthwise_fp32.cc
      conv3x3s1_direct_int8.cc
      conv3x3s2_direct_int8.cc
      conv3x3s1_depthwise_int8.cc
@@ -76,16 +79,14 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
      conv5x5s1_depthwise_int8.cc
      conv5x5s1_depthwise_fp32.cc
      conv5x5s2_depthwise_fp32.cc
-      conv_depthwise_3x3p0.cc
-      conv_depthwise_3x3p1.cc
-      conv_depthwise_3x3s1.cc
-      conv_depthwise_3x3s2.cc
+      conv3x3_winograd_fp32_c4.cc
      conv_winograd_3x3.cc
      conv_impl.cc
      softmax.cc
      scale.cc
      pooling.cc
      elementwise.cc
+      layout.cc
      lrn.cc
      decode_bboxes.cc
      concat.cc
@@ -121,4 +122,3 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
      anchor_generator.cc
      DEPS ${lite_kernel_deps} context tensor)
 endif()
- 
--- a/lite/backends/arm/math/col_im_transform.cc
+++ b/lite/backends/arm/math/col_im_transform.cc
@@ -32,8 +32,10 @@ void col2im<float>(const float* data_col,
                   const int width,
                   const int kernel_h,
                   const int kernel_w,
-                   const int pad_h,
-                   const int pad_w,
+                   const int pad_h0,
+                   const int pad_h1,
+                   const int pad_w0,
+                   const int pad_w1,
                   const int stride_h,
                   const int stride_w,
                   const int dilation_h,
@@ -41,19 +43,22 @@ void col2im<float>(const float* data_col,
                   float* data_im) {
  memset(data_im, 0, height * width * channels * sizeof(float));
  const int output_h =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+      (height + pad_h0 + pad_h1 - (dilation_h * (kernel_h - 1) + 1)) /
+          stride_h +
+      1;
  const int output_w =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+      (width + pad_w0 + pad_w1 - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
+      1;
  const int channel_size = height * width;
  for (int channel = channels; channel--; data_im += channel_size) {
    for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
      for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-        int input_row = -pad_h + kernel_row * dilation_h;
+        int input_row = -pad_h0 + kernel_row * dilation_h;
        for (int output_rows = output_h; output_rows; output_rows--) {
          if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
            data_col += output_w;
          } else {
-            int input_col = -pad_w + kernel_col * dilation_w;
+            int input_col = -pad_w0 + kernel_col * dilation_w;
            for (int output_col = output_w; output_col; output_col--) {
              if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
                data_im[input_row * width + input_col] += *data_col;

--- a/lite/backends/arm/math/col_im_transform.h
+++ b/lite/backends/arm/math/col_im_transform.h
@@ -26,8 +26,10 @@ void col2im(const Dtype* data_col,
            const int width,
            const int kernel_h,
            const int kernel_w,
-            const int pad_h,
-            const int pad_w,
+            const int pad_h0,
+            const int pad_h1,
+            const int pad_w0,
+            const int pad_w1,
            const int stride_h,
            const int stride_w,
            const int dilation_h,

--- a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
+++ b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
--- a/lite/backends/arm/math/conv3x3s1_direct_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1_direct_fp32.cc
@@ -35,9 +35,10 @@ size_t conv3x3s1_direct_workspace_size(const operators::ConvParam& param,
  auto dim_in = param.x->dims();
  auto dim_out = param.output->dims();
  const int threads = ctx->threads();
+  auto paddings = *param.paddings;
  int llc_size = ctx->llc_size() / sizeof(float);
-  const int pad_w = param.paddings[1];
-  const int pad_h = param.paddings[0];
+  const int pad_w = paddings[2];
+  const int pad_h = paddings[0];
  int ow = dim_out[3];
  int oh = dim_out[2];
  int ic = dim_in[1];
@@ -74,9 +75,10 @@ void conv_3x3s1_direct_fp32(const float* i_data,
                            ARMContext* ctx) {
  const int threads = ctx->threads();
  int l2_size = ctx->llc_size() / sizeof(float);
+  auto paddings = *param.paddings;

-  const int pad_h = param.paddings[0];
-  const int pad_w = param.paddings[1];
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
  const int wout_round = ROUNDUP(ow, OUT_W_BLOCK);
  const int win_round = wout_round + 2;
  bool flag_relu = param.fuse_relu;

--- a/lite/backends/arm/math/conv3x3s1_direct_int8.cc
+++ b/lite/backends/arm/math/conv3x3s1_direct_int8.cc
@@ -41,10 +41,11 @@ void conv_3x3s1_direct_int8(const int8_t* din,
                            const operators::ConvParam& param,
                            Context<TARGET(kARM)>* ctx,
                            const float* scale) {
+  auto paddings = *param.paddings;
  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias;
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];

  const int threads = ctx->threads();
  int llc_size = ctx->llc_size() / 4;

--- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
--- a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
--- a/lite/backends/arm/math/conv3x3s2_direct_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2_direct_fp32.cc
@@ -32,10 +32,11 @@ size_t conv3x3s2_direct_workspace_size(const operators::ConvParam& param,
                                       ARMContext* ctx) {
  auto dim_in = param.x->dims();
  auto dim_out = param.output->dims();
+  auto paddings = *param.paddings;
  const int threads = ctx->threads();
  int llc_size = ctx->llc_size() / sizeof(float);
-  const int pad_w = param.paddings[1];
-  const int pad_h = param.paddings[0];
+  const int pad_w = paddings[2];
+  const int pad_h = paddings[0];
  int ow = dim_out[3];
  int oh = dim_out[2];
  int ic = dim_in[1];
@@ -73,10 +74,11 @@ void conv_3x3s2_direct_fp32(const float* i_data,
  //! 3x3s2 convolution, implemented by direct algorithm
  //! prepack input to tmp buffer
  //! write output to tmp buffer
+  auto paddings = *param.paddings;
  const int threads = ctx->threads();
  int l2_size = ctx->llc_size() / sizeof(float);
-  const int pad_w = param.paddings[1];
-  const int pad_h = param.paddings[0];
+  const int pad_w = paddings[2];
+  const int pad_h = paddings[0];
  const int wout_round = ROUNDUP(ow, OUT_W_BLOCK);
  const int win_round = wout_round * 2 /*stride_w*/ + 1;
  bool flag_relu = param.fuse_relu;

--- a/lite/backends/arm/math/conv3x3s2_direct_int8.cc
+++ b/lite/backends/arm/math/conv3x3s2_direct_int8.cc
@@ -46,10 +46,11 @@ void conv_3x3s2_direct_int8(const int8_t* din,
  //! 3x3s2 int8 convolution, implemented by direct algorithm
  //! prepack input to tmp buffer
  //! write output to tmp buffer
+  auto paddings = *param.paddings;
  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias;
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[1];

  const int threads = ctx->threads();
  int llc_size = ctx->llc_size() / 4;
@@ -472,10 +473,11 @@ void conv_3x3s2_direct_int8(const int8_t* din,
  //! 3x3s2 int8 convolution, implemented by direct algorithm
  //! prepack input to tmp buffer
  //! write output to tmp buffer
+  auto paddings = *param.paddings;
  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias;
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[1];
  const int threads = ctx->threads();
  //! set 1/4 l2 cache
  int llc_size = ctx->llc_size() / 4;

--- a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
--- a/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arm_neon.h>
+#include "lite/backends/arm/math/conv_block_utils.h"
+#include "lite/backends/arm/math/conv_impl.h"
+#include "lite/core/context.h"
+#include "lite/operators/op_params.h"
+#ifdef ARM_WITH_OMP
+#include <omp.h>
+#endif
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void conv_3x3s2_depthwise_fp32(const float* i_data,
+                               float* o_data,
+                               int bs,
+                               int oc,
+                               int oh,
+                               int ow,
+                               int ic,
+                               int ih,
+                               int win,
+                               const float* weights,
+                               const float* bias,
+                               const operators::ConvParam& param,
+                               ARMContext* ctx) {
+  auto paddings = *param.paddings;
+  int threads = ctx->threads();
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
+  const int out_c_block = 4;
+  const int out_h_kernel = 1;
+  const int out_w_kernel = 4;
+  const int win_ext = ow * 2 + 1;
+  const int ow_round = ROUNDUP(ow, 4);
+  const int win_round = ROUNDUP(win_ext, 4);
+  const int hin_round = oh * 2 + 1;
+  const int prein_size = win_round * hin_round * out_c_block;
+  auto workspace_size =
+      threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
+  ctx->ExtendWorkspace(sizeof(float) * workspace_size);
+
+  bool flag_relu = param.fuse_relu;
+  bool flag_bias = param.bias != nullptr;
+
+  /// get workspace
+  auto ptr_zero = ctx->workspace_data<float>();
+  memset(ptr_zero, 0, sizeof(float) * win_round);
+  float* ptr_write = ptr_zero + win_round;
+
+  int size_in_channel = win * ih;
+  int size_out_channel = ow * oh;
+
+  int ws = -pad_w;
+  int we = ws + win_round;
+  int hs = -pad_h;
+  int he = hs + hin_round;
+  int w_loop = ow_round / 4;
+  auto remain = w_loop * 4 - ow;
+  bool flag_remain = remain > 0;
+  remain = 4 - remain;
+  remain = remain > 0 ? remain : 0;
+  int row_len = win_round * out_c_block;
+
+  for (int n = 0; n < bs; ++n) {
+    const float* din_batch = i_data + n * ic * size_in_channel;
+    float* dout_batch = o_data + n * oc * size_out_channel;
+#pragma omp parallel for num_threads(threads)
+    for (int c = 0; c < oc; c += out_c_block) {
+#ifdef ARM_WITH_OMP
+      float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size;
+#else
+      float* pre_din = ptr_write + ow_round;
+#endif
+      /// const array size
+      prepack_input_nxwc4_dw(
+          din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero);
+      const float* weight_c = weights + c * 9;  // kernel_w * kernel_h
+      float* dout_c00 = dout_batch + c * size_out_channel;
+      float bias_local[4] = {0, 0, 0, 0};
+      if (flag_bias) {
+        bias_local[0] = bias[c];
+        bias_local[1] = bias[c + 1];
+        bias_local[2] = bias[c + 2];
+        bias_local[3] = bias[c + 3];
+      }
+#ifdef __aarch64__
+      float32x4_t w0 = vld1q_f32(weight_c);       // w0, v23
+      float32x4_t w1 = vld1q_f32(weight_c + 4);   // w1, v24
+      float32x4_t w2 = vld1q_f32(weight_c + 8);   // w2, v25
+      float32x4_t w3 = vld1q_f32(weight_c + 12);  // w3, v26
+      float32x4_t w4 = vld1q_f32(weight_c + 16);  // w4, v27
+      float32x4_t w5 = vld1q_f32(weight_c + 20);  // w5, v28
+      float32x4_t w6 = vld1q_f32(weight_c + 24);  // w6, v29
+      float32x4_t w7 = vld1q_f32(weight_c + 28);  // w7, v30
+      float32x4_t w8 = vld1q_f32(weight_c + 32);  // w8, v31
+#endif
+      for (int h = 0; h < oh; h += out_h_kernel) {
+        float* outc0 = dout_c00 + h * ow;
+        float* outc1 = outc0 + size_out_channel;
+        float* outc2 = outc1 + size_out_channel;
+        float* outc3 = outc2 + size_out_channel;
+        const float* inr0 = pre_din + h * 2 * row_len;
+        const float* inr1 = inr0 + row_len;
+        const float* inr2 = inr1 + row_len;
+        if (c + out_c_block > oc) {
+          switch (c + out_c_block - oc) {
+            case 3:
+              outc1 = ptr_write;
+            case 2:
+              outc2 = ptr_write;
+            case 1:
+              outc3 = ptr_write;
+            default:
+              break;
+          }
+        }
+        auto c0 = outc0;
+        auto c1 = outc1;
+        auto c2 = outc2;
+        auto c3 = outc3;
+        float pre_out[16];
+        for (int w = 0; w < w_loop; ++w) {
+          bool flag_mask = (w == w_loop - 1) && flag_remain;
+          if (flag_mask) {
+            c0 = outc0;
+            c1 = outc1;
+            c2 = outc2;
+            c3 = outc3;
+            outc0 = pre_out;
+            outc1 = pre_out + 4;
+            outc2 = pre_out + 8;
+            outc3 = pre_out + 12;
+          }
+// clang-format off
+#ifdef __aarch64__
+          asm volatile(
+          "ldr    q8, [%[bias]]\n"         /* load bias */
+          "ldp    q0, q1,   [%[inr0]], #32\n" /* load input r0*/
+          "and    v19.16b,  v8.16b, v8.16b\n"
+          "ldp    q2, q3,   [%[inr0]], #32\n" /* load input r0*/
+          "and    v20.16b,  v8.16b, v8.16b\n"
+          "ldp    q4, q5,   [%[inr0]], #32\n" /* load input r0*/
+          "and    v21.16b,  v8.16b, v8.16b\n"
+          "ldp    q6, q7,   [%[inr0]], #32\n" /* load input r0*/
+          "and    v22.16b,  v8.16b, v8.16b\n"
+          "ldr    q8,       [%[inr0]]\n"      /* load input r0*/
+          /*  r0 mul w0-w2, get out */
+          "fmla   v19.4s ,  %[w0].4s,  v0.4s\n" /* outr0 = w0 * r0, 0*/
+          "fmla   v20.4s ,  %[w0].4s,  v2.4s\n" /* outr1 = w0 * r0, 2*/
+          "fmla   v21.4s ,  %[w0].4s,  v4.4s\n" /* outr2 = w0 * r0, 4*/
+          "fmla   v22.4s ,  %[w0].4s,  v6.4s\n" /* outr3 = w0 * r0, 6*/
+          "fmla   v19.4s ,  %[w1].4s,  v1.4s\n" /* outr0 = w1 * r0, 1*/
+          "ldp    q0, q1,   [%[inr1]], #32\n"   /* load input r1*/
+          "fmla   v20.4s ,  %[w1].4s,  v3.4s\n" /* outr1 = w1 * r0, 3*/
+          "fmla   v21.4s ,  %[w1].4s,  v5.4s\n" /* outr2 = w1 * r0, 5*/
+          "fmla   v22.4s ,  %[w1].4s,  v7.4s\n" /* outr3 = w1 * r0, 7*/
+          "fmla   v19.4s ,  %[w2].4s,  v2.4s\n" /* outr0 = w0 * r0, 2*/
+          "ldp    q2, q3,   [%[inr1]], #32\n"   /* load input r1*/
+          "fmla   v20.4s ,  %[w2].4s,  v4.4s\n" /* outr1 = w0 * r0, 4*/
+          "ldp    q4, q5,   [%[inr1]], #32\n"   /* load input r1*/
+          "fmla   v21.4s ,  %[w2].4s,  v6.4s\n" /* outr2 = w0 * r0, 6*/
+          "ldp    q6, q7,   [%[inr1]], #32\n"   /* load input r1*/
+          "fmla   v22.4s ,  %[w2].4s,  v8.4s\n" /* outr3 = w0 * r0, 8*/
+          "ldr    q8,   [%[inr1]]\n"            /* load input r1*/
+          /*  r1, mul w3-w5, get out */
+          "fmla   v19.4s ,  %[w3].4s,  v0.4s\n" /* outr0 = w3 * r1, 0*/
+          "fmla   v20.4s ,  %[w3].4s,  v2.4s\n" /* outr1 = w3 * r1, 2*/
+          "fmla   v21.4s ,  %[w3].4s,  v4.4s\n" /* outr2 = w3 * r1, 4*/
+          "fmla   v22.4s ,  %[w3].4s,  v6.4s\n" /* outr3 = w3 * r1, 6*/
+          "fmla   v19.4s ,  %[w4].4s,  v1.4s\n" /* outr0 = w4 * r1, 1*/
+          "ldp    q0, q1,   [%[inr2]], #32\n"   /* load input r2*/
+          "fmla   v20.4s ,  %[w4].4s,  v3.4s\n" /* outr1 = w4 * r1, 3*/
+          "fmla   v21.4s ,  %[w4].4s,  v5.4s\n" /* outr2 = w4 * r1, 5*/
+          "fmla   v22.4s ,  %[w4].4s,  v7.4s\n" /* outr3 = w4 * r1, 7*/
+          "fmla   v19.4s ,  %[w5].4s,  v2.4s\n" /* outr0 = w5 * r1, 2*/
+          "ldp    q2, q3,   [%[inr2]], #32\n"   /* load input r2*/
+          "fmla   v20.4s ,  %[w5].4s,  v4.4s\n" /* outr1 = w5 * r1, 4*/
+          "ldp    q4, q5,   [%[inr2]], #32\n"   /* load input r2*/
+          "fmla   v21.4s ,  %[w5].4s,  v6.4s\n" /* outr2 = w5 * r1, 6*/
+          "ldp    q6, q7,   [%[inr2]], #32\n"   /* load input r2*/
+          "fmla   v22.4s ,  %[w5].4s,  v8.4s\n" /* outr3 = w5 * r1, 8*/
+          "ldr    q8,   [%[inr2]]\n"            /* load input r2*/
+          /*  r2, mul w6-w8, get out r0, r1 */
+          "fmla   v19.4s ,  %[w6].4s,  v0.4s\n" /* outr0 = w6 * r2, 0*/
+          "fmla   v20.4s ,  %[w6].4s,  v2.4s\n" /* outr1 = w6 * r2, 2*/
+          "fmla   v21.4s ,  %[w6].4s,  v4.4s\n" /* outr2 = w6 * r2, 4*/
+          "fmla   v22.4s ,  %[w6].4s,  v6.4s\n" /* outr3 = w6 * r2, 6*/
+          "fmla   v19.4s ,  %[w7].4s,  v1.4s\n" /* outr0 = w7 * r2, 1*/
+          "fmla   v20.4s ,  %[w7].4s,  v3.4s\n" /* outr1 = w7 * r2, 3*/
+          "fmla   v21.4s ,  %[w7].4s,  v5.4s\n" /* outr2 = w7 * r2, 5*/
+          "fmla   v22.4s ,  %[w7].4s,  v7.4s\n" /* outr3 = w7 * r2, 7*/
+          "fmla   v19.4s ,  %[w8].4s,  v2.4s\n" /* outr0 = w8 * r2, 2*/
+          "fmla   v20.4s ,  %[w8].4s,  v4.4s\n" /* outr1 = w8 * r2, 4*/
+          "fmla   v21.4s ,  %[w8].4s,  v6.4s\n" /* outr2 = w8 * r2, 6*/
+          "fmla   v22.4s ,  %[w8].4s,  v8.4s\n" /* outr3 = w8 * r2, 8*/
+          /* transpose */
+          "trn1 v0.4s, v19.4s, v20.4s\n" /* r0: a0a1c0c1*/
+          "trn2 v1.4s, v19.4s, v20.4s\n" /* r0: b0b1d0d1*/
+          "trn1 v2.4s, v21.4s, v22.4s\n" /* r0: a2a3c2c3*/
+          "trn2 v3.4s, v21.4s, v22.4s\n" /* r0: b2b3d2d3*/
+          "trn1 v19.2d, v0.2d, v2.2d\n"  /* r0: a0a1a2a3*/
+          "trn2 v21.2d, v0.2d, v2.2d\n"  /* r0: c0c1c2c3*/
+          "trn1 v20.2d, v1.2d, v3.2d\n"  /* r0: b0b1b2b3*/
+          "trn2 v22.2d, v1.2d, v3.2d\n"  /* r0: d0d1d2d3*/
+          /* relu */
+          "cbz  %w[flag_relu],  0f\n"    /* skip relu*/
+          "movi v0.4s, #0\n"             /* for relu */
+          "fmax v19.4s, v19.4s, v0.4s\n"
+          "fmax v20.4s, v20.4s, v0.4s\n"
+          "fmax v21.4s, v21.4s, v0.4s\n"
+          "fmax v22.4s, v22.4s, v0.4s\n"
+          /* save result */
+          "0:\n"
+          "str q19, [%[outc0]], #16\n"
+          "str q20, [%[outc1]], #16\n"
+          "str q21, [%[outc2]], #16\n"
+          "str q22, [%[outc3]], #16\n"
+          :[inr0] "+r"(inr0), [inr1] "+r"(inr1),
+          [inr2] "+r"(inr2),
+          [outc0]"+r"(outc0), [outc1]"+r"(outc1),
+          [outc2]"+r"(outc2), [outc3]"+r"(outc3)
+          :[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2),
+          [w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5),
+          [w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8),
+          [bias] "r" (bias_local), [flag_relu]"r"(flag_relu)
+          : "cc", "memory",
+                  "v0","v1","v2","v3","v4","v5","v6","v7",
+                  "v8", "v19","v20","v21","v22"
+          );
+#else
+          asm volatile(
+          /* fill with bias */
+          "vld1.32  {d16-d17}, [%[bias]]\n"   /* load bias */
+          /* load weights */
+          "vld1.32    {d18-d21}, [%[wc0]]!\n" /* load w0-2, to q9-11 */
+          "vld1.32  {d0-d3},   [%[r0]]!\n"    /* load input r0, 0,1*/
+          "vand.i32 q12,  q8, q8\n"
+          "vld1.32  {d4-d7},   [%[r0]]!\n"    /* load input r0, 2,3*/
+          "vand.i32 q13,  q8, q8\n"
+          "vld1.32  {d8-d11},  [%[r0]]!\n"    /* load input r0, 4,5*/
+          "vand.i32 q14,  q8, q8\n"
+          "vld1.32  {d12-d15}, [%[r0]]!\n"    /* load input r0, 6,7*/
+          "vand.i32 q15,  q8, q8\n"
+          "vld1.32  {d16-d17}, [%[r0]]\n"     /* load input r0, 8*/
+          /* mul r0 with w0, w1, w2 */
+          "vmla.f32   q12, q9, q0               @ w0 * inr0\n"
+          "vmla.f32   q13, q9, q2               @ w0 * inr2\n"
+          "vld1.32    {d22-d23}, [%[wc0]]!\n"   /* load w2, to q11 */
+          "vmla.f32   q14, q9, q4               @ w0 * inr4\n"
+          "vmla.f32   q15, q9, q6               @ w0 * inr6\n"
+          "vmla.f32   q12, q10, q1              @ w1 * inr1\n"
+          "vld1.32    {d0-d3}, [%[r1]]!         @ load r1, 0, 1\n"
+          "vmla.f32   q13, q10, q3              @ w1 * inr3\n"
+          "vmla.f32   q14, q10, q5              @ w1 * inr5\n"
+          "vmla.f32   q15, q10, q7              @ w1 * inr7\n"
+          "vld1.32    {d18-d21}, [%[wc0]]!\n"  /* load w3-4, to q9-10 */
+          "vmla.f32   q12, q11, q2              @ w2 * inr2\n"
+          "vld1.32    {d4-d7}, [%[r1]]!         @ load r1, 2, 3\n"
+          "vmla.f32   q13, q11, q4              @ w2 * inr4\n"
+          "vld1.32    {d8-d11}, [%[r1]]!        @ load r1, 4, 5\n"
+          "vmla.f32   q14, q11, q6              @ w2 * inr6\n"
+          "vld1.32    {d12-d15}, [%[r1]]!       @ load r1, 6, 7\n"
+          "vmla.f32   q15, q11, q8              @ w2 * inr8\n"
+          /* mul r1 with w3, w4, w5 */
+          "vmla.f32   q12, q9, q0               @ w3 * inr0\n"
+          "vmla.f32   q13, q9, q2               @ w3 * inr2\n"
+          "vld1.32    {d22-d23}, [%[wc0]]!\n"   /* load w5, to q11 */
+          "vmla.f32   q14, q9, q4               @ w3 * inr4\n"
+          "vmla.f32   q15, q9, q6               @ w3 * inr6\n"
+          "vld1.32    {d16-d17}, [%[r1]]\n"     /* load input r1, 8*/
+          "vmla.f32   q12, q10, q1              @ w4 * inr1\n"
+          "vld1.32    {d0-d3}, [%[r2]]!         @ load r2, 0, 1\n"
+          "vmla.f32   q13, q10, q3              @ w4 * inr3\n"
+          "vmla.f32   q14, q10, q5              @ w4 * inr5\n"
+          "vmla.f32   q15, q10, q7              @ w4 * inr7\n"
+          "vld1.32    {d18-d21}, [%[wc0]]!\n"   /* load w6-7, to q9-10 */
+          "vmla.f32   q12, q11, q2              @ w5 * inr2\n"
+          "vld1.32    {d4-d7}, [%[r2]]!         @ load r2, 2, 3\n"
+          "vmla.f32   q13, q11, q4              @ w5 * inr4\n"
+          "vld1.32    {d8-d11}, [%[r2]]!        @ load r2, 4, 5\n"
+          "vmla.f32   q14, q11, q6              @ w5 * inr6\n"
+          "vld1.32    {d12-d15}, [%[r2]]!       @ load r2, 6, 7\n"
+          "vmla.f32   q15, q11, q8              @ w5 * inr8\n"
+          /* mul r2 with w6, w7, w8 */
+          "vmla.f32   q12, q9, q0               @ w6 * inr0\n"
+          "vmla.f32   q13, q9, q2               @ w6 * inr2\n"
+          "vld1.32    {d22-d23}, [%[wc0]]!\n"   /* load w8, to q11 */
+          "vmla.f32   q14, q9, q4               @ w6 * inr4\n"
+          "vmla.f32   q15, q9, q6               @ w6 * inr6\n"
+          "vld1.32    {d16-d17}, [%[r2]]\n"     /* load input r2, 8*/
+          "vmla.f32   q12, q10, q1              @ w7 * inr1\n"
+          "vmla.f32   q13, q10, q3              @ w7 * inr3\n"
+          "vmla.f32   q14, q10, q5              @ w7 * inr5\n"
+          "vmla.f32   q15, q10, q7              @ w7 * inr7\n"
+          "sub    %[wc0], %[wc0], #144          @ wc0 - 144 to start address\n"
+          "vmla.f32   q12, q11, q2              @ w8 * inr2\n"
+          "vmla.f32   q13, q11, q4              @ w8 * inr4\n"
+          "vmla.f32   q14, q11, q6              @ w8 * inr6\n"
+          "vmla.f32   q15, q11, q8              @ w8 * inr8\n"
+          /* transpose */
+          "vtrn.32 q12, q13\n"    /* a0a1c0c1, b0b1d0d1*/
+          "vtrn.32 q14, q15\n"    /* a2a3c2c3, b2b3d2d3*/
+          "vswp   d25, d28\n"     /* a0a1a2a3, c0c1c2c3*/
+          "vswp   d27, d30\n"     /* b0b1b2b3, d0d1d2d3*/
+          "cmp  %[flag_relu], #0\n"
+          "beq  0f\n"             /* skip relu*/
+          "vmov.u32 q0, #0\n"
+          "vmax.f32 q12, q12, q0\n"
+          "vmax.f32 q13, q13, q0\n"
+          "vmax.f32 q14, q14, q0\n"
+          "vmax.f32 q15, q15, q0\n"
+          "0:\n"
+          "vst1.32 {d24-d25}, [%[outc0]]!\n" /* save outc0*/
+          "vst1.32 {d26-d27}, [%[outc1]]!\n" /* save outc1*/
+          "vst1.32 {d28-d29}, [%[outc2]]!\n" /* save outc2*/
+          "vst1.32 {d30-d31}, [%[outc3]]!\n" /* save outc3*/
+          :[r0] "+r"(inr0), [r1] "+r"(inr1),
+           [r2] "+r"(inr2), [wc0] "+r" (weight_c),
+           [outc0]"+r"(outc0), [outc1]"+r"(outc1),
+           [outc2]"+r"(outc2), [outc3]"+r"(outc3)
+          :[bias] "r" (bias_local),
+           [flag_relu]"r"(flag_relu)
+          :"cc", "memory",
+            "q0","q1","q2","q3","q4","q5","q6","q7",
+            "q8", "q9","q10","q11","q12","q13","q14","q15"
+          );
+#endif  //  __arch64__
+          // clang-format off
+          if (flag_mask) {
+            for (int i = 0; i < remain; ++i) {
+              c0[i] = pre_out[i];
+              c1[i] = pre_out[i + 4];
+              c2[i] = pre_out[i + 8];
+              c3[i] = pre_out[i + 12];
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/arm/math/conv_block_utils.h
+++ b/lite/backends/arm/math/conv_block_utils.h
--- a/lite/backends/arm/math/conv_depthwise.h
+++ b/lite/backends/arm/math/conv_depthwise.h
--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
--- a/lite/backends/arm/math/conv_impl.h
+++ b/lite/backends/arm/math/conv_impl.h
@@ -314,7 +314,23 @@ void fill_bias_int8(int* tensor,
                    const int* bias,
                    int channel,
                    int channel_size);
+// new winograd

+void weight_trans_c4(
+    float* dest, const float* src, int ic, int oc, void* workspace);
+void conv_compute_6x6_3x3(const float* input,
+                          float* output,
+                          int num,
+                          int chout,
+                          int hout,
+                          int wout,
+                          int chin,
+                          int hin,
+                          int win,
+                          const float* weight,
+                          const float* bias,
+                          const operators::ConvParam& param,
+                          ARMContext* ctx);
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/arm/math/conv_winograd_3x3.cc
+++ b/lite/backends/arm/math/conv_winograd_3x3.cc
--- a/lite/backends/arm/math/funcs.h
+++ b/lite/backends/arm/math/funcs.h
--- a/lite/backends/arm/math/interpolate.cc
+++ b/lite/backends/arm/math/interpolate.cc
--- a/lite/backends/arm/math/interpolate.h
+++ b/lite/backends/arm/math/interpolate.h
--- a/lite/backends/arm/math/layout.cc
+++ b/lite/backends/arm/math/layout.cc
--- a/lite/backends/arm/math/layout.h
+++ b/lite/backends/arm/math/layout.h
--- a/lite/backends/arm/math/packed_sgemm.cc
+++ b/lite/backends/arm/math/packed_sgemm.cc
--- a/lite/backends/arm/math/packed_sgemm_c4.cc
+++ b/lite/backends/arm/math/packed_sgemm_c4.cc
--- a/lite/backends/arm/math/packed_sgemm_c4.h
+++ b/lite/backends/arm/math/packed_sgemm_c4.h
--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
--- a/lite/backends/arm/math/sgemv.cc
+++ b/lite/backends/arm/math/sgemv.cc
--- a/lite/backends/arm/math/sgemv.h
+++ b/lite/backends/arm/math/sgemv.h
--- a/lite/backends/cuda/CMakeLists.txt
+++ b/lite/backends/cuda/CMakeLists.txt
 if(NOT LITE_WITH_CUDA)
    return()
 endif()
-set(cuda_static_deps cudnn_static cublas_static curand_static
-    culibos_static cudart_static)
+get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES)

 nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_static_deps})
 nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_static_deps})

--- a/lite/backends/cuda/cuda_utils.h
+++ b/lite/backends/cuda/cuda_utils.h
--- a/lite/backends/cuda/math/CMakeLists.txt
+++ b/lite/backends/cuda/math/CMakeLists.txt
--- a/lite/backends/cuda/math/batched_gemm.cc
+++ b/lite/backends/cuda/math/batched_gemm.cc
--- a/lite/backends/cuda/math/batched_gemm.h
+++ b/lite/backends/cuda/math/batched_gemm.h
--- a/lite/backends/cuda/math/cudnn_conv.cc
+++ b/lite/backends/cuda/math/cudnn_conv.cc
--- a/lite/backends/cuda/math/cudnn_pool.cc
+++ b/lite/backends/cuda/math/cudnn_pool.cc
--- a/lite/backends/cuda/math/cudnn_pool.h
+++ b/lite/backends/cuda/math/cudnn_pool.h
--- a/lite/backends/cuda/math/elementwise.cu
+++ b/lite/backends/cuda/math/elementwise.cu
--- a/lite/backends/cuda/math/elementwise.h
+++ b/lite/backends/cuda/math/elementwise.h
--- a/lite/backends/cuda/math/gemm.cc
+++ b/lite/backends/cuda/math/gemm.cc
--- a/lite/backends/cuda/math/gemm.h
+++ b/lite/backends/cuda/math/gemm.h
--- a/lite/backends/cuda/math/utils.h
+++ b/lite/backends/cuda/math/utils.h
--- a/lite/backends/fpga/KD/pes/conv_process.hpp
+++ b/lite/backends/fpga/KD/pes/conv_process.hpp
--- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
--- a/lite/backends/fpga/KD/pes/pooling_pe.hpp
+++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp
--- a/lite/backends/npu/builder.cc
+++ b/lite/backends/npu/builder.cc
--- a/lite/backends/npu/builder.h
+++ b/lite/backends/npu/builder.h
--- a/lite/backends/opencl/cl_wrapper.cc
+++ b/lite/backends/opencl/cl_wrapper.cc
--- a/lite/backends/x86/math/CMakeLists.txt
+++ b/lite/backends/x86/math/CMakeLists.txt
--- a/lite/backends/x86/math/beam_search.cc
+++ b/lite/backends/x86/math/beam_search.cc
--- a/lite/backends/x86/math/pooling.cc
+++ b/lite/backends/x86/math/pooling.cc
--- a/lite/backends/x86/math/search_fc.cc
+++ b/lite/backends/x86/math/search_fc.cc
--- a/lite/backends/x86/math/search_fc.h
+++ b/lite/backends/x86/math/search_fc.h
--- a/lite/backends/x86/math/sequence_topk_avg_pooling.cc
+++ b/lite/backends/x86/math/sequence_topk_avg_pooling.cc
--- a/lite/backends/x86/math/sequence_topk_avg_pooling.h
+++ b/lite/backends/x86/math/sequence_topk_avg_pooling.h
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
--- a/lite/core/context.h
+++ b/lite/core/context.h
--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
--- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
--- a/lite/core/mir/fusion/conv_bn_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_bn_fuse_pass.cc
--- a/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc
--- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
--- a/lite/core/mir/fusion/fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass.cc
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
--- a/lite/core/mir/pass.h
+++ b/lite/core/mir/pass.h
--- a/lite/core/mir/pass_utils.cc
+++ b/lite/core/mir/pass_utils.cc
--- a/lite/core/mir/pass_utils.h
+++ b/lite/core/mir/pass_utils.h
--- a/lite/core/mir/static_kernel_pick_pass.h
+++ b/lite/core/mir/static_kernel_pick_pass.h
--- a/lite/core/mir/subgraph/generate_npu_program_pass.cc
+++ b/lite/core/mir/subgraph/generate_npu_program_pass.cc
--- a/lite/core/mir/subgraph/generate_npu_program_pass.h
+++ b/lite/core/mir/subgraph/generate_npu_program_pass.h
--- a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc
+++ b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc
--- a/lite/core/mir/subgraph/generate_xpu_program_pass.cc
+++ b/lite/core/mir/subgraph/generate_xpu_program_pass.cc
--- a/lite/core/mir/subgraph/generate_xpu_program_pass.h
+++ b/lite/core/mir/subgraph/generate_xpu_program_pass.h
--- a/lite/core/mir/type_layout_cast_pass.cc
+++ b/lite/core/mir/type_layout_cast_pass.cc
--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
--- a/lite/core/mir/variable_place_inference_pass.h
+++ b/lite/core/mir/variable_place_inference_pass.h
--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
--- a/lite/core/profile/CMakeLists.txt
+++ b/lite/core/profile/CMakeLists.txt
--- a/lite/core/profile/profiler.cc
+++ b/lite/core/profile/profiler.cc
--- a/lite/core/profile/profiler.h
+++ b/lite/core/profile/profiler.h
--- a/lite/core/profile/test_timer.cc
+++ b/lite/core/profile/test_timer.cc
--- a/lite/core/profile/timer.h
+++ b/lite/core/profile/timer.h
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
--- a/lite/core/program.h
+++ b/lite/core/program.h
--- a/lite/demo/cxx/Makefile.def
+++ b/lite/demo/cxx/Makefile.def
--- a/lite/demo/cxx/README.md
+++ b/lite/demo/cxx/README.md
--- a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7
--- a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8
--- a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7
--- a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8
--- a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7
--- a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8
--- a/lite/demo/cxx/mobile_detection/mobile_detection.cc
+++ b/lite/demo/cxx/mobile_detection/mobile_detection.cc
--- a/lite/demo/cxx/mobile_detection/test.jpg
+++ b/lite/demo/cxx/mobile_detection/test.jpg
--- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
+++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
--- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
+++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
--- a/lite/gen_code/CMakeLists.txt
+++ b/lite/gen_code/CMakeLists.txt
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
--- a/lite/kernels/arm/conv_depthwise.cc
+++ b/lite/kernels/arm/conv_depthwise.cc
--- a/lite/kernels/arm/conv_gemmlike.h
+++ b/lite/kernels/arm/conv_gemmlike.h
--- a/lite/kernels/arm/conv_transpose_compute.cc
+++ b/lite/kernels/arm/conv_transpose_compute.cc
--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
--- a/lite/kernels/arm/conv_winograd.h
+++ b/lite/kernels/arm/conv_winograd.h
--- a/lite/kernels/arm/fc_compute.cc
+++ b/lite/kernels/arm/fc_compute.cc
--- a/lite/kernels/arm/fill_constant_compute.cc
+++ b/lite/kernels/arm/fill_constant_compute.cc
--- a/lite/kernels/arm/interpolate_compute.cc
+++ b/lite/kernels/arm/interpolate_compute.cc
--- a/lite/kernels/arm/layout_compute.cc
+++ b/lite/kernels/arm/layout_compute.cc
--- a/lite/kernels/arm/layout_compute.h
+++ b/lite/kernels/arm/layout_compute.h
--- a/lite/kernels/arm/lookup_table_compute.cc
+++ b/lite/kernels/arm/lookup_table_compute.cc
--- a/lite/kernels/arm/lookup_table_compute_test.cc
+++ b/lite/kernels/arm/lookup_table_compute_test.cc
--- a/lite/kernels/arm/lrn_compute.cc
+++ b/lite/kernels/arm/lrn_compute.cc
--- a/lite/kernels/arm/lrn_compute_test.cc
+++ b/lite/kernels/arm/lrn_compute_test.cc
--- a/lite/kernels/arm/matmul_compute.cc
+++ b/lite/kernels/arm/matmul_compute.cc
--- a/lite/kernels/arm/mul_compute.cc
+++ b/lite/kernels/arm/mul_compute.cc
--- a/lite/kernels/arm/pool_compute.cc
+++ b/lite/kernels/arm/pool_compute.cc
--- a/lite/kernels/arm/pool_compute_test.cc
+++ b/lite/kernels/arm/pool_compute_test.cc
--- a/lite/kernels/arm/split_compute.cc
+++ b/lite/kernels/arm/split_compute.cc
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
--- a/lite/kernels/cuda/attention_padding_mask_compute.cu
+++ b/lite/kernels/cuda/attention_padding_mask_compute.cu
--- a/lite/kernels/cuda/attention_padding_mask_compute.h
+++ b/lite/kernels/cuda/attention_padding_mask_compute.h
--- a/lite/kernels/cuda/attention_padding_mask_compute_test.cc
+++ b/lite/kernels/cuda/attention_padding_mask_compute_test.cc
--- a/lite/kernels/cuda/bilinear_interp_compute.cu
+++ b/lite/kernels/cuda/bilinear_interp_compute.cu
--- a/lite/kernels/cuda/bilinear_interp_compute_test.cc
+++ b/lite/kernels/cuda/bilinear_interp_compute_test.cc
--- a/lite/kernels/cuda/calib_compute_cuda_test.cc
+++ b/lite/kernels/cuda/calib_compute_cuda_test.cc
--- a/lite/kernels/cuda/concat_compute.cu
+++ b/lite/kernels/cuda/concat_compute.cu
--- a/lite/kernels/cuda/conv_compute.cc
+++ b/lite/kernels/cuda/conv_compute.cc
--- a/lite/kernels/cuda/conv_compute_test.cc
+++ b/lite/kernels/cuda/conv_compute_test.cc
--- a/lite/kernels/cuda/elementwise_compute.cu
+++ b/lite/kernels/cuda/elementwise_compute.cu
--- a/lite/kernels/cuda/elementwise_compute.h
+++ b/lite/kernels/cuda/elementwise_compute.h
--- a/lite/kernels/cuda/elementwise_compute_test.cc
+++ b/lite/kernels/cuda/elementwise_compute_test.cc
--- a/lite/kernels/cuda/feed_compute.cc
+++ b/lite/kernels/cuda/feed_compute.cc
--- a/lite/kernels/cuda/feed_compute.h
+++ b/lite/kernels/cuda/feed_compute.h
--- a/lite/kernels/cuda/layout_compute.cc
+++ b/lite/kernels/cuda/layout_compute.cc
--- a/lite/kernels/cuda/lookup_table_compute.cu
+++ b/lite/kernels/cuda/lookup_table_compute.cu
--- a/lite/kernels/cuda/match_matrix_tensor_compute.cu
+++ b/lite/kernels/cuda/match_matrix_tensor_compute.cu
--- a/lite/kernels/cuda/match_matrix_tensor_compute.h
+++ b/lite/kernels/cuda/match_matrix_tensor_compute.h
--- a/lite/kernels/cuda/match_matrix_tensor_compute_test.cc
+++ b/lite/kernels/cuda/match_matrix_tensor_compute_test.cc
--- a/lite/kernels/cuda/mul_compute_test.cc
+++ b/lite/kernels/cuda/mul_compute_test.cc
--- a/lite/kernels/cuda/nearest_interp_compute.cu
+++ b/lite/kernels/cuda/nearest_interp_compute.cu
--- a/lite/kernels/cuda/nearest_interp_compute_test.cc
+++ b/lite/kernels/cuda/nearest_interp_compute_test.cc
--- a/lite/kernels/cuda/pool_compute.cu
+++ b/lite/kernels/cuda/pool_compute.cu
--- a/lite/kernels/cuda/pool_compute.h
+++ b/lite/kernels/cuda/pool_compute.h
--- a/lite/kernels/cuda/pool_compute_test.cc
+++ b/lite/kernels/cuda/pool_compute_test.cc
--- a/lite/kernels/cuda/search_aligned_mat_mul_compute.cc
+++ b/lite/kernels/cuda/search_aligned_mat_mul_compute.cc
--- a/lite/kernels/cuda/search_aligned_mat_mul_compute.h
+++ b/lite/kernels/cuda/search_aligned_mat_mul_compute.h
--- a/lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc
+++ b/lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc
--- a/lite/kernels/cuda/search_fc_compute.cu
+++ b/lite/kernels/cuda/search_fc_compute.cu
--- a/lite/kernels/cuda/search_fc_compute.h
+++ b/lite/kernels/cuda/search_fc_compute.h
--- a/lite/kernels/cuda/search_fc_compute_test.cc
+++ b/lite/kernels/cuda/search_fc_compute_test.cc
--- a/lite/kernels/cuda/search_grnn_compute.cu
+++ b/lite/kernels/cuda/search_grnn_compute.cu
--- a/lite/kernels/cuda/search_grnn_compute.h
+++ b/lite/kernels/cuda/search_grnn_compute.h
--- a/lite/kernels/cuda/search_grnn_compute_test.cc
+++ b/lite/kernels/cuda/search_grnn_compute_test.cc
--- a/lite/kernels/cuda/search_group_padding_compute.cu
+++ b/lite/kernels/cuda/search_group_padding_compute.cu
--- a/lite/kernels/cuda/search_group_padding_compute.h
+++ b/lite/kernels/cuda/search_group_padding_compute.h
--- a/lite/kernels/cuda/search_group_padding_compute_test.cc
+++ b/lite/kernels/cuda/search_group_padding_compute_test.cc
--- a/lite/kernels/cuda/search_seq_depadding_compute.cu
+++ b/lite/kernels/cuda/search_seq_depadding_compute.cu
--- a/lite/kernels/cuda/search_seq_depadding_compute.h
+++ b/lite/kernels/cuda/search_seq_depadding_compute.h
--- a/lite/kernels/cuda/search_seq_depadding_compute_test.cc
+++ b/lite/kernels/cuda/search_seq_depadding_compute_test.cc
--- a/lite/kernels/cuda/search_seq_fc_compute.cu
+++ b/lite/kernels/cuda/search_seq_fc_compute.cu
--- a/lite/kernels/cuda/search_seq_fc_compute.h
+++ b/lite/kernels/cuda/search_seq_fc_compute.h
--- a/lite/kernels/cuda/search_seq_fc_compute_test.cc
+++ b/lite/kernels/cuda/search_seq_fc_compute_test.cc
--- a/lite/kernels/cuda/sequence_arithmetic_compute.cu
+++ b/lite/kernels/cuda/sequence_arithmetic_compute.cu
--- a/lite/kernels/cuda/sequence_arithmetic_compute.h
+++ b/lite/kernels/cuda/sequence_arithmetic_compute.h
--- a/lite/kernels/cuda/sequence_arithmetic_compute_test.cc
+++ b/lite/kernels/cuda/sequence_arithmetic_compute_test.cc
--- a/lite/kernels/cuda/sequence_concat_compute.cu
+++ b/lite/kernels/cuda/sequence_concat_compute.cu
--- a/lite/kernels/cuda/sequence_concat_compute.h
+++ b/lite/kernels/cuda/sequence_concat_compute.h
--- a/lite/kernels/cuda/sequence_concat_compute_test.cc
+++ b/lite/kernels/cuda/sequence_concat_compute_test.cc
--- a/lite/kernels/cuda/sequence_pool_compute.cu
+++ b/lite/kernels/cuda/sequence_pool_compute.cu
--- a/lite/kernels/cuda/sequence_pool_compute.h
+++ b/lite/kernels/cuda/sequence_pool_compute.h
--- a/lite/kernels/cuda/sequence_pool_compute_test.cc
+++ b/lite/kernels/cuda/sequence_pool_compute_test.cc
--- a/lite/kernels/cuda/sequence_reverse_compute.cu
+++ b/lite/kernels/cuda/sequence_reverse_compute.cu
--- a/lite/kernels/cuda/sequence_reverse_compute.h
+++ b/lite/kernels/cuda/sequence_reverse_compute.h
--- a/lite/kernels/cuda/sequence_reverse_compute_test.cc
+++ b/lite/kernels/cuda/sequence_reverse_compute_test.cc
--- a/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
+++ b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
--- a/lite/kernels/cuda/sequence_topk_avg_pooling_compute.h
+++ b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.h
--- a/lite/kernels/cuda/softmax_compute.cu
+++ b/lite/kernels/cuda/softmax_compute.cu
--- a/lite/kernels/cuda/var_conv_2d_compute.cu
+++ b/lite/kernels/cuda/var_conv_2d_compute.cu
--- a/lite/kernels/cuda/var_conv_2d_compute.h
+++ b/lite/kernels/cuda/var_conv_2d_compute.h
--- a/lite/kernels/cuda/var_conv_2d_compute_test.cc
+++ b/lite/kernels/cuda/var_conv_2d_compute_test.cc
--- a/lite/kernels/fpga/conv_compute.cc
+++ b/lite/kernels/fpga/conv_compute.cc
--- a/lite/kernels/fpga/conv_compute_test.cc
+++ b/lite/kernels/fpga/conv_compute_test.cc
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
--- a/lite/kernels/npu/bridges/act_op.cc
+++ b/lite/kernels/npu/bridges/act_op.cc
--- a/lite/kernels/npu/bridges/act_op_test.cc
+++ b/lite/kernels/npu/bridges/act_op_test.cc
--- a/lite/kernels/npu/bridges/batch_norm_op.cc
+++ b/lite/kernels/npu/bridges/batch_norm_op.cc
--- a/lite/kernels/npu/bridges/conv_op.cc
+++ b/lite/kernels/npu/bridges/conv_op.cc
--- a/lite/kernels/npu/bridges/conv_op_test.cc
+++ b/lite/kernels/npu/bridges/conv_op_test.cc
--- a/lite/kernels/npu/bridges/conv_transpose_op.cc
+++ b/lite/kernels/npu/bridges/conv_transpose_op.cc
--- a/lite/kernels/npu/bridges/conv_transpose_op_test.cc
+++ b/lite/kernels/npu/bridges/conv_transpose_op_test.cc
--- a/lite/kernels/npu/bridges/elementwise_ops.cc
+++ b/lite/kernels/npu/bridges/elementwise_ops.cc
--- a/lite/kernels/npu/bridges/elementwise_ops_test.cc
+++ b/lite/kernels/npu/bridges/elementwise_ops_test.cc
--- a/lite/kernels/npu/bridges/interpolate_op.cc
+++ b/lite/kernels/npu/bridges/interpolate_op.cc
--- a/lite/kernels/npu/bridges/mul_op.cc
+++ b/lite/kernels/npu/bridges/mul_op.cc
--- a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_npu_bridges.h
--- a/lite/kernels/npu/bridges/pool_op.cc
+++ b/lite/kernels/npu/bridges/pool_op.cc
--- a/lite/kernels/npu/bridges/pool_op_test.cc
+++ b/lite/kernels/npu/bridges/pool_op_test.cc
--- a/lite/kernels/npu/bridges/reduce_mean_op.cc
+++ b/lite/kernels/npu/bridges/reduce_mean_op.cc
--- a/lite/kernels/npu/bridges/reduce_mean_op_test.cc
+++ b/lite/kernels/npu/bridges/reduce_mean_op_test.cc
--- a/lite/kernels/npu/bridges/reshape_op.cc
+++ b/lite/kernels/npu/bridges/reshape_op.cc
--- a/lite/kernels/npu/bridges/sqrt_op.cc
+++ b/lite/kernels/npu/bridges/sqrt_op.cc
--- a/lite/kernels/npu/bridges/sqrt_op_test.cc
+++ b/lite/kernels/npu/bridges/sqrt_op_test.cc
--- a/lite/kernels/npu/bridges/square_op.cc
+++ b/lite/kernels/npu/bridges/square_op.cc
--- a/lite/kernels/npu/bridges/square_op_test.cc
+++ b/lite/kernels/npu/bridges/square_op_test.cc
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
--- a/lite/kernels/opencl/conv_compute.cc
+++ b/lite/kernels/opencl/conv_compute.cc
--- a/lite/kernels/opencl/conv_compute_test.cc
+++ b/lite/kernels/opencl/conv_compute_test.cc
--- a/lite/kernels/opencl/depthwise_conv2d_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute.cc
--- a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
--- a/lite/kernels/opencl/io_copy_compute.cc
+++ b/lite/kernels/opencl/io_copy_compute.cc
--- a/lite/kernels/opencl/pool_compute.cc
+++ b/lite/kernels/opencl/pool_compute.cc
--- a/lite/kernels/opencl/pool_compute_test.cc
+++ b/lite/kernels/opencl/pool_compute_test.cc
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
--- a/lite/kernels/x86/attention_padding_mask_compute.cc
+++ b/lite/kernels/x86/attention_padding_mask_compute.cc
--- a/lite/kernels/x86/attention_padding_mask_compute.h
+++ b/lite/kernels/x86/attention_padding_mask_compute.h
--- a/lite/kernels/x86/attention_padding_mask_compute_test.cc
+++ b/lite/kernels/x86/attention_padding_mask_compute_test.cc
--- a/lite/kernels/x86/cast_compute.cc
+++ b/lite/kernels/x86/cast_compute.cc
--- a/lite/kernels/x86/cast_compute.h
+++ b/lite/kernels/x86/cast_compute.h
--- a/lite/kernels/x86/cast_compute_test.cc
+++ b/lite/kernels/x86/cast_compute_test.cc
--- a/lite/kernels/x86/conv_compute.h
+++ b/lite/kernels/x86/conv_compute.h
--- a/lite/kernels/x86/conv_compute_test.cc
+++ b/lite/kernels/x86/conv_compute_test.cc
--- a/lite/kernels/x86/fill_constant_compute.cc
+++ b/lite/kernels/x86/fill_constant_compute.cc
--- a/lite/kernels/x86/gather_compute.cc
+++ b/lite/kernels/x86/gather_compute.cc
--- a/lite/kernels/x86/gather_compute.h
+++ b/lite/kernels/x86/gather_compute.h
--- a/lite/kernels/x86/gather_compute_test.cc
+++ b/lite/kernels/x86/gather_compute_test.cc
--- a/lite/kernels/x86/layer_norm_compute.cc
+++ b/lite/kernels/x86/layer_norm_compute.cc
--- a/lite/kernels/x86/layer_norm_compute.h
+++ b/lite/kernels/x86/layer_norm_compute.h
--- a/lite/kernels/x86/layer_norm_compute_test.cc
+++ b/lite/kernels/x86/layer_norm_compute_test.cc
--- a/lite/kernels/x86/lookup_table_compute.cc
+++ b/lite/kernels/x86/lookup_table_compute.cc
--- a/lite/kernels/x86/lookup_table_compute.h
+++ b/lite/kernels/x86/lookup_table_compute.h
--- a/lite/kernels/x86/lookup_table_compute_test.cc
+++ b/lite/kernels/x86/lookup_table_compute_test.cc
--- a/lite/kernels/x86/match_matrix_tensor_compute.cc
+++ b/lite/kernels/x86/match_matrix_tensor_compute.cc
--- a/lite/kernels/x86/match_matrix_tensor_compute.h
+++ b/lite/kernels/x86/match_matrix_tensor_compute.h
--- a/lite/kernels/x86/match_matrix_tensor_compute_test.cc
+++ b/lite/kernels/x86/match_matrix_tensor_compute_test.cc
--- a/lite/kernels/x86/mean_compute.cc
+++ b/lite/kernels/x86/mean_compute.cc
--- a/lite/kernels/x86/mul_compute.cc
+++ b/lite/kernels/x86/mul_compute.cc
--- a/lite/kernels/x86/mul_compute.h
+++ b/lite/kernels/x86/mul_compute.h
--- a/lite/kernels/x86/pool_compute.h
+++ b/lite/kernels/x86/pool_compute.h
--- a/lite/kernels/x86/pool_compute_test.cc
+++ b/lite/kernels/x86/pool_compute_test.cc
--- a/lite/kernels/x86/search_aligned_mat_mul_compute.cc
+++ b/lite/kernels/x86/search_aligned_mat_mul_compute.cc
--- a/lite/kernels/x86/search_aligned_mat_mul_compute.h
+++ b/lite/kernels/x86/search_aligned_mat_mul_compute.h
--- a/lite/kernels/x86/search_fc_compute.cc
+++ b/lite/kernels/x86/search_fc_compute.cc
--- a/lite/kernels/x86/search_fc_compute.h
+++ b/lite/kernels/x86/search_fc_compute.h
--- a/lite/kernels/x86/search_fc_compute_test.cc
+++ b/lite/kernels/x86/search_fc_compute_test.cc
--- a/lite/kernels/x86/search_grnn_compute.cc
+++ b/lite/kernels/x86/search_grnn_compute.cc
--- a/lite/kernels/x86/search_grnn_compute.h
+++ b/lite/kernels/x86/search_grnn_compute.h
--- a/lite/kernels/x86/search_grnn_compute_test.cc
+++ b/lite/kernels/x86/search_grnn_compute_test.cc
--- a/lite/kernels/x86/search_group_padding_compute.cc
+++ b/lite/kernels/x86/search_group_padding_compute.cc
--- a/lite/kernels/x86/search_group_padding_compute.h
+++ b/lite/kernels/x86/search_group_padding_compute.h
--- a/lite/kernels/x86/search_group_padding_compute_test.cc
+++ b/lite/kernels/x86/search_group_padding_compute_test.cc
--- a/lite/kernels/x86/search_seq_depadding_compute.cc
+++ b/lite/kernels/x86/search_seq_depadding_compute.cc
--- a/lite/kernels/x86/search_seq_depadding_compute.h
+++ b/lite/kernels/x86/search_seq_depadding_compute.h
--- a/lite/kernels/x86/search_seq_depadding_compute_test.cc
+++ b/lite/kernels/x86/search_seq_depadding_compute_test.cc
--- a/lite/kernels/x86/search_seq_fc_compute.cc
+++ b/lite/kernels/x86/search_seq_fc_compute.cc
--- a/lite/kernels/x86/search_seq_fc_compute.h
+++ b/lite/kernels/x86/search_seq_fc_compute.h
--- a/lite/kernels/x86/sequence_arithmetic_compute.cc
+++ b/lite/kernels/x86/sequence_arithmetic_compute.cc
--- a/lite/kernels/x86/sequence_arithmetic_compute.h
+++ b/lite/kernels/x86/sequence_arithmetic_compute.h
--- a/lite/kernels/x86/sequence_arithmetic_compute_test.cc
+++ b/lite/kernels/x86/sequence_arithmetic_compute_test.cc
--- a/lite/kernels/x86/sequence_concat_compute.cc
+++ b/lite/kernels/x86/sequence_concat_compute.cc
--- a/lite/kernels/x86/sequence_concat_compute.h
+++ b/lite/kernels/x86/sequence_concat_compute.h
--- a/lite/kernels/x86/sequence_concat_compute_test.cc
+++ b/lite/kernels/x86/sequence_concat_compute_test.cc
--- a/lite/kernels/x86/sequence_reverse_compute.cc
+++ b/lite/kernels/x86/sequence_reverse_compute.cc
--- a/lite/kernels/x86/sequence_reverse_compute.h
+++ b/lite/kernels/x86/sequence_reverse_compute.h
--- a/lite/kernels/x86/sequence_reverse_compute_test.cc
+++ b/lite/kernels/x86/sequence_reverse_compute_test.cc
--- a/lite/kernels/x86/sequence_topk_avg_pooling_compute.cc
+++ b/lite/kernels/x86/sequence_topk_avg_pooling_compute.cc
--- a/lite/kernels/x86/sequence_topk_avg_pooling_compute.h
+++ b/lite/kernels/x86/sequence_topk_avg_pooling_compute.h
--- a/lite/kernels/x86/softmax_compute.cc
+++ b/lite/kernels/x86/softmax_compute.cc
--- a/lite/kernels/x86/stack_compute.cc
+++ b/lite/kernels/x86/stack_compute.cc
--- a/lite/kernels/x86/stack_compute.h
+++ b/lite/kernels/x86/stack_compute.h
--- a/lite/kernels/x86/stack_compute_test.cc
+++ b/lite/kernels/x86/stack_compute_test.cc
--- a/lite/kernels/x86/var_conv_2d_compute.cc
+++ b/lite/kernels/x86/var_conv_2d_compute.cc
--- a/lite/kernels/x86/var_conv_2d_compute.h
+++ b/lite/kernels/x86/var_conv_2d_compute.h
--- a/lite/kernels/x86/var_conv_2d_compute_test.cc
+++ b/lite/kernels/x86/var_conv_2d_compute_test.cc
--- a/lite/kernels/xpu/bridges/conv_op.cc
+++ b/lite/kernels/xpu/bridges/conv_op.cc
--- a/lite/kernels/xpu/bridges/conv_op_test.cc
+++ b/lite/kernels/xpu/bridges/conv_op_test.cc
--- a/lite/kernels/xpu/bridges/pool_op_test.cc
+++ b/lite/kernels/xpu/bridges/pool_op_test.cc
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
--- a/lite/operators/activation_ops.cc
+++ b/lite/operators/activation_ops.cc
--- a/lite/operators/attention_padding_mask_op.cc
+++ b/lite/operators/attention_padding_mask_op.cc
--- a/lite/operators/attention_padding_mask_op.h
+++ b/lite/operators/attention_padding_mask_op.h
--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
--- a/lite/operators/conv_transpose_op.cc
+++ b/lite/operators/conv_transpose_op.cc
--- a/lite/operators/conv_transpose_op.h
+++ b/lite/operators/conv_transpose_op.h
--- a/lite/operators/fill_constant_op.cc
+++ b/lite/operators/fill_constant_op.cc
--- a/lite/operators/interpolate_op.cc
+++ b/lite/operators/interpolate_op.cc
--- a/lite/operators/lookup_table_v2_op.cc
+++ b/lite/operators/lookup_table_v2_op.cc
--- a/lite/operators/lookup_table_v2_op.h
+++ b/lite/operators/lookup_table_v2_op.h
--- a/lite/operators/lrn_op.cc
+++ b/lite/operators/lrn_op.cc
--- a/lite/operators/match_matrix_tensor_op.cc
+++ b/lite/operators/match_matrix_tensor_op.cc
--- a/lite/operators/match_matrix_tensor_op.h
+++ b/lite/operators/match_matrix_tensor_op.h
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/pool_op.cc
+++ b/lite/operators/pool_op.cc
--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
--- a/lite/operators/search_aligned_mat_mul_op.cc
+++ b/lite/operators/search_aligned_mat_mul_op.cc
--- a/lite/operators/search_aligned_mat_mul_op.h
+++ b/lite/operators/search_aligned_mat_mul_op.h
--- a/lite/operators/search_fc_op.cc
+++ b/lite/operators/search_fc_op.cc
--- a/lite/operators/search_fc_op.h
+++ b/lite/operators/search_fc_op.h
--- a/lite/operators/search_grnn_op.cc
+++ b/lite/operators/search_grnn_op.cc
--- a/lite/operators/search_grnn_op.h
+++ b/lite/operators/search_grnn_op.h
--- a/lite/operators/search_group_padding_op.cc
+++ b/lite/operators/search_group_padding_op.cc
--- a/lite/operators/search_group_padding_op.h
+++ b/lite/operators/search_group_padding_op.h
--- a/lite/operators/search_seq_depadding_op.cc
+++ b/lite/operators/search_seq_depadding_op.cc
--- a/lite/operators/search_seq_depadding_op.h
+++ b/lite/operators/search_seq_depadding_op.h
--- a/lite/operators/search_seq_fc_op.cc
+++ b/lite/operators/search_seq_fc_op.cc
--- a/lite/operators/search_seq_fc_op.h
+++ b/lite/operators/search_seq_fc_op.h
--- a/lite/operators/search_seq_softmax_op.cc
+++ b/lite/operators/search_seq_softmax_op.cc
--- a/lite/operators/search_seq_softmax_op.h
+++ b/lite/operators/search_seq_softmax_op.h
--- a/lite/operators/sequence_arithmetic_op.cc
+++ b/lite/operators/sequence_arithmetic_op.cc
--- a/lite/operators/sequence_arithmetic_op.h
+++ b/lite/operators/sequence_arithmetic_op.h
--- a/lite/operators/sequence_concat_op.cc
+++ b/lite/operators/sequence_concat_op.cc
--- a/lite/operators/sequence_concat_op.h
+++ b/lite/operators/sequence_concat_op.h
--- a/lite/operators/sequence_reverse_op.cc
+++ b/lite/operators/sequence_reverse_op.cc
--- a/lite/operators/sequence_reverse_op.h
+++ b/lite/operators/sequence_reverse_op.h
--- a/lite/operators/sequence_topk_avg_pooling_op.cc
+++ b/lite/operators/sequence_topk_avg_pooling_op.cc
--- a/lite/operators/sequence_topk_avg_pooling_op.h
+++ b/lite/operators/sequence_topk_avg_pooling_op.h
--- a/lite/operators/split_op.cc
+++ b/lite/operators/split_op.cc
--- a/lite/operators/unsqueeze_op.cc
+++ b/lite/operators/unsqueeze_op.cc
--- a/lite/operators/var_conv_2d_op.cc
+++ b/lite/operators/var_conv_2d_op.cc
--- a/lite/operators/var_conv_2d_op.h
+++ b/lite/operators/var_conv_2d_op.h
--- a/lite/tests/cv/image_convert_test.cc
+++ b/lite/tests/cv/image_convert_test.cc
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
--- a/lite/tests/kernels/bilinear_interp_compute_test.cc
+++ b/lite/tests/kernels/bilinear_interp_compute_test.cc
--- a/lite/tests/kernels/conv2d_transpose_compute_test.cc
+++ b/lite/tests/kernels/conv2d_transpose_compute_test.cc
--- a/lite/tests/kernels/fill_constant_compute_test.cc
+++ b/lite/tests/kernels/fill_constant_compute_test.cc
--- a/lite/tests/kernels/lrn_compute_test.cc
+++ b/lite/tests/kernels/lrn_compute_test.cc
--- a/lite/tests/kernels/nearest_interp_compute_test.cc
+++ b/lite/tests/kernels/nearest_interp_compute_test.cc
--- a/lite/tests/kernels/search_aligned_mat_mul_compute_test.cc
+++ b/lite/tests/kernels/search_aligned_mat_mul_compute_test.cc
--- a/lite/tests/kernels/search_seq_fc_compute_test.cc
+++ b/lite/tests/kernels/search_seq_fc_compute_test.cc
--- a/lite/tests/kernels/shuffle_channel_compute_test.cc
+++ b/lite/tests/kernels/shuffle_channel_compute_test.cc
--- a/lite/tests/kernels/unsqueeze_compute_test.cc
+++ b/lite/tests/kernels/unsqueeze_compute_test.cc
--- a/lite/tests/math/CMakeLists.txt
+++ b/lite/tests/math/CMakeLists.txt
--- a/lite/tests/math/conv_compute_test.cc
+++ b/lite/tests/math/conv_compute_test.cc
--- a/lite/tests/math/conv_int8_compute_test.cc
+++ b/lite/tests/math/conv_int8_compute_test.cc
--- a/lite/tests/math/conv_transpose_compute_test.cc
+++ b/lite/tests/math/conv_transpose_compute_test.cc
--- a/lite/tests/math/gemm_int8_compute_test.cc
+++ b/lite/tests/math/gemm_int8_compute_test.cc
--- a/lite/tests/math/gemv_int8_compute_test.cc
+++ b/lite/tests/math/gemv_int8_compute_test.cc
--- a/lite/tests/math/layout_compute_test.cc
+++ b/lite/tests/math/layout_compute_test.cc
--- a/lite/tests/math/pool_compute_test.cc
+++ b/lite/tests/math/pool_compute_test.cc
--- a/lite/tests/math/sgemm_c4_compute_test.cc
+++ b/lite/tests/math/sgemm_c4_compute_test.cc
--- a/lite/tests/math/sgemm_compute_test.cc
+++ b/lite/tests/math/sgemm_compute_test.cc
--- a/lite/tests/math/sgemv_compute_test.cc
+++ b/lite/tests/math/sgemv_compute_test.cc
--- a/lite/tests/utils/naive_math_impl.h
+++ b/lite/tests/utils/naive_math_impl.h
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
--- a/lite/tools/build_npu.sh
+++ b/lite/tools/build_npu.sh
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
--- a/lite/tools/debug/debug_utils.h
+++ b/lite/tools/debug/debug_utils.h
--- a/lite/utils/cv/paddle_image_preprocess.cc
+++ b/lite/utils/cv/paddle_image_preprocess.cc
--- a/lite/utils/cv/paddle_image_preprocess.h
+++ b/lite/utils/cv/paddle_image_preprocess.h
--- a/lite/utils/io.h
+++ b/lite/utils/io.h
--- a/lite/utils/logging.cc
+++ b/lite/utils/logging.cc
--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
--- a/lite/utils/replace_stl/stream.cc
+++ b/lite/utils/replace_stl/stream.cc
--- a/lite/utils/replace_stl/stream.h
+++ b/lite/utils/replace_stl/stream.h
--- a/mobile/src/fpga/V2/api.cpp
+++ b/mobile/src/fpga/V2/api.cpp
--- a/mobile/src/fpga/V2/image.cpp
+++ b/mobile/src/fpga/V2/image.cpp
--- a/mobile/src/fpga/V2/pe.cpp
+++ b/mobile/src/fpga/V2/pe.cpp
--- a/mobile/src/fpga/common/driver.cpp
+++ b/mobile/src/fpga/common/driver.cpp
--- a/mobile/src/fpga/common/fpga_common.h
+++ b/mobile/src/fpga/common/fpga_common.h
--- a/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
--- a/mobile/src/operators/math/depthwise_conv3x3.cpp
+++ b/mobile/src/operators/math/depthwise_conv3x3.cpp