From 09fd2b2aab0d1dfd90c0dbe1d6489958994d6f34 Mon Sep 17 00:00:00 2001 From: Wilber Date: Tue, 3 Nov 2020 00:23:49 -0600 Subject: [PATCH] Paddle support compile on sw (#27858) --- CMakeLists.txt | 11 ++++++++++- cmake/cblas.cmake | 4 ++++ cmake/external/eigen.cmake | 5 +++++ cmake/flags.cmake | 2 +- paddle/fluid/operators/math/blas.h | 2 +- paddle/fluid/operators/search_compute.h | 8 ++++---- paddle/fluid/platform/cpu_helper.cc | 3 +++ paddle/fluid/platform/cpu_info.cc | 3 ++- paddle/fluid/platform/cpu_info.h | 3 ++- paddle/fluid/platform/device_tracer.cc | 24 ++++++++++++++++++++++++ python/CMakeLists.txt | 2 +- python/setup.py.in | 3 ++- 12 files changed, 59 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 80820c6487c..91820123da4 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -138,6 +138,7 @@ option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF) option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON) option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON) option(WITH_ARM "Compile PaddlePaddle with arm support" OFF) +option(WITH_SW "Compile PaddlePaddle with sw support" OFF) option(WITH_MUSL "Compile with musl libc instead of gblic" OFF) # PY_VERSION @@ -257,10 +258,18 @@ if(WITH_ARM) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_ARM=ON" FORCE) set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_ARM=ON." FORCE) - set(WITH_GPU OFF CACHE STRING "Disable GPU when compiling WITH_ARM=ON." FORCE) add_definitions(-DPADDLE_WITH_ARM) endif() +if (WITH_SW) + # mieee flag solves floating-point exceptions under sw and ALPHA architectures + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -mieee") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -mieee") + set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_SW=ON" FORCE) + set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_SW=ON." FORCE) + add_definitions(-DPADDLE_WITH_SW) +endif() + set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index 32042864be4..75bb8bdda21 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -101,6 +101,8 @@ if(NOT DEFINED CBLAS_PROVIDER AND WITH_SYSTEM_BLAS) ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS}) find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS ${REFERENCE_CBLAS_LIB_SEARCH_PATHS}) + find_library(REFERENCE_BLAS_LIBRARY NAMES blas PATHS + ${REFERENCE_BLAS_LIB_SEARCH_PATHS}) if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY) set(CBLAS_PROVIDER REFERENCE_CBLAS) @@ -127,5 +129,7 @@ endif() include_directories(${CBLAS_INC_DIR}) if(NOT ${CBLAS_PROVIDER} STREQUAL MKLML) target_link_libraries(cblas ${CBLAS_LIBRARIES}) +elseif(${CBLAS_PROVIDER} STREQUAL REFERENCE_CBLAS) + target_link_libraries(cblas gfortran ${CBLAS_LIBRARIES} ${REFERENCE_BLAS_LIBRARY}) endif() diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index b1e38978910..f27dcd06ef8 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -97,3 +97,8 @@ endif() add_library(eigen3 INTERFACE) add_dependencies(eigen3 extern_eigen3) + +# sw not support thread_local semantic +if(WITH_SW) + add_definitions(-DEIGEN_AVOID_THREAD_LOCAL) +endif() diff --git a/cmake/flags.cmake b/cmake/flags.cmake index ed0bf8396b3..ef7d3f2f5ba 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -183,7 +183,7 @@ set(GPU_COMMON_FLAGS -Wno-error=unused-function # Warnings in Numpy Header. -Wno-error=array-bounds # Warnings in Eigen::array ) -if (NOT WITH_NV_JETSON AND NOT WITH_ARM) +if (NOT WITH_NV_JETSON AND NOT WITH_ARM AND NOT WITH_SW) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64") endif() endif(NOT WIN32) diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index 562e2de3bd3..6e61031ec1c 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -32,7 +32,7 @@ class Tensor; #include #endif -#ifdef PADDLE_USE_OPENBLAS +#if defined(PADDLE_USE_OPENBLAS) || defined(PADDLE_USE_REFERENCE_CBLAS) #include #endif diff --git a/paddle/fluid/operators/search_compute.h b/paddle/fluid/operators/search_compute.h index d166b350af3..df302310517 100644 --- a/paddle/fluid/operators/search_compute.h +++ b/paddle/fluid/operators/search_compute.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#if !defined(PADDLE_WITH_ARM) +#if !defined(PADDLE_WITH_ARM) && !defined(PADDLE_WITH_SW) #include #endif #include @@ -74,7 +74,7 @@ void call_gemm_batched(const framework::ExecutionContext& ctx, } } -#if !defined(PADDLE_WITH_ARM) +#if !defined(PADDLE_WITH_ARM) && !defined(PADDLE_WITH_SW) #define __m256x __m256 @@ -114,7 +114,7 @@ inline void axpy(const T* x, T* y, size_t len, const T alpha) { _mm256_add_px(_mm256_load_px(y + jjj), _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj)))); } -#elif defined(PADDLE_WITH_ARM) +#elif defined(PADDLE_WITH_ARM) || defined(PADDLE_WITH_SW) PADDLE_THROW(platform::errors::Unimplemented("axpy is not supported")); #else lll = len & ~SSE_CUT_LEN_MASK; @@ -143,7 +143,7 @@ inline void axpy_noadd(const T* x, T* y, size_t len, const T alpha) { for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) { _mm256_store_px(y + jjj, _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj))); } -#elif defined(PADDLE_WITH_ARM) +#elif defined(PADDLE_WITH_ARM) || defined(PADDLE_WITH_SW) PADDLE_THROW(platform::errors::Unimplemented("axpy_noadd is not supported")); #else lll = len & ~SSE_CUT_LEN_MASK; diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc index a402f397348..46fdc2b4570 100644 --- a/paddle/fluid/platform/cpu_helper.cc +++ b/paddle/fluid/platform/cpu_helper.cc @@ -42,6 +42,9 @@ void SetNumThreads(int num_threads) { int real_num_threads = num_threads > 1 ? num_threads : 1; platform::dynload::MKL_Set_Num_Threads(real_num_threads); omp_set_num_threads(real_num_threads); +#elif defined(PADDLE_USE_REFERENCE_CBLAS) + // cblas not support multi-thread + return; #else PADDLE_THROW(platform::errors::Unimplemented( "This library (except OPENBLAS, MKLML) is not supported yet, so the" diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index 2df1f291f9f..6f25df107f6 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -140,7 +140,8 @@ bool MayIUse(const cpu_isa_t cpu_isa) { if (cpu_isa == isa_any) { return true; } else { -#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) +#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) && \ + !defined(PADDLE_WITH_SW) int reg[4]; cpuid(reg, 0); int nIds = reg[0]; diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index c071246c512..10870b2b728 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -40,7 +40,8 @@ limitations under the License. */ #ifdef _WIN32 #define cpuid(reg, x) __cpuidex(reg, x, 0) #else -#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) +#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) && \ + !defined(PADDLE_WITH_SW) #include inline void cpuid(int reg[4], int x) { __cpuid_count(x, 0, reg[0], reg[1], reg[2], reg[3]); diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index ec934c3b980..bbf8e4d5ca7 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -37,9 +37,16 @@ namespace paddle { namespace platform { namespace { // Tracking the nested block stacks of each thread. +#ifdef PADDLE_WITH_SW +// sw not supported thread_local +std::deque block_id_stack; +std::deque annotation_stack; +#else +// Tracking the nested event stacks. thread_local std::deque block_id_stack; // Tracking the nested event stacks. thread_local std::deque annotation_stack; +#endif // stack to strore event sunch as pe and so on static std::deque main_thread_annotation_stack{}; static std::deque main_thread_annotation_stack_name{}; @@ -288,8 +295,13 @@ class DeviceTracerImpl : public DeviceTracer { } void AddAnnotation(uint32_t id, Event *event) { +#ifdef PADDLE_WITH_SW + std::forward_list> *local_correlations_pairs = + nullptr; +#else thread_local std::forward_list> *local_correlations_pairs = nullptr; +#endif if (local_correlations_pairs == nullptr) { std::lock_guard l(trace_mu_); correlations_pairs.emplace_front(); @@ -304,7 +316,11 @@ class DeviceTracerImpl : public DeviceTracer { VLOG(1) << "Empty timeline annotation."; return; } +#ifdef PADDLE_WITH_SW + std::forward_list *local_cpu_records_ = nullptr; +#else thread_local std::forward_list *local_cpu_records_ = nullptr; +#endif if (local_cpu_records_ == nullptr) { std::lock_guard l(trace_mu_); cpu_records_.emplace_front(); @@ -335,8 +351,12 @@ class DeviceTracerImpl : public DeviceTracer { VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced."; return; } +#ifdef PADDLE_WITH_SW + std::forward_list *local_mem_info_record = nullptr; +#else thread_local std::forward_list *local_mem_info_record = nullptr; +#endif if (local_mem_info_record == nullptr) { std::lock_guard l(trace_mu_); mem_info_record_.emplace_front(); @@ -353,8 +373,12 @@ class DeviceTracerImpl : public DeviceTracer { VLOG(1) << "Empty timeline annotation."; return; } +#ifdef PADDLE_WITH_SW + std::forward_list *local_active_kind_records = nullptr; +#else thread_local std::forward_list *local_active_kind_records = nullptr; +#endif if (local_active_kind_records == nullptr) { std::lock_guard l(trace_mu_); active_kind_records_.emplace_front(); diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 8244b91d32d..c7ee43a3bc0 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -106,7 +106,7 @@ if(APPLE) message(FATAL_ERROR "install_name_tool not found, please check.\n") endif() endif() -if(LINUX) +if(LINUX AND NOT WITH_SW) find_program(PATCHELF_EXECUTABLE patchelf) if(NOT PATCHELF_EXECUTABLE) message(FATAL_ERROR "patchelf not found, please install it.\n" diff --git a/python/setup.py.in b/python/setup.py.in index f9395f8dd31..b7a6289d38f 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -349,7 +349,8 @@ if '${CMAKE_BUILD_TYPE}' == 'Release': command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so' # The dynamic library compiled under aarch64 is greater than 64M, # and an oversize error will be reported when using patchelf. - if platform.machine() != 'aarch64': + # The sw_64 not suppot patchelf, so we just disable that. + if platform.machine() != 'aarch64' and platform.machine() != 'sw_64': if os.system(command) != 0: raise Exception("patch ${FLUID_CORE_NAME}.%s failed, command: %s" % (ext_name, command)) -- GitLab