Paddle support compile on sw (#27858)

09fd2b2a · Wilber · GitHub · 953302d9 · 09fd2b2a · 09fd2b2a
12 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -138,6 +138,7 @@ option(WITH_LITE   "Compile Paddle Fluid with Lite Engine" OFF)
 option(WITH_NCCL   "Compile PaddlePaddle with NCCL support"             ON)
 option(WITH_CRYPTO   "Compile PaddlePaddle with crypto support"         ON)
 option(WITH_ARM   "Compile PaddlePaddle with arm support"         OFF)
+option(WITH_SW   "Compile PaddlePaddle with sw support"         OFF)
 option(WITH_MUSL        "Compile with musl libc instead of gblic"  OFF)

 # PY_VERSION
@@ -257,10 +258,18 @@ if(WITH_ARM)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
    set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_ARM=ON" FORCE)
    set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_ARM=ON." FORCE)
-    set(WITH_GPU OFF CACHE STRING "Disable GPU when compiling WITH_ARM=ON." FORCE)
    add_definitions(-DPADDLE_WITH_ARM)
 endif()

+if (WITH_SW)
+    # mieee flag solves floating-point exceptions under sw and ALPHA architectures
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -mieee")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -mieee")
+    set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_SW=ON" FORCE)
+    set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_SW=ON." FORCE)
+    add_definitions(-DPADDLE_WITH_SW)
+endif()
+
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")

 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")

--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -101,6 +101,8 @@ if(NOT DEFINED CBLAS_PROVIDER AND WITH_SYSTEM_BLAS)
        ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
  find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
        ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
+  find_library(REFERENCE_BLAS_LIBRARY NAMES blas PATHS
+        ${REFERENCE_BLAS_LIB_SEARCH_PATHS})

  if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
    set(CBLAS_PROVIDER REFERENCE_CBLAS)
@@ -127,5 +129,7 @@ endif()
 include_directories(${CBLAS_INC_DIR})
 if(NOT ${CBLAS_PROVIDER} STREQUAL MKLML)
  target_link_libraries(cblas ${CBLAS_LIBRARIES})
+elseif(${CBLAS_PROVIDER} STREQUAL REFERENCE_CBLAS)
+  target_link_libraries(cblas gfortran ${CBLAS_LIBRARIES} ${REFERENCE_BLAS_LIBRARY})
 endif()

--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -97,3 +97,8 @@ endif()
 add_library(eigen3 INTERFACE)

 add_dependencies(eigen3 extern_eigen3)
+
+# sw not support thread_local semantic
+if(WITH_SW)
+  add_definitions(-DEIGEN_AVOID_THREAD_LOCAL)
+endif()
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -183,7 +183,7 @@ set(GPU_COMMON_FLAGS
    -Wno-error=unused-function  # Warnings in Numpy Header.
    -Wno-error=array-bounds # Warnings in Eigen::array
 )
-if (NOT WITH_NV_JETSON AND NOT WITH_ARM)
+if (NOT WITH_NV_JETSON AND NOT WITH_ARM AND NOT WITH_SW)
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
 endif()
 endif(NOT WIN32)

--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -32,7 +32,7 @@ class Tensor;
 #include <libxsmm.h>
 #endif

-#ifdef PADDLE_USE_OPENBLAS
+#if defined(PADDLE_USE_OPENBLAS) || defined(PADDLE_USE_REFERENCE_CBLAS)
 #include <cblas.h>
 #endif


--- a/paddle/fluid/operators/search_compute.h
+++ b/paddle/fluid/operators/search_compute.h
@@ -14,7 +14,7 @@ limitations under the License. */

 #pragma once

-#if !defined(PADDLE_WITH_ARM)
+#if !defined(PADDLE_WITH_ARM) && !defined(PADDLE_WITH_SW)
 #include <immintrin.h>
 #endif
 #include <cfloat>
@@ -74,7 +74,7 @@ void call_gemm_batched(const framework::ExecutionContext& ctx,
  }
 }

-#if !defined(PADDLE_WITH_ARM)
+#if !defined(PADDLE_WITH_ARM) && !defined(PADDLE_WITH_SW)

 #define __m256x __m256

@@ -114,7 +114,7 @@ inline void axpy(const T* x, T* y, size_t len, const T alpha) {
        _mm256_add_px(_mm256_load_px(y + jjj),
                      _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj))));
  }
-#elif defined(PADDLE_WITH_ARM)
+#elif defined(PADDLE_WITH_ARM) || defined(PADDLE_WITH_SW)
  PADDLE_THROW(platform::errors::Unimplemented("axpy is not supported"));
 #else
  lll = len & ~SSE_CUT_LEN_MASK;
@@ -143,7 +143,7 @@ inline void axpy_noadd(const T* x, T* y, size_t len, const T alpha) {
  for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
    _mm256_store_px(y + jjj, _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj)));
  }
-#elif defined(PADDLE_WITH_ARM)
+#elif defined(PADDLE_WITH_ARM) || defined(PADDLE_WITH_SW)
  PADDLE_THROW(platform::errors::Unimplemented("axpy_noadd is not supported"));
 #else
  lll = len & ~SSE_CUT_LEN_MASK;

--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -42,6 +42,9 @@ void SetNumThreads(int num_threads) {
  int real_num_threads = num_threads > 1 ? num_threads : 1;
  platform::dynload::MKL_Set_Num_Threads(real_num_threads);
  omp_set_num_threads(real_num_threads);
+#elif defined(PADDLE_USE_REFERENCE_CBLAS)
+  // cblas not support multi-thread
+  return;
 #else
  PADDLE_THROW(platform::errors::Unimplemented(
      "This library (except OPENBLAS, MKLML) is not supported yet, so the"

--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -140,7 +140,8 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
  if (cpu_isa == isa_any) {
    return true;
  } else {
-#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM)
+#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) && \
+    !defined(PADDLE_WITH_SW)
    int reg[4];
    cpuid(reg, 0);
    int nIds = reg[0];

--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -40,7 +40,8 @@ limitations under the License. */
 #ifdef _WIN32
 #define cpuid(reg, x) __cpuidex(reg, x, 0)
 #else
-#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM)
+#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) && \
+    !defined(PADDLE_WITH_SW)
 #include <cpuid.h>
 inline void cpuid(int reg[4], int x) {
  __cpuid_count(x, 0, reg[0], reg[1], reg[2], reg[3]);

--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -37,9 +37,16 @@ namespace paddle {
 namespace platform {
 namespace {
 // Tracking the nested block stacks of each thread.
+#ifdef PADDLE_WITH_SW
+// sw not supported thread_local
+std::deque<int> block_id_stack;
+std::deque<Event *> annotation_stack;
+#else
+// Tracking the nested event stacks.
 thread_local std::deque<int> block_id_stack;
 // Tracking the nested event stacks.
 thread_local std::deque<Event *> annotation_stack;
+#endif
 // stack to strore event sunch as pe and so on
 static std::deque<Event *> main_thread_annotation_stack{};
 static std::deque<std::string> main_thread_annotation_stack_name{};
@@ -288,8 +295,13 @@ class DeviceTracerImpl : public DeviceTracer {
  }

  void AddAnnotation(uint32_t id, Event *event) {
+#ifdef PADDLE_WITH_SW
+    std::forward_list<std::pair<uint32_t, Event *>> *local_correlations_pairs =
+        nullptr;
+#else
    thread_local std::forward_list<std::pair<uint32_t, Event *>>
        *local_correlations_pairs = nullptr;
+#endif
    if (local_correlations_pairs == nullptr) {
      std::lock_guard<std::mutex> l(trace_mu_);
      correlations_pairs.emplace_front();
@@ -304,7 +316,11 @@ class DeviceTracerImpl : public DeviceTracer {
      VLOG(1) << "Empty timeline annotation.";
      return;
    }
+#ifdef PADDLE_WITH_SW
+    std::forward_list<CPURecord> *local_cpu_records_ = nullptr;
+#else
    thread_local std::forward_list<CPURecord> *local_cpu_records_ = nullptr;
+#endif
    if (local_cpu_records_ == nullptr) {
      std::lock_guard<std::mutex> l(trace_mu_);
      cpu_records_.emplace_front();
@@ -335,8 +351,12 @@ class DeviceTracerImpl : public DeviceTracer {
      VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced.";
      return;
    }
+#ifdef PADDLE_WITH_SW
+    std::forward_list<MemInfoRecord> *local_mem_info_record = nullptr;
+#else
    thread_local std::forward_list<MemInfoRecord> *local_mem_info_record =
        nullptr;
+#endif
    if (local_mem_info_record == nullptr) {
      std::lock_guard<std::mutex> l(trace_mu_);
      mem_info_record_.emplace_front();
@@ -353,8 +373,12 @@ class DeviceTracerImpl : public DeviceTracer {
      VLOG(1) << "Empty timeline annotation.";
      return;
    }
+#ifdef PADDLE_WITH_SW
+    std::forward_list<ActiveKindRecord> *local_active_kind_records = nullptr;
+#else
    thread_local std::forward_list<ActiveKindRecord>
        *local_active_kind_records = nullptr;
+#endif
    if (local_active_kind_records == nullptr) {
      std::lock_guard<std::mutex> l(trace_mu_);
      active_kind_records_.emplace_front();

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -106,7 +106,7 @@ if(APPLE)
    message(FATAL_ERROR "install_name_tool not found, please check.\n")
  endif()
 endif()
-if(LINUX)
+if(LINUX AND NOT WITH_SW)
  find_program(PATCHELF_EXECUTABLE patchelf)
  if(NOT PATCHELF_EXECUTABLE)
    message(FATAL_ERROR "patchelf not found, please install it.\n"

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -349,7 +349,8 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
            command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
        # The dynamic library compiled under aarch64 is greater than 64M,
        # and an oversize error will be reported when using patchelf.
-        if platform.machine() != 'aarch64':
+        # The sw_64 not suppot patchelf, so we just disable that.
+        if platform.machine() != 'aarch64' and platform.machine() != 'sw_64':
          if os.system(command) != 0:
              raise Exception("patch ${FLUID_CORE_NAME}.%s failed, command: %s" % (ext_name, command))