From 09fd2b2aab0d1dfd90c0dbe1d6489958994d6f34 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Tue, 3 Nov 2020 00:23:49 -0600
Subject: [PATCH] Paddle support compile on sw (#27858)

---
 CMakeLists.txt                          | 11 ++++++++++-
 cmake/cblas.cmake                       |  4 ++++
 cmake/external/eigen.cmake              |  5 +++++
 cmake/flags.cmake                       |  2 +-
 paddle/fluid/operators/math/blas.h      |  2 +-
 paddle/fluid/operators/search_compute.h |  8 ++++----
 paddle/fluid/platform/cpu_helper.cc     |  3 +++
 paddle/fluid/platform/cpu_info.cc       |  3 ++-
 paddle/fluid/platform/cpu_info.h        |  3 ++-
 paddle/fluid/platform/device_tracer.cc  | 24 ++++++++++++++++++++++++
 python/CMakeLists.txt                   |  2 +-
 python/setup.py.in                      |  3 ++-
 12 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 80820c6487c..91820123da4 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -138,6 +138,7 @@ option(WITH_LITE   "Compile Paddle Fluid with Lite Engine" OFF)
 option(WITH_NCCL   "Compile PaddlePaddle with NCCL support"             ON)
 option(WITH_CRYPTO   "Compile PaddlePaddle with crypto support"         ON)
 option(WITH_ARM   "Compile PaddlePaddle with arm support"         OFF)
+option(WITH_SW   "Compile PaddlePaddle with sw support"         OFF)
 option(WITH_MUSL        "Compile with musl libc instead of gblic"  OFF)
 
 # PY_VERSION
@@ -257,10 +258,18 @@ if(WITH_ARM)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
     set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_ARM=ON" FORCE)
     set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_ARM=ON." FORCE)
-    set(WITH_GPU OFF CACHE STRING "Disable GPU when compiling WITH_ARM=ON." FORCE)
     add_definitions(-DPADDLE_WITH_ARM)
 endif()
 
+if (WITH_SW)
+    # mieee flag solves floating-point exceptions under sw and ALPHA architectures
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -mieee")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -mieee")
+    set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_SW=ON" FORCE)
+    set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_SW=ON." FORCE)
+    add_definitions(-DPADDLE_WITH_SW)
+endif()
+
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 32042864be4..75bb8bdda21 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -101,6 +101,8 @@ if(NOT DEFINED CBLAS_PROVIDER AND WITH_SYSTEM_BLAS)
         ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
   find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
         ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
+  find_library(REFERENCE_BLAS_LIBRARY NAMES blas PATHS
+        ${REFERENCE_BLAS_LIB_SEARCH_PATHS})
 
   if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
     set(CBLAS_PROVIDER REFERENCE_CBLAS)
@@ -127,5 +129,7 @@ endif()
 include_directories(${CBLAS_INC_DIR})
 if(NOT ${CBLAS_PROVIDER} STREQUAL MKLML)
   target_link_libraries(cblas ${CBLAS_LIBRARIES})
+elseif(${CBLAS_PROVIDER} STREQUAL REFERENCE_CBLAS)
+  target_link_libraries(cblas gfortran ${CBLAS_LIBRARIES} ${REFERENCE_BLAS_LIBRARY})
 endif()
 
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index b1e38978910..f27dcd06ef8 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -97,3 +97,8 @@ endif()
 add_library(eigen3 INTERFACE)
 
 add_dependencies(eigen3 extern_eigen3)
+
+# sw not support thread_local semantic
+if(WITH_SW)
+  add_definitions(-DEIGEN_AVOID_THREAD_LOCAL)
+endif()
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index ed0bf8396b3..ef7d3f2f5ba 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -183,7 +183,7 @@ set(GPU_COMMON_FLAGS
     -Wno-error=unused-function  # Warnings in Numpy Header.
     -Wno-error=array-bounds # Warnings in Eigen::array
 )
-if (NOT WITH_NV_JETSON AND NOT WITH_ARM)
+if (NOT WITH_NV_JETSON AND NOT WITH_ARM AND NOT WITH_SW)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
 endif()
 endif(NOT WIN32)
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 562e2de3bd3..6e61031ec1c 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -32,7 +32,7 @@ class Tensor;
 #include <libxsmm.h>
 #endif
 
-#ifdef PADDLE_USE_OPENBLAS
+#if defined(PADDLE_USE_OPENBLAS) || defined(PADDLE_USE_REFERENCE_CBLAS)
 #include <cblas.h>
 #endif
 
diff --git a/paddle/fluid/operators/search_compute.h b/paddle/fluid/operators/search_compute.h
index d166b350af3..df302310517 100644
--- a/paddle/fluid/operators/search_compute.h
+++ b/paddle/fluid/operators/search_compute.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#if !defined(PADDLE_WITH_ARM)
+#if !defined(PADDLE_WITH_ARM) && !defined(PADDLE_WITH_SW)
 #include <immintrin.h>
 #endif
 #include <cfloat>
@@ -74,7 +74,7 @@ void call_gemm_batched(const framework::ExecutionContext& ctx,
   }
 }
 
-#if !defined(PADDLE_WITH_ARM)
+#if !defined(PADDLE_WITH_ARM) && !defined(PADDLE_WITH_SW)
 
 #define __m256x __m256
 
@@ -114,7 +114,7 @@ inline void axpy(const T* x, T* y, size_t len, const T alpha) {
         _mm256_add_px(_mm256_load_px(y + jjj),
                       _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj))));
   }
-#elif defined(PADDLE_WITH_ARM)
+#elif defined(PADDLE_WITH_ARM) || defined(PADDLE_WITH_SW)
   PADDLE_THROW(platform::errors::Unimplemented("axpy is not supported"));
 #else
   lll = len & ~SSE_CUT_LEN_MASK;
@@ -143,7 +143,7 @@ inline void axpy_noadd(const T* x, T* y, size_t len, const T alpha) {
   for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
     _mm256_store_px(y + jjj, _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj)));
   }
-#elif defined(PADDLE_WITH_ARM)
+#elif defined(PADDLE_WITH_ARM) || defined(PADDLE_WITH_SW)
   PADDLE_THROW(platform::errors::Unimplemented("axpy_noadd is not supported"));
 #else
   lll = len & ~SSE_CUT_LEN_MASK;
diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
index a402f397348..46fdc2b4570 100644
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -42,6 +42,9 @@ void SetNumThreads(int num_threads) {
   int real_num_threads = num_threads > 1 ? num_threads : 1;
   platform::dynload::MKL_Set_Num_Threads(real_num_threads);
   omp_set_num_threads(real_num_threads);
+#elif defined(PADDLE_USE_REFERENCE_CBLAS)
+  // cblas not support multi-thread
+  return;
 #else
   PADDLE_THROW(platform::errors::Unimplemented(
       "This library (except OPENBLAS, MKLML) is not supported yet, so the"
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 2df1f291f9f..6f25df107f6 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -140,7 +140,8 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
   if (cpu_isa == isa_any) {
     return true;
   } else {
-#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM)
+#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) && \
+    !defined(PADDLE_WITH_SW)
     int reg[4];
     cpuid(reg, 0);
     int nIds = reg[0];
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index c071246c512..10870b2b728 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -40,7 +40,8 @@ limitations under the License. */
 #ifdef _WIN32
 #define cpuid(reg, x) __cpuidex(reg, x, 0)
 #else
-#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM)
+#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) && \
+    !defined(PADDLE_WITH_SW)
 #include <cpuid.h>
 inline void cpuid(int reg[4], int x) {
   __cpuid_count(x, 0, reg[0], reg[1], reg[2], reg[3]);
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index ec934c3b980..bbf8e4d5ca7 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -37,9 +37,16 @@ namespace paddle {
 namespace platform {
 namespace {
 // Tracking the nested block stacks of each thread.
+#ifdef PADDLE_WITH_SW
+// sw not supported thread_local
+std::deque<int> block_id_stack;
+std::deque<Event *> annotation_stack;
+#else
+// Tracking the nested event stacks.
 thread_local std::deque<int> block_id_stack;
 // Tracking the nested event stacks.
 thread_local std::deque<Event *> annotation_stack;
+#endif
 // stack to strore event sunch as pe and so on
 static std::deque<Event *> main_thread_annotation_stack{};
 static std::deque<std::string> main_thread_annotation_stack_name{};
@@ -288,8 +295,13 @@ class DeviceTracerImpl : public DeviceTracer {
   }
 
   void AddAnnotation(uint32_t id, Event *event) {
+#ifdef PADDLE_WITH_SW
+    std::forward_list<std::pair<uint32_t, Event *>> *local_correlations_pairs =
+        nullptr;
+#else
     thread_local std::forward_list<std::pair<uint32_t, Event *>>
         *local_correlations_pairs = nullptr;
+#endif
     if (local_correlations_pairs == nullptr) {
       std::lock_guard<std::mutex> l(trace_mu_);
       correlations_pairs.emplace_front();
@@ -304,7 +316,11 @@ class DeviceTracerImpl : public DeviceTracer {
       VLOG(1) << "Empty timeline annotation.";
       return;
     }
+#ifdef PADDLE_WITH_SW
+    std::forward_list<CPURecord> *local_cpu_records_ = nullptr;
+#else
     thread_local std::forward_list<CPURecord> *local_cpu_records_ = nullptr;
+#endif
     if (local_cpu_records_ == nullptr) {
       std::lock_guard<std::mutex> l(trace_mu_);
       cpu_records_.emplace_front();
@@ -335,8 +351,12 @@ class DeviceTracerImpl : public DeviceTracer {
       VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced.";
       return;
     }
+#ifdef PADDLE_WITH_SW
+    std::forward_list<MemInfoRecord> *local_mem_info_record = nullptr;
+#else
     thread_local std::forward_list<MemInfoRecord> *local_mem_info_record =
         nullptr;
+#endif
     if (local_mem_info_record == nullptr) {
       std::lock_guard<std::mutex> l(trace_mu_);
       mem_info_record_.emplace_front();
@@ -353,8 +373,12 @@ class DeviceTracerImpl : public DeviceTracer {
       VLOG(1) << "Empty timeline annotation.";
       return;
     }
+#ifdef PADDLE_WITH_SW
+    std::forward_list<ActiveKindRecord> *local_active_kind_records = nullptr;
+#else
     thread_local std::forward_list<ActiveKindRecord>
         *local_active_kind_records = nullptr;
+#endif
     if (local_active_kind_records == nullptr) {
       std::lock_guard<std::mutex> l(trace_mu_);
       active_kind_records_.emplace_front();
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 8244b91d32d..c7ee43a3bc0 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -106,7 +106,7 @@ if(APPLE)
     message(FATAL_ERROR "install_name_tool not found, please check.\n")
   endif()
 endif()
-if(LINUX)
+if(LINUX AND NOT WITH_SW)
   find_program(PATCHELF_EXECUTABLE patchelf)
   if(NOT PATCHELF_EXECUTABLE)
     message(FATAL_ERROR "patchelf not found, please install it.\n"
diff --git a/python/setup.py.in b/python/setup.py.in
index f9395f8dd31..b7a6289d38f 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -349,7 +349,8 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
             command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
         # The dynamic library compiled under aarch64 is greater than 64M,
         # and an oversize error will be reported when using patchelf.
-        if platform.machine() != 'aarch64':
+        # The sw_64 not suppot patchelf, so we just disable that.
+        if platform.machine() != 'aarch64' and platform.machine() != 'sw_64':
           if os.system(command) != 0:
               raise Exception("patch ${FLUID_CORE_NAME}.%s failed, command: %s" % (ext_name, command))
 
-- 
GitLab