未验证 提交 09fd2b2a 编写于 作者: W Wilber 提交者: GitHub

Paddle support compile on sw (#27858)

上级 953302d9
......@@ -138,6 +138,7 @@ option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF)
option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON)
option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON)
option(WITH_ARM "Compile PaddlePaddle with arm support" OFF)
option(WITH_SW "Compile PaddlePaddle with sw support" OFF)
option(WITH_MUSL "Compile with musl libc instead of gblic" OFF)
# PY_VERSION
......@@ -257,10 +258,18 @@ if(WITH_ARM)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_ARM=ON" FORCE)
set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_ARM=ON." FORCE)
set(WITH_GPU OFF CACHE STRING "Disable GPU when compiling WITH_ARM=ON." FORCE)
add_definitions(-DPADDLE_WITH_ARM)
endif()
if (WITH_SW)
# mieee flag solves floating-point exceptions under sw and ALPHA architectures
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -mieee")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -mieee")
set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_SW=ON" FORCE)
set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_SW=ON." FORCE)
add_definitions(-DPADDLE_WITH_SW)
endif()
set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
......
......@@ -101,6 +101,8 @@ if(NOT DEFINED CBLAS_PROVIDER AND WITH_SYSTEM_BLAS)
${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
find_library(REFERENCE_BLAS_LIBRARY NAMES blas PATHS
${REFERENCE_BLAS_LIB_SEARCH_PATHS})
if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
set(CBLAS_PROVIDER REFERENCE_CBLAS)
......@@ -127,5 +129,7 @@ endif()
include_directories(${CBLAS_INC_DIR})
if(NOT ${CBLAS_PROVIDER} STREQUAL MKLML)
target_link_libraries(cblas ${CBLAS_LIBRARIES})
elseif(${CBLAS_PROVIDER} STREQUAL REFERENCE_CBLAS)
target_link_libraries(cblas gfortran ${CBLAS_LIBRARIES} ${REFERENCE_BLAS_LIBRARY})
endif()
......@@ -97,3 +97,8 @@ endif()
add_library(eigen3 INTERFACE)
add_dependencies(eigen3 extern_eigen3)
# sw not support thread_local semantic
if(WITH_SW)
add_definitions(-DEIGEN_AVOID_THREAD_LOCAL)
endif()
......@@ -183,7 +183,7 @@ set(GPU_COMMON_FLAGS
-Wno-error=unused-function # Warnings in Numpy Header.
-Wno-error=array-bounds # Warnings in Eigen::array
)
if (NOT WITH_NV_JETSON AND NOT WITH_ARM)
if (NOT WITH_NV_JETSON AND NOT WITH_ARM AND NOT WITH_SW)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
endif()
endif(NOT WIN32)
......
......@@ -32,7 +32,7 @@ class Tensor;
#include <libxsmm.h>
#endif
#ifdef PADDLE_USE_OPENBLAS
#if defined(PADDLE_USE_OPENBLAS) || defined(PADDLE_USE_REFERENCE_CBLAS)
#include <cblas.h>
#endif
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#pragma once
#if !defined(PADDLE_WITH_ARM)
#if !defined(PADDLE_WITH_ARM) && !defined(PADDLE_WITH_SW)
#include <immintrin.h>
#endif
#include <cfloat>
......@@ -74,7 +74,7 @@ void call_gemm_batched(const framework::ExecutionContext& ctx,
}
}
#if !defined(PADDLE_WITH_ARM)
#if !defined(PADDLE_WITH_ARM) && !defined(PADDLE_WITH_SW)
#define __m256x __m256
......@@ -114,7 +114,7 @@ inline void axpy(const T* x, T* y, size_t len, const T alpha) {
_mm256_add_px(_mm256_load_px(y + jjj),
_mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj))));
}
#elif defined(PADDLE_WITH_ARM)
#elif defined(PADDLE_WITH_ARM) || defined(PADDLE_WITH_SW)
PADDLE_THROW(platform::errors::Unimplemented("axpy is not supported"));
#else
lll = len & ~SSE_CUT_LEN_MASK;
......@@ -143,7 +143,7 @@ inline void axpy_noadd(const T* x, T* y, size_t len, const T alpha) {
for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
_mm256_store_px(y + jjj, _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj)));
}
#elif defined(PADDLE_WITH_ARM)
#elif defined(PADDLE_WITH_ARM) || defined(PADDLE_WITH_SW)
PADDLE_THROW(platform::errors::Unimplemented("axpy_noadd is not supported"));
#else
lll = len & ~SSE_CUT_LEN_MASK;
......
......@@ -42,6 +42,9 @@ void SetNumThreads(int num_threads) {
int real_num_threads = num_threads > 1 ? num_threads : 1;
platform::dynload::MKL_Set_Num_Threads(real_num_threads);
omp_set_num_threads(real_num_threads);
#elif defined(PADDLE_USE_REFERENCE_CBLAS)
// cblas not support multi-thread
return;
#else
PADDLE_THROW(platform::errors::Unimplemented(
"This library (except OPENBLAS, MKLML) is not supported yet, so the"
......
......@@ -140,7 +140,8 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
if (cpu_isa == isa_any) {
return true;
} else {
#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM)
#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) && \
!defined(PADDLE_WITH_SW)
int reg[4];
cpuid(reg, 0);
int nIds = reg[0];
......
......@@ -40,7 +40,8 @@ limitations under the License. */
#ifdef _WIN32
#define cpuid(reg, x) __cpuidex(reg, x, 0)
#else
#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM)
#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) && \
!defined(PADDLE_WITH_SW)
#include <cpuid.h>
inline void cpuid(int reg[4], int x) {
__cpuid_count(x, 0, reg[0], reg[1], reg[2], reg[3]);
......
......@@ -37,9 +37,16 @@ namespace paddle {
namespace platform {
namespace {
// Tracking the nested block stacks of each thread.
#ifdef PADDLE_WITH_SW
// sw not supported thread_local
std::deque<int> block_id_stack;
std::deque<Event *> annotation_stack;
#else
// Tracking the nested event stacks.
thread_local std::deque<int> block_id_stack;
// Tracking the nested event stacks.
thread_local std::deque<Event *> annotation_stack;
#endif
// stack to strore event sunch as pe and so on
static std::deque<Event *> main_thread_annotation_stack{};
static std::deque<std::string> main_thread_annotation_stack_name{};
......@@ -288,8 +295,13 @@ class DeviceTracerImpl : public DeviceTracer {
}
void AddAnnotation(uint32_t id, Event *event) {
#ifdef PADDLE_WITH_SW
std::forward_list<std::pair<uint32_t, Event *>> *local_correlations_pairs =
nullptr;
#else
thread_local std::forward_list<std::pair<uint32_t, Event *>>
*local_correlations_pairs = nullptr;
#endif
if (local_correlations_pairs == nullptr) {
std::lock_guard<std::mutex> l(trace_mu_);
correlations_pairs.emplace_front();
......@@ -304,7 +316,11 @@ class DeviceTracerImpl : public DeviceTracer {
VLOG(1) << "Empty timeline annotation.";
return;
}
#ifdef PADDLE_WITH_SW
std::forward_list<CPURecord> *local_cpu_records_ = nullptr;
#else
thread_local std::forward_list<CPURecord> *local_cpu_records_ = nullptr;
#endif
if (local_cpu_records_ == nullptr) {
std::lock_guard<std::mutex> l(trace_mu_);
cpu_records_.emplace_front();
......@@ -335,8 +351,12 @@ class DeviceTracerImpl : public DeviceTracer {
VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced.";
return;
}
#ifdef PADDLE_WITH_SW
std::forward_list<MemInfoRecord> *local_mem_info_record = nullptr;
#else
thread_local std::forward_list<MemInfoRecord> *local_mem_info_record =
nullptr;
#endif
if (local_mem_info_record == nullptr) {
std::lock_guard<std::mutex> l(trace_mu_);
mem_info_record_.emplace_front();
......@@ -353,8 +373,12 @@ class DeviceTracerImpl : public DeviceTracer {
VLOG(1) << "Empty timeline annotation.";
return;
}
#ifdef PADDLE_WITH_SW
std::forward_list<ActiveKindRecord> *local_active_kind_records = nullptr;
#else
thread_local std::forward_list<ActiveKindRecord>
*local_active_kind_records = nullptr;
#endif
if (local_active_kind_records == nullptr) {
std::lock_guard<std::mutex> l(trace_mu_);
active_kind_records_.emplace_front();
......
......@@ -106,7 +106,7 @@ if(APPLE)
message(FATAL_ERROR "install_name_tool not found, please check.\n")
endif()
endif()
if(LINUX)
if(LINUX AND NOT WITH_SW)
find_program(PATCHELF_EXECUTABLE patchelf)
if(NOT PATCHELF_EXECUTABLE)
message(FATAL_ERROR "patchelf not found, please install it.\n"
......
......@@ -349,7 +349,8 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
# The dynamic library compiled under aarch64 is greater than 64M,
# and an oversize error will be reported when using patchelf.
if platform.machine() != 'aarch64':
# The sw_64 not suppot patchelf, so we just disable that.
if platform.machine() != 'aarch64' and platform.machine() != 'sw_64':
if os.system(command) != 0:
raise Exception("patch ${FLUID_CORE_NAME}.%s failed, command: %s" % (ext_name, command))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册