diff --git a/CMakeLists.txt b/CMakeLists.txt index 861bb50a2de0249e4e5ac2e2fa1d7a8a7c61bca0..1a59db8c71bf3b1ea472c1ee56a1cd97de42dad8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,19 +12,26 @@ # See the License for the specific language governing permissions and # limitations under the License -cmake_minimum_required(VERSION 3.0) - -project(paddle CXX C) - set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake") set(PROJ_ROOT ${CMAKE_SOURCE_DIR}) +include(system) + +if(ANDROID) + cmake_minimum_required(VERSION 3.7) +else() + cmake_minimum_required(VERSION 3.0) +endif() + +project(paddle CXX C) + find_package(Sphinx) -find_package(CUDA QUIET) +if(NOT CMAKE_CROSSCOMPILING) + find_package(CUDA QUIET) +endif(NOT CMAKE_CROSSCOMPILING) find_package(Git REQUIRED) find_package(Threads REQUIRED) -include(system) include(simd) ################################ Configurations ####################################### @@ -51,6 +58,21 @@ if(NOT CMAKE_BUILD_TYPE) FORCE) endif() +if(ANDROID) + if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21") + message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 21") + endif() + + set(WITH_GPU OFF CACHE STRING + "Disable GPU when cross-compiling for Android" FORCE) + set(WITH_AVX OFF CACHE STRING + "Disable AVX when cross-compiling for Android" FORCE) + set(WITH_PYTHON OFF CACHE STRING + "Disable PYTHON when cross-compiling for Android" FORCE) + set(WITH_RDMA OFF CACHE STRING + "Disable RDMA when cross-compiling for Android" FORCE) +endif(ANDROID) + set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING "A path setting third party libraries download & build directories.") ######################################################################################## @@ -75,7 +97,6 @@ include(flags) # set paddle compile flags include(cudnn) # set cudnn libraries include(version) # set PADDLE_VERSION include(coveralls) # set code coverage - include(configure) # add paddle env configuration include_directories("${PROJ_ROOT}") diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index 235c95f017f2b6ef24195a0210ccafff36b6ed61..b8bf1bb07a1f779354b2c10071264bf41d279f6c 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -19,9 +19,9 @@ set(CBLAS_FOUND OFF) set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs") set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL") -find_path(MKL_INCLUDE_DIR mkl.h PATHS +find_path(MKL_INC_DIR mkl.h PATHS ${MKL_ROOT}/include) -find_path(MKL_INCLUDE_DIR mkl_lapacke.h PATHS +find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS ${MKL_ROOT}/include) find_library(MKL_CORE_LIB NAMES mkl_core PATHS ${MKL_ROOT}/lib @@ -34,15 +34,19 @@ find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS ${MKL_ROOT}/lib/intel64) -if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64) +if(MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64) set(CBLAS_PROVIDER MKL) - set(CBLAS_INC_DIR ${MKL_INCLUDE_DIR}) + set(CBLAS_INC_DIR ${MKL_INC_DIR}) set(CBLAS_LIBRARIES ${MKL_INTEL_LP64} ${MKL_SEQUENTIAL_LIB} ${MKL_CORE_LIB}) add_definitions(-DPADDLE_USE_MKL) message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") set(CBLAS_FOUND ON) + if(${MKL_LAPACK_INC_DIR}) + add_definitions(-DPADDLE_USE_LAPACK) + message(STATUS "Found lapack in MKL (include: ${MKL_LAPACK_INC_DIR})") + endif() return() # return file. endif() @@ -68,13 +72,17 @@ find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3 find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3 PATHS ${ATLAS_LIB_SEARCH_PATHS}) -if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB) +if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB AND NOT CBLAS_FOUND) set(CBLAS_PROVIDER ATLAS) - set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR}) + set(CBLAS_INC_DIR ${ATLAS_INC_DIR}) set(CBLAS_LIBRARIES ${ATLAS_LIB} ${ATLAS_CBLAS_LIB}) add_definitions(-DPADDLE_USE_ATLAS) - message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") + message(STATUS "Found ATLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") set(CBLAS_FOUND ON) + if(ATLAS_CLAPACK_INC_DIR) + add_definitions(-DPADDLE_USE_LAPACK) + message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})") + endif() return() endif() @@ -103,8 +111,12 @@ if(OPENBLAS_INC_DIR AND OPENBLAS_LIB) set(CBLAS_PROVIDER OPENBLAS) set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR}) set(CBLAS_LIBRARIES ${OPENBLAS_LIB}) - message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") + message(STATUS "Found OpenBLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") set(CBLAS_FOUND ON) + if(OPENBLAS_LAPACKE_INC_DIR) + add_definitions(-DPADDLE_USE_LAPACK) + message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})") + endif() return() endif() diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 0bb016201dd8ae912ac8ec9f925bc5277fad7aed..5e507e78f74eee885922f502f35e3c15fafb622d 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -32,6 +32,14 @@ if(NOT WITH_PROFILER) add_definitions(-DPADDLE_DISABLE_PROFILER) endif(NOT WITH_PROFILER) +if(NOT CMAKE_CROSSCOMPILING) + if(WITH_AVX AND AVX_FOUND) + set(SIMD_FLAG ${AVX_FLAG}) + elseif(SSE3_FOUND) + set(SIMD_FLAG ${SSE3_FLAG}) + endif() +endif() + if(NOT WITH_GPU) add_definitions(-DPADDLE_ONLY_CPU) add_definitions(-DHPPL_STUB_FUNC) @@ -48,21 +56,12 @@ else() message(FATAL_ERROR "Paddle need cudnn to compile") endif() - if(WITH_AVX) - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}") - else(WITH_AVX) - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}") - endif(WITH_AVX) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}") # Include cuda and cudnn include_directories(${CUDNN_INCLUDE_DIR}) include_directories(${CUDA_TOOLKIT_INCLUDE}) endif(NOT WITH_GPU) -if(WITH_AVX) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}") -else(WITH_AVX) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}") -endif(WITH_AVX) +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}") diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index 5754407d66c18872bf0cf314ee6e0a32e0d4329d..af9be86961833dcd62371227165d411a3b61d79e 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -1,3 +1,7 @@ +if(NOT WITH_GPU) + return() +endif() + set(CUDNN_ROOT "" CACHE PATH "CUDNN ROOT") find_path(CUDNN_INCLUDE_DIR cudnn.h PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 8acb4065e35ab4981c389b2ccb22e9e2cddabf88..0afb3ab9af48046af01f03838eefa0bd2fcb2821 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -33,6 +33,8 @@ ExternalProject_Add( UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON CMAKE_ARGS -DBUILD_TESTING=OFF diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index cdef63188db001ffe9f06691cb3abffad9cdde5f..4a9e2ecc6bbe74c5856a55fb0c982777d7ac25b7 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -35,6 +35,8 @@ ExternalProject_Add( UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON CMAKE_ARGS -DWITH_GFLAGS=ON diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index 555469e5c11dd3ee7458888e72fd3a5fec08ec0e..49c7d71443cda700a14af6be65ff6658eec7229f 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -43,6 +43,8 @@ IF(WITH_TESTING) UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR} CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON CMAKE_ARGS -DBUILD_GMOCK=ON diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index a11d18617bf3a81065dce9a99d948561c181b9d4..92ea23c7633e974fd09251f967965364b1928307 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -54,6 +54,8 @@ IF(NOT ${CBLAS_FOUND}) "you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...") ENDIF(NOT CMAKE_Fortran_COMPILER) + ADD_DEFINITIONS(-DPADDLE_USE_LAPACK) + ExternalProject_Add( openblas ${EXTERNAL_PROJECT_LOG_ARGS} diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index 93d7275df05d723d7dd66ef0c5ac15672c051c34..9fd3afd0998b38c18b4490e6fb1c6fe0222ed142 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -219,9 +219,9 @@ ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND) ENDIF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND) -INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR}) -INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR}) - -IF(NOT WITH_PYTHON) +IF(WITH_PYTHON) + INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR}) + INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR}) +ELSE() SET(PYTHON_LIBRARIES "") ENDIF() diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index c1c7bd4a432a0fb36580276b7997b91e370a997e..293070c3cfcc1196001f64469f3254289b0de792 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -50,6 +50,8 @@ ExternalProject_Add( UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} CMAKE_ARGS -DWITH_GPU=${WITH_GPU} CMAKE_ARGS -DWITH_OMP=${USE_OMP} diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index 519d9e0f21692fc2b69f8c57b87ffb1ef56b1239..45ca5542b7dc30216b45487782f849b93c5f8fca 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -22,7 +22,7 @@ SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include dire IF(WIN32) SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE) ELSE(WIN32) - set(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE) + SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE) ENDIF(WIN32) INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) @@ -36,6 +36,8 @@ ExternalProject_Add( UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR} CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON diff --git a/cmake/flags.cmake b/cmake/flags.cmake index b76852fc6c50e80633c8294fb2724b83f15293a7..7eb92efcb00fa18461e61e0508b485c13ef23a1f 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -2,6 +2,7 @@ include(CheckCXXCompilerFlag) include(CheckCCompilerFlag) include(CheckCXXSymbolExists) +include(CheckTypeSize) function(CheckCompilerCXX11Flag) if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") @@ -25,7 +26,7 @@ function(CheckCompilerCXX11Flag) endfunction() CheckCompilerCXX11Flag() -LIST(APPEND CMAKE_CXX_FLAGS -std=c++11) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") # safe_set_flag # @@ -83,6 +84,17 @@ if(NOT UINT64_MAX_EXISTS) endif() endif() +SET(CMAKE_EXTRA_INCLUDE_FILES "pthread.h") +CHECK_TYPE_SIZE(pthread_spinlock_t SPINLOCK_FOUND) +CHECK_TYPE_SIZE(pthread_barrier_t BARRIER_FOUND) +if(SPINLOCK_FOUND) + add_definitions(-DPADDLE_USE_PTHREAD_SPINLOCK) +endif(SPINLOCK_FOUND) +if(BARRIER_FOUND) + add_definitions(-DPADDLE_USE_PTHREAD_BARRIER) +endif(BARRIER_FOUND) +SET(CMAKE_EXTRA_INCLUDE_FILES "") + # Common flags. the compiler flag used for C/C++ sources whenever release or debug # Do not care if this flag is support for gcc. set(COMMON_FLAGS diff --git a/cmake/simd.cmake b/cmake/simd.cmake index d380c996dfa95f0caa2b9cd9daa0ac9141e51fe0..46035a908b588861607a25d3a21cf34b7b6fd4b8 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -2,6 +2,7 @@ # so that PaddlePaddle can unleash the vectorization power of muticore. INCLUDE(CheckCXXSourceRuns) +INCLUDE(CheckCXXSourceCompiles) IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") set(MMX_FLAG "-mmmx") @@ -17,6 +18,8 @@ ELSEIF(MSVC) SET(AVX2_FLAG "/arch:AVX2") ENDIF() +set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS}) + # Check MMX set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG}) CHECK_CXX_SOURCE_RUNS(" @@ -73,4 +76,5 @@ int main() return 0; }" AVX2_FOUND) +set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND) diff --git a/cmake/system.cmake b/cmake/system.cmake index 3e472da7e0bd9c433f92f3e8b52970cd2cc6dcba..3ca06665ab2385e34302a6bcce7ada549ea1e247 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -67,6 +67,12 @@ MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES) MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}") MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores") +IF(DEFINED CMAKE_SYSTEM_NAME) + IF(${CMAKE_SYSTEM_NAME} STREQUAL "Android") + SET(ANDROID TRUE) + ENDIF() +ENDIF() + # external dependencies log output SET(EXTERNAL_PROJECT_LOG_ARGS LOG_DOWNLOAD 0 # Wrap download in script to log output diff --git a/cmake/util.cmake b/cmake/util.cmake index bacb64eb9ee65fffc824e4587a22fc432c092b19..099a85809d93e01772bf8c5c329fd9055ee4f054 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -90,6 +90,10 @@ function(link_paddle_exe TARGET_NAME) ${RDMA_LD_FLAGS} ${RDMA_LIBS}) + if(ANDROID) + target_link_libraries(${TARGET_NAME} log) + endif(ANDROID) + add_dependencies(${TARGET_NAME} ${external_project_dependencies}) endfunction() diff --git a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh index f35bfbc5c8253d632f8089f5037421f527633aad..9c49a4bd2083794e98b099b25944bedec3d5a2ff 100644 --- a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh +++ b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh @@ -17,7 +17,11 @@ limitations under the License. */ #include #include "hl_base.h" +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#include "hl_neon_matrix_kernel.cuh" +#else #include "hl_sse_matrix_kernel.cuh" +#endif /** * @brief cpu element wise unary operator. diff --git a/paddle/cuda/include/hl_matrix_base.cuh b/paddle/cuda/include/hl_matrix_base.cuh index db35ee2037433163ebb3673edb350e3fab71fba9..8b755c1095c2c4fdb7e74d8cddc948e6a6af380b 100644 --- a/paddle/cuda/include/hl_matrix_base.cuh +++ b/paddle/cuda/include/hl_matrix_base.cuh @@ -66,6 +66,8 @@ typedef BaseOp SSESquaredDiff; typedef BaseOp SSEFirst; typedef BaseOp SSESecond; typedef BaseOp SSEClassificationError; +#elif defined(__ARM__NEON__) || defined(__ARM_NEON) +#include "hl_matrix_base_neon.cuh" #else #include "hl_matrix_base_sse.cuh" #endif diff --git a/paddle/cuda/include/hl_matrix_base_neon.cuh b/paddle/cuda/include/hl_matrix_base_neon.cuh new file mode 100644 index 0000000000000000000000000000000000000000..e13019f5ee24ad600005c99678426ee3808b0e54 --- /dev/null +++ b/paddle/cuda/include/hl_matrix_base_neon.cuh @@ -0,0 +1,161 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + + +#ifndef HL_MATRIX_BASE_NEON_CUH_ +#define HL_MATRIX_BASE_NEON_CUH_ + +namespace aggregate { +class SSESum { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + return vaddq_f32(a, b); + } +}; + +class SSEMax { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + return vmaxq_f32(a, b); + } +}; + +class SSEMin { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + return vminq_f32(a, b); + } +}; +} // namespace aggregate + +namespace base { +namespace unary { +class SSEIdentity { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a) const { + return a; + } +}; +} // namespace unary + +namespace binary { +class SSEAdd { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + return vaddq_f32(a, b); + } +}; + +class SSEAdd2 { +public: + static const bool sse = true; + const real p1; + const real p2; + float32x4_t mp1; + float32x4_t mp2; + +public: + SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) { + mp1 = vdupq_n_f32(p1); + mp2 = vdupq_n_f32(p2); + } + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + float32x4_t tmp1, tmp2; + tmp1 = vmulq_f32(mp1, a); + tmp2 = vmulq_f32(mp2, b); + return vaddq_f32(tmp1, tmp2); + } +}; + +class SSESub { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + return vsubq_f32(a, b); + } +}; + +class SSEMul { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + return vmulq_f32(a, b); + } +}; + +class SSEDiv { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + float32x4_t tmp; + tmp = vrecpeq_f32(b); + return vmulq_f32(a, tmp); + } +}; + +class SSESquaredDiff { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + float32x4_t tmp; + tmp = vsubq_f32(a, b); + return vmulq_f32(tmp, tmp); + } +}; + +class SSEFirst { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + return a; + } +}; + +class SSESecond { +public: + static const bool sse = true; + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + return b; + } +}; + +class SSEClassificationError { +public: + static const bool sse = true; + const real p; + float32x4_t mp; + uint32x4_t result; + +public: + explicit SSEClassificationError(const real s) : p(s) { + mp = vdupq_n_f32(p); + result = vdupq_n_u32(1); + } + // TODO: to be check + INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const { + uint32x4_t tmp1 = vcgtq_f32(a, mp); + uint32x4_t tmp2 = vcgtq_f32(b, mp); + uint32x4_t tmp3 = veorq_u32(tmp1, tmp2); + return vcvtq_f32_u32(vandq_u32(tmp3, result)); + } +}; +} // namespace binary +} // namespace base + +#endif /* HL_MATRIX_BASE_NEON_CUH_ */ diff --git a/paddle/cuda/include/hl_matrix_type.cuh b/paddle/cuda/include/hl_matrix_type.cuh index 59213eee75f50d3c054ed8684a9a0e1053342a0a..f965ba966793f6f6eea0ad3606f60553fe904dda 100644 --- a/paddle/cuda/include/hl_matrix_type.cuh +++ b/paddle/cuda/include/hl_matrix_type.cuh @@ -17,13 +17,20 @@ limitations under the License. */ #include "hl_base.h" -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) #include #ifndef PADDLE_TYPE_DOUBLE typedef float4 vecType; #else typedef double2 vecType; #endif +#elif (defined __ARM_NEON) || (defined __ARM_NEON__) +#include +#ifndef PADDLE_TYPE_DOUBLE +typedef float32x4_t vecType; +#else +#error NEON instructions does not support double precision +#endif #else #include #include diff --git a/paddle/cuda/include/hl_neon_matrix_kernel.cuh b/paddle/cuda/include/hl_neon_matrix_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..7b4e5b00079b66d0a46a1344a43f41962cf50f10 --- /dev/null +++ b/paddle/cuda/include/hl_neon_matrix_kernel.cuh @@ -0,0 +1,299 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + + +#ifndef HL_NEON_MATRIX_KERNEL_CUH_ +#define HL_NEON_MATRIX_KERNEL_CUH_ + +#include "hl_matrix_type.cuh" + +#define VECTOR_SIZE 16 + +/* number of float in vector */ +#define VECTOR_LEN 4 +#define VECTOR_SET vdupq_n_f32 + +inline bool hl_check_align(size_t size) { + return !(size & (VECTOR_SIZE - 1)); +} + +inline bool hl_check_align(void *ptr) { + return hl_check_align(reinterpret_cast(ptr)); +} + +template +inline real hl_agg_op(Agg agg, vecType mm) { + float32x4_t rev = vrev64q_f32(mm); + float32x4_t tmp1 = agg.vecOp(rev, rev); + float32x2_t lo = vget_high_f32(rev); + float32x2_t hi = vget_low_f32(rev); + float32x4_t tmp2 = vcombine_f32(hi, lo); + float32x4_t ret = agg.vecOp(tmp1, tmp2); + + return vgetq_lane_f32(ret, 0); +} + +template +void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv, + int dimM, int dimN, + real *dst, int ld, + real *A, int lda) { + for (int i = 0; i < dimM; i++, A += lda) { + vecType mm = VECTOR_SET(agg.init()); + vecType *a = (vecType*)(A); + for (int j = 0; j < dimN / VECTOR_LEN; j++, a++) { + mm = agg.vecOp(mm, op.vecOp(*a)); + } + + int rem = dimN % VECTOR_LEN; + if (rem) { + real tmp = hl_agg_op(agg, mm); + real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN; + for (int j = 0; j < rem; j++) { + tmp = agg(tmp, op(a[j])); + } + dst[i*ld] = sv(dst[i*ld], tmp); + } else { + dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm)); + } + } +} + +template +void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv, + int dimM, int dimN, + real *dst, int ld, + real *A, int lda, + real *B, int ldb) { + for (int i = 0; i < dimM; i++, A += lda, B += ldb) { + vecType mm = VECTOR_SET(agg.init()); + vecType *a = (vecType*)(A); + vecType *b = (vecType*)(B); + for (int j = 0; j < dimN / VECTOR_LEN; j++, a++, b++) { + mm = agg.vecOp(mm, op.vecOp(*a, *b)); + } + + int rem = dimN % VECTOR_LEN; + if (rem) { + real tmp = hl_agg_op(agg, mm); + real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN; + real *b = B + (dimN / VECTOR_LEN) * VECTOR_LEN; + for (int j = 0; j < rem; j++) { + tmp = agg(tmp, op(a[j], b[j])); + } + dst[i*ld] = sv(dst[i*ld], tmp); + } else { + dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm)); + } + } +} + +template +void hl_matrix_column_op(Agg agg, Op op, Saver sv, + int dimM, int dimN, + real *dst, + real *A, int lda) { + for (int j = 0; j < dimN; j++) { + real tmp = agg.init(); + for (int i = 0; i < dimM; i++) { + tmp = agg(tmp, op(A[i * lda + j])); + } + dst[j] = sv(dst[j], tmp); + } +} + +template +void hl_matrix_column_op(Agg agg, Op op, Saver sv, + int dimM, int dimN, + real *dst, + real *A, int lda, + real *B, int ldb) { + for (int j = 0; j < dimN; j++) { + real tmp = agg.init(); + for (int i = 0; i < dimM; i++) { + tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j])); + } + dst[j] = sv(dst[j], tmp); + } +} + +/* + * MaxRow greater than or equal dimN + * dimN is multiples of VECTOR_LEN + * so rem <= MaxRow / VECTOR_LEN + */ +template +void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv, + int dimM, int dimN, + real *dst, + real *A, int lda) { + vecType mm[MaxRow / VECTOR_LEN]; + for (int n = 0; n < MaxRow / VECTOR_LEN; n++) { + mm[n] = VECTOR_SET(agg.init()); + } + + for (int i = 0; i < dimM; i++) { + vecType *a = (vecType*)(A + i * lda); + for (int n = 0; n < dimN / VECTOR_LEN; n++) { + mm[n] = agg.vecOp(mm[n], op.vecOp(a[n])); + } + } + + vecType *result = (vecType*)(dst); + for (int n = 0; n < dimN / VECTOR_LEN; n++) { + result[n] = sv.vecOp(result[n], mm[n]); + } + + int rem = dimN % VECTOR_LEN; + if (rem) { + A += (dimN / VECTOR_LEN) * VECTOR_LEN; + dst += (dimN / VECTOR_LEN) * VECTOR_LEN; + hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda); + } +} + +/* + * dimN is multiples of VECTOR_LEN + * dimN greater than Step + */ +template +void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv, + int dimM, int dimN, + real *dst, + real *A, int lda) { + for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step) { + vecType mm[Step / VECTOR_LEN]; + for (int n = 0; n < Step / VECTOR_LEN; n++) { + mm[n] = VECTOR_SET(agg.init()); + } + + for (int i = 0; i < dimM; i++) { + vecType *a = (vecType*)(A + i * lda); + for (int n = 0; n < Step / VECTOR_LEN; n++) { + mm[n] = agg.vecOp(mm[n], op.vecOp(a[n])); + } + } + + vecType *result = (vecType*)(dst); + for (int n = 0; n < Step / VECTOR_LEN; n++) { + result[n] = sv.vecOp(result[n], mm[n]); + } + } + + int remRow = dimN % Step; + if (remRow) { + hl_sse_column_op_with_rem(agg, op, sv, dimM, remRow, dst, A, lda); + } +} + +template +void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv, + int dimM, int dimN, + real *dst, + real *A, int lda) { + if (dimN <= 16) { + hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda); + } else if (dimN <= 32) { + hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda); + } else if (dimN <= 1024 || dimM <= 512) { + hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda); + } else { + hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda); + } +} + +template +void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv, + int dimM, int dimN, + real *dst, + real *A, int lda, + real *B, int ldb) { + vecType mm[MaxRow / VECTOR_LEN]; + for (int n = 0; n < MaxRow / VECTOR_LEN; n++) { + mm[n] = VECTOR_SET(agg.init()); + } + + for (int i = 0; i < dimM; i++) { + vecType *a = (vecType*)(A + i * lda); + vecType *b = (vecType*)(B + i * ldb); + for (int n = 0; n < dimN / VECTOR_LEN; n++) { + mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n])); + } + } + + vecType *result = (vecType*)(dst); + for (int n = 0; n < dimN / VECTOR_LEN; n++) { + result[n] = sv.vecOp(result[n], mm[n]); + } + + int rem = dimN % VECTOR_LEN; + if (rem) { + A += (dimN / VECTOR_LEN) * VECTOR_LEN; + B += (dimN / VECTOR_LEN) * VECTOR_LEN; + dst += (dimN / VECTOR_LEN) * VECTOR_LEN; + hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda, B, ldb); + } +} + +template +void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv, + int dimM, int dimN, + real *dst, + real *A, int lda, + real *B, int ldb) { + for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step, B += Step) { + vecType mm[Step / VECTOR_LEN]; + for (int n = 0; n < Step / VECTOR_LEN; n++) { + mm[n] = VECTOR_SET(agg.init()); + } + + for (int i = 0; i < dimM; i++) { + vecType *a = (vecType*)(A + i * lda); + vecType *b = (vecType*)(B + i * ldb); + for (int n = 0; n < Step / VECTOR_LEN; n++) { + mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n])); + } + } + + vecType *result = (vecType*)(dst); + for (int n = 0; n < Step / VECTOR_LEN; n++) { + result[n] = sv.vecOp(result[n], mm[n]); + } + } + + int remRow = dimN % Step; + if (remRow) { + hl_sse_column_op_with_rem( + agg, op, sv, dimM, remRow, dst, A, lda, B, ldb); + } +} + +template +void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv, + int dimM, int dimN, + real *dst, + real *A, int lda, + real *B, int ldb) { + if (dimN <= 16) { + hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb); + } else if (dimN <= 32) { + hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb); + } else if (dimN <= 1024 || dimM <= 512) { + hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb); + } else { + hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb); + } +} + +#endif /* HL_NEON_MATRIX_KERNEL_CUH_ */ diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp index 7617af10ba719490d1b33dd297b070cd8c7c292c..a0b1cd471dd02fd20bb2247395bdb74651610bbf 100644 --- a/paddle/gserver/tests/LayerGradUtil.cpp +++ b/paddle/gserver/tests/LayerGradUtil.cpp @@ -778,8 +778,10 @@ void testProjectionGrad(ProjectionConfig conf, config.biasSize = biasSize == 0 ? config.layerConfig.size() : biasSize; config.layerConfig.set_bias_size(config.biasSize); config.layerConfig.set_shared_biases(sharedBias); - config.inputDefs.push_back( - {inputType, "layer_0", conf.input_size(), parameterSize}); + config.inputDefs.push_back({inputType, + "layer_0", + static_cast(conf.input_size()), + parameterSize}); *config.layerConfig.add_inputs()->mutable_proj_conf() = conf; config.testState = testState; testLayerGrad(config, "mixed", batchSize, false, useGpu); diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp index d7aa1184872d5a6129becca1f6e282776c9dbe15..6203cd3b9ab9f95853cd3c46750fd55d6dfbba4a 100644 --- a/paddle/math/MathFunctions.cpp +++ b/paddle/math/MathFunctions.cpp @@ -85,11 +85,16 @@ int getrf(const CBLAS_ORDER order, float* A, const int lda, int* ipiv) { +#ifdef PADDLE_USE_LAPACK #ifdef PADDLE_USE_ATLAS return clapack_sgetrf(order, M, N, A, lda, ipiv); #else return LAPACKE_sgetrf(order, M, N, A, lda, ipiv); #endif +#else + LOG(FATAL) << "Not implemented"; +#endif + return 0; } template <> @@ -99,11 +104,16 @@ int getrf(const CBLAS_ORDER order, double* A, const int lda, int* ipiv) { +#ifdef PADDLE_USE_LAPACK #ifdef PADDLE_USE_ATLAS return clapack_dgetrf(order, M, N, A, lda, ipiv); #else return LAPACKE_dgetrf(order, M, N, A, lda, ipiv); #endif +#else + LOG(FATAL) << "Not implemented"; +#endif + return 0; } template <> @@ -112,11 +122,16 @@ int getri(const CBLAS_ORDER order, float* A, const int lda, const int* ipiv) { +#ifdef PADDLE_USE_LAPACK #ifdef PADDLE_USE_ATLAS return clapack_sgetri(order, N, A, lda, ipiv); #else return LAPACKE_sgetri(order, N, A, lda, ipiv); #endif +#else + LOG(FATAL) << "Not implemented"; +#endif + return 0; } template <> @@ -125,11 +140,16 @@ int getri(const CBLAS_ORDER order, double* A, const int lda, const int* ipiv) { +#ifdef PADDLE_USE_LAPACK #ifdef PADDLE_USE_ATLAS return clapack_dgetri(order, N, A, lda, ipiv); #else return LAPACKE_dgetri(order, N, A, lda, ipiv); #endif +#else + LOG(FATAL) << "Not implemented"; +#endif + return 0; } template <> diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h index c8559eefd8378450fc18c2ba821c65b39c8cc046..9f8f84a87c5e60b2a6573844f251c42152d8156b 100644 --- a/paddle/math/MathFunctions.h +++ b/paddle/math/MathFunctions.h @@ -17,11 +17,14 @@ limitations under the License. */ #ifdef PADDLE_USE_MKL #include +#ifdef PADDLE_USE_LAPACK #include +#endif #else extern "C" { #include } +#ifdef PADDLE_USE_LAPACK #ifdef PADDLE_USE_ATLAS extern "C" { #include @@ -30,6 +33,7 @@ extern "C" { #include #endif #endif +#endif #include diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 5f30a15f2eb913d57d01479cf132e188b9e7c813..55a7344495f8e57dc95095ab1b81b45008fa9acc 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -2426,41 +2426,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) { int lda = a->getStride(); int ldb = b->getStride(); int ldc = getStride(); -#ifndef PADDLE_TYPE_DOUBLE - cblas_sgemm(CblasRowMajor, - a_trans, - b_trans, - M, - N, - K, - scaleAB, - A, - lda, - B, - ldb, - scaleT, - C, - ldc); -#else - cblas_dgemm(CblasRowMajor, - a_trans, - b_trans, - M, - N, - K, - scaleAB, - A, - lda, - B, - ldb, - scaleT, - C, - ldc); -// TODO(yuyang18): Is gemm defined other place? -#endif - - VLOG(2) << " A[0]=" << A[0] << " A[1]=" << A[1] << " B[0]=" << B[0] - << " B[1]=" << B[1] << " C[0]=" << C[0] << " C[1]=" << C[1]; + gemm( + a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc); } void CpuMatrix::mul( diff --git a/paddle/math/SIMDFunctions.cpp b/paddle/math/SIMDFunctions.cpp index 95219debf50e57407b668d315b91141d259fc779..d66d543a61450b47b7758b50eaecc107c6fe3576 100644 --- a/paddle/math/SIMDFunctions.cpp +++ b/paddle/math/SIMDFunctions.cpp @@ -13,119 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "SIMDFunctions.h" +#ifdef __SSE3__ #include +#endif #include -#ifndef __AVX__ -static void addto_sse(float* a, const float* b, size_t len) { - int offset = len % 16; - __m128 ma0, ma1, ma2, ma3; - __m128 mb0, mb1, mb2, mb3; - - for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) { - ma0 = _mm_load_ps(a); - ma1 = _mm_load_ps(a + 4); - ma2 = _mm_load_ps(a + 8); - ma3 = _mm_load_ps(a + 12); - - mb0 = _mm_load_ps(b); - mb1 = _mm_load_ps(b + 4); - mb2 = _mm_load_ps(b + 8); - mb3 = _mm_load_ps(b + 12); - - ma0 = _mm_add_ps(ma0, mb0); - ma1 = _mm_add_ps(ma1, mb1); - ma2 = _mm_add_ps(ma2, mb2); - ma3 = _mm_add_ps(ma3, mb3); - - _mm_store_ps(a, ma0); - _mm_store_ps(a + 4, ma1); - _mm_store_ps(a + 8, ma2); - _mm_store_ps(a + 12, ma3); - } - - for (int i = 0; i < offset; i++) a[i] += b[i]; -} - -static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) { - int offset = len % 16; - - __m128 ma0, ma1, ma2, ma3; - __m128 mb0, mb1, mb2, mb3; - - for (unsigned int k = 0; k < len / 16; k++, a += 16) { - ma0 = _mm_load_ps(a); - ma1 = _mm_load_ps(a + 4); - ma2 = _mm_load_ps(a + 8); - ma3 = _mm_load_ps(a + 12); - - for (int i = 0; i < batch; i++) { - mb0 = _mm_load_ps(b[i]); - mb1 = _mm_load_ps(b[i] + 4); - mb2 = _mm_load_ps(b[i] + 8); - mb3 = _mm_load_ps(b[i] + 12); - ma0 = _mm_add_ps(ma0, mb0); - ma1 = _mm_add_ps(ma1, mb1); - ma2 = _mm_add_ps(ma2, mb2); - ma3 = _mm_add_ps(ma3, mb3); - b[i] += 16; - } - - _mm_store_ps(a, ma0); - _mm_store_ps(a + 4, ma1); - _mm_store_ps(a + 8, ma2); - _mm_store_ps(a + 12, ma3); - } - - for (int i = 0; i < offset; i++) { - for (int k = 0; k < batch; k++) a[i] += b[k][i]; - } - return; -} - -static void col_max_sse(float* result, - const float* data, - int dim, - int numSamples) { - // first sample, direct copy - for (int d = 0; d < dim; ++d) { - result[d] = data[d]; - } - int offset = dim % 16; - __m128 ma0, ma1, ma2, ma3; - __m128 mb0, mb1, mb2, mb3; - // first 16n dims - for (int k = 0; k < dim / 16; k++, result += 16, data += 16) { - ma0 = _mm_load_ps(result); - ma1 = _mm_load_ps(result + 4); - ma2 = _mm_load_ps(result + 8); - ma3 = _mm_load_ps(result + 12); - for (int i = 1; i < numSamples; i++) { - mb0 = _mm_load_ps(data + i * dim); - mb1 = _mm_load_ps(data + i * dim + 4); - mb2 = _mm_load_ps(data + i * dim + 8); - mb3 = _mm_load_ps(data + i * dim + 12); - ma0 = _mm_max_ps(ma0, mb0); - ma1 = _mm_max_ps(ma1, mb1); - ma2 = _mm_max_ps(ma2, mb2); - ma3 = _mm_max_ps(ma3, mb3); - } - _mm_store_ps(result, ma0); - _mm_store_ps(result + 4, ma1); - _mm_store_ps(result + 8, ma2); - _mm_store_ps(result + 12, ma3); - } - // last dims - for (int d = 0; d < offset; ++d) { - float sm = data[d]; - for (int i = 1; i < numSamples; ++i) { - sm = std::max(sm, data[i * dim + d]); - } - result[d] = sm; - } -} - -#else +#ifdef __AVX__ static void addto_avx(float* a, const float* b, size_t len) { int offset = len % 32; @@ -355,17 +248,128 @@ static void decayL1_avx( } } +#elif defined(__SSE3__) + +static void addto_sse(float* a, const float* b, size_t len) { + int offset = len % 16; + __m128 ma0, ma1, ma2, ma3; + __m128 mb0, mb1, mb2, mb3; + + for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) { + ma0 = _mm_load_ps(a); + ma1 = _mm_load_ps(a + 4); + ma2 = _mm_load_ps(a + 8); + ma3 = _mm_load_ps(a + 12); + + mb0 = _mm_load_ps(b); + mb1 = _mm_load_ps(b + 4); + mb2 = _mm_load_ps(b + 8); + mb3 = _mm_load_ps(b + 12); + + ma0 = _mm_add_ps(ma0, mb0); + ma1 = _mm_add_ps(ma1, mb1); + ma2 = _mm_add_ps(ma2, mb2); + ma3 = _mm_add_ps(ma3, mb3); + + _mm_store_ps(a, ma0); + _mm_store_ps(a + 4, ma1); + _mm_store_ps(a + 8, ma2); + _mm_store_ps(a + 12, ma3); + } + + for (int i = 0; i < offset; i++) a[i] += b[i]; +} + +static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) { + int offset = len % 16; + + __m128 ma0, ma1, ma2, ma3; + __m128 mb0, mb1, mb2, mb3; + + for (unsigned int k = 0; k < len / 16; k++, a += 16) { + ma0 = _mm_load_ps(a); + ma1 = _mm_load_ps(a + 4); + ma2 = _mm_load_ps(a + 8); + ma3 = _mm_load_ps(a + 12); + + for (int i = 0; i < batch; i++) { + mb0 = _mm_load_ps(b[i]); + mb1 = _mm_load_ps(b[i] + 4); + mb2 = _mm_load_ps(b[i] + 8); + mb3 = _mm_load_ps(b[i] + 12); + ma0 = _mm_add_ps(ma0, mb0); + ma1 = _mm_add_ps(ma1, mb1); + ma2 = _mm_add_ps(ma2, mb2); + ma3 = _mm_add_ps(ma3, mb3); + b[i] += 16; + } + + _mm_store_ps(a, ma0); + _mm_store_ps(a + 4, ma1); + _mm_store_ps(a + 8, ma2); + _mm_store_ps(a + 12, ma3); + } + + for (int i = 0; i < offset; i++) { + for (int k = 0; k < batch; k++) a[i] += b[k][i]; + } + return; +} + +static void col_max_sse(float* result, + const float* data, + int dim, + int numSamples) { + // first sample, direct copy + for (int d = 0; d < dim; ++d) { + result[d] = data[d]; + } + int offset = dim % 16; + __m128 ma0, ma1, ma2, ma3; + __m128 mb0, mb1, mb2, mb3; + // first 16n dims + for (int k = 0; k < dim / 16; k++, result += 16, data += 16) { + ma0 = _mm_load_ps(result); + ma1 = _mm_load_ps(result + 4); + ma2 = _mm_load_ps(result + 8); + ma3 = _mm_load_ps(result + 12); + for (int i = 1; i < numSamples; i++) { + mb0 = _mm_load_ps(data + i * dim); + mb1 = _mm_load_ps(data + i * dim + 4); + mb2 = _mm_load_ps(data + i * dim + 8); + mb3 = _mm_load_ps(data + i * dim + 12); + ma0 = _mm_max_ps(ma0, mb0); + ma1 = _mm_max_ps(ma1, mb1); + ma2 = _mm_max_ps(ma2, mb2); + ma3 = _mm_max_ps(ma3, mb3); + } + _mm_store_ps(result, ma0); + _mm_store_ps(result + 4, ma1); + _mm_store_ps(result + 8, ma2); + _mm_store_ps(result + 12, ma3); + } + // last dims + for (int d = 0; d < offset; ++d) { + float sm = data[d]; + for (int i = 1; i < numSamples; ++i) { + sm = std::max(sm, data[i * dim + d]); + } + result[d] = sm; + } +} + #endif -#ifndef __AVX__ -#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__) -#else +#if defined(__AVX__) #define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__) +#elif defined(__SSE3__) +#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__) #endif namespace paddle { namespace simd { namespace internal { +#ifdef __SSE3__ void addToImpl(float* a, const float* b, size_t len) { SIMD_INVOKE(addto, a, b, len); } @@ -376,6 +380,7 @@ void batchAddToImpl(float* a, const float* b[], int batch, size_t len) { void colMaxImpl(float* result, const float* data, int dim, int numSamples) { SIMD_INVOKE(col_max, result, data, dim, numSamples); } +#endif #ifdef __AVX__ void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len) { @@ -385,8 +390,8 @@ void decayL1AvxImpl( float* dst, float* src, float* lr, float lambda, size_t len) { decayL1_avx(dst, src, lr, lambda, len); } - #endif + } // namespace internal } // namespace simd } // namespace paddle diff --git a/paddle/math/SIMDFunctions.h b/paddle/math/SIMDFunctions.h index 9b0a8719b287a2b88e966484090974586d64521f..439f11b79d134d7054f45f2d0a70fc5a6fde6c13 100644 --- a/paddle/math/SIMDFunctions.h +++ b/paddle/math/SIMDFunctions.h @@ -128,17 +128,29 @@ void decayL1AvxImpl( template <> inline void addTo(float* a, const float* b, size_t len) { +#ifdef __SSE3__ internal::addToImpl(a, b, len); +#else + naive::addTo(a, b, len); +#endif } template <> inline void batchAddTo(float* a, const float* b[], int batch, size_t len) { +#ifdef __SSE3__ internal::batchAddToImpl(a, b, batch, len); +#else + naive::batchAddTo(a, b, batch, len); +#endif } template <> inline void colMax(float* result, const float* data, int dim, int numSamples) { +#ifdef __SSE3__ internal::colMaxImpl(result, data, dim, numSamples); +#else + naive::colMax(result, data, dim, numSamples); +#endif } template <> diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp index 56e5442394b04230c22d668aa734dc0fa44004c2..7ce17a3207becb176a852a16fca52376009db9ee 100644 --- a/paddle/math/Storage.cpp +++ b/paddle/math/Storage.cpp @@ -14,6 +14,7 @@ limitations under the License. */ #include "Storage.h" #include "Allocator.h" +#include "paddle/utils/StringUtil.h" #include "paddle/utils/Util.h" DEFINE_int32(pool_limit_size, @@ -62,7 +63,7 @@ PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) { } if (gpuAllocator_[deviceId] == nullptr) { std::string name = - "gpu" + std::to_string(deviceId) + std::string("_pool"); + "gpu" + str::to_string(deviceId) + std::string("_pool"); gpuAllocator_[deviceId] = new PoolAllocator(new GpuAllocator(), FLAGS_pool_limit_size, name); } diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp index 877cbb86ec112739a5c7eeee969ca48ef491ee87..19ff40ba7e9584f772043f939bcb31caf666163d 100644 --- a/paddle/pserver/ParameterServer2.cpp +++ b/paddle/pserver/ParameterServer2.cpp @@ -29,6 +29,7 @@ limitations under the License. */ #include "paddle/utils/Flags.h" #include "paddle/utils/GlobalConstants.h" #include "paddle/utils/Stat.h" +#include "paddle/utils/StringUtil.h" DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec"); DEFINE_double(async_lagged_ratio_min, @@ -218,7 +219,8 @@ void ParameterServer2::setConfig(const SetConfigRequest& request, callback(response); /// always defined, barrier slowest node function need it. - statSet_.reset(new StatSet("ParameterServer" + std::to_string(serverId_))); + statSet_.reset(new StatSet("ParameterServer" + + str::to_string(static_cast(serverId_)))); } real bufferSum(const std::vector& buffers) { diff --git a/paddle/utils/CpuId.cpp b/paddle/utils/CpuId.cpp index 8eefdd2980e7f56a836df6fd2ff8c31b81a55555..edd33c454122d95078e0fde2a2e9d68903951ee8 100644 --- a/paddle/utils/CpuId.cpp +++ b/paddle/utils/CpuId.cpp @@ -19,7 +19,7 @@ limitations under the License. */ /// for MSVC #define CPUID(info, x) __cpuidex(info, x, 0) -#else +#elif !defined(__ANDROID__) #include @@ -31,6 +31,7 @@ limitations under the License. */ namespace paddle { SIMDFlags::SIMDFlags() { +#if !defined(__ANDROID__) unsigned int cpuInfo[4]; // CPUID: https://en.wikipedia.org/wiki/CPUID // clang-format off @@ -51,6 +52,9 @@ SIMDFlags::SIMDFlags() { CPUID(cpuInfo, 0x80000001); simd_flags_ |= cpuInfo[2] & (1 << 16) ? SIMD_FMA4 : SIMD_NONE; // clang-fotmat on +#else + simd_flags_ = SIMD_NEON; +#endif } SIMDFlags const* SIMDFlags::instance() { diff --git a/paddle/utils/CpuId.h b/paddle/utils/CpuId.h index 5fc610964d4f5b8064f16ebf1b26bbb002264ce1..869be5be541dafd699a87a8e8893aadadf59b711 100644 --- a/paddle/utils/CpuId.h +++ b/paddle/utils/CpuId.h @@ -30,6 +30,7 @@ enum simd_t { SIMD_AVX = 1 << 8, ///< AVX SIMD_AVX2 = 1 << 9, ///< AVX 2 SIMD_AVX512 = 1 << 10, ///< AVX 512 + SIMD_NEON = 1 << 11, /// NEON }; // clang-format on @@ -96,6 +97,7 @@ private: #define HAS_AVX HAS_SIMD(SIMD_AVX) #define HAS_AVX2 HAS_SIMD(SIMD_AVX2) #define HAS_AVX512 HAS_SIMD(SIMD_AVX512) +#define HAS_NEON HAS_SIMD(SIMD_NEON) // clang-format on /** diff --git a/paddle/utils/Logging.cpp b/paddle/utils/Logging.cpp index 5a1c6ecb2219f7983609c27f3215c7fc1e9e9ef2..ea96bad240ad81c4c29b7dab35b015549052e2bb 100644 --- a/paddle/utils/Logging.cpp +++ b/paddle/utils/Logging.cpp @@ -18,6 +18,7 @@ limitations under the License. */ */ #include "Logging.h" +#include namespace paddle { diff --git a/paddle/utils/StringUtil.h b/paddle/utils/StringUtil.h index 0b4f4c9113ae9d714b634b67931e51b408bbe777..95f071cb7de87d87f6988c136d7993c66fa9dde1 100644 --- a/paddle/utils/StringUtil.h +++ b/paddle/utils/StringUtil.h @@ -54,6 +54,25 @@ inline T toWithStatus(const std::string& s, bool* ok = nullptr) { return v; } +/** + * Cast type T to string with status. + * + * @param [in] v input value of type T. + * @param [out] ok status, return true if there is no error in casting. Set + * nullptr if user don't care error at all. + * @return result of casting. If error occurred, a empty string will be + * returned. + */ +template +inline std::string toWithStatus(const T v, bool* ok = nullptr) { + std::ostringstream sout; + sout << v; + if (ok) { + *ok = !sout.fail(); + } + return sout.str(); +} + /// Convert string to type T. It makes sure all the characters in s are used. /// Otherwise it will abort. /// @@ -67,6 +86,18 @@ inline T to(const std::string& s) { return v; } +/// Convert type T to string. +/// +/// @tparam T type of input value +/// @param v input value of type T +template +std::string to_string(T v) { + bool ok; + std::string s = toWithStatus(v, &ok); + CHECK(ok) << "Cannot convert v(" << v << ") to type std::string"; + return s; +} + } // namespace str #undef DEFINE_STRING_CONVERSION diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp index 1f56b6b8a96602d298507452fc7182d46179de41..b18b73e06a6c39c3bf9717280bc6323917c80efb 100644 --- a/paddle/utils/Util.cpp +++ b/paddle/utils/Util.cpp @@ -15,11 +15,16 @@ limitations under the License. */ #include "Util.h" #include -#include #include #include #include + +#ifdef __SSE__ #include +#endif +#ifdef __SSE3__ +#include +#endif #include #include @@ -163,8 +168,12 @@ void initMain(int argc, char** argv) { installProfilerSwitch(); +#ifdef __SSE__ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); +#endif +#ifdef __SSE3__ _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); +#endif if (FLAGS_seed == 0) { unsigned int t = time(NULL); diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp index 2a6f96e04d024ac3977bc154dbeeb69ce9ab3a5d..310c9a6542563891d4ba5888e58406ea28d6a2ce 100644 --- a/paddle/utils/arch/linux/Locks.cpp +++ b/paddle/utils/arch/linux/Locks.cpp @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/utils/Locks.h" #include #include +#include "paddle/utils/Logging.h" namespace paddle { class SemaphorePrivate { @@ -26,7 +27,10 @@ Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) { sem_init(&m->sem, 0, initValue); } -Semaphore::~Semaphore() { sem_destroy(&m->sem); } +Semaphore::~Semaphore() { + sem_destroy(&m->sem); + delete m; +} bool Semaphore::timeWait(struct timespec* ts) { return (0 == sem_timedwait(&m->sem, ts)); @@ -36,36 +40,101 @@ void Semaphore::wait() { sem_wait(&m->sem); } void Semaphore::post() { sem_post(&m->sem); } +#ifdef PADDLE_USE_PTHREAD_SPINLOCK + class SpinLockPrivate { public: inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); } inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); } + + inline void lock() { pthread_spin_lock(&lock_); } + inline void unlock() { pthread_spin_unlock(&lock_); } + pthread_spinlock_t lock_; char padding_[64 - sizeof(pthread_spinlock_t)]; }; -SpinLock::SpinLock() : m(new SpinLockPrivate()) {} +#else -SpinLock::~SpinLock() { delete m; } +#include +class SpinLockPrivate { +public: + inline void lock() { + while (lock_.test_and_set(std::memory_order_acquire)) { + } + } + inline void unlock() { lock_.clear(std::memory_order_release); } + + std::atomic_flag lock_ = ATOMIC_FLAG_INIT; + char padding_[64 - sizeof(lock_)]; // Padding to cache line size +}; -void SpinLock::lock() { pthread_spin_lock(&m->lock_); } +#endif -void SpinLock::unlock() { pthread_spin_unlock(&m->lock_); } +SpinLock::SpinLock() : m(new SpinLockPrivate()) {} +SpinLock::~SpinLock() { delete m; } +void SpinLock::lock() { m->lock(); } +void SpinLock::unlock() { m->unlock(); } + +#ifdef PADDLE_USE_PTHREAD_BARRIER class ThreadBarrierPrivate { public: pthread_barrier_t barrier_; + + inline explicit ThreadBarrierPrivate(int count) { + pthread_barrier_init(&barrier_, nullptr, count); + } + + inline ~ThreadBarrierPrivate() { pthread_barrier_destroy(&barrier_); } + + inline void wait() { pthread_barrier_wait(&barrier_); } }; -ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate()) { - pthread_barrier_init(&m->barrier_, nullptr, count); -} +#else -ThreadBarrier::~ThreadBarrier() { - pthread_barrier_destroy(&m->barrier_); - delete m; -} +class ThreadBarrierPrivate { +public: + pthread_mutex_t mutex_; + pthread_cond_t cond_; + int count_; + int tripCount_; + + inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) { + CHECK_NE(cnt, 0); + CHECK_GE(pthread_mutex_init(&mutex_, 0), 0); + CHECK_GE(pthread_cond_init(&cond_, 0), 0); + } + + inline ~ThreadBarrierPrivate() { + pthread_cond_destroy(&cond_); + pthread_mutex_destroy(&mutex_); + } + + /** + * @brief wait + * @return true if the last wait + */ + inline bool wait() { + pthread_mutex_lock(&mutex_); + ++count_; + if (count_ >= tripCount_) { + count_ = 0; + pthread_cond_broadcast(&cond_); + pthread_mutex_unlock(&mutex_); + return true; + } else { + pthread_cond_wait(&cond_, &mutex_); + pthread_mutex_unlock(&mutex_); + return false; + } + } +}; + +#endif -void ThreadBarrier::wait() { pthread_barrier_wait(&m->barrier_); } +ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {} +ThreadBarrier::~ThreadBarrier() { delete m; } +void ThreadBarrier::wait() { m->wait(); } } // namespace paddle diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp index 378788bcecd579fff1c762702a8c27f54cee94bf..b5d9f93f1376048eabd726331006b0bb848bce11 100644 --- a/paddle/utils/tests/test_CustomStackTrace.cpp +++ b/paddle/utils/tests/test_CustomStackTrace.cpp @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/utils/CustomStackTrace.h" #include "paddle/utils/Locks.h" +#include "paddle/utils/StringUtil.h" #include "paddle/utils/Util.h" DEFINE_int32(test_thread_num, 10, "testing thread number"); @@ -69,11 +70,11 @@ TEST(CustomStackTrace, normalTrain) { while (countDown-- > 0) { start.wait(); for (size_t i = 0; i < layerSize; ++i) { - tracer.push("layer_" + std::to_string(i)); + tracer.push("layer_" + paddle::str::to_string(i)); } tracer.pop(""); for (size_t i = 0; i < layerSize; ++i) { - tracer.pop("layer_" + std::to_string(layerSize - 1 - i)); + tracer.pop("layer_" + paddle::str::to_string(layerSize - 1 - i)); } finish.wait(); } @@ -89,7 +90,7 @@ TEST(CustomStackTrace, normalTest) { while (countDown-- > 0) { start.wait(); for (size_t i = 0; i < layerSize; ++i) { - tracer.push("layer_" + std::to_string(i)); + tracer.push("layer_" + paddle::str::to_string(i)); } tracer.clear(); // in forward test, tracer will clear after forward. finish.wait(); diff --git a/paddle/utils/tests/test_CustomStackTracePrint.cpp b/paddle/utils/tests/test_CustomStackTracePrint.cpp index 611b16aa7116d03ee51ba0095d043b78df1742ba..360c61c88a757da708b01d2bb54068b948b235cc 100644 --- a/paddle/utils/tests/test_CustomStackTracePrint.cpp +++ b/paddle/utils/tests/test_CustomStackTracePrint.cpp @@ -13,13 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/utils/CustomStackTrace.h" +#include "paddle/utils/StringUtil.h" #include "paddle/utils/Util.h" int main(int argc, char** argv) { paddle::initMain(argc, argv); for (size_t i = 0; i < 1000; ++i) { - paddle::gLayerStackTrace.push("layer_" + std::to_string(i)); + paddle::gLayerStackTrace.push("layer_" + paddle::str::to_string(i)); if (i == 998) { throw "Unhandle exception"; } diff --git a/paddle/utils/tests/test_SIMDFlags.cpp b/paddle/utils/tests/test_SIMDFlags.cpp index 8200a24ce7b7df75b48a89fbb7af15f304c5957f..185789c927be19385d6ddc7a1889b6cc56109d38 100644 --- a/paddle/utils/tests/test_SIMDFlags.cpp +++ b/paddle/utils/tests/test_SIMDFlags.cpp @@ -18,7 +18,8 @@ limitations under the License. */ using namespace paddle; // NOLINT TEST(SIMDFlags, gccTest) { -#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__)) +#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__)) && \ + !defined(__arm__) // clang-format off CHECK(!__builtin_cpu_supports("sse") != HAS_SSE); CHECK(!__builtin_cpu_supports("sse2") != HAS_SSE2); @@ -43,4 +44,5 @@ TEST(SIMDFlags, normalPrint) { LOG(INFO) << "Has AVX: " << std::boolalpha << HAS_AVX; LOG(INFO) << "Has AVX2: " << std::boolalpha << HAS_AVX2; LOG(INFO) << "Has AVX512: " << std::boolalpha << HAS_AVX512; + LOG(INFO) << "Has NEON: " << std::boolalpha << HAS_NEON; }