Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
ba68ce1a
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
ba68ce1a
编写于
4月 11, 2017
作者:
L
Luo Tao
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into stride
上级
c1738e29
3133c09f
变更
41
隐藏空白更改
内联
并排
Showing
41 changed file
with
920 addition
and
260 deletion
+920
-260
CMakeLists.txt
CMakeLists.txt
+28
-7
cmake/cblas.cmake
cmake/cblas.cmake
+20
-8
cmake/configure.cmake
cmake/configure.cmake
+11
-12
cmake/cudnn.cmake
cmake/cudnn.cmake
+4
-0
cmake/external/gflags.cmake
cmake/external/gflags.cmake
+2
-0
cmake/external/glog.cmake
cmake/external/glog.cmake
+2
-0
cmake/external/gtest.cmake
cmake/external/gtest.cmake
+2
-0
cmake/external/openblas.cmake
cmake/external/openblas.cmake
+2
-0
cmake/external/python.cmake
cmake/external/python.cmake
+4
-4
cmake/external/warpctc.cmake
cmake/external/warpctc.cmake
+2
-0
cmake/external/zlib.cmake
cmake/external/zlib.cmake
+3
-1
cmake/flags.cmake
cmake/flags.cmake
+13
-1
cmake/simd.cmake
cmake/simd.cmake
+4
-0
cmake/system.cmake
cmake/system.cmake
+6
-0
cmake/util.cmake
cmake/util.cmake
+4
-0
paddle/cuda/include/hl_cpu_matrix_kernel.cuh
paddle/cuda/include/hl_cpu_matrix_kernel.cuh
+4
-0
paddle/cuda/include/hl_matrix_base.cuh
paddle/cuda/include/hl_matrix_base.cuh
+2
-0
paddle/cuda/include/hl_matrix_base_neon.cuh
paddle/cuda/include/hl_matrix_base_neon.cuh
+161
-0
paddle/cuda/include/hl_matrix_type.cuh
paddle/cuda/include/hl_matrix_type.cuh
+8
-1
paddle/cuda/include/hl_neon_matrix_kernel.cuh
paddle/cuda/include/hl_neon_matrix_kernel.cuh
+299
-0
paddle/function/Function.h
paddle/function/Function.h
+1
-1
paddle/function/PadOpGpu.cu
paddle/function/PadOpGpu.cu
+6
-6
paddle/function/PadOpTest.cpp
paddle/function/PadOpTest.cpp
+16
-42
paddle/gserver/tests/LayerGradUtil.cpp
paddle/gserver/tests/LayerGradUtil.cpp
+4
-2
paddle/math/MathFunctions.cpp
paddle/math/MathFunctions.cpp
+20
-0
paddle/math/MathFunctions.h
paddle/math/MathFunctions.h
+4
-0
paddle/math/Matrix.cpp
paddle/math/Matrix.cpp
+2
-35
paddle/math/SIMDFunctions.cpp
paddle/math/SIMDFunctions.cpp
+119
-114
paddle/math/SIMDFunctions.h
paddle/math/SIMDFunctions.h
+12
-0
paddle/math/Storage.cpp
paddle/math/Storage.cpp
+2
-1
paddle/pserver/ParameterServer2.cpp
paddle/pserver/ParameterServer2.cpp
+3
-1
paddle/utils/CpuId.cpp
paddle/utils/CpuId.cpp
+5
-1
paddle/utils/CpuId.h
paddle/utils/CpuId.h
+2
-0
paddle/utils/Logging.cpp
paddle/utils/Logging.cpp
+1
-0
paddle/utils/StringUtil.h
paddle/utils/StringUtil.h
+31
-0
paddle/utils/Util.cpp
paddle/utils/Util.cpp
+10
-1
paddle/utils/arch/linux/Locks.cpp
paddle/utils/arch/linux/Locks.cpp
+82
-13
paddle/utils/tests/test_CustomStackTrace.cpp
paddle/utils/tests/test_CustomStackTrace.cpp
+4
-3
paddle/utils/tests/test_CustomStackTracePrint.cpp
paddle/utils/tests/test_CustomStackTracePrint.cpp
+2
-1
paddle/utils/tests/test_SIMDFlags.cpp
paddle/utils/tests/test_SIMDFlags.cpp
+3
-1
python/paddle/trainer_config_helpers/layers.py
python/paddle/trainer_config_helpers/layers.py
+10
-4
未找到文件。
CMakeLists.txt
浏览文件 @
ba68ce1a
...
...
@@ -12,19 +12,26 @@
# See the License for the specific language governing permissions and
# limitations under the License
cmake_minimum_required
(
VERSION 3.0
)
project
(
paddle CXX C
)
set
(
CMAKE_MODULE_PATH
${
CMAKE_MODULE_PATH
}
"
${
CMAKE_SOURCE_DIR
}
/cmake"
)
set
(
PROJ_ROOT
${
CMAKE_SOURCE_DIR
}
)
include
(
system
)
if
(
ANDROID
)
cmake_minimum_required
(
VERSION 3.7
)
else
()
cmake_minimum_required
(
VERSION 3.0
)
endif
()
project
(
paddle CXX C
)
find_package
(
Sphinx
)
find_package
(
CUDA QUIET
)
if
(
NOT CMAKE_CROSSCOMPILING
)
find_package
(
CUDA QUIET
)
endif
(
NOT CMAKE_CROSSCOMPILING
)
find_package
(
Git REQUIRED
)
find_package
(
Threads REQUIRED
)
include
(
system
)
include
(
simd
)
################################ Configurations #######################################
...
...
@@ -51,6 +58,21 @@ if(NOT CMAKE_BUILD_TYPE)
FORCE
)
endif
()
if
(
ANDROID
)
if
(
${
CMAKE_SYSTEM_VERSION
}
VERSION_LESS
"21"
)
message
(
FATAL_ERROR
"Unsupport standalone toolchains with Android API level lower than 21"
)
endif
()
set
(
WITH_GPU OFF CACHE STRING
"Disable GPU when cross-compiling for Android"
FORCE
)
set
(
WITH_AVX OFF CACHE STRING
"Disable AVX when cross-compiling for Android"
FORCE
)
set
(
WITH_PYTHON OFF CACHE STRING
"Disable PYTHON when cross-compiling for Android"
FORCE
)
set
(
WITH_RDMA OFF CACHE STRING
"Disable RDMA when cross-compiling for Android"
FORCE
)
endif
(
ANDROID
)
set
(
THIRD_PARTY_PATH
"
${
PROJ_ROOT
}
/third_party"
CACHE STRING
"A path setting third party libraries download & build directories."
)
########################################################################################
...
...
@@ -75,7 +97,6 @@ include(flags) # set paddle compile flags
include
(
cudnn
)
# set cudnn libraries
include
(
version
)
# set PADDLE_VERSION
include
(
coveralls
)
# set code coverage
include
(
configure
)
# add paddle env configuration
include_directories
(
"
${
PROJ_ROOT
}
"
)
...
...
cmake/cblas.cmake
浏览文件 @
ba68ce1a
...
...
@@ -19,9 +19,9 @@ set(CBLAS_FOUND OFF)
set
(
INTEL_ROOT
"/opt/intel"
CACHE PATH
"Folder contains intel libs"
)
set
(
MKL_ROOT
${
INTEL_ROOT
}
/mkl CACHE PATH
"Folder contains MKL"
)
find_path
(
MKL_INC
LUDE
_DIR mkl.h PATHS
find_path
(
MKL_INC_DIR mkl.h PATHS
${
MKL_ROOT
}
/include
)
find_path
(
MKL_
INCLUDE
_DIR mkl_lapacke.h PATHS
find_path
(
MKL_
LAPACK_INC
_DIR mkl_lapacke.h PATHS
${
MKL_ROOT
}
/include
)
find_library
(
MKL_CORE_LIB NAMES mkl_core PATHS
${
MKL_ROOT
}
/lib
...
...
@@ -34,15 +34,19 @@ find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
${
MKL_ROOT
}
/lib/intel64
)
if
(
MKL_INC
LUDE
_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64
)
if
(
MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64
)
set
(
CBLAS_PROVIDER MKL
)
set
(
CBLAS_INC_DIR
${
MKL_INC
LUDE
_DIR
}
)
set
(
CBLAS_INC_DIR
${
MKL_INC_DIR
}
)
set
(
CBLAS_LIBRARIES
${
MKL_INTEL_LP64
}
${
MKL_SEQUENTIAL_LIB
}
${
MKL_CORE_LIB
}
)
add_definitions
(
-DPADDLE_USE_MKL
)
message
(
STATUS
"Found MKL (include:
${
CBLAS_INC_DIR
}
, library:
${
CBLAS_LIBRARIES
}
)"
)
set
(
CBLAS_FOUND ON
)
if
(
${
MKL_LAPACK_INC_DIR
}
)
add_definitions
(
-DPADDLE_USE_LAPACK
)
message
(
STATUS
"Found lapack in MKL (include:
${
MKL_LAPACK_INC_DIR
}
)"
)
endif
()
return
()
# return file.
endif
()
...
...
@@ -68,13 +72,17 @@ find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
find_library
(
ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
PATHS
${
ATLAS_LIB_SEARCH_PATHS
}
)
if
(
ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB
)
if
(
ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB
AND NOT CBLAS_FOUND
)
set
(
CBLAS_PROVIDER ATLAS
)
set
(
CBLAS_INC_DIR
${
ATLAS_INC_DIR
}
${
ATLAS_CLAPACK_INC_DIR
}
)
set
(
CBLAS_INC_DIR
${
ATLAS_INC_DIR
}
)
set
(
CBLAS_LIBRARIES
${
ATLAS_LIB
}
${
ATLAS_CBLAS_LIB
}
)
add_definitions
(
-DPADDLE_USE_ATLAS
)
message
(
STATUS
"Found A
tlas
(include:
${
CBLAS_INC_DIR
}
, library:
${
CBLAS_LIBRARIES
}
)"
)
message
(
STATUS
"Found A
TLAS
(include:
${
CBLAS_INC_DIR
}
, library:
${
CBLAS_LIBRARIES
}
)"
)
set
(
CBLAS_FOUND ON
)
if
(
ATLAS_CLAPACK_INC_DIR
)
add_definitions
(
-DPADDLE_USE_LAPACK
)
message
(
STATUS
"Found lapack in ATLAS (include:
${
ATLAS_CLAPACK_INC_DIR
}
)"
)
endif
()
return
()
endif
()
...
...
@@ -103,8 +111,12 @@ if(OPENBLAS_INC_DIR AND OPENBLAS_LIB)
set
(
CBLAS_PROVIDER OPENBLAS
)
set
(
CBLAS_INC_DIR
${
OPENBLAS_INC_DIR
}
)
set
(
CBLAS_LIBRARIES
${
OPENBLAS_LIB
}
)
message
(
STATUS
"Found OpenB
las
(include:
${
CBLAS_INC_DIR
}
, library:
${
CBLAS_LIBRARIES
}
)"
)
message
(
STATUS
"Found OpenB
LAS
(include:
${
CBLAS_INC_DIR
}
, library:
${
CBLAS_LIBRARIES
}
)"
)
set
(
CBLAS_FOUND ON
)
if
(
OPENBLAS_LAPACKE_INC_DIR
)
add_definitions
(
-DPADDLE_USE_LAPACK
)
message
(
STATUS
"Found lapack in OpenBLAS (include:
${
OPENBLAS_LAPACKE_INC_DIR
}
)"
)
endif
()
return
()
endif
()
...
...
cmake/configure.cmake
浏览文件 @
ba68ce1a
...
...
@@ -32,6 +32,14 @@ if(NOT WITH_PROFILER)
add_definitions
(
-DPADDLE_DISABLE_PROFILER
)
endif
(
NOT WITH_PROFILER
)
if
(
NOT CMAKE_CROSSCOMPILING
)
if
(
WITH_AVX AND AVX_FOUND
)
set
(
SIMD_FLAG
${
AVX_FLAG
}
)
elseif
(
SSE3_FOUND
)
set
(
SIMD_FLAG
${
SSE3_FLAG
}
)
endif
()
endif
()
if
(
NOT WITH_GPU
)
add_definitions
(
-DPADDLE_ONLY_CPU
)
add_definitions
(
-DHPPL_STUB_FUNC
)
...
...
@@ -48,21 +56,12 @@ else()
message
(
FATAL_ERROR
"Paddle need cudnn to compile"
)
endif
()
if
(
WITH_AVX
)
set
(
CUDA_NVCC_FLAGS
${
CUDA_NVCC_FLAGS
}
"-Xcompiler
${
AVX_FLAG
}
"
)
else
(
WITH_AVX
)
set
(
CUDA_NVCC_FLAGS
${
CUDA_NVCC_FLAGS
}
"-Xcompiler
${
SSE3_FLAG
}
"
)
endif
(
WITH_AVX
)
set
(
CUDA_NVCC_FLAGS
${
CUDA_NVCC_FLAGS
}
"-Xcompiler
${
SIMD_FLAG
}
"
)
# Include cuda and cudnn
include_directories
(
${
CUDNN_INCLUDE_DIR
}
)
include_directories
(
${
CUDA_TOOLKIT_INCLUDE
}
)
endif
(
NOT WITH_GPU
)
if
(
WITH_AVX
)
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
${
AVX_FLAG
}
"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
${
AVX_FLAG
}
"
)
else
(
WITH_AVX
)
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
${
SSE3_FLAG
}
"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
${
SSE3_FLAG
}
"
)
endif
(
WITH_AVX
)
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
${
SIMD_FLAG
}
"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
${
SIMD_FLAG
}
"
)
cmake/cudnn.cmake
浏览文件 @
ba68ce1a
if
(
NOT WITH_GPU
)
return
()
endif
()
set
(
CUDNN_ROOT
""
CACHE PATH
"CUDNN ROOT"
)
find_path
(
CUDNN_INCLUDE_DIR cudnn.h
PATHS
${
CUDNN_ROOT
}
${
CUDNN_ROOT
}
/include
...
...
cmake/external/gflags.cmake
浏览文件 @
ba68ce1a
...
...
@@ -33,6 +33,8 @@ ExternalProject_Add(
UPDATE_COMMAND
""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
CMAKE_CXX_COMPILER
}
CMAKE_ARGS -DCMAKE_C_COMPILER=
${
CMAKE_C_COMPILER
}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=
${
CMAKE_CXX_FLAGS
}
CMAKE_ARGS -DCMAKE_C_FLAGS=
${
CMAKE_C_FLAGS
}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=
${
GFLAGS_INSTALL_DIR
}
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DBUILD_TESTING=OFF
...
...
cmake/external/glog.cmake
浏览文件 @
ba68ce1a
...
...
@@ -35,6 +35,8 @@ ExternalProject_Add(
UPDATE_COMMAND
""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
CMAKE_CXX_COMPILER
}
CMAKE_ARGS -DCMAKE_C_COMPILER=
${
CMAKE_C_COMPILER
}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=
${
CMAKE_CXX_FLAGS
}
CMAKE_ARGS -DCMAKE_C_FLAGS=
${
CMAKE_C_FLAGS
}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=
${
GLOG_INSTALL_DIR
}
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DWITH_GFLAGS=ON
...
...
cmake/external/gtest.cmake
浏览文件 @
ba68ce1a
...
...
@@ -43,6 +43,8 @@ IF(WITH_TESTING)
UPDATE_COMMAND
""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
CMAKE_CXX_COMPILER
}
CMAKE_ARGS -DCMAKE_C_COMPILER=
${
CMAKE_C_COMPILER
}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=
${
CMAKE_CXX_FLAGS
}
CMAKE_ARGS -DCMAKE_C_FLAGS=
${
CMAKE_C_FLAGS
}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=
${
GTEST_INSTALL_DIR
}
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DBUILD_GMOCK=ON
...
...
cmake/external/openblas.cmake
浏览文件 @
ba68ce1a
...
...
@@ -54,6 +54,8 @@ IF(NOT ${CBLAS_FOUND})
"you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=..."
)
ENDIF
(
NOT CMAKE_Fortran_COMPILER
)
ADD_DEFINITIONS
(
-DPADDLE_USE_LAPACK
)
ExternalProject_Add
(
openblas
${
EXTERNAL_PROJECT_LOG_ARGS
}
...
...
cmake/external/python.cmake
浏览文件 @
ba68ce1a
...
...
@@ -219,9 +219,9 @@ ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
ENDIF
(
PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND
)
I
NCLUDE_DIRECTORIES
(
${
PYTHON_INCLUDE_DIR
}
)
INCLUDE_DIRECTORIES
(
${
PYTHON_NUMPY
_INCLUDE_DIR
}
)
IF
(
NOT WITH_PYTHON
)
I
F
(
WITH_PYTHON
)
INCLUDE_DIRECTORIES
(
${
PYTHON
_INCLUDE_DIR
}
)
INCLUDE_DIRECTORIES
(
${
PYTHON_NUMPY_INCLUDE_DIR
}
)
ELSE
(
)
SET
(
PYTHON_LIBRARIES
""
)
ENDIF
()
cmake/external/warpctc.cmake
浏览文件 @
ba68ce1a
...
...
@@ -50,6 +50,8 @@ ExternalProject_Add(
UPDATE_COMMAND
""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
CMAKE_CXX_COMPILER
}
CMAKE_ARGS -DCMAKE_C_COMPILER=
${
CMAKE_C_COMPILER
}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=
${
CMAKE_CXX_FLAGS
}
CMAKE_ARGS -DCMAKE_C_FLAGS=
${
CMAKE_C_FLAGS
}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=
${
WARPCTC_INSTALL_DIR
}
CMAKE_ARGS -DWITH_GPU=
${
WITH_GPU
}
CMAKE_ARGS -DWITH_OMP=
${
USE_OMP
}
...
...
cmake/external/zlib.cmake
浏览文件 @
ba68ce1a
...
...
@@ -22,7 +22,7 @@ SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include dire
IF
(
WIN32
)
SET
(
ZLIB_LIBRARIES
"
${
ZLIB_INSTALL_DIR
}
/lib/zlibstatic.lib"
CACHE FILEPATH
"zlib library."
FORCE
)
ELSE
(
WIN32
)
set
(
ZLIB_LIBRARIES
"
${
ZLIB_INSTALL_DIR
}
/lib/libz.a"
CACHE FILEPATH
"zlib library."
FORCE
)
SET
(
ZLIB_LIBRARIES
"
${
ZLIB_INSTALL_DIR
}
/lib/libz.a"
CACHE FILEPATH
"zlib library."
FORCE
)
ENDIF
(
WIN32
)
INCLUDE_DIRECTORIES
(
${
ZLIB_INCLUDE_DIR
}
)
...
...
@@ -36,6 +36,8 @@ ExternalProject_Add(
UPDATE_COMMAND
""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
CMAKE_CXX_COMPILER
}
CMAKE_ARGS -DCMAKE_C_COMPILER=
${
CMAKE_C_COMPILER
}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=
${
CMAKE_CXX_FLAGS
}
CMAKE_ARGS -DCMAKE_C_FLAGS=
${
CMAKE_C_FLAGS
}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=
${
ZLIB_INSTALL_DIR
}
CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
...
...
cmake/flags.cmake
浏览文件 @
ba68ce1a
...
...
@@ -2,6 +2,7 @@
include
(
CheckCXXCompilerFlag
)
include
(
CheckCCompilerFlag
)
include
(
CheckCXXSymbolExists
)
include
(
CheckTypeSize
)
function
(
CheckCompilerCXX11Flag
)
if
(
CMAKE_CXX_COMPILER_ID STREQUAL
"GNU"
)
...
...
@@ -25,7 +26,7 @@ function(CheckCompilerCXX11Flag)
endfunction
()
CheckCompilerCXX11Flag
()
LIST
(
APPEND CMAKE_CXX_FLAGS -std=c++11
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-std=c++11"
)
# safe_set_flag
#
...
...
@@ -83,6 +84,17 @@ if(NOT UINT64_MAX_EXISTS)
endif
()
endif
()
SET
(
CMAKE_EXTRA_INCLUDE_FILES
"pthread.h"
)
CHECK_TYPE_SIZE
(
pthread_spinlock_t SPINLOCK_FOUND
)
CHECK_TYPE_SIZE
(
pthread_barrier_t BARRIER_FOUND
)
if
(
SPINLOCK_FOUND
)
add_definitions
(
-DPADDLE_USE_PTHREAD_SPINLOCK
)
endif
(
SPINLOCK_FOUND
)
if
(
BARRIER_FOUND
)
add_definitions
(
-DPADDLE_USE_PTHREAD_BARRIER
)
endif
(
BARRIER_FOUND
)
SET
(
CMAKE_EXTRA_INCLUDE_FILES
""
)
# Common flags. the compiler flag used for C/C++ sources whenever release or debug
# Do not care if this flag is support for gcc.
set
(
COMMON_FLAGS
...
...
cmake/simd.cmake
浏览文件 @
ba68ce1a
...
...
@@ -2,6 +2,7 @@
# so that PaddlePaddle can unleash the vectorization power of muticore.
INCLUDE
(
CheckCXXSourceRuns
)
INCLUDE
(
CheckCXXSourceCompiles
)
IF
(
CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES
"Clang"
)
set
(
MMX_FLAG
"-mmmx"
)
...
...
@@ -17,6 +18,8 @@ ELSEIF(MSVC)
SET
(
AVX2_FLAG
"/arch:AVX2"
)
ENDIF
()
set
(
CMAKE_REQUIRED_FLAGS_RETAINED
${
CMAKE_REQUIRED_FLAGS
}
)
# Check MMX
set
(
CMAKE_REQUIRED_FLAGS
${
MMX_FLAG
}
)
CHECK_CXX_SOURCE_RUNS
(
"
...
...
@@ -73,4 +76,5 @@ int main()
return 0;
}"
AVX2_FOUND
)
set
(
CMAKE_REQUIRED_FLAGS
${
CMAKE_REQUIRED_FLAGS_RETAINED
}
)
mark_as_advanced
(
MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND
)
cmake/system.cmake
浏览文件 @
ba68ce1a
...
...
@@ -67,6 +67,12 @@ MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
MESSAGE
(
STATUS
"Found Paddle host system:
${
HOST_SYSTEM
}
"
)
MESSAGE
(
STATUS
"Found Paddle host system's CPU:
${
CPU_CORES
}
cores"
)
IF
(
DEFINED CMAKE_SYSTEM_NAME
)
IF
(
${
CMAKE_SYSTEM_NAME
}
STREQUAL
"Android"
)
SET
(
ANDROID TRUE
)
ENDIF
()
ENDIF
()
# external dependencies log output
SET
(
EXTERNAL_PROJECT_LOG_ARGS
LOG_DOWNLOAD 0
# Wrap download in script to log output
...
...
cmake/util.cmake
浏览文件 @
ba68ce1a
...
...
@@ -90,6 +90,10 @@ function(link_paddle_exe TARGET_NAME)
${
RDMA_LD_FLAGS
}
${
RDMA_LIBS
}
)
if
(
ANDROID
)
target_link_libraries
(
${
TARGET_NAME
}
log
)
endif
(
ANDROID
)
add_dependencies
(
${
TARGET_NAME
}
${
external_project_dependencies
}
)
endfunction
()
...
...
paddle/cuda/include/hl_cpu_matrix_kernel.cuh
浏览文件 @
ba68ce1a
...
...
@@ -17,7 +17,11 @@ limitations under the License. */
#include <stdio.h>
#include "hl_base.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include "hl_neon_matrix_kernel.cuh"
#else
#include "hl_sse_matrix_kernel.cuh"
#endif
/**
* @brief cpu element wise unary operator.
...
...
paddle/cuda/include/hl_matrix_base.cuh
浏览文件 @
ba68ce1a
...
...
@@ -66,6 +66,8 @@ typedef BaseOp SSESquaredDiff;
typedef
BaseOp
SSEFirst
;
typedef
BaseOp
SSESecond
;
typedef
BaseOp
SSEClassificationError
;
#elif defined(__ARM__NEON__) || defined(__ARM_NEON)
#include "hl_matrix_base_neon.cuh"
#else
#include "hl_matrix_base_sse.cuh"
#endif
...
...
paddle/cuda/include/hl_matrix_base_neon.cuh
0 → 100644
浏览文件 @
ba68ce1a
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_MATRIX_BASE_NEON_CUH_
#define HL_MATRIX_BASE_NEON_CUH_
namespace
aggregate
{
class
SSESum
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vaddq_f32
(
a
,
b
);
}
};
class
SSEMax
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vmaxq_f32
(
a
,
b
);
}
};
class
SSEMin
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vminq_f32
(
a
,
b
);
}
};
}
// namespace aggregate
namespace
base
{
namespace
unary
{
class
SSEIdentity
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
)
const
{
return
a
;
}
};
}
// namespace unary
namespace
binary
{
class
SSEAdd
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vaddq_f32
(
a
,
b
);
}
};
class
SSEAdd2
{
public:
static
const
bool
sse
=
true
;
const
real
p1
;
const
real
p2
;
float32x4_t
mp1
;
float32x4_t
mp2
;
public:
SSEAdd2
(
const
real
s1
,
const
real
s2
)
:
p1
(
s1
),
p2
(
s2
)
{
mp1
=
vdupq_n_f32
(
p1
);
mp2
=
vdupq_n_f32
(
p2
);
}
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
float32x4_t
tmp1
,
tmp2
;
tmp1
=
vmulq_f32
(
mp1
,
a
);
tmp2
=
vmulq_f32
(
mp2
,
b
);
return
vaddq_f32
(
tmp1
,
tmp2
);
}
};
class
SSESub
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vsubq_f32
(
a
,
b
);
}
};
class
SSEMul
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vmulq_f32
(
a
,
b
);
}
};
class
SSEDiv
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
float32x4_t
tmp
;
tmp
=
vrecpeq_f32
(
b
);
return
vmulq_f32
(
a
,
tmp
);
}
};
class
SSESquaredDiff
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
float32x4_t
tmp
;
tmp
=
vsubq_f32
(
a
,
b
);
return
vmulq_f32
(
tmp
,
tmp
);
}
};
class
SSEFirst
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
a
;
}
};
class
SSESecond
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
b
;
}
};
class
SSEClassificationError
{
public:
static
const
bool
sse
=
true
;
const
real
p
;
float32x4_t
mp
;
uint32x4_t
result
;
public:
explicit
SSEClassificationError
(
const
real
s
)
:
p
(
s
)
{
mp
=
vdupq_n_f32
(
p
);
result
=
vdupq_n_u32
(
1
);
}
// TODO: to be check
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
uint32x4_t
tmp1
=
vcgtq_f32
(
a
,
mp
);
uint32x4_t
tmp2
=
vcgtq_f32
(
b
,
mp
);
uint32x4_t
tmp3
=
veorq_u32
(
tmp1
,
tmp2
);
return
vcvtq_f32_u32
(
vandq_u32
(
tmp3
,
result
));
}
};
}
// namespace binary
}
// namespace base
#endif
/* HL_MATRIX_BASE_NEON_CUH_ */
paddle/cuda/include/hl_matrix_type.cuh
浏览文件 @
ba68ce1a
...
...
@@ -17,13 +17,20 @@ limitations under the License. */
#include "hl_base.h"
#if
def __CUDA_ARCH__
#if
defined(__CUDA_ARCH__)
#include <vector_types.h>
#ifndef PADDLE_TYPE_DOUBLE
typedef
float4
vecType
;
#else
typedef
double2
vecType
;
#endif
#elif (defined __ARM_NEON) || (defined __ARM_NEON__)
#include <arm_neon.h>
#ifndef PADDLE_TYPE_DOUBLE
typedef
float32x4_t
vecType
;
#else
#error NEON instructions does not support double precision
#endif
#else
#include <mmintrin.h>
#include <xmmintrin.h>
...
...
paddle/cuda/include/hl_neon_matrix_kernel.cuh
0 → 100644
浏览文件 @
ba68ce1a
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_NEON_MATRIX_KERNEL_CUH_
#define HL_NEON_MATRIX_KERNEL_CUH_
#include "hl_matrix_type.cuh"
#define VECTOR_SIZE 16
/* number of float in vector */
#define VECTOR_LEN 4
#define VECTOR_SET vdupq_n_f32
inline
bool
hl_check_align
(
size_t
size
)
{
return
!
(
size
&
(
VECTOR_SIZE
-
1
));
}
inline
bool
hl_check_align
(
void
*
ptr
)
{
return
hl_check_align
(
reinterpret_cast
<
size_t
>
(
ptr
));
}
template
<
class
Agg
>
inline
real
hl_agg_op
(
Agg
agg
,
vecType
mm
)
{
float32x4_t
rev
=
vrev64q_f32
(
mm
);
float32x4_t
tmp1
=
agg
.
vecOp
(
rev
,
rev
);
float32x2_t
lo
=
vget_high_f32
(
rev
);
float32x2_t
hi
=
vget_low_f32
(
rev
);
float32x4_t
tmp2
=
vcombine_f32
(
hi
,
lo
);
float32x4_t
ret
=
agg
.
vecOp
(
tmp1
,
tmp2
);
return
vgetq_lane_f32
(
ret
,
0
);
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_row_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
int
ld
,
real
*
A
,
int
lda
)
{
for
(
int
i
=
0
;
i
<
dimM
;
i
++
,
A
+=
lda
)
{
vecType
mm
=
VECTOR_SET
(
agg
.
init
());
vecType
*
a
=
(
vecType
*
)(
A
);
for
(
int
j
=
0
;
j
<
dimN
/
VECTOR_LEN
;
j
++
,
a
++
)
{
mm
=
agg
.
vecOp
(
mm
,
op
.
vecOp
(
*
a
));
}
int
rem
=
dimN
%
VECTOR_LEN
;
if
(
rem
)
{
real
tmp
=
hl_agg_op
(
agg
,
mm
);
real
*
a
=
A
+
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
for
(
int
j
=
0
;
j
<
rem
;
j
++
)
{
tmp
=
agg
(
tmp
,
op
(
a
[
j
]));
}
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
tmp
);
}
else
{
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
hl_agg_op
(
agg
,
mm
));
}
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_row_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
int
ld
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
for
(
int
i
=
0
;
i
<
dimM
;
i
++
,
A
+=
lda
,
B
+=
ldb
)
{
vecType
mm
=
VECTOR_SET
(
agg
.
init
());
vecType
*
a
=
(
vecType
*
)(
A
);
vecType
*
b
=
(
vecType
*
)(
B
);
for
(
int
j
=
0
;
j
<
dimN
/
VECTOR_LEN
;
j
++
,
a
++
,
b
++
)
{
mm
=
agg
.
vecOp
(
mm
,
op
.
vecOp
(
*
a
,
*
b
));
}
int
rem
=
dimN
%
VECTOR_LEN
;
if
(
rem
)
{
real
tmp
=
hl_agg_op
(
agg
,
mm
);
real
*
a
=
A
+
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
real
*
b
=
B
+
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
for
(
int
j
=
0
;
j
<
rem
;
j
++
)
{
tmp
=
agg
(
tmp
,
op
(
a
[
j
],
b
[
j
]));
}
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
tmp
);
}
else
{
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
hl_agg_op
(
agg
,
mm
));
}
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
)
{
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
]));
}
dst
[
j
]
=
sv
(
dst
[
j
],
tmp
);
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
],
B
[
i
*
ldb
+
j
]));
}
dst
[
j
]
=
sv
(
dst
[
j
],
tmp
);
}
}
/*
* MaxRow greater than or equal dimN
* dimN is multiples of VECTOR_LEN
* so rem <= MaxRow / VECTOR_LEN
*/
template
<
int
MaxRow
,
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_column_op_with_rem
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
)
{
vecType
mm
[
MaxRow
/
VECTOR_LEN
];
for
(
int
n
=
0
;
n
<
MaxRow
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
VECTOR_SET
(
agg
.
init
());
}
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
vecType
*
a
=
(
vecType
*
)(
A
+
i
*
lda
);
for
(
int
n
=
0
;
n
<
dimN
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
agg
.
vecOp
(
mm
[
n
],
op
.
vecOp
(
a
[
n
]));
}
}
vecType
*
result
=
(
vecType
*
)(
dst
);
for
(
int
n
=
0
;
n
<
dimN
/
VECTOR_LEN
;
n
++
)
{
result
[
n
]
=
sv
.
vecOp
(
result
[
n
],
mm
[
n
]);
}
int
rem
=
dimN
%
VECTOR_LEN
;
if
(
rem
)
{
A
+=
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
dst
+=
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
hl_matrix_column_op
(
agg
,
op
,
sv
,
dimM
,
rem
,
dst
,
A
,
lda
);
}
}
/*
* dimN is multiples of VECTOR_LEN
* dimN greater than Step
*/
template
<
int
Step
,
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
)
{
for
(
int
j
=
0
;
j
<
dimN
/
Step
;
j
++
,
dst
+=
Step
,
A
+=
Step
)
{
vecType
mm
[
Step
/
VECTOR_LEN
];
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
VECTOR_SET
(
agg
.
init
());
}
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
vecType
*
a
=
(
vecType
*
)(
A
+
i
*
lda
);
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
agg
.
vecOp
(
mm
[
n
],
op
.
vecOp
(
a
[
n
]));
}
}
vecType
*
result
=
(
vecType
*
)(
dst
);
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
result
[
n
]
=
sv
.
vecOp
(
result
[
n
],
mm
[
n
]);
}
}
int
remRow
=
dimN
%
Step
;
if
(
remRow
)
{
hl_sse_column_op_with_rem
<
Step
>
(
agg
,
op
,
sv
,
dimM
,
remRow
,
dst
,
A
,
lda
);
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
)
{
if
(
dimN
<=
16
)
{
hl_sse_matrix_column_op
<
16
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
);
}
else
if
(
dimN
<=
32
)
{
hl_sse_matrix_column_op
<
32
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
);
}
else
if
(
dimN
<=
1024
||
dimM
<=
512
)
{
hl_sse_matrix_column_op
<
64
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
);
}
else
{
hl_sse_matrix_column_op
<
1024
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
);
}
}
template
<
int
MaxRow
,
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_column_op_with_rem
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
vecType
mm
[
MaxRow
/
VECTOR_LEN
];
for
(
int
n
=
0
;
n
<
MaxRow
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
VECTOR_SET
(
agg
.
init
());
}
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
vecType
*
a
=
(
vecType
*
)(
A
+
i
*
lda
);
vecType
*
b
=
(
vecType
*
)(
B
+
i
*
ldb
);
for
(
int
n
=
0
;
n
<
dimN
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
agg
.
vecOp
(
mm
[
n
],
op
.
vecOp
(
a
[
n
],
b
[
n
]));
}
}
vecType
*
result
=
(
vecType
*
)(
dst
);
for
(
int
n
=
0
;
n
<
dimN
/
VECTOR_LEN
;
n
++
)
{
result
[
n
]
=
sv
.
vecOp
(
result
[
n
],
mm
[
n
]);
}
int
rem
=
dimN
%
VECTOR_LEN
;
if
(
rem
)
{
A
+=
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
B
+=
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
dst
+=
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
hl_matrix_column_op
(
agg
,
op
,
sv
,
dimM
,
rem
,
dst
,
A
,
lda
,
B
,
ldb
);
}
}
template
<
int
Step
,
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
for
(
int
j
=
0
;
j
<
dimN
/
Step
;
j
++
,
dst
+=
Step
,
A
+=
Step
,
B
+=
Step
)
{
vecType
mm
[
Step
/
VECTOR_LEN
];
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
VECTOR_SET
(
agg
.
init
());
}
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
vecType
*
a
=
(
vecType
*
)(
A
+
i
*
lda
);
vecType
*
b
=
(
vecType
*
)(
B
+
i
*
ldb
);
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
agg
.
vecOp
(
mm
[
n
],
op
.
vecOp
(
a
[
n
],
b
[
n
]));
}
}
vecType
*
result
=
(
vecType
*
)(
dst
);
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
result
[
n
]
=
sv
.
vecOp
(
result
[
n
],
mm
[
n
]);
}
}
int
remRow
=
dimN
%
Step
;
if
(
remRow
)
{
hl_sse_column_op_with_rem
<
Step
>
(
agg
,
op
,
sv
,
dimM
,
remRow
,
dst
,
A
,
lda
,
B
,
ldb
);
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
if
(
dimN
<=
16
)
{
hl_sse_matrix_column_op
<
16
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
,
B
,
ldb
);
}
else
if
(
dimN
<=
32
)
{
hl_sse_matrix_column_op
<
32
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
,
B
,
ldb
);
}
else
if
(
dimN
<=
1024
||
dimM
<=
512
)
{
hl_sse_matrix_column_op
<
64
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
,
B
,
ldb
);
}
else
{
hl_sse_matrix_column_op
<
1024
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
,
B
,
ldb
);
}
}
#endif
/* HL_NEON_MATRIX_KERNEL_CUH_ */
paddle/function/Function.h
浏览文件 @
ba68ce1a
...
...
@@ -38,7 +38,7 @@ public:
if
(
err
)
{
*
err
=
Error
(
e
.
what
());
}
else
{
LOG
(
FATAL
)
<<
"Cannot get key "
<<
key
<<
"with error "
<<
e
.
what
();
LOG
(
FATAL
)
<<
"Cannot get key "
<<
key
<<
"
with error "
<<
e
.
what
();
}
return
T
();
}
...
...
paddle/function/PadOpGpu.cu
浏览文件 @
ba68ce1a
...
...
@@ -44,9 +44,9 @@ void Pad<DEVICE_TYPE_GPU>(real* outputs,
size_t
nth
=
num
*
inC
*
inH
*
inW
;
int
blockSize
=
1024
;
int
gridSize
=
(
nth
+
1024
-
1
)
/
1024
;
int
cstart
=
pad
.
channel
Start
,
cend
=
pad
.
channelEnd
;
int
hstart
=
pad
.
height
Start
,
hend
=
pad
.
heightEnd
;
int
wstart
=
pad
.
width
Start
,
wend
=
pad
.
widthEnd
;
int
cstart
=
pad
.
channel
[
0
],
cend
=
pad
.
channel
[
1
]
;
int
hstart
=
pad
.
height
[
0
],
hend
=
pad
.
height
[
1
]
;
int
wstart
=
pad
.
width
[
0
],
wend
=
pad
.
width
[
1
]
;
int
outC
=
inC
+
cstart
+
cend
;
int
outH
=
inH
+
hstart
+
hend
;
int
outW
=
inW
+
wstart
+
wend
;
...
...
@@ -83,9 +83,9 @@ void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
int
nth
=
num
*
inC
*
inH
*
inW
;
int
blockSize
=
1024
;
int
gridSize
=
(
nth
+
1024
-
1
)
/
1024
;
int
cstart
=
pad
.
channel
Start
,
cend
=
pad
.
channelEnd
;
int
hstart
=
pad
.
height
Start
,
hend
=
pad
.
heightEnd
;
int
wstart
=
pad
.
width
Start
,
wend
=
pad
.
widthEnd
;
int
cstart
=
pad
.
channel
[
0
],
cend
=
pad
.
channel
[
1
]
;
int
hstart
=
pad
.
height
[
0
],
hend
=
pad
.
height
[
1
]
;
int
wstart
=
pad
.
width
[
0
],
wend
=
pad
.
width
[
1
]
;
int
outC
=
inC
+
cstart
+
cend
;
int
outH
=
inH
+
hstart
+
hend
;
int
outW
=
inW
+
wstart
+
wend
;
...
...
paddle/function/PadOpTest.cpp
浏览文件 @
ba68ce1a
...
...
@@ -24,48 +24,22 @@ TEST(Pad, real) {
for
(
size_t
imgSizeW
:
{
5
,
32
,
96
})
{
VLOG
(
3
)
<<
" numSamples="
<<
numSamples
<<
" channels="
<<
channels
<<
" imgSizeH="
<<
imgSizeH
<<
" imgSizeW="
<<
imgSizeW
;
FunctionCompare
compare
(
"Pad"
,
FuncConfig
()
.
set
(
"cstart"
,
2
)
.
set
(
"cend"
,
3
)
.
set
(
"hstart"
,
1
)
.
set
(
"hend"
,
2
)
.
set
(
"wstart"
,
3
)
.
set
(
"wend"
,
2
));
TensorShape
inDims
{
numSamples
,
channels
,
imgSizeH
,
imgSizeW
};
TensorShape
outDims
{
numSamples
,
channels
+
5
,
imgSizeH
+
3
,
imgSizeW
+
5
};
compare
.
addInputs
(
BufferArg
(
VALUE_TYPE_FLOAT
,
inDims
));
compare
.
addOutputs
(
BufferArg
(
VALUE_TYPE_FLOAT
,
outDims
,
ASSIGN_TO
));
compare
.
run
();
}
}
}
}
}
TEST
(
PadGrad
,
real
)
{
for
(
size_t
numSamples
:
{
5
,
32
})
{
for
(
size_t
channels
:
{
1
,
5
,
32
})
{
for
(
size_t
imgSizeH
:
{
5
,
33
,
100
})
{
for
(
size_t
imgSizeW
:
{
5
,
32
,
96
})
{
VLOG
(
3
)
<<
" numSamples="
<<
numSamples
<<
" channels="
<<
channels
<<
" imgSizeH="
<<
imgSizeH
<<
" imgSizeW="
<<
imgSizeW
;
FunctionCompare
compare
(
"PadGrad"
,
FuncConfig
()
.
set
(
"cstart"
,
2
)
.
set
(
"cend"
,
3
)
.
set
(
"hstart"
,
1
)
.
set
(
"hend"
,
2
)
.
set
(
"wstart"
,
3
)
.
set
(
"wend"
,
2
));
TensorShape
inDims
{
numSamples
,
channels
,
imgSizeH
,
imgSizeW
};
TensorShape
outDims
{
numSamples
,
channels
+
5
,
imgSizeH
+
3
,
imgSizeW
+
5
};
compare
.
addInputs
(
BufferArg
(
VALUE_TYPE_FLOAT
,
outDims
));
compare
.
addOutputs
(
BufferArg
(
VALUE_TYPE_FLOAT
,
inDims
,
ASSIGN_TO
));
compare
.
run
();
for
(
bool
test_grad
:
{
false
,
true
})
{
FunctionCompare
compare
(
test_grad
?
"PadGrad"
:
"Pad"
,
FuncConfig
()
.
set
<
std
::
vector
<
uint32_t
>>
(
"channel"
,
{
2
,
3
})
.
set
<
std
::
vector
<
uint32_t
>>
(
"height"
,
{
1
,
2
})
.
set
<
std
::
vector
<
uint32_t
>>
(
"width"
,
{
3
,
2
}));
TensorShape
inDims
{
numSamples
,
channels
,
imgSizeH
,
imgSizeW
};
TensorShape
outDims
{
numSamples
,
channels
+
5
,
imgSizeH
+
3
,
imgSizeW
+
5
};
compare
.
addInputs
(
BufferArg
(
VALUE_TYPE_FLOAT
,
test_grad
?
outDims
:
inDims
));
compare
.
addOutputs
(
BufferArg
(
VALUE_TYPE_FLOAT
,
test_grad
?
inDims
:
outDims
,
ASSIGN_TO
));
compare
.
run
();
}
}
}
}
...
...
paddle/gserver/tests/LayerGradUtil.cpp
浏览文件 @
ba68ce1a
...
...
@@ -778,8 +778,10 @@ void testProjectionGrad(ProjectionConfig conf,
config
.
biasSize
=
biasSize
==
0
?
config
.
layerConfig
.
size
()
:
biasSize
;
config
.
layerConfig
.
set_bias_size
(
config
.
biasSize
);
config
.
layerConfig
.
set_shared_biases
(
sharedBias
);
config
.
inputDefs
.
push_back
(
{
inputType
,
"layer_0"
,
conf
.
input_size
(),
parameterSize
});
config
.
inputDefs
.
push_back
({
inputType
,
"layer_0"
,
static_cast
<
size_t
>
(
conf
.
input_size
()),
parameterSize
});
*
config
.
layerConfig
.
add_inputs
()
->
mutable_proj_conf
()
=
conf
;
config
.
testState
=
testState
;
testLayerGrad
(
config
,
"mixed"
,
batchSize
,
false
,
useGpu
);
...
...
paddle/math/MathFunctions.cpp
浏览文件 @
ba68ce1a
...
...
@@ -85,11 +85,16 @@ int getrf<float>(const CBLAS_ORDER order,
float
*
A
,
const
int
lda
,
int
*
ipiv
)
{
#ifdef PADDLE_USE_LAPACK
#ifdef PADDLE_USE_ATLAS
return
clapack_sgetrf
(
order
,
M
,
N
,
A
,
lda
,
ipiv
);
#else
return
LAPACKE_sgetrf
(
order
,
M
,
N
,
A
,
lda
,
ipiv
);
#endif
#else
LOG
(
FATAL
)
<<
"Not implemented"
;
#endif
return
0
;
}
template
<
>
...
...
@@ -99,11 +104,16 @@ int getrf<double>(const CBLAS_ORDER order,
double
*
A
,
const
int
lda
,
int
*
ipiv
)
{
#ifdef PADDLE_USE_LAPACK
#ifdef PADDLE_USE_ATLAS
return
clapack_dgetrf
(
order
,
M
,
N
,
A
,
lda
,
ipiv
);
#else
return
LAPACKE_dgetrf
(
order
,
M
,
N
,
A
,
lda
,
ipiv
);
#endif
#else
LOG
(
FATAL
)
<<
"Not implemented"
;
#endif
return
0
;
}
template
<
>
...
...
@@ -112,11 +122,16 @@ int getri<float>(const CBLAS_ORDER order,
float
*
A
,
const
int
lda
,
const
int
*
ipiv
)
{
#ifdef PADDLE_USE_LAPACK
#ifdef PADDLE_USE_ATLAS
return
clapack_sgetri
(
order
,
N
,
A
,
lda
,
ipiv
);
#else
return
LAPACKE_sgetri
(
order
,
N
,
A
,
lda
,
ipiv
);
#endif
#else
LOG
(
FATAL
)
<<
"Not implemented"
;
#endif
return
0
;
}
template
<
>
...
...
@@ -125,11 +140,16 @@ int getri<double>(const CBLAS_ORDER order,
double
*
A
,
const
int
lda
,
const
int
*
ipiv
)
{
#ifdef PADDLE_USE_LAPACK
#ifdef PADDLE_USE_ATLAS
return
clapack_dgetri
(
order
,
N
,
A
,
lda
,
ipiv
);
#else
return
LAPACKE_dgetri
(
order
,
N
,
A
,
lda
,
ipiv
);
#endif
#else
LOG
(
FATAL
)
<<
"Not implemented"
;
#endif
return
0
;
}
template
<
>
...
...
paddle/math/MathFunctions.h
浏览文件 @
ba68ce1a
...
...
@@ -17,11 +17,14 @@ limitations under the License. */
#ifdef PADDLE_USE_MKL
#include <mkl.h>
#ifdef PADDLE_USE_LAPACK
#include <mkl_lapacke.h>
#endif
#else
extern
"C"
{
#include <cblas.h>
}
#ifdef PADDLE_USE_LAPACK
#ifdef PADDLE_USE_ATLAS
extern
"C"
{
#include <clapack.h>
...
...
@@ -30,6 +33,7 @@ extern "C" {
#include <lapacke.h>
#endif
#endif
#endif
#include <cmath>
...
...
paddle/math/Matrix.cpp
浏览文件 @
ba68ce1a
...
...
@@ -2426,41 +2426,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
int
lda
=
a
->
getStride
();
int
ldb
=
b
->
getStride
();
int
ldc
=
getStride
();
#ifndef PADDLE_TYPE_DOUBLE
cblas_sgemm
(
CblasRowMajor
,
a_trans
,
b_trans
,
M
,
N
,
K
,
scaleAB
,
A
,
lda
,
B
,
ldb
,
scaleT
,
C
,
ldc
);
#else
cblas_dgemm
(
CblasRowMajor
,
a_trans
,
b_trans
,
M
,
N
,
K
,
scaleAB
,
A
,
lda
,
B
,
ldb
,
scaleT
,
C
,
ldc
);
// TODO(yuyang18): Is gemm defined other place?
#endif
VLOG
(
2
)
<<
" A[0]="
<<
A
[
0
]
<<
" A[1]="
<<
A
[
1
]
<<
" B[0]="
<<
B
[
0
]
<<
" B[1]="
<<
B
[
1
]
<<
" C[0]="
<<
C
[
0
]
<<
" C[1]="
<<
C
[
1
];
gemm
<
real
>
(
a_trans
,
b_trans
,
M
,
N
,
K
,
scaleAB
,
A
,
lda
,
B
,
ldb
,
scaleT
,
C
,
ldc
);
}
void
CpuMatrix
::
mul
(
...
...
paddle/math/SIMDFunctions.cpp
浏览文件 @
ba68ce1a
...
...
@@ -13,119 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "SIMDFunctions.h"
#ifdef __SSE3__
#include <immintrin.h>
#endif
#include <algorithm>
#ifndef __AVX__
static
void
addto_sse
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
int
offset
=
len
%
16
;
__m128
ma0
,
ma1
,
ma2
,
ma3
;
__m128
mb0
,
mb1
,
mb2
,
mb3
;
for
(
unsigned
int
k
=
0
;
k
<
len
/
16
;
k
++
,
a
+=
16
,
b
+=
16
)
{
ma0
=
_mm_load_ps
(
a
);
ma1
=
_mm_load_ps
(
a
+
4
);
ma2
=
_mm_load_ps
(
a
+
8
);
ma3
=
_mm_load_ps
(
a
+
12
);
mb0
=
_mm_load_ps
(
b
);
mb1
=
_mm_load_ps
(
b
+
4
);
mb2
=
_mm_load_ps
(
b
+
8
);
mb3
=
_mm_load_ps
(
b
+
12
);
ma0
=
_mm_add_ps
(
ma0
,
mb0
);
ma1
=
_mm_add_ps
(
ma1
,
mb1
);
ma2
=
_mm_add_ps
(
ma2
,
mb2
);
ma3
=
_mm_add_ps
(
ma3
,
mb3
);
_mm_store_ps
(
a
,
ma0
);
_mm_store_ps
(
a
+
4
,
ma1
);
_mm_store_ps
(
a
+
8
,
ma2
);
_mm_store_ps
(
a
+
12
,
ma3
);
}
for
(
int
i
=
0
;
i
<
offset
;
i
++
)
a
[
i
]
+=
b
[
i
];
}
static
void
batch_addto_sse
(
float
*
a
,
const
float
*
b
[],
int
batch
,
size_t
len
)
{
int
offset
=
len
%
16
;
__m128
ma0
,
ma1
,
ma2
,
ma3
;
__m128
mb0
,
mb1
,
mb2
,
mb3
;
for
(
unsigned
int
k
=
0
;
k
<
len
/
16
;
k
++
,
a
+=
16
)
{
ma0
=
_mm_load_ps
(
a
);
ma1
=
_mm_load_ps
(
a
+
4
);
ma2
=
_mm_load_ps
(
a
+
8
);
ma3
=
_mm_load_ps
(
a
+
12
);
for
(
int
i
=
0
;
i
<
batch
;
i
++
)
{
mb0
=
_mm_load_ps
(
b
[
i
]);
mb1
=
_mm_load_ps
(
b
[
i
]
+
4
);
mb2
=
_mm_load_ps
(
b
[
i
]
+
8
);
mb3
=
_mm_load_ps
(
b
[
i
]
+
12
);
ma0
=
_mm_add_ps
(
ma0
,
mb0
);
ma1
=
_mm_add_ps
(
ma1
,
mb1
);
ma2
=
_mm_add_ps
(
ma2
,
mb2
);
ma3
=
_mm_add_ps
(
ma3
,
mb3
);
b
[
i
]
+=
16
;
}
_mm_store_ps
(
a
,
ma0
);
_mm_store_ps
(
a
+
4
,
ma1
);
_mm_store_ps
(
a
+
8
,
ma2
);
_mm_store_ps
(
a
+
12
,
ma3
);
}
for
(
int
i
=
0
;
i
<
offset
;
i
++
)
{
for
(
int
k
=
0
;
k
<
batch
;
k
++
)
a
[
i
]
+=
b
[
k
][
i
];
}
return
;
}
static
void
col_max_sse
(
float
*
result
,
const
float
*
data
,
int
dim
,
int
numSamples
)
{
// first sample, direct copy
for
(
int
d
=
0
;
d
<
dim
;
++
d
)
{
result
[
d
]
=
data
[
d
];
}
int
offset
=
dim
%
16
;
__m128
ma0
,
ma1
,
ma2
,
ma3
;
__m128
mb0
,
mb1
,
mb2
,
mb3
;
// first 16n dims
for
(
int
k
=
0
;
k
<
dim
/
16
;
k
++
,
result
+=
16
,
data
+=
16
)
{
ma0
=
_mm_load_ps
(
result
);
ma1
=
_mm_load_ps
(
result
+
4
);
ma2
=
_mm_load_ps
(
result
+
8
);
ma3
=
_mm_load_ps
(
result
+
12
);
for
(
int
i
=
1
;
i
<
numSamples
;
i
++
)
{
mb0
=
_mm_load_ps
(
data
+
i
*
dim
);
mb1
=
_mm_load_ps
(
data
+
i
*
dim
+
4
);
mb2
=
_mm_load_ps
(
data
+
i
*
dim
+
8
);
mb3
=
_mm_load_ps
(
data
+
i
*
dim
+
12
);
ma0
=
_mm_max_ps
(
ma0
,
mb0
);
ma1
=
_mm_max_ps
(
ma1
,
mb1
);
ma2
=
_mm_max_ps
(
ma2
,
mb2
);
ma3
=
_mm_max_ps
(
ma3
,
mb3
);
}
_mm_store_ps
(
result
,
ma0
);
_mm_store_ps
(
result
+
4
,
ma1
);
_mm_store_ps
(
result
+
8
,
ma2
);
_mm_store_ps
(
result
+
12
,
ma3
);
}
// last dims
for
(
int
d
=
0
;
d
<
offset
;
++
d
)
{
float
sm
=
data
[
d
];
for
(
int
i
=
1
;
i
<
numSamples
;
++
i
)
{
sm
=
std
::
max
(
sm
,
data
[
i
*
dim
+
d
]);
}
result
[
d
]
=
sm
;
}
}
#else
#ifdef __AVX__
static
void
addto_avx
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
int
offset
=
len
%
32
;
...
...
@@ -355,17 +248,128 @@ static void decayL1_avx(
}
}
#elif defined(__SSE3__)
static
void
addto_sse
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
int
offset
=
len
%
16
;
__m128
ma0
,
ma1
,
ma2
,
ma3
;
__m128
mb0
,
mb1
,
mb2
,
mb3
;
for
(
unsigned
int
k
=
0
;
k
<
len
/
16
;
k
++
,
a
+=
16
,
b
+=
16
)
{
ma0
=
_mm_load_ps
(
a
);
ma1
=
_mm_load_ps
(
a
+
4
);
ma2
=
_mm_load_ps
(
a
+
8
);
ma3
=
_mm_load_ps
(
a
+
12
);
mb0
=
_mm_load_ps
(
b
);
mb1
=
_mm_load_ps
(
b
+
4
);
mb2
=
_mm_load_ps
(
b
+
8
);
mb3
=
_mm_load_ps
(
b
+
12
);
ma0
=
_mm_add_ps
(
ma0
,
mb0
);
ma1
=
_mm_add_ps
(
ma1
,
mb1
);
ma2
=
_mm_add_ps
(
ma2
,
mb2
);
ma3
=
_mm_add_ps
(
ma3
,
mb3
);
_mm_store_ps
(
a
,
ma0
);
_mm_store_ps
(
a
+
4
,
ma1
);
_mm_store_ps
(
a
+
8
,
ma2
);
_mm_store_ps
(
a
+
12
,
ma3
);
}
for
(
int
i
=
0
;
i
<
offset
;
i
++
)
a
[
i
]
+=
b
[
i
];
}
static
void
batch_addto_sse
(
float
*
a
,
const
float
*
b
[],
int
batch
,
size_t
len
)
{
int
offset
=
len
%
16
;
__m128
ma0
,
ma1
,
ma2
,
ma3
;
__m128
mb0
,
mb1
,
mb2
,
mb3
;
for
(
unsigned
int
k
=
0
;
k
<
len
/
16
;
k
++
,
a
+=
16
)
{
ma0
=
_mm_load_ps
(
a
);
ma1
=
_mm_load_ps
(
a
+
4
);
ma2
=
_mm_load_ps
(
a
+
8
);
ma3
=
_mm_load_ps
(
a
+
12
);
for
(
int
i
=
0
;
i
<
batch
;
i
++
)
{
mb0
=
_mm_load_ps
(
b
[
i
]);
mb1
=
_mm_load_ps
(
b
[
i
]
+
4
);
mb2
=
_mm_load_ps
(
b
[
i
]
+
8
);
mb3
=
_mm_load_ps
(
b
[
i
]
+
12
);
ma0
=
_mm_add_ps
(
ma0
,
mb0
);
ma1
=
_mm_add_ps
(
ma1
,
mb1
);
ma2
=
_mm_add_ps
(
ma2
,
mb2
);
ma3
=
_mm_add_ps
(
ma3
,
mb3
);
b
[
i
]
+=
16
;
}
_mm_store_ps
(
a
,
ma0
);
_mm_store_ps
(
a
+
4
,
ma1
);
_mm_store_ps
(
a
+
8
,
ma2
);
_mm_store_ps
(
a
+
12
,
ma3
);
}
for
(
int
i
=
0
;
i
<
offset
;
i
++
)
{
for
(
int
k
=
0
;
k
<
batch
;
k
++
)
a
[
i
]
+=
b
[
k
][
i
];
}
return
;
}
static
void
col_max_sse
(
float
*
result
,
const
float
*
data
,
int
dim
,
int
numSamples
)
{
// first sample, direct copy
for
(
int
d
=
0
;
d
<
dim
;
++
d
)
{
result
[
d
]
=
data
[
d
];
}
int
offset
=
dim
%
16
;
__m128
ma0
,
ma1
,
ma2
,
ma3
;
__m128
mb0
,
mb1
,
mb2
,
mb3
;
// first 16n dims
for
(
int
k
=
0
;
k
<
dim
/
16
;
k
++
,
result
+=
16
,
data
+=
16
)
{
ma0
=
_mm_load_ps
(
result
);
ma1
=
_mm_load_ps
(
result
+
4
);
ma2
=
_mm_load_ps
(
result
+
8
);
ma3
=
_mm_load_ps
(
result
+
12
);
for
(
int
i
=
1
;
i
<
numSamples
;
i
++
)
{
mb0
=
_mm_load_ps
(
data
+
i
*
dim
);
mb1
=
_mm_load_ps
(
data
+
i
*
dim
+
4
);
mb2
=
_mm_load_ps
(
data
+
i
*
dim
+
8
);
mb3
=
_mm_load_ps
(
data
+
i
*
dim
+
12
);
ma0
=
_mm_max_ps
(
ma0
,
mb0
);
ma1
=
_mm_max_ps
(
ma1
,
mb1
);
ma2
=
_mm_max_ps
(
ma2
,
mb2
);
ma3
=
_mm_max_ps
(
ma3
,
mb3
);
}
_mm_store_ps
(
result
,
ma0
);
_mm_store_ps
(
result
+
4
,
ma1
);
_mm_store_ps
(
result
+
8
,
ma2
);
_mm_store_ps
(
result
+
12
,
ma3
);
}
// last dims
for
(
int
d
=
0
;
d
<
offset
;
++
d
)
{
float
sm
=
data
[
d
];
for
(
int
i
=
1
;
i
<
numSamples
;
++
i
)
{
sm
=
std
::
max
(
sm
,
data
[
i
*
dim
+
d
]);
}
result
[
d
]
=
sm
;
}
}
#endif
#ifndef __AVX__
#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
#else
#if defined(__AVX__)
#define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__)
#elif defined(__SSE3__)
#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
#endif
namespace
paddle
{
namespace
simd
{
namespace
internal
{
#ifdef __SSE3__
void
addToImpl
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
SIMD_INVOKE
(
addto
,
a
,
b
,
len
);
}
...
...
@@ -376,6 +380,7 @@ void batchAddToImpl(float* a, const float* b[], int batch, size_t len) {
void
colMaxImpl
(
float
*
result
,
const
float
*
data
,
int
dim
,
int
numSamples
)
{
SIMD_INVOKE
(
col_max
,
result
,
data
,
dim
,
numSamples
);
}
#endif
#ifdef __AVX__
void
decayL1AvxImpl
(
float
*
dst
,
float
*
src
,
float
lambda
,
size_t
len
)
{
...
...
@@ -385,8 +390,8 @@ void decayL1AvxImpl(
float
*
dst
,
float
*
src
,
float
*
lr
,
float
lambda
,
size_t
len
)
{
decayL1_avx
(
dst
,
src
,
lr
,
lambda
,
len
);
}
#endif
}
// namespace internal
}
// namespace simd
}
// namespace paddle
paddle/math/SIMDFunctions.h
浏览文件 @
ba68ce1a
...
...
@@ -128,17 +128,29 @@ void decayL1AvxImpl(
template
<
>
inline
void
addTo
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
#ifdef __SSE3__
internal
::
addToImpl
(
a
,
b
,
len
);
#else
naive
::
addTo
(
a
,
b
,
len
);
#endif
}
template
<
>
inline
void
batchAddTo
(
float
*
a
,
const
float
*
b
[],
int
batch
,
size_t
len
)
{
#ifdef __SSE3__
internal
::
batchAddToImpl
(
a
,
b
,
batch
,
len
);
#else
naive
::
batchAddTo
(
a
,
b
,
batch
,
len
);
#endif
}
template
<
>
inline
void
colMax
(
float
*
result
,
const
float
*
data
,
int
dim
,
int
numSamples
)
{
#ifdef __SSE3__
internal
::
colMaxImpl
(
result
,
data
,
dim
,
numSamples
);
#else
naive
::
colMax
(
result
,
data
,
dim
,
numSamples
);
#endif
}
template
<
>
...
...
paddle/math/Storage.cpp
浏览文件 @
ba68ce1a
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#include "Storage.h"
#include "Allocator.h"
#include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h"
DEFINE_int32
(
pool_limit_size
,
...
...
@@ -62,7 +63,7 @@ PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) {
}
if
(
gpuAllocator_
[
deviceId
]
==
nullptr
)
{
std
::
string
name
=
"gpu"
+
st
d
::
to_string
(
deviceId
)
+
std
::
string
(
"_pool"
);
"gpu"
+
st
r
::
to_string
(
deviceId
)
+
std
::
string
(
"_pool"
);
gpuAllocator_
[
deviceId
]
=
new
PoolAllocator
(
new
GpuAllocator
(),
FLAGS_pool_limit_size
,
name
);
}
...
...
paddle/pserver/ParameterServer2.cpp
浏览文件 @
ba68ce1a
...
...
@@ -29,6 +29,7 @@ limitations under the License. */
#include "paddle/utils/Flags.h"
#include "paddle/utils/GlobalConstants.h"
#include "paddle/utils/Stat.h"
#include "paddle/utils/StringUtil.h"
DEFINE_int32
(
pserver_num_threads
,
1
,
"number of threads for sync op exec"
);
DEFINE_double
(
async_lagged_ratio_min
,
...
...
@@ -218,7 +219,8 @@ void ParameterServer2::setConfig(const SetConfigRequest& request,
callback
(
response
);
/// always defined, barrier slowest node function need it.
statSet_
.
reset
(
new
StatSet
(
"ParameterServer"
+
std
::
to_string
(
serverId_
)));
statSet_
.
reset
(
new
StatSet
(
"ParameterServer"
+
str
::
to_string
(
static_cast
<
int
>
(
serverId_
))));
}
real
bufferSum
(
const
std
::
vector
<
ParameterServer2
::
Buffer
>&
buffers
)
{
...
...
paddle/utils/CpuId.cpp
浏览文件 @
ba68ce1a
...
...
@@ -19,7 +19,7 @@ limitations under the License. */
/// for MSVC
#define CPUID(info, x) __cpuidex(info, x, 0)
#el
se
#el
if !defined(__ANDROID__)
#include <cpuid.h>
...
...
@@ -31,6 +31,7 @@ limitations under the License. */
namespace
paddle
{
SIMDFlags
::
SIMDFlags
()
{
#if !defined(__ANDROID__)
unsigned
int
cpuInfo
[
4
];
// CPUID: https://en.wikipedia.org/wiki/CPUID
// clang-format off
...
...
@@ -51,6 +52,9 @@ SIMDFlags::SIMDFlags() {
CPUID
(
cpuInfo
,
0x80000001
);
simd_flags_
|=
cpuInfo
[
2
]
&
(
1
<<
16
)
?
SIMD_FMA4
:
SIMD_NONE
;
// clang-fotmat on
#else
simd_flags_
=
SIMD_NEON
;
#endif
}
SIMDFlags
const
*
SIMDFlags
::
instance
()
{
...
...
paddle/utils/CpuId.h
浏览文件 @
ba68ce1a
...
...
@@ -30,6 +30,7 @@ enum simd_t {
SIMD_AVX
=
1
<<
8
,
///< AVX
SIMD_AVX2
=
1
<<
9
,
///< AVX 2
SIMD_AVX512
=
1
<<
10
,
///< AVX 512
SIMD_NEON
=
1
<<
11
,
/// NEON
};
// clang-format on
...
...
@@ -96,6 +97,7 @@ private:
#define HAS_AVX HAS_SIMD(SIMD_AVX)
#define HAS_AVX2 HAS_SIMD(SIMD_AVX2)
#define HAS_AVX512 HAS_SIMD(SIMD_AVX512)
#define HAS_NEON HAS_SIMD(SIMD_NEON)
// clang-format on
/**
...
...
paddle/utils/Logging.cpp
浏览文件 @
ba68ce1a
...
...
@@ -18,6 +18,7 @@ limitations under the License. */
*/
#include "Logging.h"
#include <cstdlib>
namespace
paddle
{
...
...
paddle/utils/StringUtil.h
浏览文件 @
ba68ce1a
...
...
@@ -54,6 +54,25 @@ inline T toWithStatus(const std::string& s, bool* ok = nullptr) {
return
v
;
}
/**
* Cast type T to string with status.
*
* @param [in] v input value of type T.
* @param [out] ok status, return true if there is no error in casting. Set
* nullptr if user don't care error at all.
* @return result of casting. If error occurred, a empty string will be
* returned.
*/
template
<
class
T
>
inline
std
::
string
toWithStatus
(
const
T
v
,
bool
*
ok
=
nullptr
)
{
std
::
ostringstream
sout
;
sout
<<
v
;
if
(
ok
)
{
*
ok
=
!
sout
.
fail
();
}
return
sout
.
str
();
}
/// Convert string to type T. It makes sure all the characters in s are used.
/// Otherwise it will abort.
///
...
...
@@ -67,6 +86,18 @@ inline T to(const std::string& s) {
return
v
;
}
/// Convert type T to string.
///
/// @tparam T type of input value
/// @param v input value of type T
template
<
class
T
>
std
::
string
to_string
(
T
v
)
{
bool
ok
;
std
::
string
s
=
toWithStatus
<
T
>
(
v
,
&
ok
);
CHECK
(
ok
)
<<
"Cannot convert v("
<<
v
<<
") to type std::string"
;
return
s
;
}
}
// namespace str
#undef DEFINE_STRING_CONVERSION
...
...
paddle/utils/Util.cpp
浏览文件 @
ba68ce1a
...
...
@@ -15,11 +15,16 @@ limitations under the License. */
#include "Util.h"
#include <dirent.h>
#include <pmmintrin.h>
#include <signal.h>
#include <sys/stat.h>
#include <sys/types.h>
#ifdef __SSE__
#include <xmmintrin.h>
#endif
#ifdef __SSE3__
#include <pmmintrin.h>
#endif
#include <fstream>
#include <mutex>
...
...
@@ -163,8 +168,12 @@ void initMain(int argc, char** argv) {
installProfilerSwitch
();
#ifdef __SSE__
_MM_SET_FLUSH_ZERO_MODE
(
_MM_FLUSH_ZERO_ON
);
#endif
#ifdef __SSE3__
_MM_SET_DENORMALS_ZERO_MODE
(
_MM_DENORMALS_ZERO_ON
);
#endif
if
(
FLAGS_seed
==
0
)
{
unsigned
int
t
=
time
(
NULL
);
...
...
paddle/utils/arch/linux/Locks.cpp
浏览文件 @
ba68ce1a
...
...
@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/utils/Locks.h"
#include <semaphore.h>
#include <unistd.h>
#include "paddle/utils/Logging.h"
namespace
paddle
{
class
SemaphorePrivate
{
...
...
@@ -26,7 +27,10 @@ Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
sem_init
(
&
m
->
sem
,
0
,
initValue
);
}
Semaphore
::~
Semaphore
()
{
sem_destroy
(
&
m
->
sem
);
}
Semaphore
::~
Semaphore
()
{
sem_destroy
(
&
m
->
sem
);
delete
m
;
}
bool
Semaphore
::
timeWait
(
struct
timespec
*
ts
)
{
return
(
0
==
sem_timedwait
(
&
m
->
sem
,
ts
));
...
...
@@ -36,36 +40,101 @@ void Semaphore::wait() { sem_wait(&m->sem); }
void
Semaphore
::
post
()
{
sem_post
(
&
m
->
sem
);
}
#ifdef PADDLE_USE_PTHREAD_SPINLOCK
class
SpinLockPrivate
{
public:
inline
SpinLockPrivate
()
{
pthread_spin_init
(
&
lock_
,
0
);
}
inline
~
SpinLockPrivate
()
{
pthread_spin_destroy
(
&
lock_
);
}
inline
void
lock
()
{
pthread_spin_lock
(
&
lock_
);
}
inline
void
unlock
()
{
pthread_spin_unlock
(
&
lock_
);
}
pthread_spinlock_t
lock_
;
char
padding_
[
64
-
sizeof
(
pthread_spinlock_t
)];
};
SpinLock
::
SpinLock
()
:
m
(
new
SpinLockPrivate
())
{}
#else
SpinLock
::~
SpinLock
()
{
delete
m
;
}
#include <atomic>
class
SpinLockPrivate
{
public:
inline
void
lock
()
{
while
(
lock_
.
test_and_set
(
std
::
memory_order_acquire
))
{
}
}
inline
void
unlock
()
{
lock_
.
clear
(
std
::
memory_order_release
);
}
std
::
atomic_flag
lock_
=
ATOMIC_FLAG_INIT
;
char
padding_
[
64
-
sizeof
(
lock_
)];
// Padding to cache line size
};
void
SpinLock
::
lock
()
{
pthread_spin_lock
(
&
m
->
lock_
);
}
#endif
void
SpinLock
::
unlock
()
{
pthread_spin_unlock
(
&
m
->
lock_
);
}
SpinLock
::
SpinLock
()
:
m
(
new
SpinLockPrivate
())
{}
SpinLock
::~
SpinLock
()
{
delete
m
;
}
void
SpinLock
::
lock
()
{
m
->
lock
();
}
void
SpinLock
::
unlock
()
{
m
->
unlock
();
}
#ifdef PADDLE_USE_PTHREAD_BARRIER
class
ThreadBarrierPrivate
{
public:
pthread_barrier_t
barrier_
;
inline
explicit
ThreadBarrierPrivate
(
int
count
)
{
pthread_barrier_init
(
&
barrier_
,
nullptr
,
count
);
}
inline
~
ThreadBarrierPrivate
()
{
pthread_barrier_destroy
(
&
barrier_
);
}
inline
void
wait
()
{
pthread_barrier_wait
(
&
barrier_
);
}
};
ThreadBarrier
::
ThreadBarrier
(
int
count
)
:
m
(
new
ThreadBarrierPrivate
())
{
pthread_barrier_init
(
&
m
->
barrier_
,
nullptr
,
count
);
}
#else
ThreadBarrier
::~
ThreadBarrier
()
{
pthread_barrier_destroy
(
&
m
->
barrier_
);
delete
m
;
}
class
ThreadBarrierPrivate
{
public:
pthread_mutex_t
mutex_
;
pthread_cond_t
cond_
;
int
count_
;
int
tripCount_
;
inline
explicit
ThreadBarrierPrivate
(
int
cnt
)
:
count_
(
0
),
tripCount_
(
cnt
)
{
CHECK_NE
(
cnt
,
0
);
CHECK_GE
(
pthread_mutex_init
(
&
mutex_
,
0
),
0
);
CHECK_GE
(
pthread_cond_init
(
&
cond_
,
0
),
0
);
}
inline
~
ThreadBarrierPrivate
()
{
pthread_cond_destroy
(
&
cond_
);
pthread_mutex_destroy
(
&
mutex_
);
}
/**
* @brief wait
* @return true if the last wait
*/
inline
bool
wait
()
{
pthread_mutex_lock
(
&
mutex_
);
++
count_
;
if
(
count_
>=
tripCount_
)
{
count_
=
0
;
pthread_cond_broadcast
(
&
cond_
);
pthread_mutex_unlock
(
&
mutex_
);
return
true
;
}
else
{
pthread_cond_wait
(
&
cond_
,
&
mutex_
);
pthread_mutex_unlock
(
&
mutex_
);
return
false
;
}
}
};
#endif
void
ThreadBarrier
::
wait
()
{
pthread_barrier_wait
(
&
m
->
barrier_
);
}
ThreadBarrier
::
ThreadBarrier
(
int
count
)
:
m
(
new
ThreadBarrierPrivate
(
count
))
{}
ThreadBarrier
::~
ThreadBarrier
()
{
delete
m
;
}
void
ThreadBarrier
::
wait
()
{
m
->
wait
();
}
}
// namespace paddle
paddle/utils/tests/test_CustomStackTrace.cpp
浏览文件 @
ba68ce1a
...
...
@@ -19,6 +19,7 @@ limitations under the License. */
#include "paddle/utils/CustomStackTrace.h"
#include "paddle/utils/Locks.h"
#include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h"
DEFINE_int32
(
test_thread_num
,
10
,
"testing thread number"
);
...
...
@@ -69,11 +70,11 @@ TEST(CustomStackTrace, normalTrain) {
while
(
countDown
--
>
0
)
{
start
.
wait
();
for
(
size_t
i
=
0
;
i
<
layerSize
;
++
i
)
{
tracer
.
push
(
"layer_"
+
std
::
to_string
(
i
));
tracer
.
push
(
"layer_"
+
paddle
::
str
::
to_string
(
i
));
}
tracer
.
pop
(
""
);
for
(
size_t
i
=
0
;
i
<
layerSize
;
++
i
)
{
tracer
.
pop
(
"layer_"
+
std
::
to_string
(
layerSize
-
1
-
i
));
tracer
.
pop
(
"layer_"
+
paddle
::
str
::
to_string
(
layerSize
-
1
-
i
));
}
finish
.
wait
();
}
...
...
@@ -89,7 +90,7 @@ TEST(CustomStackTrace, normalTest) {
while
(
countDown
--
>
0
)
{
start
.
wait
();
for
(
size_t
i
=
0
;
i
<
layerSize
;
++
i
)
{
tracer
.
push
(
"layer_"
+
std
::
to_string
(
i
));
tracer
.
push
(
"layer_"
+
paddle
::
str
::
to_string
(
i
));
}
tracer
.
clear
();
// in forward test, tracer will clear after forward.
finish
.
wait
();
...
...
paddle/utils/tests/test_CustomStackTracePrint.cpp
浏览文件 @
ba68ce1a
...
...
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/utils/CustomStackTrace.h"
#include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h"
int
main
(
int
argc
,
char
**
argv
)
{
paddle
::
initMain
(
argc
,
argv
);
for
(
size_t
i
=
0
;
i
<
1000
;
++
i
)
{
paddle
::
gLayerStackTrace
.
push
(
"layer_"
+
std
::
to_string
(
i
));
paddle
::
gLayerStackTrace
.
push
(
"layer_"
+
paddle
::
str
::
to_string
(
i
));
if
(
i
==
998
)
{
throw
"Unhandle exception"
;
}
...
...
paddle/utils/tests/test_SIMDFlags.cpp
浏览文件 @
ba68ce1a
...
...
@@ -18,7 +18,8 @@ limitations under the License. */
using
namespace
paddle
;
// NOLINT
TEST
(
SIMDFlags
,
gccTest
)
{
#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__))
#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__)) && \
!defined(__arm__)
// clang-format off
CHECK
(
!
__builtin_cpu_supports
(
"sse"
)
!=
HAS_SSE
);
CHECK
(
!
__builtin_cpu_supports
(
"sse2"
)
!=
HAS_SSE2
);
...
...
@@ -43,4 +44,5 @@ TEST(SIMDFlags, normalPrint) {
LOG
(
INFO
)
<<
"Has AVX: "
<<
std
::
boolalpha
<<
HAS_AVX
;
LOG
(
INFO
)
<<
"Has AVX2: "
<<
std
::
boolalpha
<<
HAS_AVX2
;
LOG
(
INFO
)
<<
"Has AVX512: "
<<
std
::
boolalpha
<<
HAS_AVX512
;
LOG
(
INFO
)
<<
"Has NEON: "
<<
std
::
boolalpha
<<
HAS_NEON
;
}
python/paddle/trainer_config_helpers/layers.py
浏览文件 @
ba68ce1a
...
...
@@ -1940,7 +1940,7 @@ def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
@
layer_support
()
def
hsigmoid
(
input
,
label
,
num_classes
,
num_classes
=
None
,
name
=
None
,
bias_attr
=
None
,
param_attr
=
None
,
...
...
@@ -1956,8 +1956,7 @@ def hsigmoid(input,
.. code-block:: python
cost = hsigmoid(input=[layer1, layer2],
label=data_layer,
num_classes=3)
label=data_layer)
:param input: Input layers. It could be a LayerOutput or list/tuple of
LayerOutput.
...
...
@@ -1965,12 +1964,14 @@ def hsigmoid(input,
:param label: Label layer.
:type label: LayerOutput
:param num_classes: number of classes.
:type num_classes: int
:type num_classes: int
|None
:param name: layer name
:type name: basestring
:param bias_attr: Bias attribute. None means default bias.
False means no bias.
:type bias_attr: ParameterAttribute|False
:param param_attr: Parameter Attribute. None means default parameter.
:type param_attr: ParameterAttribute|None
:param layer_attr: Extra Layer Attribute.
:type layer_attr: ExtraLayerAttribute
:return: LayerOutput object.
...
...
@@ -1990,6 +1991,11 @@ def hsigmoid(input,
assert
isinstance
(
label
,
LayerOutput
)
assert
label
.
layer_type
==
LayerType
.
DATA
if
num_classes
is
None
:
num_classes
=
label
.
size
if
num_classes
is
None
or
num_classes
<=
2
:
raise
ValueError
(
"hsigmoid label size must larger than 2."
)
ipts_for_layer
=
[]
parents
=
[]
for
each_input
,
each_param_attr
in
zip
(
input
,
param_attr
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录