Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
38fa74ed
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
38fa74ed
编写于
3月 24, 2017
作者:
L
Liu Yiqun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix cmake error of failing to find UINT64_MAX.
上级
f261dc6a
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
133 addition
and
136 deletion
+133
-136
CMakeLists.txt
CMakeLists.txt
+1
-1
cmake/configure.cmake
cmake/configure.cmake
+11
-5
cmake/simd.cmake
cmake/simd.cmake
+1
-10
paddle/math/SIMDFunctions.cpp
paddle/math/SIMDFunctions.cpp
+117
-117
paddle/math/SIMDFunctions.h
paddle/math/SIMDFunctions.h
+3
-3
未找到文件。
CMakeLists.txt
浏览文件 @
38fa74ed
...
...
@@ -25,6 +25,7 @@ find_package(Git REQUIRED)
find_package
(
Threads REQUIRED
)
include
(
system
)
include
(
simd
)
################################ Configurations #######################################
option
(
WITH_GPU
"Compile PaddlePaddle with NVIDIA GPU"
${
CUDA_FOUND
}
)
...
...
@@ -64,7 +65,6 @@ include(external/openblas) # download, build, install openblas
include
(
external/swig
)
# download, build, install swig
include
(
external/warpctc
)
# download, build, install warpctc
include
(
simd
)
# set simd flag
include
(
package
)
# set paddle packages
include
(
cpplint
)
# set paddle c++ style
include
(
ccache
)
# set ccache for compilation
...
...
cmake/configure.cmake
浏览文件 @
38fa74ed
...
...
@@ -32,6 +32,16 @@ if(NOT WITH_PROFILER)
add_definitions
(
-DPADDLE_DISABLE_PROFILER
)
endif
(
NOT WITH_PROFILER
)
if
(
NEON_FOUND
)
set
(
SIMD_FLAG
${
NEON_FLAG
}
)
else
(
NEON_FOUND
)
if
(
WITH_AVX
)
set
(
SIMD_FLAG
${
AVX_FLAG
}
)
else
(
WITH_AVX
)
set
(
SIMD_FLAG
${
SSE3_FLAG
}
)
endif
(
WITH_AVX
)
endif
(
NEON_FOUND
)
if
(
NOT WITH_GPU
)
add_definitions
(
-DPADDLE_ONLY_CPU
)
add_definitions
(
-DHPPL_STUB_FUNC
)
...
...
@@ -48,11 +58,7 @@ else()
message
(
FATAL_ERROR
"Paddle need cudnn to compile"
)
endif
()
if
(
WITH_AVX
)
set
(
CUDA_NVCC_FLAGS
${
CUDA_NVCC_FLAGS
}
"-Xcompiler
${
AVX_FLAG
}
"
)
else
(
WITH_AVX
)
set
(
CUDA_NVCC_FLAGS
${
CUDA_NVCC_FLAGS
}
"-Xcompiler
${
SSE3_FLAG
}
"
)
endif
(
WITH_AVX
)
set
(
CUDA_NVCC_FLAGS
${
CUDA_NVCC_FLAGS
}
"-Xcompiler
${
SIMD_FLAG
}
"
)
# Include cuda and cudnn
include_directories
(
${
CUDNN_INCLUDE_DIR
}
)
...
...
cmake/simd.cmake
浏览文件 @
38fa74ed
...
...
@@ -88,14 +88,5 @@ int main()
return 0;
}"
NEON_FOUND
)
if
(
NEON_FOUND
)
set
(
SIMD_FLAG
${
NEON_FLAG
}
)
else
(
NEON_FOUND
)
if
(
WITH_AVX
)
set
(
SIMD_FLAG
${
AVX_FLAG
}
)
else
(
WITH_AVX
)
set
(
SIMD_FLAG
${
SSE3_FLAG
}
)
endif
(
WITH_AVX
)
endif
(
NEON_FOUND
)
set
(
CMAKE_REQUIRED_FLAGS
""
)
mark_as_advanced
(
MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND NEON_FOUND
)
paddle/math/SIMDFunctions.cpp
浏览文件 @
38fa74ed
...
...
@@ -13,122 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "SIMDFunctions.h"
#ifdef __SSE__
#ifdef __SSE
3
__
#include <immintrin.h>
#endif
#include <algorithm>
#ifdef __SSE__
static
void
addto_sse
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
int
offset
=
len
%
16
;
__m128
ma0
,
ma1
,
ma2
,
ma3
;
__m128
mb0
,
mb1
,
mb2
,
mb3
;
for
(
unsigned
int
k
=
0
;
k
<
len
/
16
;
k
++
,
a
+=
16
,
b
+=
16
)
{
ma0
=
_mm_load_ps
(
a
);
ma1
=
_mm_load_ps
(
a
+
4
);
ma2
=
_mm_load_ps
(
a
+
8
);
ma3
=
_mm_load_ps
(
a
+
12
);
mb0
=
_mm_load_ps
(
b
);
mb1
=
_mm_load_ps
(
b
+
4
);
mb2
=
_mm_load_ps
(
b
+
8
);
mb3
=
_mm_load_ps
(
b
+
12
);
ma0
=
_mm_add_ps
(
ma0
,
mb0
);
ma1
=
_mm_add_ps
(
ma1
,
mb1
);
ma2
=
_mm_add_ps
(
ma2
,
mb2
);
ma3
=
_mm_add_ps
(
ma3
,
mb3
);
_mm_store_ps
(
a
,
ma0
);
_mm_store_ps
(
a
+
4
,
ma1
);
_mm_store_ps
(
a
+
8
,
ma2
);
_mm_store_ps
(
a
+
12
,
ma3
);
}
for
(
int
i
=
0
;
i
<
offset
;
i
++
)
a
[
i
]
+=
b
[
i
];
}
static
void
batch_addto_sse
(
float
*
a
,
const
float
*
b
[],
int
batch
,
size_t
len
)
{
int
offset
=
len
%
16
;
__m128
ma0
,
ma1
,
ma2
,
ma3
;
__m128
mb0
,
mb1
,
mb2
,
mb3
;
for
(
unsigned
int
k
=
0
;
k
<
len
/
16
;
k
++
,
a
+=
16
)
{
ma0
=
_mm_load_ps
(
a
);
ma1
=
_mm_load_ps
(
a
+
4
);
ma2
=
_mm_load_ps
(
a
+
8
);
ma3
=
_mm_load_ps
(
a
+
12
);
for
(
int
i
=
0
;
i
<
batch
;
i
++
)
{
mb0
=
_mm_load_ps
(
b
[
i
]);
mb1
=
_mm_load_ps
(
b
[
i
]
+
4
);
mb2
=
_mm_load_ps
(
b
[
i
]
+
8
);
mb3
=
_mm_load_ps
(
b
[
i
]
+
12
);
ma0
=
_mm_add_ps
(
ma0
,
mb0
);
ma1
=
_mm_add_ps
(
ma1
,
mb1
);
ma2
=
_mm_add_ps
(
ma2
,
mb2
);
ma3
=
_mm_add_ps
(
ma3
,
mb3
);
b
[
i
]
+=
16
;
}
_mm_store_ps
(
a
,
ma0
);
_mm_store_ps
(
a
+
4
,
ma1
);
_mm_store_ps
(
a
+
8
,
ma2
);
_mm_store_ps
(
a
+
12
,
ma3
);
}
for
(
int
i
=
0
;
i
<
offset
;
i
++
)
{
for
(
int
k
=
0
;
k
<
batch
;
k
++
)
a
[
i
]
+=
b
[
k
][
i
];
}
return
;
}
static
void
col_max_sse
(
float
*
result
,
const
float
*
data
,
int
dim
,
int
numSamples
)
{
// first sample, direct copy
for
(
int
d
=
0
;
d
<
dim
;
++
d
)
{
result
[
d
]
=
data
[
d
];
}
int
offset
=
dim
%
16
;
__m128
ma0
,
ma1
,
ma2
,
ma3
;
__m128
mb0
,
mb1
,
mb2
,
mb3
;
// first 16n dims
for
(
int
k
=
0
;
k
<
dim
/
16
;
k
++
,
result
+=
16
,
data
+=
16
)
{
ma0
=
_mm_load_ps
(
result
);
ma1
=
_mm_load_ps
(
result
+
4
);
ma2
=
_mm_load_ps
(
result
+
8
);
ma3
=
_mm_load_ps
(
result
+
12
);
for
(
int
i
=
1
;
i
<
numSamples
;
i
++
)
{
mb0
=
_mm_load_ps
(
data
+
i
*
dim
);
mb1
=
_mm_load_ps
(
data
+
i
*
dim
+
4
);
mb2
=
_mm_load_ps
(
data
+
i
*
dim
+
8
);
mb3
=
_mm_load_ps
(
data
+
i
*
dim
+
12
);
ma0
=
_mm_max_ps
(
ma0
,
mb0
);
ma1
=
_mm_max_ps
(
ma1
,
mb1
);
ma2
=
_mm_max_ps
(
ma2
,
mb2
);
ma3
=
_mm_max_ps
(
ma3
,
mb3
);
}
_mm_store_ps
(
result
,
ma0
);
_mm_store_ps
(
result
+
4
,
ma1
);
_mm_store_ps
(
result
+
8
,
ma2
);
_mm_store_ps
(
result
+
12
,
ma3
);
}
// last dims
for
(
int
d
=
0
;
d
<
offset
;
++
d
)
{
float
sm
=
data
[
d
];
for
(
int
i
=
1
;
i
<
numSamples
;
++
i
)
{
sm
=
std
::
max
(
sm
,
data
[
i
*
dim
+
d
]);
}
result
[
d
]
=
sm
;
}
}
#elif defined(__AVX__)
#ifdef __AVX__
static
void
addto_avx
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
int
offset
=
len
%
32
;
...
...
@@ -358,18 +248,128 @@ static void decayL1_avx(
}
}
#elif defined(__SSE3__)
static
void
addto_sse
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
int
offset
=
len
%
16
;
__m128
ma0
,
ma1
,
ma2
,
ma3
;
__m128
mb0
,
mb1
,
mb2
,
mb3
;
for
(
unsigned
int
k
=
0
;
k
<
len
/
16
;
k
++
,
a
+=
16
,
b
+=
16
)
{
ma0
=
_mm_load_ps
(
a
);
ma1
=
_mm_load_ps
(
a
+
4
);
ma2
=
_mm_load_ps
(
a
+
8
);
ma3
=
_mm_load_ps
(
a
+
12
);
mb0
=
_mm_load_ps
(
b
);
mb1
=
_mm_load_ps
(
b
+
4
);
mb2
=
_mm_load_ps
(
b
+
8
);
mb3
=
_mm_load_ps
(
b
+
12
);
ma0
=
_mm_add_ps
(
ma0
,
mb0
);
ma1
=
_mm_add_ps
(
ma1
,
mb1
);
ma2
=
_mm_add_ps
(
ma2
,
mb2
);
ma3
=
_mm_add_ps
(
ma3
,
mb3
);
_mm_store_ps
(
a
,
ma0
);
_mm_store_ps
(
a
+
4
,
ma1
);
_mm_store_ps
(
a
+
8
,
ma2
);
_mm_store_ps
(
a
+
12
,
ma3
);
}
for
(
int
i
=
0
;
i
<
offset
;
i
++
)
a
[
i
]
+=
b
[
i
];
}
static
void
batch_addto_sse
(
float
*
a
,
const
float
*
b
[],
int
batch
,
size_t
len
)
{
int
offset
=
len
%
16
;
__m128
ma0
,
ma1
,
ma2
,
ma3
;
__m128
mb0
,
mb1
,
mb2
,
mb3
;
for
(
unsigned
int
k
=
0
;
k
<
len
/
16
;
k
++
,
a
+=
16
)
{
ma0
=
_mm_load_ps
(
a
);
ma1
=
_mm_load_ps
(
a
+
4
);
ma2
=
_mm_load_ps
(
a
+
8
);
ma3
=
_mm_load_ps
(
a
+
12
);
for
(
int
i
=
0
;
i
<
batch
;
i
++
)
{
mb0
=
_mm_load_ps
(
b
[
i
]);
mb1
=
_mm_load_ps
(
b
[
i
]
+
4
);
mb2
=
_mm_load_ps
(
b
[
i
]
+
8
);
mb3
=
_mm_load_ps
(
b
[
i
]
+
12
);
ma0
=
_mm_add_ps
(
ma0
,
mb0
);
ma1
=
_mm_add_ps
(
ma1
,
mb1
);
ma2
=
_mm_add_ps
(
ma2
,
mb2
);
ma3
=
_mm_add_ps
(
ma3
,
mb3
);
b
[
i
]
+=
16
;
}
_mm_store_ps
(
a
,
ma0
);
_mm_store_ps
(
a
+
4
,
ma1
);
_mm_store_ps
(
a
+
8
,
ma2
);
_mm_store_ps
(
a
+
12
,
ma3
);
}
for
(
int
i
=
0
;
i
<
offset
;
i
++
)
{
for
(
int
k
=
0
;
k
<
batch
;
k
++
)
a
[
i
]
+=
b
[
k
][
i
];
}
return
;
}
static
void
col_max_sse
(
float
*
result
,
const
float
*
data
,
int
dim
,
int
numSamples
)
{
// first sample, direct copy
for
(
int
d
=
0
;
d
<
dim
;
++
d
)
{
result
[
d
]
=
data
[
d
];
}
int
offset
=
dim
%
16
;
__m128
ma0
,
ma1
,
ma2
,
ma3
;
__m128
mb0
,
mb1
,
mb2
,
mb3
;
// first 16n dims
for
(
int
k
=
0
;
k
<
dim
/
16
;
k
++
,
result
+=
16
,
data
+=
16
)
{
ma0
=
_mm_load_ps
(
result
);
ma1
=
_mm_load_ps
(
result
+
4
);
ma2
=
_mm_load_ps
(
result
+
8
);
ma3
=
_mm_load_ps
(
result
+
12
);
for
(
int
i
=
1
;
i
<
numSamples
;
i
++
)
{
mb0
=
_mm_load_ps
(
data
+
i
*
dim
);
mb1
=
_mm_load_ps
(
data
+
i
*
dim
+
4
);
mb2
=
_mm_load_ps
(
data
+
i
*
dim
+
8
);
mb3
=
_mm_load_ps
(
data
+
i
*
dim
+
12
);
ma0
=
_mm_max_ps
(
ma0
,
mb0
);
ma1
=
_mm_max_ps
(
ma1
,
mb1
);
ma2
=
_mm_max_ps
(
ma2
,
mb2
);
ma3
=
_mm_max_ps
(
ma3
,
mb3
);
}
_mm_store_ps
(
result
,
ma0
);
_mm_store_ps
(
result
+
4
,
ma1
);
_mm_store_ps
(
result
+
8
,
ma2
);
_mm_store_ps
(
result
+
12
,
ma3
);
}
// last dims
for
(
int
d
=
0
;
d
<
offset
;
++
d
)
{
float
sm
=
data
[
d
];
for
(
int
i
=
1
;
i
<
numSamples
;
++
i
)
{
sm
=
std
::
max
(
sm
,
data
[
i
*
dim
+
d
]);
}
result
[
d
]
=
sm
;
}
}
#endif
#ifdef __SSE__
#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
#elif __AVX__
#if defined(__AVX__)
#define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__)
#elif defined(__SSE3__)
#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
#endif
namespace
paddle
{
namespace
simd
{
namespace
internal
{
#ifdef __SSE__
#ifdef __SSE
3
__
void
addToImpl
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
SIMD_INVOKE
(
addto
,
a
,
b
,
len
);
}
...
...
@@ -390,8 +390,8 @@ void decayL1AvxImpl(
float
*
dst
,
float
*
src
,
float
*
lr
,
float
lambda
,
size_t
len
)
{
decayL1_avx
(
dst
,
src
,
lr
,
lambda
,
len
);
}
#endif
}
// namespace internal
}
// namespace simd
}
// namespace paddle
paddle/math/SIMDFunctions.h
浏览文件 @
38fa74ed
...
...
@@ -128,7 +128,7 @@ void decayL1AvxImpl(
template
<
>
inline
void
addTo
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
#ifdef __SSE__
#ifdef __SSE
3
__
internal
::
addToImpl
(
a
,
b
,
len
);
#else
naive
::
addTo
(
a
,
b
,
len
);
...
...
@@ -137,7 +137,7 @@ inline void addTo(float* a, const float* b, size_t len) {
template
<
>
inline
void
batchAddTo
(
float
*
a
,
const
float
*
b
[],
int
batch
,
size_t
len
)
{
#ifdef __SSE__
#ifdef __SSE
3
__
internal
::
batchAddToImpl
(
a
,
b
,
batch
,
len
);
#else
naive
::
batchAddTo
(
a
,
b
,
batch
,
len
);
...
...
@@ -146,7 +146,7 @@ inline void batchAddTo(float* a, const float* b[], int batch, size_t len) {
template
<
>
inline
void
colMax
(
float
*
result
,
const
float
*
data
,
int
dim
,
int
numSamples
)
{
#ifdef __SSE__
#ifdef __SSE
3
__
internal
::
colMaxImpl
(
result
,
data
,
dim
,
numSamples
);
#else
naive
::
colMax
(
result
,
data
,
dim
,
numSamples
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录