Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
38fa74ed
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
38fa74ed
编写于
3月 24, 2017
作者:
L
Liu Yiqun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix cmake error of failing to find UINT64_MAX.
上级
f261dc6a
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
133 addition
and
136 deletion
+133
-136
CMakeLists.txt
CMakeLists.txt
+1
-1
cmake/configure.cmake
cmake/configure.cmake
+11
-5
cmake/simd.cmake
cmake/simd.cmake
+1
-10
paddle/math/SIMDFunctions.cpp
paddle/math/SIMDFunctions.cpp
+117
-117
paddle/math/SIMDFunctions.h
paddle/math/SIMDFunctions.h
+3
-3
未找到文件。
CMakeLists.txt
浏览文件 @
38fa74ed
...
...
@@ -25,6 +25,7 @@ find_package(Git REQUIRED)
find_package
(
Threads REQUIRED
)
include
(
system
)
include
(
simd
)
################################ Configurations #######################################
option
(
WITH_GPU
"Compile PaddlePaddle with NVIDIA GPU"
${
CUDA_FOUND
}
)
...
...
@@ -64,7 +65,6 @@ include(external/openblas) # download, build, install openblas
include
(
external/swig
)
# download, build, install swig
include
(
external/warpctc
)
# download, build, install warpctc
include
(
simd
)
# set simd flag
include
(
package
)
# set paddle packages
include
(
cpplint
)
# set paddle c++ style
include
(
ccache
)
# set ccache for compilation
...
...
cmake/configure.cmake
浏览文件 @
38fa74ed
...
...
@@ -32,6 +32,16 @@ if(NOT WITH_PROFILER)
add_definitions
(
-DPADDLE_DISABLE_PROFILER
)
endif
(
NOT WITH_PROFILER
)
if
(
NEON_FOUND
)
set
(
SIMD_FLAG
${
NEON_FLAG
}
)
else
(
NEON_FOUND
)
if
(
WITH_AVX
)
set
(
SIMD_FLAG
${
AVX_FLAG
}
)
else
(
WITH_AVX
)
set
(
SIMD_FLAG
${
SSE3_FLAG
}
)
endif
(
WITH_AVX
)
endif
(
NEON_FOUND
)
if
(
NOT WITH_GPU
)
add_definitions
(
-DPADDLE_ONLY_CPU
)
add_definitions
(
-DHPPL_STUB_FUNC
)
...
...
@@ -48,11 +58,7 @@ else()
message
(
FATAL_ERROR
"Paddle need cudnn to compile"
)
endif
()
if
(
WITH_AVX
)
set
(
CUDA_NVCC_FLAGS
${
CUDA_NVCC_FLAGS
}
"-Xcompiler
${
AVX_FLAG
}
"
)
else
(
WITH_AVX
)
set
(
CUDA_NVCC_FLAGS
${
CUDA_NVCC_FLAGS
}
"-Xcompiler
${
SSE3_FLAG
}
"
)
endif
(
WITH_AVX
)
set
(
CUDA_NVCC_FLAGS
${
CUDA_NVCC_FLAGS
}
"-Xcompiler
${
SIMD_FLAG
}
"
)
# Include cuda and cudnn
include_directories
(
${
CUDNN_INCLUDE_DIR
}
)
...
...
cmake/simd.cmake
浏览文件 @
38fa74ed
...
...
@@ -88,14 +88,5 @@ int main()
return 0;
}"
NEON_FOUND
)
if
(
NEON_FOUND
)
set
(
SIMD_FLAG
${
NEON_FLAG
}
)
else
(
NEON_FOUND
)
if
(
WITH_AVX
)
set
(
SIMD_FLAG
${
AVX_FLAG
}
)
else
(
WITH_AVX
)
set
(
SIMD_FLAG
${
SSE3_FLAG
}
)
endif
(
WITH_AVX
)
endif
(
NEON_FOUND
)
set
(
CMAKE_REQUIRED_FLAGS
""
)
mark_as_advanced
(
MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND NEON_FOUND
)
paddle/math/SIMDFunctions.cpp
浏览文件 @
38fa74ed
...
...
@@ -13,122 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "SIMDFunctions.h"
#ifdef __SSE__
#ifdef __SSE
3
__
#include <immintrin.h>
#endif
#include <algorithm>
#ifdef __SSE__
static
void
addto_sse
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
int
offset
=
len
%
16
;
__m128
ma0
,
ma1
,
ma2
,
ma3
;
__m128
mb0
,
mb1
,
mb2
,
mb3
;
for
(
unsigned
int
k
=
0
;
k
<
len
/
16
;
k
++
,
a
+=
16
,
b
+=
16
)
{
ma0
=
_mm_load_ps
(
a
);
ma1
=
_mm_load_ps
(
a
+
4
);
ma2
=
_mm_load_ps
(
a
+
8
);
ma3
=
_mm_load_ps
(
a
+
12
);
mb0
=
_mm_load_ps
(
b
);
mb1
=
_mm_load_ps
(
b
+
4
);
mb2
=
_mm_load_ps
(
b
+
8
);
mb3
=
_mm_load_ps
(
b
+
12
);
ma0
=
_mm_add_ps
(
ma0
,
mb0
);
ma1
=
_mm_add_ps
(
ma1
,
mb1
);
ma2
=
_mm_add_ps
(
ma2
,
mb2
);
ma3
=
_mm_add_ps
(
ma3
,
mb3
);
_mm_store_ps
(
a
,
ma0
);
_mm_store_ps
(
a
+
4
,
ma1
);
_mm_store_ps
(
a
+
8
,
ma2
);
_mm_store_ps
(
a
+
12
,
ma3
);
}
for
(
int
i
=
0
;
i
<
offset
;
i
++
)
a
[
i
]
+=
b
[
i
];
}
static
void
batch_addto_sse
(
float
*
a
,
const
float
*
b
[],
int
batch
,
size_t
len
)
{
int
offset
=
len
%
16
;
__m128
ma0
,
ma1
,
ma2
,
ma3
;
__m128
mb0
,
mb1
,
mb2
,
mb3
;
for
(
unsigned
int
k
=
0
;
k
<
len
/
16
;
k
++
,
a
+=
16
)
{
ma0
=
_mm_load_ps
(
a
);
ma1
=
_mm_load_ps
(
a
+
4
);
ma2
=
_mm_load_ps
(
a
+
8
);
ma3
=
_mm_load_ps
(
a
+
12
);
for
(
int
i
=
0
;
i
<
batch
;
i
++
)
{
mb0
=
_mm_load_ps
(
b
[
i
]);
mb1
=
_mm_load_ps
(
b
[
i
]
+
4
);
mb2
=
_mm_load_ps
(
b
[
i
]
+
8
);
mb3
=
_mm_load_ps
(
b
[
i
]
+
12
);
ma0
=
_mm_add_ps
(
ma0
,
mb0
);
ma1
=
_mm_add_ps
(
ma1
,
mb1
);
ma2
=
_mm_add_ps
(
ma2
,
mb2
);
ma3
=
_mm_add_ps
(
ma3
,
mb3
);
b
[
i
]
+=
16
;
}
_mm_store_ps
(
a
,
ma0
);
_mm_store_ps
(
a
+
4
,
ma1
);
_mm_store_ps
(
a
+
8
,
ma2
);
_mm_store_ps
(
a
+
12
,
ma3
);
}
for
(
int
i
=
0
;
i
<
offset
;
i
++
)
{
for
(
int
k
=
0
;
k
<
batch
;
k
++
)
a
[
i
]
+=
b
[
k
][
i
];
}
return
;
}
static
void
col_max_sse
(
float
*
result
,
const
float
*
data
,
int
dim
,
int
numSamples
)
{
// first sample, direct copy
for
(
int
d
=
0
;
d
<
dim
;
++
d
)
{
result
[
d
]
=
data
[
d
];
}
int
offset
=
dim
%
16
;
__m128
ma0
,
ma1
,
ma2
,
ma3
;
__m128
mb0
,
mb1
,
mb2
,
mb3
;
// first 16n dims
for
(
int
k
=
0
;
k
<
dim
/
16
;
k
++
,
result
+=
16
,
data
+=
16
)
{
ma0
=
_mm_load_ps
(
result
);
ma1
=
_mm_load_ps
(
result
+
4
);
ma2
=
_mm_load_ps
(
result
+
8
);
ma3
=
_mm_load_ps
(
result
+
12
);
for
(
int
i
=
1
;
i
<
numSamples
;
i
++
)
{
mb0
=
_mm_load_ps
(
data
+
i
*
dim
);
mb1
=
_mm_load_ps
(
data
+
i
*
dim
+
4
);
mb2
=
_mm_load_ps
(
data
+
i
*
dim
+
8
);
mb3
=
_mm_load_ps
(
data
+
i
*
dim
+
12
);
ma0
=
_mm_max_ps
(
ma0
,
mb0
);
ma1
=
_mm_max_ps
(
ma1
,
mb1
);
ma2
=
_mm_max_ps
(
ma2
,
mb2
);
ma3
=
_mm_max_ps
(
ma3
,
mb3
);
}
_mm_store_ps
(
result
,
ma0
);
_mm_store_ps
(
result
+
4
,
ma1
);
_mm_store_ps
(
result
+
8
,
ma2
);
_mm_store_ps
(
result
+
12
,
ma3
);
}
// last dims
for
(
int
d
=
0
;
d
<
offset
;
++
d
)
{
float
sm
=
data
[
d
];
for
(
int
i
=
1
;
i
<
numSamples
;
++
i
)
{
sm
=
std
::
max
(
sm
,
data
[
i
*
dim
+
d
]);
}
result
[
d
]
=
sm
;
}
}
#elif defined(__AVX__)
#ifdef __AVX__
static
void
addto_avx
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
int
offset
=
len
%
32
;
...
...
@@ -358,18 +248,128 @@ static void decayL1_avx(
}
}
#elif defined(__SSE3__)
static
void
addto_sse
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
int
offset
=
len
%
16
;
__m128
ma0
,
ma1
,
ma2
,
ma3
;
__m128
mb0
,
mb1
,
mb2
,
mb3
;
for
(
unsigned
int
k
=
0
;
k
<
len
/
16
;
k
++
,
a
+=
16
,
b
+=
16
)
{
ma0
=
_mm_load_ps
(
a
);
ma1
=
_mm_load_ps
(
a
+
4
);
ma2
=
_mm_load_ps
(
a
+
8
);
ma3
=
_mm_load_ps
(
a
+
12
);
mb0
=
_mm_load_ps
(
b
);
mb1
=
_mm_load_ps
(
b
+
4
);
mb2
=
_mm_load_ps
(
b
+
8
);
mb3
=
_mm_load_ps
(
b
+
12
);
ma0
=
_mm_add_ps
(
ma0
,
mb0
);
ma1
=
_mm_add_ps
(
ma1
,
mb1
);
ma2
=
_mm_add_ps
(
ma2
,
mb2
);
ma3
=
_mm_add_ps
(
ma3
,
mb3
);
_mm_store_ps
(
a
,
ma0
);
_mm_store_ps
(
a
+
4
,
ma1
);
_mm_store_ps
(
a
+
8
,
ma2
);
_mm_store_ps
(
a
+
12
,
ma3
);
}
for
(
int
i
=
0
;
i
<
offset
;
i
++
)
a
[
i
]
+=
b
[
i
];
}
static
void
batch_addto_sse
(
float
*
a
,
const
float
*
b
[],
int
batch
,
size_t
len
)
{
int
offset
=
len
%
16
;
__m128
ma0
,
ma1
,
ma2
,
ma3
;
__m128
mb0
,
mb1
,
mb2
,
mb3
;
for
(
unsigned
int
k
=
0
;
k
<
len
/
16
;
k
++
,
a
+=
16
)
{
ma0
=
_mm_load_ps
(
a
);
ma1
=
_mm_load_ps
(
a
+
4
);
ma2
=
_mm_load_ps
(
a
+
8
);
ma3
=
_mm_load_ps
(
a
+
12
);
for
(
int
i
=
0
;
i
<
batch
;
i
++
)
{
mb0
=
_mm_load_ps
(
b
[
i
]);
mb1
=
_mm_load_ps
(
b
[
i
]
+
4
);
mb2
=
_mm_load_ps
(
b
[
i
]
+
8
);
mb3
=
_mm_load_ps
(
b
[
i
]
+
12
);
ma0
=
_mm_add_ps
(
ma0
,
mb0
);
ma1
=
_mm_add_ps
(
ma1
,
mb1
);
ma2
=
_mm_add_ps
(
ma2
,
mb2
);
ma3
=
_mm_add_ps
(
ma3
,
mb3
);
b
[
i
]
+=
16
;
}
_mm_store_ps
(
a
,
ma0
);
_mm_store_ps
(
a
+
4
,
ma1
);
_mm_store_ps
(
a
+
8
,
ma2
);
_mm_store_ps
(
a
+
12
,
ma3
);
}
for
(
int
i
=
0
;
i
<
offset
;
i
++
)
{
for
(
int
k
=
0
;
k
<
batch
;
k
++
)
a
[
i
]
+=
b
[
k
][
i
];
}
return
;
}
static
void
col_max_sse
(
float
*
result
,
const
float
*
data
,
int
dim
,
int
numSamples
)
{
// first sample, direct copy
for
(
int
d
=
0
;
d
<
dim
;
++
d
)
{
result
[
d
]
=
data
[
d
];
}
int
offset
=
dim
%
16
;
__m128
ma0
,
ma1
,
ma2
,
ma3
;
__m128
mb0
,
mb1
,
mb2
,
mb3
;
// first 16n dims
for
(
int
k
=
0
;
k
<
dim
/
16
;
k
++
,
result
+=
16
,
data
+=
16
)
{
ma0
=
_mm_load_ps
(
result
);
ma1
=
_mm_load_ps
(
result
+
4
);
ma2
=
_mm_load_ps
(
result
+
8
);
ma3
=
_mm_load_ps
(
result
+
12
);
for
(
int
i
=
1
;
i
<
numSamples
;
i
++
)
{
mb0
=
_mm_load_ps
(
data
+
i
*
dim
);
mb1
=
_mm_load_ps
(
data
+
i
*
dim
+
4
);
mb2
=
_mm_load_ps
(
data
+
i
*
dim
+
8
);
mb3
=
_mm_load_ps
(
data
+
i
*
dim
+
12
);
ma0
=
_mm_max_ps
(
ma0
,
mb0
);
ma1
=
_mm_max_ps
(
ma1
,
mb1
);
ma2
=
_mm_max_ps
(
ma2
,
mb2
);
ma3
=
_mm_max_ps
(
ma3
,
mb3
);
}
_mm_store_ps
(
result
,
ma0
);
_mm_store_ps
(
result
+
4
,
ma1
);
_mm_store_ps
(
result
+
8
,
ma2
);
_mm_store_ps
(
result
+
12
,
ma3
);
}
// last dims
for
(
int
d
=
0
;
d
<
offset
;
++
d
)
{
float
sm
=
data
[
d
];
for
(
int
i
=
1
;
i
<
numSamples
;
++
i
)
{
sm
=
std
::
max
(
sm
,
data
[
i
*
dim
+
d
]);
}
result
[
d
]
=
sm
;
}
}
#endif
#ifdef __SSE__
#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
#elif __AVX__
#if defined(__AVX__)
#define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__)
#elif defined(__SSE3__)
#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
#endif
namespace
paddle
{
namespace
simd
{
namespace
internal
{
#ifdef __SSE__
#ifdef __SSE
3
__
void
addToImpl
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
SIMD_INVOKE
(
addto
,
a
,
b
,
len
);
}
...
...
@@ -390,8 +390,8 @@ void decayL1AvxImpl(
float
*
dst
,
float
*
src
,
float
*
lr
,
float
lambda
,
size_t
len
)
{
decayL1_avx
(
dst
,
src
,
lr
,
lambda
,
len
);
}
#endif
}
// namespace internal
}
// namespace simd
}
// namespace paddle
paddle/math/SIMDFunctions.h
浏览文件 @
38fa74ed
...
...
@@ -128,7 +128,7 @@ void decayL1AvxImpl(
template
<
>
inline
void
addTo
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
#ifdef __SSE__
#ifdef __SSE
3
__
internal
::
addToImpl
(
a
,
b
,
len
);
#else
naive
::
addTo
(
a
,
b
,
len
);
...
...
@@ -137,7 +137,7 @@ inline void addTo(float* a, const float* b, size_t len) {
template
<
>
inline
void
batchAddTo
(
float
*
a
,
const
float
*
b
[],
int
batch
,
size_t
len
)
{
#ifdef __SSE__
#ifdef __SSE
3
__
internal
::
batchAddToImpl
(
a
,
b
,
batch
,
len
);
#else
naive
::
batchAddTo
(
a
,
b
,
batch
,
len
);
...
...
@@ -146,7 +146,7 @@ inline void batchAddTo(float* a, const float* b[], int batch, size_t len) {
template
<
>
inline
void
colMax
(
float
*
result
,
const
float
*
data
,
int
dim
,
int
numSamples
)
{
#ifdef __SSE__
#ifdef __SSE
3
__
internal
::
colMaxImpl
(
result
,
data
,
dim
,
numSamples
);
#else
naive
::
colMax
(
result
,
data
,
dim
,
numSamples
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录