Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
2a601e02
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
2a601e02
编写于
3月 23, 2017
作者:
L
Liu Yiqun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Set the simd-related kernels used under arm toolchains.
上级
909cc6f0
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
490 addition
and
5 deletion
+490
-5
paddle/cuda/include/hl_cpu_matrix_kernel.cuh
paddle/cuda/include/hl_cpu_matrix_kernel.cuh
+4
-0
paddle/cuda/include/hl_matrix_base.cuh
paddle/cuda/include/hl_matrix_base.cuh
+2
-0
paddle/cuda/include/hl_matrix_base_neon.cuh
paddle/cuda/include/hl_matrix_base_neon.cuh
+161
-0
paddle/cuda/include/hl_neon_matrix_kernel.cuh
paddle/cuda/include/hl_neon_matrix_kernel.cuh
+299
-0
paddle/math/SIMDFunctions.cpp
paddle/math/SIMDFunctions.cpp
+9
-4
paddle/math/SIMDFunctions.h
paddle/math/SIMDFunctions.h
+12
-0
paddle/utils/tests/test_SIMDFlags.cpp
paddle/utils/tests/test_SIMDFlags.cpp
+3
-1
未找到文件。
paddle/cuda/include/hl_cpu_matrix_kernel.cuh
浏览文件 @
2a601e02
...
@@ -17,7 +17,11 @@ limitations under the License. */
...
@@ -17,7 +17,11 @@ limitations under the License. */
#include <stdio.h>
#include <stdio.h>
#include "hl_base.h"
#include "hl_base.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include "hl_neon_matrix_kernel.cuh"
#else
#include "hl_sse_matrix_kernel.cuh"
#include "hl_sse_matrix_kernel.cuh"
#endif
/**
/**
* @brief cpu element wise unary operator.
* @brief cpu element wise unary operator.
...
...
paddle/cuda/include/hl_matrix_base.cuh
浏览文件 @
2a601e02
...
@@ -66,6 +66,8 @@ typedef BaseOp SSESquaredDiff;
...
@@ -66,6 +66,8 @@ typedef BaseOp SSESquaredDiff;
typedef
BaseOp
SSEFirst
;
typedef
BaseOp
SSEFirst
;
typedef
BaseOp
SSESecond
;
typedef
BaseOp
SSESecond
;
typedef
BaseOp
SSEClassificationError
;
typedef
BaseOp
SSEClassificationError
;
#elif defined(__ARM__NEON__) || defined(__ARM_NEON)
#include "hl_matrix_base_neon.cuh"
#else
#else
#include "hl_matrix_base_sse.cuh"
#include "hl_matrix_base_sse.cuh"
#endif
#endif
...
...
paddle/cuda/include/hl_matrix_base_neon.cuh
0 → 100644
浏览文件 @
2a601e02
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_MATRIX_BASE_NEON_CUH_
#define HL_MATRIX_BASE_NEON_CUH_
namespace
aggregate
{
class
SSESum
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vaddq_f32
(
a
,
b
);
}
};
class
SSEMax
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vmaxq_f32
(
a
,
b
);
}
};
class
SSEMin
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vminq_f32
(
a
,
b
);
}
};
}
// namespace aggregate
namespace
base
{
namespace
unary
{
class
SSEIdentity
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
)
const
{
return
a
;
}
};
}
// namespace unary
namespace
binary
{
class
SSEAdd
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vaddq_f32
(
a
,
b
);
}
};
class
SSEAdd2
{
public:
static
const
bool
sse
=
true
;
const
real
p1
;
const
real
p2
;
float32x4_t
mp1
;
float32x4_t
mp2
;
public:
SSEAdd2
(
const
real
s1
,
const
real
s2
)
:
p1
(
s1
),
p2
(
s2
)
{
mp1
=
vdupq_n_f32
(
p1
);
mp2
=
vdupq_n_f32
(
p2
);
}
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
float32x4_t
tmp1
,
tmp2
;
tmp1
=
vmulq_f32
(
mp1
,
a
);
tmp2
=
vmulq_f32
(
mp2
,
b
);
return
vaddq_f32
(
tmp1
,
tmp2
);
}
};
class
SSESub
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vsubq_f32
(
a
,
b
);
}
};
class
SSEMul
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vmulq_f32
(
a
,
b
);
}
};
class
SSEDiv
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
float32x4_t
tmp
;
tmp
=
vrecpeq_f32
(
b
);
return
vmulq_f32
(
a
,
tmp
);
}
};
class
SSESquaredDiff
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
float32x4_t
tmp
;
tmp
=
vsubq_f32
(
a
,
b
);
return
vmulq_f32
(
tmp
,
tmp
);
}
};
class
SSEFirst
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
a
;
}
};
class
SSESecond
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
b
;
}
};
class
SSEClassificationError
{
public:
static
const
bool
sse
=
true
;
const
real
p
;
float32x4_t
mp
;
uint32x4_t
result
;
public:
explicit
SSEClassificationError
(
const
real
s
)
:
p
(
s
)
{
mp
=
vdupq_n_f32
(
p
);
result
=
vdupq_n_u32
(
1
);
}
// TODO: to be check
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
uint32x4_t
tmp1
=
vcgtq_f32
(
a
,
mp
);
uint32x4_t
tmp2
=
vcgtq_f32
(
b
,
mp
);
uint32x4_t
tmp3
=
veorq_u32
(
tmp1
,
tmp2
);
return
vcvtq_f32_u32
(
vandq_u32
(
tmp3
,
result
));
}
};
}
// namespace binary
}
// namespace base
#endif
/* HL_MATRIX_BASE_NEON_CUH_ */
paddle/cuda/include/hl_neon_matrix_kernel.cuh
0 → 100644
浏览文件 @
2a601e02
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_NEON_MATRIX_KERNEL_CUH_
#define HL_NEON_MATRIX_KERNEL_CUH_
#include "hl_matrix_type.cuh"
#define VECTOR_SIZE 16
/* number of float in vector */
#define VECTOR_LEN 4
#define VECTOR_SET vdupq_n_f32
inline
bool
hl_check_align
(
size_t
size
)
{
return
!
(
size
&
(
VECTOR_SIZE
-
1
));
}
inline
bool
hl_check_align
(
void
*
ptr
)
{
return
hl_check_align
(
reinterpret_cast
<
size_t
>
(
ptr
));
}
template
<
class
Agg
>
inline
real
hl_agg_op
(
Agg
agg
,
vecType
mm
)
{
float32x4_t
rev
=
vrev64q_f32
(
mm
);
float32x4_t
tmp1
=
agg
.
vecOp
(
rev
,
rev
);
float32x2_t
lo
=
vget_high_f32
(
rev
);
float32x2_t
hi
=
vget_low_f32
(
rev
);
float32x4_t
tmp2
=
vcombine_f32
(
hi
,
lo
);
float32x4_t
ret
=
agg
.
vecOp
(
tmp1
,
tmp2
);
return
vgetq_lane_f32
(
ret
,
0
);
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_row_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
int
ld
,
real
*
A
,
int
lda
)
{
for
(
int
i
=
0
;
i
<
dimM
;
i
++
,
A
+=
lda
)
{
vecType
mm
=
VECTOR_SET
(
agg
.
init
());
vecType
*
a
=
(
vecType
*
)(
A
);
for
(
int
j
=
0
;
j
<
dimN
/
VECTOR_LEN
;
j
++
,
a
++
)
{
mm
=
agg
.
vecOp
(
mm
,
op
.
vecOp
(
*
a
));
}
int
rem
=
dimN
%
VECTOR_LEN
;
if
(
rem
)
{
real
tmp
=
hl_agg_op
(
agg
,
mm
);
real
*
a
=
A
+
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
for
(
int
j
=
0
;
j
<
rem
;
j
++
)
{
tmp
=
agg
(
tmp
,
op
(
a
[
j
]));
}
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
tmp
);
}
else
{
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
hl_agg_op
(
agg
,
mm
));
}
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_row_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
int
ld
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
for
(
int
i
=
0
;
i
<
dimM
;
i
++
,
A
+=
lda
,
B
+=
ldb
)
{
vecType
mm
=
VECTOR_SET
(
agg
.
init
());
vecType
*
a
=
(
vecType
*
)(
A
);
vecType
*
b
=
(
vecType
*
)(
B
);
for
(
int
j
=
0
;
j
<
dimN
/
VECTOR_LEN
;
j
++
,
a
++
,
b
++
)
{
mm
=
agg
.
vecOp
(
mm
,
op
.
vecOp
(
*
a
,
*
b
));
}
int
rem
=
dimN
%
VECTOR_LEN
;
if
(
rem
)
{
real
tmp
=
hl_agg_op
(
agg
,
mm
);
real
*
a
=
A
+
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
real
*
b
=
B
+
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
for
(
int
j
=
0
;
j
<
rem
;
j
++
)
{
tmp
=
agg
(
tmp
,
op
(
a
[
j
],
b
[
j
]));
}
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
tmp
);
}
else
{
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
hl_agg_op
(
agg
,
mm
));
}
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
)
{
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
]));
}
dst
[
j
]
=
sv
(
dst
[
j
],
tmp
);
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
],
B
[
i
*
ldb
+
j
]));
}
dst
[
j
]
=
sv
(
dst
[
j
],
tmp
);
}
}
/*
* MaxRow greater than or equal dimN
* dimN is multiples of VECTOR_LEN
* so rem <= MaxRow / VECTOR_LEN
*/
template
<
int
MaxRow
,
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_column_op_with_rem
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
)
{
vecType
mm
[
MaxRow
/
VECTOR_LEN
];
for
(
int
n
=
0
;
n
<
MaxRow
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
VECTOR_SET
(
agg
.
init
());
}
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
vecType
*
a
=
(
vecType
*
)(
A
+
i
*
lda
);
for
(
int
n
=
0
;
n
<
dimN
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
agg
.
vecOp
(
mm
[
n
],
op
.
vecOp
(
a
[
n
]));
}
}
vecType
*
result
=
(
vecType
*
)(
dst
);
for
(
int
n
=
0
;
n
<
dimN
/
VECTOR_LEN
;
n
++
)
{
result
[
n
]
=
sv
.
vecOp
(
result
[
n
],
mm
[
n
]);
}
int
rem
=
dimN
%
VECTOR_LEN
;
if
(
rem
)
{
A
+=
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
dst
+=
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
hl_matrix_column_op
(
agg
,
op
,
sv
,
dimM
,
rem
,
dst
,
A
,
lda
);
}
}
/*
* dimN is multiples of VECTOR_LEN
* dimN greater than Step
*/
template
<
int
Step
,
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
)
{
for
(
int
j
=
0
;
j
<
dimN
/
Step
;
j
++
,
dst
+=
Step
,
A
+=
Step
)
{
vecType
mm
[
Step
/
VECTOR_LEN
];
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
VECTOR_SET
(
agg
.
init
());
}
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
vecType
*
a
=
(
vecType
*
)(
A
+
i
*
lda
);
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
agg
.
vecOp
(
mm
[
n
],
op
.
vecOp
(
a
[
n
]));
}
}
vecType
*
result
=
(
vecType
*
)(
dst
);
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
result
[
n
]
=
sv
.
vecOp
(
result
[
n
],
mm
[
n
]);
}
}
int
remRow
=
dimN
%
Step
;
if
(
remRow
)
{
hl_sse_column_op_with_rem
<
Step
>
(
agg
,
op
,
sv
,
dimM
,
remRow
,
dst
,
A
,
lda
);
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
)
{
if
(
dimN
<=
16
)
{
hl_sse_matrix_column_op
<
16
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
);
}
else
if
(
dimN
<=
32
)
{
hl_sse_matrix_column_op
<
32
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
);
}
else
if
(
dimN
<=
1024
||
dimM
<=
512
)
{
hl_sse_matrix_column_op
<
64
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
);
}
else
{
hl_sse_matrix_column_op
<
1024
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
);
}
}
template
<
int
MaxRow
,
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_column_op_with_rem
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
vecType
mm
[
MaxRow
/
VECTOR_LEN
];
for
(
int
n
=
0
;
n
<
MaxRow
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
VECTOR_SET
(
agg
.
init
());
}
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
vecType
*
a
=
(
vecType
*
)(
A
+
i
*
lda
);
vecType
*
b
=
(
vecType
*
)(
B
+
i
*
ldb
);
for
(
int
n
=
0
;
n
<
dimN
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
agg
.
vecOp
(
mm
[
n
],
op
.
vecOp
(
a
[
n
],
b
[
n
]));
}
}
vecType
*
result
=
(
vecType
*
)(
dst
);
for
(
int
n
=
0
;
n
<
dimN
/
VECTOR_LEN
;
n
++
)
{
result
[
n
]
=
sv
.
vecOp
(
result
[
n
],
mm
[
n
]);
}
int
rem
=
dimN
%
VECTOR_LEN
;
if
(
rem
)
{
A
+=
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
B
+=
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
dst
+=
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
hl_matrix_column_op
(
agg
,
op
,
sv
,
dimM
,
rem
,
dst
,
A
,
lda
,
B
,
ldb
);
}
}
template
<
int
Step
,
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
for
(
int
j
=
0
;
j
<
dimN
/
Step
;
j
++
,
dst
+=
Step
,
A
+=
Step
,
B
+=
Step
)
{
vecType
mm
[
Step
/
VECTOR_LEN
];
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
VECTOR_SET
(
agg
.
init
());
}
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
vecType
*
a
=
(
vecType
*
)(
A
+
i
*
lda
);
vecType
*
b
=
(
vecType
*
)(
B
+
i
*
ldb
);
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
agg
.
vecOp
(
mm
[
n
],
op
.
vecOp
(
a
[
n
],
b
[
n
]));
}
}
vecType
*
result
=
(
vecType
*
)(
dst
);
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
result
[
n
]
=
sv
.
vecOp
(
result
[
n
],
mm
[
n
]);
}
}
int
remRow
=
dimN
%
Step
;
if
(
remRow
)
{
hl_sse_column_op_with_rem
<
Step
>
(
agg
,
op
,
sv
,
dimM
,
remRow
,
dst
,
A
,
lda
,
B
,
ldb
);
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
if
(
dimN
<=
16
)
{
hl_sse_matrix_column_op
<
16
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
,
B
,
ldb
);
}
else
if
(
dimN
<=
32
)
{
hl_sse_matrix_column_op
<
32
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
,
B
,
ldb
);
}
else
if
(
dimN
<=
1024
||
dimM
<=
512
)
{
hl_sse_matrix_column_op
<
64
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
,
B
,
ldb
);
}
else
{
hl_sse_matrix_column_op
<
1024
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
,
B
,
ldb
);
}
}
#endif
/* HL_NEON_MATRIX_KERNEL_CUH_ */
paddle/math/SIMDFunctions.cpp
浏览文件 @
2a601e02
...
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
...
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "SIMDFunctions.h"
#include "SIMDFunctions.h"
#ifdef __SSE__
#include <immintrin.h>
#include <immintrin.h>
#endif
#include <algorithm>
#include <algorithm>
#if
ndef __AVX
__
#if
def __SSE
__
static
void
addto_sse
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
static
void
addto_sse
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
int
offset
=
len
%
16
;
int
offset
=
len
%
16
;
__m128
ma0
,
ma1
,
ma2
,
ma3
;
__m128
ma0
,
ma1
,
ma2
,
ma3
;
...
@@ -125,7 +127,8 @@ static void col_max_sse(float* result,
...
@@ -125,7 +127,8 @@ static void col_max_sse(float* result,
}
}
}
}
#else
#elif defined(__AVX__)
static
void
addto_avx
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
static
void
addto_avx
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
int
offset
=
len
%
32
;
int
offset
=
len
%
32
;
...
@@ -357,15 +360,16 @@ static void decayL1_avx(
...
@@ -357,15 +360,16 @@ static void decayL1_avx(
#endif
#endif
#if
ndef __AVX
__
#if
def __SSE
__
#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
#el
se
#el
if __AVX__
#define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__)
#define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__)
#endif
#endif
namespace
paddle
{
namespace
paddle
{
namespace
simd
{
namespace
simd
{
namespace
internal
{
namespace
internal
{
#ifdef __SSE__
void
addToImpl
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
void
addToImpl
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
SIMD_INVOKE
(
addto
,
a
,
b
,
len
);
SIMD_INVOKE
(
addto
,
a
,
b
,
len
);
}
}
...
@@ -376,6 +380,7 @@ void batchAddToImpl(float* a, const float* b[], int batch, size_t len) {
...
@@ -376,6 +380,7 @@ void batchAddToImpl(float* a, const float* b[], int batch, size_t len) {
void
colMaxImpl
(
float
*
result
,
const
float
*
data
,
int
dim
,
int
numSamples
)
{
void
colMaxImpl
(
float
*
result
,
const
float
*
data
,
int
dim
,
int
numSamples
)
{
SIMD_INVOKE
(
col_max
,
result
,
data
,
dim
,
numSamples
);
SIMD_INVOKE
(
col_max
,
result
,
data
,
dim
,
numSamples
);
}
}
#endif
#ifdef __AVX__
#ifdef __AVX__
void
decayL1AvxImpl
(
float
*
dst
,
float
*
src
,
float
lambda
,
size_t
len
)
{
void
decayL1AvxImpl
(
float
*
dst
,
float
*
src
,
float
lambda
,
size_t
len
)
{
...
...
paddle/math/SIMDFunctions.h
浏览文件 @
2a601e02
...
@@ -128,17 +128,29 @@ void decayL1AvxImpl(
...
@@ -128,17 +128,29 @@ void decayL1AvxImpl(
template
<
>
template
<
>
inline
void
addTo
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
inline
void
addTo
(
float
*
a
,
const
float
*
b
,
size_t
len
)
{
#ifdef __SSE__
internal
::
addToImpl
(
a
,
b
,
len
);
internal
::
addToImpl
(
a
,
b
,
len
);
#else
naive
::
addTo
(
a
,
b
,
len
);
#endif
}
}
template
<
>
template
<
>
inline
void
batchAddTo
(
float
*
a
,
const
float
*
b
[],
int
batch
,
size_t
len
)
{
inline
void
batchAddTo
(
float
*
a
,
const
float
*
b
[],
int
batch
,
size_t
len
)
{
#ifdef __SSE__
internal
::
batchAddToImpl
(
a
,
b
,
batch
,
len
);
internal
::
batchAddToImpl
(
a
,
b
,
batch
,
len
);
#else
naive
::
batchAddTo
(
a
,
b
,
batch
,
len
);
#endif
}
}
template
<
>
template
<
>
inline
void
colMax
(
float
*
result
,
const
float
*
data
,
int
dim
,
int
numSamples
)
{
inline
void
colMax
(
float
*
result
,
const
float
*
data
,
int
dim
,
int
numSamples
)
{
#ifdef __SSE__
internal
::
colMaxImpl
(
result
,
data
,
dim
,
numSamples
);
internal
::
colMaxImpl
(
result
,
data
,
dim
,
numSamples
);
#else
naive
::
colMax
(
result
,
data
,
dim
,
numSamples
);
#endif
}
}
template
<
>
template
<
>
...
...
paddle/utils/tests/test_SIMDFlags.cpp
浏览文件 @
2a601e02
...
@@ -18,7 +18,8 @@ limitations under the License. */
...
@@ -18,7 +18,8 @@ limitations under the License. */
using
namespace
paddle
;
// NOLINT
using
namespace
paddle
;
// NOLINT
TEST
(
SIMDFlags
,
gccTest
)
{
TEST
(
SIMDFlags
,
gccTest
)
{
#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__))
#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__)) && \
!defined(__arm__)
// clang-format off
// clang-format off
CHECK
(
!
__builtin_cpu_supports
(
"sse"
)
!=
HAS_SSE
);
CHECK
(
!
__builtin_cpu_supports
(
"sse"
)
!=
HAS_SSE
);
CHECK
(
!
__builtin_cpu_supports
(
"sse2"
)
!=
HAS_SSE2
);
CHECK
(
!
__builtin_cpu_supports
(
"sse2"
)
!=
HAS_SSE2
);
...
@@ -43,4 +44,5 @@ TEST(SIMDFlags, normalPrint) {
...
@@ -43,4 +44,5 @@ TEST(SIMDFlags, normalPrint) {
LOG
(
INFO
)
<<
"Has AVX: "
<<
std
::
boolalpha
<<
HAS_AVX
;
LOG
(
INFO
)
<<
"Has AVX: "
<<
std
::
boolalpha
<<
HAS_AVX
;
LOG
(
INFO
)
<<
"Has AVX2: "
<<
std
::
boolalpha
<<
HAS_AVX2
;
LOG
(
INFO
)
<<
"Has AVX2: "
<<
std
::
boolalpha
<<
HAS_AVX2
;
LOG
(
INFO
)
<<
"Has AVX512: "
<<
std
::
boolalpha
<<
HAS_AVX512
;
LOG
(
INFO
)
<<
"Has AVX512: "
<<
std
::
boolalpha
<<
HAS_AVX512
;
LOG
(
INFO
)
<<
"Has NEON: "
<<
std
::
boolalpha
<<
HAS_NEON
;
}
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录