Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
430adf43
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
430adf43
编写于
5月 31, 2017
作者:
L
Liu Yiqun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Move the definition of hl_vec_add/sub/mul/div/max/min to hl_tensor_ops.h
上级
8f5d22b0
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
215 addition
and
106 deletion
+215
-106
paddle/cuda/include/hl_cpu_scalar.cuh
paddle/cuda/include/hl_cpu_scalar.cuh
+0
-24
paddle/cuda/include/hl_cpu_simd_neon.cuh
paddle/cuda/include/hl_cpu_simd_neon.cuh
+0
-25
paddle/cuda/include/hl_cpu_simd_sse.cuh
paddle/cuda/include/hl_cpu_simd_sse.cuh
+0
-48
paddle/cuda/include/hl_matrix_base_detail.cuh
paddle/cuda/include/hl_matrix_base_detail.cuh
+11
-9
paddle/cuda/include/hl_matrix_type.cuh
paddle/cuda/include/hl_matrix_type.cuh
+2
-0
paddle/cuda/include/hl_tensor_ops.h
paddle/cuda/include/hl_tensor_ops.h
+202
-0
未找到文件。
paddle/cuda/include/hl_cpu_scalar.cuh
浏览文件 @
430adf43
...
...
@@ -40,30 +40,6 @@ INLINE real hl_vec_set(const real r) {
return
r
;
}
INLINE
real
hl_vec_max
(
const
real
a
,
const
real
b
)
{
return
a
>
b
?
a
:
b
;
}
INLINE
real
hl_vec_min
(
const
real
a
,
const
real
b
)
{
return
a
>
b
?
b
:
a
;
}
INLINE
real
hl_vec_add
(
const
real
a
,
const
real
b
)
{
return
a
+
b
;
}
INLINE
real
hl_vec_sub
(
const
real
a
,
const
real
b
)
{
return
a
-
b
;
}
INLINE
real
hl_vec_mul
(
const
real
a
,
const
real
b
)
{
return
a
*
b
;
}
INLINE
real
hl_vec_div
(
const
real
a
,
const
real
b
)
{
return
a
/
b
;
}
INLINE
real
hl_vec_classification_error
(
const
real
a
,
const
real
b
,
const
real
p
,
...
...
paddle/cuda/include/hl_cpu_simd_neon.cuh
浏览文件 @
430adf43
...
...
@@ -44,31 +44,6 @@ inline float32x4_t hl_vec_set(const real f) {
return
vdupq_n_f32
(
f
);
}
inline
float32x4_t
hl_vec_max
(
const
float32x4_t
a
,
const
float32x4_t
b
)
{
return
vmaxq_f32
(
a
,
b
);
}
inline
float32x4_t
hl_vec_min
(
const
float32x4_t
a
,
const
float32x4_t
b
)
{
return
vminq_f32
(
a
,
b
);
}
inline
float32x4_t
hl_vec_add
(
const
float32x4_t
a
,
const
float32x4_t
b
)
{
return
vaddq_f32
(
a
,
b
);
}
inline
float32x4_t
hl_vec_sub
(
const
float32x4_t
a
,
const
float32x4_t
b
)
{
return
vsubq_f32
(
a
,
b
);
}
inline
float32x4_t
hl_vec_mul
(
const
float32x4_t
a
,
const
float32x4_t
b
)
{
return
vmulq_f32
(
a
,
b
);
}
inline
float32x4_t
hl_vec_div
(
const
float32x4_t
a
,
const
float32x4_t
b
)
{
float32x4_t
tmp
=
vrecpeq_f32
(
b
);
return
vmulq_f32
(
a
,
tmp
);
}
inline
float32x4_t
hl_vec_classification_error
(
const
float32x4_t
a
,
const
float32x4_t
b
,
const
float32x4_t
p
,
...
...
paddle/cuda/include/hl_cpu_simd_sse.cuh
浏览文件 @
430adf43
...
...
@@ -45,30 +45,6 @@ inline __m128 hl_vec_set(const real f) {
return
_mm_set_ps1
(
f
);
}
inline
__m128
hl_vec_max
(
const
__m128
a
,
const
__m128
b
)
{
return
_mm_max_ps
(
a
,
b
);
}
inline
__m128
hl_vec_min
(
const
__m128
a
,
const
__m128
b
)
{
return
_mm_min_ps
(
a
,
b
);
}
inline
__m128
hl_vec_add
(
const
__m128
a
,
const
__m128
b
)
{
return
_mm_add_ps
(
a
,
b
);
}
inline
__m128
hl_vec_sub
(
const
__m128
a
,
const
__m128
b
)
{
return
_mm_sub_ps
(
a
,
b
);
}
inline
__m128
hl_vec_mul
(
const
__m128
a
,
const
__m128
b
)
{
return
_mm_mul_ps
(
a
,
b
);
}
inline
__m128
hl_vec_div
(
const
__m128
a
,
const
__m128
b
)
{
return
_mm_div_ps
(
a
,
b
);
}
inline
__m128
hl_vec_classification_error
(
const
__m128
a
,
const
__m128
b
,
const
__m128
p
,
...
...
@@ -103,30 +79,6 @@ inline __m128d hl_vec_set(const real d) {
#endif
}
inline
__m128d
hl_vec_max
(
const
__m128d
a
,
const
__m128d
b
)
{
return
_mm_max_pd
(
a
,
b
);
}
inline
__m128d
hl_vec_min
(
const
__m128d
a
,
const
__m128d
b
)
{
return
_mm_min_pd
(
a
,
b
);
}
inline
__m128d
hl_vec_add
(
const
__m128d
a
,
const
__m128d
b
)
{
return
_mm_add_pd
(
a
,
b
);
}
inline
__m128d
hl_vec_sub
(
const
__m128d
a
,
const
__m128d
b
)
{
return
_mm_sub_pd
(
a
,
b
);
}
inline
__m128d
hl_vec_mul
(
const
__m128d
a
,
const
__m128d
b
)
{
return
_mm_mul_pd
(
a
,
b
);
}
inline
__m128d
hl_vec_div
(
const
__m128d
a
,
const
__m128d
b
)
{
return
_mm_div_pd
(
a
,
b
);
}
inline
__m128d
hl_vec_classification_error
(
const
__m128d
a
,
const
__m128d
b
,
const
__m128d
p
,
...
...
paddle/cuda/include/hl_matrix_base_detail.cuh
浏览文件 @
430adf43
...
...
@@ -16,13 +16,14 @@ limitations under the License. */
#define HL_MATRIX_BASE_DETAIL_CUH_
#include "hl_matrix_type.cuh"
#include "hl_tensor_ops.h"
namespace
aggregate
{
class
SSESum
{
public:
static
const
bool
sse
=
VECTOR_SIMD
;
INLINE
vecType
vecOp
(
const
vecType
a
,
const
vecType
b
)
const
{
return
h
l_vec_add
(
a
,
b
);
return
h
ppl
::
binary
::
add
<
vecType
>
()
(
a
,
b
);
}
};
...
...
@@ -30,7 +31,7 @@ class SSEMax {
public:
static
const
bool
sse
=
VECTOR_SIMD
;
INLINE
vecType
vecOp
(
const
vecType
a
,
const
vecType
b
)
const
{
return
h
l_vec_max
(
a
,
b
);
return
h
ppl
::
binary
::
max
<
vecType
>
()
(
a
,
b
);
}
};
...
...
@@ -38,7 +39,7 @@ class SSEMin {
public:
static
const
bool
sse
=
VECTOR_SIMD
;
INLINE
vecType
vecOp
(
const
vecType
a
,
const
vecType
b
)
const
{
return
h
l_vec_min
(
a
,
b
);
return
h
ppl
::
binary
::
min
<
vecType
>
()
(
a
,
b
);
}
};
}
// namespace aggregate
...
...
@@ -59,7 +60,7 @@ class SSEAdd {
public:
static
const
bool
sse
=
VECTOR_SIMD
;
INLINE
vecType
vecOp
(
const
vecType
a
,
const
vecType
b
)
const
{
return
h
l_vec_add
(
a
,
b
);
return
h
ppl
::
binary
::
add
<
vecType
>
()
(
a
,
b
);
}
};
...
...
@@ -77,7 +78,7 @@ public:
mp2
=
hl_vec_set
(
p2
);
}
INLINE
vecType
vecOp
(
const
vecType
a
,
const
vecType
b
)
const
{
return
h
l_vec_add
(
hl_vec_mul
(
mp1
,
a
),
hl_vec_mul
(
mp2
,
b
)
);
return
h
ppl
::
binary
::
add_scale
<
vecType
>
(
mp1
,
mp2
)(
a
,
b
);
}
};
...
...
@@ -85,7 +86,7 @@ class SSESub {
public:
static
const
bool
sse
=
VECTOR_SIMD
;
INLINE
vecType
vecOp
(
const
vecType
a
,
const
vecType
b
)
const
{
return
h
l_vec_sub
(
a
,
b
);
return
h
ppl
::
binary
::
sub
<
vecType
>
()
(
a
,
b
);
}
};
...
...
@@ -93,7 +94,7 @@ class SSEMul {
public:
static
const
bool
sse
=
VECTOR_SIMD
;
INLINE
vecType
vecOp
(
const
vecType
a
,
const
vecType
b
)
const
{
return
h
l_vec_mul
(
a
,
b
);
return
h
ppl
::
binary
::
mul
<
vecType
>
()
(
a
,
b
);
}
};
...
...
@@ -101,7 +102,7 @@ class SSEDiv {
public:
static
const
bool
sse
=
VECTOR_SIMD
;
INLINE
vecType
vecOp
(
const
vecType
a
,
const
vecType
b
)
const
{
return
h
l_vec_div
(
a
,
b
);
return
h
ppl
::
binary
::
div
<
vecType
>
()
(
a
,
b
);
}
};
...
...
@@ -109,7 +110,8 @@ class SSESquaredDiff {
public:
static
const
bool
sse
=
VECTOR_SIMD
;
INLINE
vecType
vecOp
(
const
vecType
a
,
const
vecType
b
)
const
{
return
hl_vec_mul
(
hl_vec_sub
(
a
,
b
),
hl_vec_sub
(
a
,
b
));
vecType
tmp
=
hppl
::
binary
::
sub
<
vecType
>
()(
a
,
b
);
return
hppl
::
binary
::
mul
<
vecType
>
()(
tmp
,
tmp
);
}
};
...
...
paddle/cuda/include/hl_matrix_type.cuh
浏览文件 @
430adf43
...
...
@@ -38,10 +38,12 @@ typedef double2 vecType;
#endif
#elif defined(__SSE3__)
#include "hl_cpu_simd_sse.cuh"
#define PADDLE_USE_SSE3
#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && !defined(__NVCC__)
// Currently nvcc does not support neon intrinsic.
// TODO: Extract simd intrinsic implementation from .cu files.
#include "hl_cpu_simd_neon.cuh"
#define PADDLE_USE_NEON
#else
#include "hl_cpu_scalar.cuh"
#endif
...
...
paddle/cuda/include/hl_tensor_ops.h
浏览文件 @
430adf43
...
...
@@ -328,6 +328,208 @@ public:
INLINE
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
a
<
b
?
b
:
a
;
}
};
#ifdef PADDLE_USE_SSE3
#ifndef PADDLE_TYPE_DOUBLE
template
<
>
class
add
<
__m128
>
{
public:
INLINE
__m128
operator
()(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_add_ps
(
a
,
b
);
}
};
template
<
>
class
add_scale
<
__m128
>
{
private:
const
__m128
p1
;
const
__m128
p2
;
public:
INLINE
add_scale
(
const
__m128
s1
,
const
__m128
s2
)
:
p1
(
s1
),
p2
(
s2
)
{}
INLINE
__m128
operator
()(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_add_ps
(
_mm_mul_ps
(
p1
,
a
),
_mm_mul_ps
(
p2
,
b
));
}
};
template
<
>
class
sub
<
__m128
>
{
public:
INLINE
__m128
operator
()(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_sub_ps
(
a
,
b
);
}
};
template
<
>
class
mul
<
__m128
>
{
public:
INLINE
__m128
operator
()(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_mul_ps
(
a
,
b
);
}
};
template
<
>
class
div
<
__m128
>
{
public:
INLINE
__m128
operator
()(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_div_ps
(
a
,
b
);
}
};
template
<
>
class
min
<
__m128
>
{
public:
INLINE
__m128
operator
()(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_min_ps
(
a
,
b
);
}
};
template
<
>
class
max
<
__m128
>
{
public:
INLINE
__m128
operator
()(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_max_ps
(
a
,
b
);
}
};
#else
template
<
>
class
add
<
__m128d
>
{
public:
INLINE
__m128d
operator
()(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_add_pd
(
a
,
b
);
}
};
template
<
>
class
add_scale
<
__m128d
>
{
private:
const
__m128d
p1
;
const
__m128d
p2
;
public:
INLINE
add_scale
(
const
__m128d
s1
,
const
__m128d
s2
)
:
p1
(
s1
),
p2
(
s2
)
{}
INLINE
__m128d
operator
()(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_add_pd
(
_mm_mul_pd
(
p1
,
a
),
_mm_mul_pd
(
p2
,
b
));
}
};
template
<
>
class
sub
<
__m128d
>
{
public:
INLINE
__m128d
operator
()(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_sub_pd
(
a
,
b
);
}
};
template
<
>
class
mul
<
__m128d
>
{
public:
INLINE
__m128d
operator
()(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_mul_pd
(
a
,
b
);
}
};
template
<
>
class
div
<
__m128d
>
{
public:
INLINE
__m128d
operator
()(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_div_pd
(
a
,
b
);
}
};
template
<
>
class
min
<
__m128d
>
{
public:
INLINE
__m128d
operator
()(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_min_pd
(
a
,
b
);
}
};
template
<
>
class
max
<
__m128d
>
{
public:
INLINE
__m128d
operator
()(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_max_pd
(
a
,
b
);
}
};
#endif // PADDLE_TYPE_DOUBLE
#endif // PADDLE_USE_SSE3
#ifdef PADDLE_USE_NEON
#ifndef PADDLE_TYPE_DOUBLE
template
<
>
class
add
<
float32x4_t
>
{
public:
INLINE
float32x4_t
operator
()(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vmulq_f32
(
a
,
b
);
}
};
template
<
>
class
add_scale
<
float32x4_t
>
{
private:
const
float32x4_t
p1
;
const
float32x4_t
p2
;
public:
INLINE
add_scale
(
const
float32x4_t
s1
,
const
float32x4_t
s2
)
:
p1
(
s1
),
p2
(
s2
)
{}
INLINE
float32x4_t
operator
()(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vaddq_f32
(
vmulq_f32
(
p1
,
a
),
vmulq_f32
(
p2
,
b
));
}
};
template
<
>
class
sub
<
float32x4_t
>
{
public:
INLINE
float32x4_t
operator
()(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vsubq_f32
(
a
,
b
);
}
};
template
<
>
class
mul
<
float32x4_t
>
{
public:
INLINE
float32x4_t
operator
()(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vmulq_f32
(
a
,
b
);
}
};
template
<
>
class
div
<
float32x4_t
>
{
public:
INLINE
float32x4_t
operator
()(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
float32x4_t
tmp
=
vrecpeq_f32
(
b
);
return
vmulq_f32
(
a
,
tmp
);
}
};
template
<
>
class
min
<
float32x4_t
>
{
public:
INLINE
float32x4_t
operator
()(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vminq_f32
(
a
,
b
);
}
};
template
<
>
class
max
<
float32x4_t
>
{
public:
INLINE
float32x4_t
operator
()(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vmaxq_f32
(
a
,
b
);
}
}
#else
#error To be implemented
#endif // PADDLE_TYPE_DOUBLE
#endif // PADDLE_USE_NEON
}
// namespace binary
}
// namespace hppl
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录