Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
e36e24d4
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e36e24d4
编写于
6月 05, 2017
作者:
Y
Yiqun Liu
提交者:
GitHub
6月 05, 2017
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #2299 from Xreki/support_scalar_kernels
Centralize the use of simd intrinsic and implement scalar kernels.
上级
4537b7bc
7fb0684a
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
656 addition
and
654 deletion
+656
-654
paddle/cuda/include/hl_cpu_matrix_kernel.cuh
paddle/cuda/include/hl_cpu_matrix_kernel.cuh
+3
-33
paddle/cuda/include/hl_cpu_matrix_kernel_detail.cuh
paddle/cuda/include/hl_cpu_matrix_kernel_detail.cuh
+57
-65
paddle/cuda/include/hl_cpu_scalar.cuh
paddle/cuda/include/hl_cpu_scalar.cuh
+50
-0
paddle/cuda/include/hl_cpu_simd_neon.cuh
paddle/cuda/include/hl_cpu_simd_neon.cuh
+73
-0
paddle/cuda/include/hl_cpu_simd_sse.cuh
paddle/cuda/include/hl_cpu_simd_sse.cuh
+94
-0
paddle/cuda/include/hl_matrix_base.cuh
paddle/cuda/include/hl_matrix_base.cuh
+2
-24
paddle/cuda/include/hl_matrix_base_detail.cuh
paddle/cuda/include/hl_matrix_base_detail.cuh
+153
-0
paddle/cuda/include/hl_matrix_base_sse.cuh
paddle/cuda/include/hl_matrix_base_sse.cuh
+0
-211
paddle/cuda/include/hl_matrix_type.cuh
paddle/cuda/include/hl_matrix_type.cuh
+22
-22
paddle/cuda/include/hl_neon_matrix_kernel.cuh
paddle/cuda/include/hl_neon_matrix_kernel.cuh
+0
-299
paddle/cuda/include/hl_tensor_ops.h
paddle/cuda/include/hl_tensor_ops.h
+202
-0
未找到文件。
paddle/cuda/include/hl_cpu_matrix_kernel.cuh
浏览文件 @
e36e24d4
...
...
@@ -17,10 +17,9 @@ limitations under the License. */
#include <stdio.h>
#include "hl_base.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include "hl_neon_matrix_kernel.cuh"
#else
#include "hl_sse_matrix_kernel.cuh"
#ifndef __CUDA_ARCH__
#include "hl_cpu_matrix_kernel_detail.cuh"
#endif
/**
...
...
@@ -114,35 +113,6 @@ void hl_cpu_apply_quaternary_op(Op op,
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_row_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
int
ld
,
real
*
A
,
int
lda
)
{
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
]));
}
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
tmp
);
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_row_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
int
ld
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
],
B
[
i
*
ldb
+
j
]));
}
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
tmp
);
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_cpu_matrix_row_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
...
...
paddle/cuda/include/hl_
sse_matrix_kerne
l.cuh
→
paddle/cuda/include/hl_
cpu_matrix_kernel_detai
l.cuh
浏览文件 @
e36e24d4
...
...
@@ -13,26 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_
SSE_MATRIX_KERNE
L_CUH_
#define HL_
SSE_MATRIX_KERNE
L_CUH_
#ifndef HL_
MATRIX_KERNEL_DETAI
L_CUH_
#define HL_
MATRIX_KERNEL_DETAI
L_CUH_
#include "hl_matrix_type.cuh"
#define VECTOR_SIZE 16
#ifndef PADDLE_TYPE_DOUBLE
/* number of float in vector */
#define VECTOR_LEN 4
#define VECTOR_SET _mm_set_ps1
#else
#if defined(__APPLE__) || defined(__OSX__)
#define _mm_set_pd1 _mm_set1_pd
#endif
/* number of double in vector */
#define VECTOR_LEN 2
#define VECTOR_SET _mm_set_pd1
#endif
inline
bool
hl_check_align
(
size_t
size
)
{
return
!
(
size
&
(
VECTOR_SIZE
-
1
));
}
...
...
@@ -41,27 +26,63 @@ inline bool hl_check_align(void *ptr) {
return
hl_check_align
(
reinterpret_cast
<
size_t
>
(
ptr
));
}
#ifndef PADDLE_TYPE_DOUBLE
template
<
class
Agg
>
inline
real
hl_agg_op
(
Agg
agg
,
vecType
mm
)
{
__m128
lo
=
_mm_unpacklo_ps
(
mm
,
mm
);
__m128
hi
=
_mm_unpackhi_ps
(
mm
,
mm
);
__m128
tmp1
=
agg
.
vecOp
(
lo
,
hi
);
__m128
tmp2
=
_mm_movehl_ps
(
tmp1
,
tmp1
);
__m128
ret
=
agg
.
vecOp
(
tmp1
,
tmp2
);
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_row_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
int
ld
,
real
*
A
,
int
lda
)
{
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
]));
}
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
tmp
);
}
}
return
_mm_cvtss_f32
(
ret
);
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_row_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
int
ld
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
],
B
[
i
*
ldb
+
j
]));
}
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
tmp
);
}
}
#else
template
<
class
Agg
>
inline
real
hl_agg_op
(
Agg
agg
,
vecType
mm
)
{
__m128d
lo
=
_mm_unpacklo_pd
(
mm
,
mm
);
__m128d
hi
=
_mm_unpackhi_pd
(
mm
,
mm
);
__m128d
ret
=
agg
.
vecOp
(
lo
,
hi
);
return
_mm_cvtsd_f64
(
ret
);
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
)
{
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
]));
}
dst
[
j
]
=
sv
(
dst
[
j
],
tmp
);
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
],
B
[
i
*
ldb
+
j
]));
}
dst
[
j
]
=
sv
(
dst
[
j
],
tmp
);
}
}
#endif
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_row_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
...
...
@@ -118,35 +139,6 @@ void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
)
{
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
]));
}
dst
[
j
]
=
sv
(
dst
[
j
],
tmp
);
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
],
B
[
i
*
ldb
+
j
]));
}
dst
[
j
]
=
sv
(
dst
[
j
],
tmp
);
}
}
/*
* MaxRow greater than or equal dimN
* dimN is multiples of VECTOR_LEN
...
...
@@ -315,4 +307,4 @@ void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
}
}
#endif
/* HL_
SSE_MATRIX_KERNE
L_CUH_ */
#endif
/* HL_
MATRIX_KERNEL_DETAI
L_CUH_ */
paddle/cuda/include/hl_cpu_scalar.cuh
0 → 100644
浏览文件 @
e36e24d4
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_CPU_SCALAR_CUH_
#define HL_CPU_SCALAR_CUH_
#define VECTOR_SIMD false
#define VECTOR_SET hl_vec_set
#ifndef PADDLE_TYPE_DOUBLE
/* size of float */
#define VECTOR_SIZE 4
#else
/* size of double */
#define VECTOR_SIZE 8
#endif
typedef
real
vecType
;
/* Consider a real as a vector */
#define VECTOR_LEN 1
template
<
class
Agg
>
inline
real
hl_agg_op
(
Agg
agg
,
vecType
mm
)
{
return
mm
;
}
INLINE
real
hl_vec_set
(
const
real
r
)
{
return
r
;
}
INLINE
real
hl_vec_classification_error
(
const
real
a
,
const
real
b
,
const
real
p
,
const
real
r
)
{
return
((
a
>
p
)
==
(
b
>
p
))
?
0.0
f
:
1.0
f
;
}
#endif // HL_CPU_SCALAR_CUH_
paddle/cuda/include/hl_cpu_simd_neon.cuh
0 → 100644
浏览文件 @
e36e24d4
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_CPU_SIMD_NEON_CUH_
#define HL_CPU_SIMD_NEON_CUH_
#include <arm_neon.h>
#define VECTOR_SIMD true
#define VECTOR_SIZE 16
#define VECTOR_SET hl_vec_set
#ifndef PADDLE_TYPE_DOUBLE
typedef
float32x4_t
vecType
;
/* number of float in vector */
#define VECTOR_LEN 4
template
<
class
Agg
>
inline
real
hl_agg_op
(
Agg
agg
,
vecType
mm
)
{
float32x4_t
rev
=
vrev64q_f32
(
mm
);
float32x4_t
tmp1
=
agg
.
vecOp
(
rev
,
rev
);
float32x2_t
lo
=
vget_high_f32
(
rev
);
float32x2_t
hi
=
vget_low_f32
(
rev
);
float32x4_t
tmp2
=
vcombine_f32
(
hi
,
lo
);
float32x4_t
ret
=
agg
.
vecOp
(
tmp1
,
tmp2
);
return
vgetq_lane_f32
(
ret
,
0
);
}
inline
float32x4_t
hl_vec_set
(
const
real
f
)
{
return
vdupq_n_f32
(
f
);
}
inline
float32x4_t
hl_vec_classification_error
(
const
float32x4_t
a
,
const
float32x4_t
b
,
const
float32x4_t
p
,
const
float32x4_t
r
)
{
uint32x4_t
tmp1
=
vcgtq_f32
(
a
,
p
);
uint32x4_t
tmp2
=
vcgtq_f32
(
b
,
p
);
uint32x4_t
tmp3
=
veorq_u32
(
tmp1
,
tmp2
);
return
vcvtq_f32_u32
(
vandq_u32
(
tmp3
,
vcvtq_u32_f32
(
r
)));
}
#else
#ifdef __aarch64__
typedef
float64x2_t
vecType
;
/* number of float in vector */
#define VECTOR_LEN 2
#define VECTOR_SET vdupq_n_f64
#error To be implemented
#else
#error NEON instructions does not support double precision
#endif // __aarch64__
#endif
#endif // HL_CPU_SIMD_NEON_CUH_
paddle/cuda/include/hl_cpu_simd_sse.cuh
0 → 100644
浏览文件 @
e36e24d4
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_CPU_SIMD_SSE_CUH_
#define HL_CPU_SIMD_SSE_CUH_
#include <mmintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#define VECTOR_SIMD true
#define VECTOR_SIZE 16
#define VECTOR_SET hl_vec_set
#ifndef PADDLE_TYPE_DOUBLE
typedef
__m128
vecType
;
/* number of float in vector */
#define VECTOR_LEN 4
template
<
class
Agg
>
inline
real
hl_agg_op
(
Agg
agg
,
vecType
mm
)
{
__m128
lo
=
_mm_unpacklo_ps
(
mm
,
mm
);
__m128
hi
=
_mm_unpackhi_ps
(
mm
,
mm
);
__m128
tmp1
=
agg
.
vecOp
(
lo
,
hi
);
__m128
tmp2
=
_mm_movehl_ps
(
tmp1
,
tmp1
);
__m128
ret
=
agg
.
vecOp
(
tmp1
,
tmp2
);
return
_mm_cvtss_f32
(
ret
);
}
inline
__m128
hl_vec_set
(
const
real
f
)
{
return
_mm_set_ps1
(
f
);
}
inline
__m128
hl_vec_classification_error
(
const
__m128
a
,
const
__m128
b
,
const
__m128
p
,
const
__m128
r
)
{
__m128
tmp1
=
_mm_cmpgt_ps
(
a
,
p
);
__m128
tmp2
=
_mm_cmpgt_ps
(
b
,
p
);
__m128
tmp3
=
_mm_xor_ps
(
tmp1
,
tmp2
);
return
_mm_and_ps
(
tmp3
,
r
);
}
#else
typedef
__m128d
vecType
;
/* number of double in vector */
#define VECTOR_LEN 2
template
<
class
Agg
>
inline
real
hl_agg_op
(
Agg
agg
,
vecType
mm
)
{
__m128d
lo
=
_mm_unpacklo_pd
(
mm
,
mm
);
__m128d
hi
=
_mm_unpackhi_pd
(
mm
,
mm
);
__m128d
ret
=
agg
.
vecOp
(
lo
,
hi
);
return
_mm_cvtsd_f64
(
ret
);
}
inline
__m128d
hl_vec_set
(
const
real
d
)
{
#if defined(__APPLE__) || defined(__OSX__)
return
_mm_set1_pd
(
d
);
#else
return
_mm_set_pd1
(
d
);
#endif
}
inline
__m128d
hl_vec_classification_error
(
const
__m128d
a
,
const
__m128d
b
,
const
__m128d
p
,
const
__m128d
r
)
{
__m128d
tmp1
=
_mm_cmpgt_pd
(
a
,
p
);
__m128d
tmp2
=
_mm_cmpgt_pd
(
b
,
p
);
__m128d
tmp3
=
_mm_xor_pd
(
tmp1
,
tmp2
);
return
_mm_and_pd
(
tmp3
,
r
);
}
#endif
#endif // HL_CPU_SIMD_SSE_CUH_
paddle/cuda/include/hl_matrix_base.cuh
浏览文件 @
e36e24d4
...
...
@@ -18,26 +18,6 @@ limitations under the License. */
#include "hl_matrix_type.cuh"
#ifdef __CUDA_ARCH__
/**
* CUDA kernel inline function
*/
#define INLINE __device__ inline
#else
/**
* CPP inline function
*/
#define INLINE inline
#endif
#ifndef PADDLE_TYPE_DOUBLE
#define DEVICE_FMAX fmaxf
#define DEVICE_FMIN fminf
#else
#define DEVICE_FMAX fmax
#define DEVICE_FMIN fmin
#endif
class
BaseOp
{
public:
static
const
bool
sse
=
false
;
...
...
@@ -66,10 +46,8 @@ typedef BaseOp SSESquaredDiff;
typedef
BaseOp
SSEFirst
;
typedef
BaseOp
SSESecond
;
typedef
BaseOp
SSEClassificationError
;
#elif defined(__ARM__NEON__) || defined(__ARM_NEON)
#include "hl_matrix_base_neon.cuh"
#else
#include "hl_matrix_base_
sse
.cuh"
#include "hl_matrix_base_
detail
.cuh"
#endif
namespace
aggregate
{
...
...
@@ -124,7 +102,7 @@ public:
add2
(
const
real
s1
,
const
real
s2
)
:
SSEAdd2
(
s1
,
s2
),
p1
(
s1
),
p2
(
s2
)
{}
INLINE
real
operator
()(
const
real
a
,
const
real
b
)
const
{
return
p1
*
a
+
p2
*
b
;
return
p1
*
a
+
p2
*
b
;
}
};
...
...
paddle/cuda/include/hl_matrix_base_
neon
.cuh
→
paddle/cuda/include/hl_matrix_base_
detail
.cuh
浏览文件 @
e36e24d4
...
...
@@ -12,32 +12,34 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_MATRIX_BASE_DETAIL_CUH_
#define HL_MATRIX_BASE_DETAIL_CUH_
#i
fndef HL_MATRIX_BASE_NEON_CUH_
#
define HL_MATRIX_BASE_NEON_CUH_
#i
nclude "hl_matrix_type.cuh"
#
include "hl_tensor_ops.h"
namespace
aggregate
{
class
SSESum
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vaddq_f32
(
a
,
b
);
static
const
bool
sse
=
VECTOR_SIMD
;
INLINE
vecType
vecOp
(
const
vecType
a
,
const
vecType
b
)
const
{
return
hppl
::
binary
::
add
<
vecType
>
()
(
a
,
b
);
}
};
class
SSEMax
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vmaxq_f32
(
a
,
b
);
static
const
bool
sse
=
VECTOR_SIMD
;
INLINE
vecType
vecOp
(
const
vecType
a
,
const
vecType
b
)
const
{
return
hppl
::
binary
::
max
<
vecType
>
()
(
a
,
b
);
}
};
class
SSEMin
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vminq_f32
(
a
,
b
);
static
const
bool
sse
=
VECTOR_SIMD
;
INLINE
vecType
vecOp
(
const
vecType
a
,
const
vecType
b
)
const
{
return
hppl
::
binary
::
min
<
vecType
>
()
(
a
,
b
);
}
};
}
// namespace aggregate
...
...
@@ -46,8 +48,8 @@ namespace base {
namespace
unary
{
class
SSEIdentity
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
)
const
{
static
const
bool
sse
=
VECTOR_SIMD
;
INLINE
vecType
vecOp
(
const
vecType
a
)
const
{
return
a
;
}
};
...
...
@@ -56,106 +58,96 @@ public:
namespace
binary
{
class
SSEAdd
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vaddq_f32
(
a
,
b
);
static
const
bool
sse
=
VECTOR_SIMD
;
INLINE
vecType
vecOp
(
const
vecType
a
,
const
vecType
b
)
const
{
return
hppl
::
binary
::
add
<
vecType
>
()
(
a
,
b
);
}
};
class
SSEAdd2
{
public:
static
const
bool
sse
=
true
;
static
const
bool
sse
=
VECTOR_SIMD
;
const
real
p1
;
const
real
p2
;
float32x4_t
mp1
;
float32x4_t
mp2
;
vecType
mp1
;
vecType
mp2
;
public:
SSEAdd2
(
const
real
s1
,
const
real
s2
)
:
p1
(
s1
),
p2
(
s2
)
{
mp1
=
vdupq_n_f32
(
p1
);
mp2
=
vdupq_n_f32
(
p2
);
mp1
=
hl_vec_set
(
p1
);
mp2
=
hl_vec_set
(
p2
);
}
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
float32x4_t
tmp1
,
tmp2
;
tmp1
=
vmulq_f32
(
mp1
,
a
);
tmp2
=
vmulq_f32
(
mp2
,
b
);
return
vaddq_f32
(
tmp1
,
tmp2
);
INLINE
vecType
vecOp
(
const
vecType
a
,
const
vecType
b
)
const
{
return
hppl
::
binary
::
add_scale
<
vecType
>
(
mp1
,
mp2
)(
a
,
b
);
}
};
class
SSESub
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vsubq_f32
(
a
,
b
);
static
const
bool
sse
=
VECTOR_SIMD
;
INLINE
vecType
vecOp
(
const
vecType
a
,
const
vecType
b
)
const
{
return
hppl
::
binary
::
sub
<
vecType
>
()
(
a
,
b
);
}
};
class
SSEMul
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vmulq_f32
(
a
,
b
);
static
const
bool
sse
=
VECTOR_SIMD
;
INLINE
vecType
vecOp
(
const
vecType
a
,
const
vecType
b
)
const
{
return
hppl
::
binary
::
mul
<
vecType
>
()
(
a
,
b
);
}
};
class
SSEDiv
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
float32x4_t
tmp
;
tmp
=
vrecpeq_f32
(
b
);
return
vmulq_f32
(
a
,
tmp
);
static
const
bool
sse
=
VECTOR_SIMD
;
INLINE
vecType
vecOp
(
const
vecType
a
,
const
vecType
b
)
const
{
return
hppl
::
binary
::
div
<
vecType
>
()(
a
,
b
);
}
};
class
SSESquaredDiff
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
float32x4_t
tmp
;
tmp
=
vsubq_f32
(
a
,
b
);
return
vmulq_f32
(
tmp
,
tmp
);
static
const
bool
sse
=
VECTOR_SIMD
;
INLINE
vecType
vecOp
(
const
vecType
a
,
const
vecType
b
)
const
{
vecType
tmp
=
hppl
::
binary
::
sub
<
vecType
>
()(
a
,
b
);
return
hppl
::
binary
::
mul
<
vecType
>
()(
tmp
,
tmp
);
}
};
class
SSEFirst
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
static
const
bool
sse
=
VECTOR_SIMD
;
INLINE
vecType
vecOp
(
const
vecType
a
,
const
vecType
b
)
const
{
return
a
;
}
};
class
SSESecond
{
public:
static
const
bool
sse
=
true
;
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
static
const
bool
sse
=
VECTOR_SIMD
;
INLINE
vecType
vecOp
(
const
vecType
a
,
const
vecType
b
)
const
{
return
b
;
}
};
class
SSEClassificationError
{
public:
static
const
bool
sse
=
true
;
static
const
bool
sse
=
VECTOR_SIMD
;
const
real
p
;
float32x4_t
mp
;
uint32x4_t
result
;
vecType
mp
;
vecType
result
;
public:
explicit
SSEClassificationError
(
const
real
s
)
:
p
(
s
)
{
mp
=
vdupq_n_f32
(
p
);
result
=
vdupq_n_u32
(
1
);
mp
=
hl_vec_set
(
p
);
result
=
hl_vec_set
(
1.0
f
);
}
// TODO: to be check
INLINE
float32x4_t
vecOp
(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
uint32x4_t
tmp1
=
vcgtq_f32
(
a
,
mp
);
uint32x4_t
tmp2
=
vcgtq_f32
(
b
,
mp
);
uint32x4_t
tmp3
=
veorq_u32
(
tmp1
,
tmp2
);
return
vcvtq_f32_u32
(
vandq_u32
(
tmp3
,
result
));
INLINE
vecType
vecOp
(
const
vecType
a
,
const
vecType
b
)
const
{
return
hl_vec_classification_error
(
a
,
b
,
mp
,
result
);
}
};
}
// namespace binary
}
// namespace base
#endif
/* HL_MATRIX_BASE_
NEON
_CUH_ */
#endif
/* HL_MATRIX_BASE_
DETAIL
_CUH_ */
paddle/cuda/include/hl_matrix_base_sse.cuh
已删除
100644 → 0
浏览文件 @
4537b7bc
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_MATRIX_BASE_SSE_CUH_
#define HL_MATRIX_BASE_SSE_CUH_
namespace
aggregate
{
class
SSESum
{
public:
static
const
bool
sse
=
true
;
INLINE
__m128
vecOp
(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_add_ps
(
a
,
b
);
}
INLINE
__m128d
vecOp
(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_add_pd
(
a
,
b
);
}
};
class
SSEMax
{
public:
static
const
bool
sse
=
true
;
INLINE
__m128
vecOp
(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_max_ps
(
a
,
b
);
}
INLINE
__m128d
vecOp
(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_max_pd
(
a
,
b
);
}
};
class
SSEMin
{
public:
static
const
bool
sse
=
true
;
INLINE
__m128
vecOp
(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_min_ps
(
a
,
b
);
}
INLINE
__m128d
vecOp
(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_min_pd
(
a
,
b
);
}
};
}
// namespace aggregate
namespace
base
{
namespace
unary
{
class
SSEIdentity
{
public:
static
const
bool
sse
=
true
;
INLINE
__m128
vecOp
(
const
__m128
a
)
const
{
return
a
;
}
INLINE
__m128d
vecOp
(
const
__m128d
a
)
const
{
return
a
;
}
};
}
// namespace unary
namespace
binary
{
class
SSEAdd
{
public:
static
const
bool
sse
=
true
;
INLINE
__m128
vecOp
(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_add_ps
(
a
,
b
);
}
INLINE
__m128d
vecOp
(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_add_pd
(
a
,
b
);
}
};
class
SSEAdd2
{
public:
static
const
bool
sse
=
true
;
const
real
p1
;
const
real
p2
;
union
{
__m128
f
;
__m128d
d
;}
mp1
;
union
{
__m128
f
;
__m128d
d
;}
mp2
;
public:
SSEAdd2
(
const
real
s1
,
const
real
s2
)
:
p1
(
s1
),
p2
(
s2
)
{
if
(
sizeof
(
real
)
==
sizeof
(
float
))
{
mp1
.
f
=
_mm_set1_ps
(
p1
);
mp2
.
f
=
_mm_set1_ps
(
p2
);
}
else
{
mp1
.
d
=
_mm_set1_pd
(
p1
);
mp2
.
d
=
_mm_set1_pd
(
p2
);
}
}
INLINE
__m128
vecOp
(
const
__m128
a
,
const
__m128
b
)
const
{
__m128
tmp1
,
tmp2
;
tmp1
=
_mm_mul_ps
(
mp1
.
f
,
a
);
tmp2
=
_mm_mul_ps
(
mp2
.
f
,
b
);
return
_mm_add_ps
(
tmp1
,
tmp2
);
}
INLINE
__m128d
vecOp
(
const
__m128d
a
,
const
__m128d
b
)
const
{
__m128d
tmp1
,
tmp2
;
tmp1
=
_mm_mul_pd
(
mp1
.
d
,
a
);
tmp2
=
_mm_mul_pd
(
mp2
.
d
,
b
);
return
_mm_add_pd
(
tmp1
,
tmp2
);
}
};
class
SSESub
{
public:
static
const
bool
sse
=
true
;
INLINE
__m128
vecOp
(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_sub_ps
(
a
,
b
);
}
INLINE
__m128d
vecOp
(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_sub_pd
(
a
,
b
);
}
};
class
SSEMul
{
public:
static
const
bool
sse
=
true
;
INLINE
__m128
vecOp
(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_mul_ps
(
a
,
b
);
}
INLINE
__m128d
vecOp
(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_mul_pd
(
a
,
b
);
}
};
class
SSEDiv
{
public:
static
const
bool
sse
=
true
;
INLINE
__m128
vecOp
(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_div_ps
(
a
,
b
);
}
INLINE
__m128d
vecOp
(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_div_pd
(
a
,
b
);
}
};
class
SSESquaredDiff
{
public:
static
const
bool
sse
=
true
;
INLINE
__m128
vecOp
(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_mul_ps
(
_mm_sub_ps
(
a
,
b
),
_mm_sub_ps
(
a
,
b
));
}
INLINE
__m128d
vecOp
(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_mul_pd
(
_mm_sub_pd
(
a
,
b
),
_mm_sub_pd
(
a
,
b
));
}
};
class
SSEFirst
{
public:
static
const
bool
sse
=
true
;
INLINE
__m128
vecOp
(
const
__m128
a
,
const
__m128
b
)
const
{
return
a
;
}
INLINE
__m128d
vecOp
(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
a
;
}
};
class
SSESecond
{
public:
static
const
bool
sse
=
true
;
INLINE
__m128
vecOp
(
const
__m128
a
,
const
__m128
b
)
const
{
return
b
;
}
INLINE
__m128d
vecOp
(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
b
;
}
};
class
SSEClassificationError
{
public:
static
const
bool
sse
=
true
;
const
real
p
;
union
{
__m128
f
;
__m128d
d
;}
mp
;
union
{
__m128
f
;
__m128d
d
;}
result
;
public:
explicit
SSEClassificationError
(
const
real
s
)
:
p
(
s
)
{
if
(
sizeof
(
real
)
==
sizeof
(
float
))
{
mp
.
f
=
_mm_set1_ps
(
p
);
result
.
f
=
_mm_set1_ps
(
1.0
f
);
}
else
{
mp
.
d
=
_mm_set1_pd
(
p
);
result
.
d
=
_mm_set1_pd
(
1.0
);
}
}
INLINE
__m128
vecOp
(
const
__m128
a
,
const
__m128
b
)
const
{
__m128
tmp1
=
_mm_cmpgt_ps
(
a
,
mp
.
f
);
__m128
tmp2
=
_mm_cmpgt_ps
(
b
,
mp
.
f
);
__m128
tmp3
=
_mm_xor_ps
(
tmp1
,
tmp2
);
return
_mm_and_ps
(
tmp3
,
result
.
f
);
}
INLINE
__m128d
vecOp
(
const
__m128d
a
,
const
__m128d
b
)
const
{
__m128d
tmp1
=
_mm_cmpgt_pd
(
a
,
mp
.
d
);
__m128d
tmp2
=
_mm_cmpgt_pd
(
b
,
mp
.
d
);
__m128d
tmp3
=
_mm_xor_pd
(
tmp1
,
tmp2
);
return
_mm_and_pd
(
tmp3
,
result
.
d
);
}
};
}
// namespace binary
}
// namespace base
#endif
/* HL_MATRIX_BASE_SSE_CUH_ */
paddle/cuda/include/hl_matrix_type.cuh
浏览文件 @
e36e24d4
...
...
@@ -17,35 +17,35 @@ limitations under the License. */
#include "hl_base.h"
#if defined(__CUDA_ARCH__)
#ifdef __CUDA_ARCH__
/**
* CUDA kernel inline function
*/
#define INLINE __device__ inline
#else
/**
* CPP inline function
*/
#define INLINE inline
#endif
#ifdef __CUDA_ARCH__
#include <vector_types.h>
#ifndef PADDLE_TYPE_DOUBLE
typedef
float4
vecType
;
#else
typedef
double2
vecType
;
#endif
#elif (defined __ARM_NEON) || (defined __ARM_NEON__)
#include <arm_neon.h>
#ifndef PADDLE_TYPE_DOUBLE
typedef
float32x4_t
vecType
;
#else
#error NEON instructions does not support double precision
#endif
#elif defined(__SSE3__)
#include "hl_cpu_simd_sse.cuh"
#define PADDLE_USE_SSE3
#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && !defined(__NVCC__)
// Currently nvcc does not support neon intrinsic.
// TODO: Extract simd intrinsic implementation from .cu files.
#include "hl_cpu_simd_neon.cuh"
#define PADDLE_USE_NEON
#else
#include <mmintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#ifndef PADDLE_TYPE_DOUBLE
typedef
__m128
vecType
;
#else
typedef
__m128d
vecType
;
#endif
#endif
#ifdef __CUDA_ARCH__
#define INLINE __device__ inline
#else
#define INLINE inline
#include "hl_cpu_scalar.cuh"
#endif
#endif // HL_MATRIX_TYPE_CUH_
paddle/cuda/include/hl_neon_matrix_kernel.cuh
已删除
100644 → 0
浏览文件 @
4537b7bc
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_NEON_MATRIX_KERNEL_CUH_
#define HL_NEON_MATRIX_KERNEL_CUH_
#include "hl_matrix_type.cuh"
#define VECTOR_SIZE 16
/* number of float in vector */
#define VECTOR_LEN 4
#define VECTOR_SET vdupq_n_f32
inline
bool
hl_check_align
(
size_t
size
)
{
return
!
(
size
&
(
VECTOR_SIZE
-
1
));
}
inline
bool
hl_check_align
(
void
*
ptr
)
{
return
hl_check_align
(
reinterpret_cast
<
size_t
>
(
ptr
));
}
template
<
class
Agg
>
inline
real
hl_agg_op
(
Agg
agg
,
vecType
mm
)
{
float32x4_t
rev
=
vrev64q_f32
(
mm
);
float32x4_t
tmp1
=
agg
.
vecOp
(
rev
,
rev
);
float32x2_t
lo
=
vget_high_f32
(
rev
);
float32x2_t
hi
=
vget_low_f32
(
rev
);
float32x4_t
tmp2
=
vcombine_f32
(
hi
,
lo
);
float32x4_t
ret
=
agg
.
vecOp
(
tmp1
,
tmp2
);
return
vgetq_lane_f32
(
ret
,
0
);
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_row_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
int
ld
,
real
*
A
,
int
lda
)
{
for
(
int
i
=
0
;
i
<
dimM
;
i
++
,
A
+=
lda
)
{
vecType
mm
=
VECTOR_SET
(
agg
.
init
());
vecType
*
a
=
(
vecType
*
)(
A
);
for
(
int
j
=
0
;
j
<
dimN
/
VECTOR_LEN
;
j
++
,
a
++
)
{
mm
=
agg
.
vecOp
(
mm
,
op
.
vecOp
(
*
a
));
}
int
rem
=
dimN
%
VECTOR_LEN
;
if
(
rem
)
{
real
tmp
=
hl_agg_op
(
agg
,
mm
);
real
*
a
=
A
+
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
for
(
int
j
=
0
;
j
<
rem
;
j
++
)
{
tmp
=
agg
(
tmp
,
op
(
a
[
j
]));
}
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
tmp
);
}
else
{
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
hl_agg_op
(
agg
,
mm
));
}
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_row_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
int
ld
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
for
(
int
i
=
0
;
i
<
dimM
;
i
++
,
A
+=
lda
,
B
+=
ldb
)
{
vecType
mm
=
VECTOR_SET
(
agg
.
init
());
vecType
*
a
=
(
vecType
*
)(
A
);
vecType
*
b
=
(
vecType
*
)(
B
);
for
(
int
j
=
0
;
j
<
dimN
/
VECTOR_LEN
;
j
++
,
a
++
,
b
++
)
{
mm
=
agg
.
vecOp
(
mm
,
op
.
vecOp
(
*
a
,
*
b
));
}
int
rem
=
dimN
%
VECTOR_LEN
;
if
(
rem
)
{
real
tmp
=
hl_agg_op
(
agg
,
mm
);
real
*
a
=
A
+
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
real
*
b
=
B
+
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
for
(
int
j
=
0
;
j
<
rem
;
j
++
)
{
tmp
=
agg
(
tmp
,
op
(
a
[
j
],
b
[
j
]));
}
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
tmp
);
}
else
{
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
hl_agg_op
(
agg
,
mm
));
}
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
)
{
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
]));
}
dst
[
j
]
=
sv
(
dst
[
j
],
tmp
);
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
],
B
[
i
*
ldb
+
j
]));
}
dst
[
j
]
=
sv
(
dst
[
j
],
tmp
);
}
}
/*
* MaxRow greater than or equal dimN
* dimN is multiples of VECTOR_LEN
* so rem <= MaxRow / VECTOR_LEN
*/
template
<
int
MaxRow
,
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_column_op_with_rem
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
)
{
vecType
mm
[
MaxRow
/
VECTOR_LEN
];
for
(
int
n
=
0
;
n
<
MaxRow
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
VECTOR_SET
(
agg
.
init
());
}
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
vecType
*
a
=
(
vecType
*
)(
A
+
i
*
lda
);
for
(
int
n
=
0
;
n
<
dimN
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
agg
.
vecOp
(
mm
[
n
],
op
.
vecOp
(
a
[
n
]));
}
}
vecType
*
result
=
(
vecType
*
)(
dst
);
for
(
int
n
=
0
;
n
<
dimN
/
VECTOR_LEN
;
n
++
)
{
result
[
n
]
=
sv
.
vecOp
(
result
[
n
],
mm
[
n
]);
}
int
rem
=
dimN
%
VECTOR_LEN
;
if
(
rem
)
{
A
+=
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
dst
+=
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
hl_matrix_column_op
(
agg
,
op
,
sv
,
dimM
,
rem
,
dst
,
A
,
lda
);
}
}
/*
* dimN is multiples of VECTOR_LEN
* dimN greater than Step
*/
template
<
int
Step
,
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
)
{
for
(
int
j
=
0
;
j
<
dimN
/
Step
;
j
++
,
dst
+=
Step
,
A
+=
Step
)
{
vecType
mm
[
Step
/
VECTOR_LEN
];
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
VECTOR_SET
(
agg
.
init
());
}
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
vecType
*
a
=
(
vecType
*
)(
A
+
i
*
lda
);
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
agg
.
vecOp
(
mm
[
n
],
op
.
vecOp
(
a
[
n
]));
}
}
vecType
*
result
=
(
vecType
*
)(
dst
);
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
result
[
n
]
=
sv
.
vecOp
(
result
[
n
],
mm
[
n
]);
}
}
int
remRow
=
dimN
%
Step
;
if
(
remRow
)
{
hl_sse_column_op_with_rem
<
Step
>
(
agg
,
op
,
sv
,
dimM
,
remRow
,
dst
,
A
,
lda
);
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
)
{
if
(
dimN
<=
16
)
{
hl_sse_matrix_column_op
<
16
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
);
}
else
if
(
dimN
<=
32
)
{
hl_sse_matrix_column_op
<
32
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
);
}
else
if
(
dimN
<=
1024
||
dimM
<=
512
)
{
hl_sse_matrix_column_op
<
64
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
);
}
else
{
hl_sse_matrix_column_op
<
1024
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
);
}
}
template
<
int
MaxRow
,
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_column_op_with_rem
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
vecType
mm
[
MaxRow
/
VECTOR_LEN
];
for
(
int
n
=
0
;
n
<
MaxRow
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
VECTOR_SET
(
agg
.
init
());
}
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
vecType
*
a
=
(
vecType
*
)(
A
+
i
*
lda
);
vecType
*
b
=
(
vecType
*
)(
B
+
i
*
ldb
);
for
(
int
n
=
0
;
n
<
dimN
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
agg
.
vecOp
(
mm
[
n
],
op
.
vecOp
(
a
[
n
],
b
[
n
]));
}
}
vecType
*
result
=
(
vecType
*
)(
dst
);
for
(
int
n
=
0
;
n
<
dimN
/
VECTOR_LEN
;
n
++
)
{
result
[
n
]
=
sv
.
vecOp
(
result
[
n
],
mm
[
n
]);
}
int
rem
=
dimN
%
VECTOR_LEN
;
if
(
rem
)
{
A
+=
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
B
+=
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
dst
+=
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
hl_matrix_column_op
(
agg
,
op
,
sv
,
dimM
,
rem
,
dst
,
A
,
lda
,
B
,
ldb
);
}
}
template
<
int
Step
,
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
for
(
int
j
=
0
;
j
<
dimN
/
Step
;
j
++
,
dst
+=
Step
,
A
+=
Step
,
B
+=
Step
)
{
vecType
mm
[
Step
/
VECTOR_LEN
];
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
VECTOR_SET
(
agg
.
init
());
}
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
vecType
*
a
=
(
vecType
*
)(
A
+
i
*
lda
);
vecType
*
b
=
(
vecType
*
)(
B
+
i
*
ldb
);
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
agg
.
vecOp
(
mm
[
n
],
op
.
vecOp
(
a
[
n
],
b
[
n
]));
}
}
vecType
*
result
=
(
vecType
*
)(
dst
);
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
result
[
n
]
=
sv
.
vecOp
(
result
[
n
],
mm
[
n
]);
}
}
int
remRow
=
dimN
%
Step
;
if
(
remRow
)
{
hl_sse_column_op_with_rem
<
Step
>
(
agg
,
op
,
sv
,
dimM
,
remRow
,
dst
,
A
,
lda
,
B
,
ldb
);
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
if
(
dimN
<=
16
)
{
hl_sse_matrix_column_op
<
16
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
,
B
,
ldb
);
}
else
if
(
dimN
<=
32
)
{
hl_sse_matrix_column_op
<
32
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
,
B
,
ldb
);
}
else
if
(
dimN
<=
1024
||
dimM
<=
512
)
{
hl_sse_matrix_column_op
<
64
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
,
B
,
ldb
);
}
else
{
hl_sse_matrix_column_op
<
1024
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
,
B
,
ldb
);
}
}
#endif
/* HL_NEON_MATRIX_KERNEL_CUH_ */
paddle/cuda/include/hl_tensor_ops.h
浏览文件 @
e36e24d4
...
...
@@ -328,6 +328,208 @@ public:
INLINE
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
a
<
b
?
b
:
a
;
}
};
#ifdef PADDLE_USE_SSE3
#ifndef PADDLE_TYPE_DOUBLE
template
<
>
class
add
<
__m128
>
{
public:
INLINE
__m128
operator
()(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_add_ps
(
a
,
b
);
}
};
template
<
>
class
add_scale
<
__m128
>
{
private:
const
__m128
p1
;
const
__m128
p2
;
public:
INLINE
add_scale
(
const
__m128
s1
,
const
__m128
s2
)
:
p1
(
s1
),
p2
(
s2
)
{}
INLINE
__m128
operator
()(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_add_ps
(
_mm_mul_ps
(
p1
,
a
),
_mm_mul_ps
(
p2
,
b
));
}
};
template
<
>
class
sub
<
__m128
>
{
public:
INLINE
__m128
operator
()(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_sub_ps
(
a
,
b
);
}
};
template
<
>
class
mul
<
__m128
>
{
public:
INLINE
__m128
operator
()(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_mul_ps
(
a
,
b
);
}
};
template
<
>
class
div
<
__m128
>
{
public:
INLINE
__m128
operator
()(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_div_ps
(
a
,
b
);
}
};
template
<
>
class
min
<
__m128
>
{
public:
INLINE
__m128
operator
()(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_min_ps
(
a
,
b
);
}
};
template
<
>
class
max
<
__m128
>
{
public:
INLINE
__m128
operator
()(
const
__m128
a
,
const
__m128
b
)
const
{
return
_mm_max_ps
(
a
,
b
);
}
};
#else
template
<
>
class
add
<
__m128d
>
{
public:
INLINE
__m128d
operator
()(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_add_pd
(
a
,
b
);
}
};
template
<
>
class
add_scale
<
__m128d
>
{
private:
const
__m128d
p1
;
const
__m128d
p2
;
public:
INLINE
add_scale
(
const
__m128d
s1
,
const
__m128d
s2
)
:
p1
(
s1
),
p2
(
s2
)
{}
INLINE
__m128d
operator
()(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_add_pd
(
_mm_mul_pd
(
p1
,
a
),
_mm_mul_pd
(
p2
,
b
));
}
};
template
<
>
class
sub
<
__m128d
>
{
public:
INLINE
__m128d
operator
()(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_sub_pd
(
a
,
b
);
}
};
template
<
>
class
mul
<
__m128d
>
{
public:
INLINE
__m128d
operator
()(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_mul_pd
(
a
,
b
);
}
};
template
<
>
class
div
<
__m128d
>
{
public:
INLINE
__m128d
operator
()(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_div_pd
(
a
,
b
);
}
};
template
<
>
class
min
<
__m128d
>
{
public:
INLINE
__m128d
operator
()(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_min_pd
(
a
,
b
);
}
};
template
<
>
class
max
<
__m128d
>
{
public:
INLINE
__m128d
operator
()(
const
__m128d
a
,
const
__m128d
b
)
const
{
return
_mm_max_pd
(
a
,
b
);
}
};
#endif // PADDLE_TYPE_DOUBLE
#endif // PADDLE_USE_SSE3
#ifdef PADDLE_USE_NEON
#ifndef PADDLE_TYPE_DOUBLE
template
<
>
class
add
<
float32x4_t
>
{
public:
INLINE
float32x4_t
operator
()(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vmulq_f32
(
a
,
b
);
}
};
template
<
>
class
add_scale
<
float32x4_t
>
{
private:
const
float32x4_t
p1
;
const
float32x4_t
p2
;
public:
INLINE
add_scale
(
const
float32x4_t
s1
,
const
float32x4_t
s2
)
:
p1
(
s1
),
p2
(
s2
)
{}
INLINE
float32x4_t
operator
()(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vaddq_f32
(
vmulq_f32
(
p1
,
a
),
vmulq_f32
(
p2
,
b
));
}
};
template
<
>
class
sub
<
float32x4_t
>
{
public:
INLINE
float32x4_t
operator
()(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vsubq_f32
(
a
,
b
);
}
};
template
<
>
class
mul
<
float32x4_t
>
{
public:
INLINE
float32x4_t
operator
()(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vmulq_f32
(
a
,
b
);
}
};
template
<
>
class
div
<
float32x4_t
>
{
public:
INLINE
float32x4_t
operator
()(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
float32x4_t
tmp
=
vrecpeq_f32
(
b
);
return
vmulq_f32
(
a
,
tmp
);
}
};
template
<
>
class
min
<
float32x4_t
>
{
public:
INLINE
float32x4_t
operator
()(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vminq_f32
(
a
,
b
);
}
};
template
<
>
class
max
<
float32x4_t
>
{
public:
INLINE
float32x4_t
operator
()(
const
float32x4_t
a
,
const
float32x4_t
b
)
const
{
return
vmaxq_f32
(
a
,
b
);
}
};
#else
#error To be implemented
#endif // PADDLE_TYPE_DOUBLE
#endif // PADDLE_USE_NEON
}
// namespace binary
}
// namespace hppl
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录