Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
32a85081
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
32a85081
编写于
5月 25, 2017
作者:
J
Jiangtao Hu
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Support scalar computing.
上级
21be601b
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
233 addition
and
418 deletion
+233
-418
paddle/cuda/include/hl_cpu_matrix_kernel.cuh
paddle/cuda/include/hl_cpu_matrix_kernel.cuh
+3
-33
paddle/cuda/include/hl_cpu_matrix_kernel_detail.cuh
paddle/cuda/include/hl_cpu_matrix_kernel_detail.cuh
+57
-65
paddle/cuda/include/hl_cpu_scalar.cuh
paddle/cuda/include/hl_cpu_scalar.cuh
+39
-0
paddle/cuda/include/hl_cpu_simd_neon.cuh
paddle/cuda/include/hl_cpu_simd_neon.cuh
+58
-0
paddle/cuda/include/hl_cpu_simd_sse.cuh
paddle/cuda/include/hl_cpu_simd_sse.cuh
+65
-0
paddle/cuda/include/hl_matrix_base.cuh
paddle/cuda/include/hl_matrix_base.cuh
+5
-5
paddle/cuda/include/hl_matrix_type.cuh
paddle/cuda/include/hl_matrix_type.cuh
+6
-16
paddle/cuda/include/hl_neon_matrix_kernel.cuh
paddle/cuda/include/hl_neon_matrix_kernel.cuh
+0
-299
未找到文件。
paddle/cuda/include/hl_cpu_matrix_kernel.cuh
浏览文件 @
32a85081
...
...
@@ -17,10 +17,9 @@ limitations under the License. */
#include <stdio.h>
#include "hl_base.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include "hl_neon_matrix_kernel.cuh"
#else
#include "hl_sse_matrix_kernel.cuh"
#ifndef __CUDA_ARCH__
#include "hl_cpu_matrix_kernel_detail.cuh"
#endif
/**
...
...
@@ -114,35 +113,6 @@ void hl_cpu_apply_quaternary_op(Op op,
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_row_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
int
ld
,
real
*
A
,
int
lda
)
{
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
]));
}
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
tmp
);
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_row_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
int
ld
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
],
B
[
i
*
ldb
+
j
]));
}
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
tmp
);
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_cpu_matrix_row_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
...
...
paddle/cuda/include/hl_
sse_matrix_kerne
l.cuh
→
paddle/cuda/include/hl_
cpu_matrix_kernel_detai
l.cuh
浏览文件 @
32a85081
...
...
@@ -13,26 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_
SSE_MATRIX_KERNE
L_CUH_
#define HL_
SSE_MATRIX_KERNE
L_CUH_
#ifndef HL_
MATRIX_KERNEL_DETAI
L_CUH_
#define HL_
MATRIX_KERNEL_DETAI
L_CUH_
#include "hl_matrix_type.cuh"
#define VECTOR_SIZE 16
#ifndef PADDLE_TYPE_DOUBLE
/* number of float in vector */
#define VECTOR_LEN 4
#define VECTOR_SET _mm_set_ps1
#else
#if defined(__APPLE__) || defined(__OSX__)
#define _mm_set_pd1 _mm_set1_pd
#endif
/* number of double in vector */
#define VECTOR_LEN 2
#define VECTOR_SET _mm_set_pd1
#endif
inline
bool
hl_check_align
(
size_t
size
)
{
return
!
(
size
&
(
VECTOR_SIZE
-
1
));
}
...
...
@@ -41,27 +26,63 @@ inline bool hl_check_align(void *ptr) {
return
hl_check_align
(
reinterpret_cast
<
size_t
>
(
ptr
));
}
#ifndef PADDLE_TYPE_DOUBLE
template
<
class
Agg
>
inline
real
hl_agg_op
(
Agg
agg
,
vecType
mm
)
{
__m128
lo
=
_mm_unpacklo_ps
(
mm
,
mm
);
__m128
hi
=
_mm_unpackhi_ps
(
mm
,
mm
);
__m128
tmp1
=
agg
.
vecOp
(
lo
,
hi
);
__m128
tmp2
=
_mm_movehl_ps
(
tmp1
,
tmp1
);
__m128
ret
=
agg
.
vecOp
(
tmp1
,
tmp2
);
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_row_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
int
ld
,
real
*
A
,
int
lda
)
{
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
]));
}
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
tmp
);
}
}
return
_mm_cvtss_f32
(
ret
);
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_row_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
int
ld
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
],
B
[
i
*
ldb
+
j
]));
}
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
tmp
);
}
}
#else
template
<
class
Agg
>
inline
real
hl_agg_op
(
Agg
agg
,
vecType
mm
)
{
__m128d
lo
=
_mm_unpacklo_pd
(
mm
,
mm
);
__m128d
hi
=
_mm_unpackhi_pd
(
mm
,
mm
);
__m128d
ret
=
agg
.
vecOp
(
lo
,
hi
);
return
_mm_cvtsd_f64
(
ret
);
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
)
{
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
]));
}
dst
[
j
]
=
sv
(
dst
[
j
],
tmp
);
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
],
B
[
i
*
ldb
+
j
]));
}
dst
[
j
]
=
sv
(
dst
[
j
],
tmp
);
}
}
#endif
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_row_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
...
...
@@ -118,35 +139,6 @@ void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
)
{
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
]));
}
dst
[
j
]
=
sv
(
dst
[
j
],
tmp
);
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
],
B
[
i
*
ldb
+
j
]));
}
dst
[
j
]
=
sv
(
dst
[
j
],
tmp
);
}
}
/*
* MaxRow greater than or equal dimN
* dimN is multiples of VECTOR_LEN
...
...
@@ -315,4 +307,4 @@ void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
}
}
#endif
/* HL_
SSE_MATRIX_KERNE
L_CUH_ */
#endif
/* HL_
MATRIX_KERNEL_DETAI
L_CUH_ */
paddle/cuda/include/hl_cpu_scalar.cuh
0 → 100644
浏览文件 @
32a85081
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_CPU_SCALAR_CUH_
#define HL_CPU_SCALAR_CUH_
#ifndef PADDLE_TYPE_DOUBLE
/* size of float */
#define VECTOR_SIZE 4
#else
/* size of double */
#define VECTOR_SIZE 8
#endif
typedef
real
vecType
;
inline
void
set_zero
(
vecType
&
mm
)
{
mm
=
(
vecType
)
0.0
f
;
}
/* Consider a real as a vector */
#define VECTOR_LEN 1
#define VECTOR_SET set_zero
template
<
class
Agg
>
inline
real
hl_agg_op
(
Agg
agg
,
vecType
mm
)
{
return
mm
;
}
#endif // HL_CPU_SCALAR_CUH_
paddle/cuda/include/hl_cpu_simd_neon.cuh
0 → 100644
浏览文件 @
32a85081
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_CPU_SIMD_NEON_CUH_
#define HL_CPU_SIMD_NEON_CUH_
#include <arm_neon.h>
#define VECTOR_SIZE 16
#ifndef PADDLE_TYPE_DOUBLE
typedef
float32x4_t
vecType
;
/* number of float in vector */
#define VECTOR_LEN 4
#define VECTOR_SET vdupq_n_f32
template
<
class
Agg
>
inline
real
hl_agg_op
(
Agg
agg
,
vecType
mm
)
{
float32x4_t
rev
=
vrev64q_f32
(
mm
);
float32x4_t
tmp1
=
agg
.
vecOp
(
rev
,
rev
);
float32x2_t
lo
=
vget_high_f32
(
rev
);
float32x2_t
hi
=
vget_low_f32
(
rev
);
float32x4_t
tmp2
=
vcombine_f32
(
hi
,
lo
);
float32x4_t
ret
=
agg
.
vecOp
(
tmp1
,
tmp2
);
return
vgetq_lane_f32
(
ret
,
0
);
}
#else
#ifdef __aarch64__
typedef
float64x2_t
vecType
;
/* number of float in vector */
#define VECTOR_LEN 2
#define VECTOR_SET vdupq_n_f64
#error To be implemented
#else
#error NEON instructions does not support double precision
#endif
#endif
#endif // HL_CPU_SIMD_NEON_CUH_
paddle/cuda/include/hl_cpu_simd_sse.cuh
0 → 100644
浏览文件 @
32a85081
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_SIMD_SSE_CUH_
#define HL_SIMD_SSE_CUH_
#include <mmintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#define VECTOR_SIZE 16
#ifndef PADDLE_TYPE_DOUBLE
typedef
__m128
vecType
;
/* number of float in vector */
#define VECTOR_LEN 4
#define VECTOR_SET _mm_set_ps1
template
<
class
Agg
>
inline
real
hl_agg_op
(
Agg
agg
,
vecType
mm
)
{
__m128
lo
=
_mm_unpacklo_ps
(
mm
,
mm
);
__m128
hi
=
_mm_unpackhi_ps
(
mm
,
mm
);
__m128
tmp1
=
agg
.
vecOp
(
lo
,
hi
);
__m128
tmp2
=
_mm_movehl_ps
(
tmp1
,
tmp1
);
__m128
ret
=
agg
.
vecOp
(
tmp1
,
tmp2
);
return
_mm_cvtss_f32
(
ret
);
}
#else
typedef
__m128d
vecType
;
/* number of double in vector */
#define VECTOR_LEN 2
#if defined(__APPLE__) || defined(__OSX__)
#define _mm_set_pd1 _mm_set1_pd
#endif
#define VECTOR_SET _mm_set_pd1
template
<
class
Agg
>
inline
real
hl_agg_op
(
Agg
agg
,
vecType
mm
)
{
__m128d
lo
=
_mm_unpacklo_pd
(
mm
,
mm
);
__m128d
hi
=
_mm_unpackhi_pd
(
mm
,
mm
);
__m128d
ret
=
agg
.
vecOp
(
lo
,
hi
);
return
_mm_cvtsd_f64
(
ret
);
}
#endif
#endif // HL_SIMD_SSE_CUH_
paddle/cuda/include/hl_matrix_base.cuh
浏览文件 @
32a85081
...
...
@@ -52,7 +52,11 @@ public:
}
};
#ifdef __CUDA_ARCH__
#if defined(__SSE3__)
#include "hl_matrix_base_sse.cuh"
#elif (defined(__ARM__NEON__) || defined(__ARM_NEON))
#include "hl_matrix_base_neon.cuh"
#else
typedef
BaseOp
SSESum
;
typedef
BaseOp
SSEMax
;
typedef
BaseOp
SSEMin
;
...
...
@@ -66,10 +70,6 @@ typedef BaseOp SSESquaredDiff;
typedef
BaseOp
SSEFirst
;
typedef
BaseOp
SSESecond
;
typedef
BaseOp
SSEClassificationError
;
#elif defined(__ARM__NEON__) || defined(__ARM_NEON)
#include "hl_matrix_base_neon.cuh"
#else
#include "hl_matrix_base_sse.cuh"
#endif
namespace
aggregate
{
...
...
paddle/cuda/include/hl_matrix_type.cuh
浏览文件 @
32a85081
...
...
@@ -17,29 +17,19 @@ limitations under the License. */
#include "hl_base.h"
#if
defined(__CUDA_ARCH__)
#if
def __CUDA_ARCH__
#include <vector_types.h>
#ifndef PADDLE_TYPE_DOUBLE
typedef
float4
vecType
;
#else
typedef
double2
vecType
;
#endif
#elif (defined __ARM_NEON) || (defined __ARM_NEON__)
#include <arm_neon.h>
#ifndef PADDLE_TYPE_DOUBLE
typedef
float32x4_t
vecType
;
#else
#error NEON instructions does not support double precision
#endif
#elif defined(__SSE3__)
#include "hl_cpu_simd_sse.cuh"
#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
#include "hl_cpu_simd_neon.cuh"
#else
#include <mmintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#ifndef PADDLE_TYPE_DOUBLE
typedef
__m128
vecType
;
#else
typedef
__m128d
vecType
;
#endif
#include "hl_cpu_scalar.cuh"
#endif
#ifdef __CUDA_ARCH__
...
...
paddle/cuda/include/hl_neon_matrix_kernel.cuh
已删除
100644 → 0
浏览文件 @
21be601b
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_NEON_MATRIX_KERNEL_CUH_
#define HL_NEON_MATRIX_KERNEL_CUH_
#include "hl_matrix_type.cuh"
#define VECTOR_SIZE 16
/* number of float in vector */
#define VECTOR_LEN 4
#define VECTOR_SET vdupq_n_f32
inline
bool
hl_check_align
(
size_t
size
)
{
return
!
(
size
&
(
VECTOR_SIZE
-
1
));
}
inline
bool
hl_check_align
(
void
*
ptr
)
{
return
hl_check_align
(
reinterpret_cast
<
size_t
>
(
ptr
));
}
template
<
class
Agg
>
inline
real
hl_agg_op
(
Agg
agg
,
vecType
mm
)
{
float32x4_t
rev
=
vrev64q_f32
(
mm
);
float32x4_t
tmp1
=
agg
.
vecOp
(
rev
,
rev
);
float32x2_t
lo
=
vget_high_f32
(
rev
);
float32x2_t
hi
=
vget_low_f32
(
rev
);
float32x4_t
tmp2
=
vcombine_f32
(
hi
,
lo
);
float32x4_t
ret
=
agg
.
vecOp
(
tmp1
,
tmp2
);
return
vgetq_lane_f32
(
ret
,
0
);
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_row_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
int
ld
,
real
*
A
,
int
lda
)
{
for
(
int
i
=
0
;
i
<
dimM
;
i
++
,
A
+=
lda
)
{
vecType
mm
=
VECTOR_SET
(
agg
.
init
());
vecType
*
a
=
(
vecType
*
)(
A
);
for
(
int
j
=
0
;
j
<
dimN
/
VECTOR_LEN
;
j
++
,
a
++
)
{
mm
=
agg
.
vecOp
(
mm
,
op
.
vecOp
(
*
a
));
}
int
rem
=
dimN
%
VECTOR_LEN
;
if
(
rem
)
{
real
tmp
=
hl_agg_op
(
agg
,
mm
);
real
*
a
=
A
+
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
for
(
int
j
=
0
;
j
<
rem
;
j
++
)
{
tmp
=
agg
(
tmp
,
op
(
a
[
j
]));
}
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
tmp
);
}
else
{
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
hl_agg_op
(
agg
,
mm
));
}
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_row_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
int
ld
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
for
(
int
i
=
0
;
i
<
dimM
;
i
++
,
A
+=
lda
,
B
+=
ldb
)
{
vecType
mm
=
VECTOR_SET
(
agg
.
init
());
vecType
*
a
=
(
vecType
*
)(
A
);
vecType
*
b
=
(
vecType
*
)(
B
);
for
(
int
j
=
0
;
j
<
dimN
/
VECTOR_LEN
;
j
++
,
a
++
,
b
++
)
{
mm
=
agg
.
vecOp
(
mm
,
op
.
vecOp
(
*
a
,
*
b
));
}
int
rem
=
dimN
%
VECTOR_LEN
;
if
(
rem
)
{
real
tmp
=
hl_agg_op
(
agg
,
mm
);
real
*
a
=
A
+
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
real
*
b
=
B
+
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
for
(
int
j
=
0
;
j
<
rem
;
j
++
)
{
tmp
=
agg
(
tmp
,
op
(
a
[
j
],
b
[
j
]));
}
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
tmp
);
}
else
{
dst
[
i
*
ld
]
=
sv
(
dst
[
i
*
ld
],
hl_agg_op
(
agg
,
mm
));
}
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
)
{
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
]));
}
dst
[
j
]
=
sv
(
dst
[
j
],
tmp
);
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
for
(
int
j
=
0
;
j
<
dimN
;
j
++
)
{
real
tmp
=
agg
.
init
();
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
tmp
=
agg
(
tmp
,
op
(
A
[
i
*
lda
+
j
],
B
[
i
*
ldb
+
j
]));
}
dst
[
j
]
=
sv
(
dst
[
j
],
tmp
);
}
}
/*
* MaxRow greater than or equal dimN
* dimN is multiples of VECTOR_LEN
* so rem <= MaxRow / VECTOR_LEN
*/
template
<
int
MaxRow
,
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_column_op_with_rem
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
)
{
vecType
mm
[
MaxRow
/
VECTOR_LEN
];
for
(
int
n
=
0
;
n
<
MaxRow
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
VECTOR_SET
(
agg
.
init
());
}
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
vecType
*
a
=
(
vecType
*
)(
A
+
i
*
lda
);
for
(
int
n
=
0
;
n
<
dimN
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
agg
.
vecOp
(
mm
[
n
],
op
.
vecOp
(
a
[
n
]));
}
}
vecType
*
result
=
(
vecType
*
)(
dst
);
for
(
int
n
=
0
;
n
<
dimN
/
VECTOR_LEN
;
n
++
)
{
result
[
n
]
=
sv
.
vecOp
(
result
[
n
],
mm
[
n
]);
}
int
rem
=
dimN
%
VECTOR_LEN
;
if
(
rem
)
{
A
+=
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
dst
+=
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
hl_matrix_column_op
(
agg
,
op
,
sv
,
dimM
,
rem
,
dst
,
A
,
lda
);
}
}
/*
* dimN is multiples of VECTOR_LEN
* dimN greater than Step
*/
template
<
int
Step
,
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
)
{
for
(
int
j
=
0
;
j
<
dimN
/
Step
;
j
++
,
dst
+=
Step
,
A
+=
Step
)
{
vecType
mm
[
Step
/
VECTOR_LEN
];
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
VECTOR_SET
(
agg
.
init
());
}
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
vecType
*
a
=
(
vecType
*
)(
A
+
i
*
lda
);
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
agg
.
vecOp
(
mm
[
n
],
op
.
vecOp
(
a
[
n
]));
}
}
vecType
*
result
=
(
vecType
*
)(
dst
);
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
result
[
n
]
=
sv
.
vecOp
(
result
[
n
],
mm
[
n
]);
}
}
int
remRow
=
dimN
%
Step
;
if
(
remRow
)
{
hl_sse_column_op_with_rem
<
Step
>
(
agg
,
op
,
sv
,
dimM
,
remRow
,
dst
,
A
,
lda
);
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
)
{
if
(
dimN
<=
16
)
{
hl_sse_matrix_column_op
<
16
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
);
}
else
if
(
dimN
<=
32
)
{
hl_sse_matrix_column_op
<
32
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
);
}
else
if
(
dimN
<=
1024
||
dimM
<=
512
)
{
hl_sse_matrix_column_op
<
64
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
);
}
else
{
hl_sse_matrix_column_op
<
1024
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
);
}
}
template
<
int
MaxRow
,
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_column_op_with_rem
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
vecType
mm
[
MaxRow
/
VECTOR_LEN
];
for
(
int
n
=
0
;
n
<
MaxRow
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
VECTOR_SET
(
agg
.
init
());
}
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
vecType
*
a
=
(
vecType
*
)(
A
+
i
*
lda
);
vecType
*
b
=
(
vecType
*
)(
B
+
i
*
ldb
);
for
(
int
n
=
0
;
n
<
dimN
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
agg
.
vecOp
(
mm
[
n
],
op
.
vecOp
(
a
[
n
],
b
[
n
]));
}
}
vecType
*
result
=
(
vecType
*
)(
dst
);
for
(
int
n
=
0
;
n
<
dimN
/
VECTOR_LEN
;
n
++
)
{
result
[
n
]
=
sv
.
vecOp
(
result
[
n
],
mm
[
n
]);
}
int
rem
=
dimN
%
VECTOR_LEN
;
if
(
rem
)
{
A
+=
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
B
+=
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
dst
+=
(
dimN
/
VECTOR_LEN
)
*
VECTOR_LEN
;
hl_matrix_column_op
(
agg
,
op
,
sv
,
dimM
,
rem
,
dst
,
A
,
lda
,
B
,
ldb
);
}
}
template
<
int
Step
,
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
for
(
int
j
=
0
;
j
<
dimN
/
Step
;
j
++
,
dst
+=
Step
,
A
+=
Step
,
B
+=
Step
)
{
vecType
mm
[
Step
/
VECTOR_LEN
];
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
VECTOR_SET
(
agg
.
init
());
}
for
(
int
i
=
0
;
i
<
dimM
;
i
++
)
{
vecType
*
a
=
(
vecType
*
)(
A
+
i
*
lda
);
vecType
*
b
=
(
vecType
*
)(
B
+
i
*
ldb
);
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
mm
[
n
]
=
agg
.
vecOp
(
mm
[
n
],
op
.
vecOp
(
a
[
n
],
b
[
n
]));
}
}
vecType
*
result
=
(
vecType
*
)(
dst
);
for
(
int
n
=
0
;
n
<
Step
/
VECTOR_LEN
;
n
++
)
{
result
[
n
]
=
sv
.
vecOp
(
result
[
n
],
mm
[
n
]);
}
}
int
remRow
=
dimN
%
Step
;
if
(
remRow
)
{
hl_sse_column_op_with_rem
<
Step
>
(
agg
,
op
,
sv
,
dimM
,
remRow
,
dst
,
A
,
lda
,
B
,
ldb
);
}
}
template
<
class
Agg
,
class
Op
,
class
Saver
>
void
hl_sse_matrix_column_op
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
dimM
,
int
dimN
,
real
*
dst
,
real
*
A
,
int
lda
,
real
*
B
,
int
ldb
)
{
if
(
dimN
<=
16
)
{
hl_sse_matrix_column_op
<
16
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
,
B
,
ldb
);
}
else
if
(
dimN
<=
32
)
{
hl_sse_matrix_column_op
<
32
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
,
B
,
ldb
);
}
else
if
(
dimN
<=
1024
||
dimM
<=
512
)
{
hl_sse_matrix_column_op
<
64
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
,
B
,
ldb
);
}
else
{
hl_sse_matrix_column_op
<
1024
>
(
agg
,
op
,
sv
,
dimM
,
dimN
,
dst
,
A
,
lda
,
B
,
ldb
);
}
}
#endif
/* HL_NEON_MATRIX_KERNEL_CUH_ */
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录