Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
26592a86
Mace
项目概览
Xiaomi
/
Mace
通知
107
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
26592a86
编写于
9月 08, 2018
作者:
李
李寅
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Integrate gemm to matmul
上级
ba6972ec
变更
12
隐藏空白更改
内联
并排
Showing
12 changed file
with
423 addition
and
209 deletion
+423
-209
.gitlab-ci.yml
.gitlab-ci.yml
+1
-1
mace/core/tensor.h
mace/core/tensor.h
+5
-1
mace/core/testing/test_benchmark_main.cc
mace/core/testing/test_benchmark_main.cc
+2
-1
mace/kernels/gemm_test.cc
mace/kernels/gemm_test.cc
+37
-25
mace/kernels/matmul.h
mace/kernels/matmul.h
+22
-25
mace/kernels/matmul_benchmark.cc
mace/kernels/matmul_benchmark.cc
+5
-3
mace/kernels/sgemm.cc
mace/kernels/sgemm.cc
+250
-92
mace/kernels/sgemm.h
mace/kernels/sgemm.h
+68
-42
mace/kernels/sgemm_pack_test.cc
mace/kernels/sgemm_pack_test.cc
+13
-12
mace/ops/matmul.h
mace/ops/matmul.h
+5
-1
mace/ops/matmul_benchmark.cc
mace/ops/matmul_benchmark.cc
+13
-4
mace/ops/winograd_transform_benchmark.cc
mace/ops/winograd_transform_benchmark.cc
+2
-2
未找到文件。
.gitlab-ci.yml
浏览文件 @
26592a86
...
@@ -85,7 +85,7 @@ ndk_versions_compatible_tests:
...
@@ -85,7 +85,7 @@ ndk_versions_compatible_tests:
-
DEFAULT_NDK_PATH=$ANDROID_NDK_HOME
-
DEFAULT_NDK_PATH=$ANDROID_NDK_HOME
-
prefix_path=${DEFAULT_NDK_PATH%android-ndk-*}
-
prefix_path=${DEFAULT_NDK_PATH%android-ndk-*}
-
>
-
>
for ndk in android-ndk-r1
2b android-ndk-r1
5c android-ndk-r16 android-ndk-r17b;
for ndk in android-ndk-r15c android-ndk-r16 android-ndk-r17b;
do
do
new_ndk_path=${prefix_path}${ndk};
new_ndk_path=${prefix_path}${ndk};
if [ "$new_ndk_path" != "$DEFAULT_NDK_PATH" ]; then
if [ "$new_ndk_path" != "$DEFAULT_NDK_PATH" ]; then
...
...
mace/core/tensor.h
浏览文件 @
26592a86
...
@@ -399,6 +399,10 @@ class Tensor {
...
@@ -399,6 +399,10 @@ class Tensor {
zero_point_
=
zero_point
;
zero_point_
=
zero_point
;
}
}
inline
void
SetIsWeight
(
bool
is_weight
)
{
is_weight_
=
is_weight
;
}
private:
private:
Allocator
*
allocator_
;
Allocator
*
allocator_
;
DataType
dtype_
;
DataType
dtype_
;
...
@@ -409,7 +413,7 @@ class Tensor {
...
@@ -409,7 +413,7 @@ class Tensor {
bool
is_buffer_owner_
;
bool
is_buffer_owner_
;
bool
unused_
;
bool
unused_
;
std
::
string
name_
;
std
::
string
name_
;
const
bool
is_weight_
;
bool
is_weight_
;
float
scale_
;
float
scale_
;
int32_t
zero_point_
;
int32_t
zero_point_
;
...
...
mace/core/testing/test_benchmark_main.cc
浏览文件 @
26592a86
...
@@ -33,7 +33,8 @@ int main(int argc, char **argv) {
...
@@ -33,7 +33,8 @@ int main(int argc, char **argv) {
// config runtime
// config runtime
mace
::
MaceStatus
status
=
mace
::
SetOpenMPThreadsAndAffinityPolicy
(
mace
::
MaceStatus
status
=
mace
::
SetOpenMPThreadsAndAffinityPolicy
(
FLAGS_omp_num_threads
,
FLAGS_omp_num_threads
,
static_cast
<
mace
::
CPUAffinityPolicy
>
(
FLAGS_cpu_affinity_policy
));
static_cast
<
mace
::
CPUAffinityPolicy
>
(
FLAGS_cpu_affinity_policy
),
true
);
if
(
status
!=
mace
::
MACE_SUCCESS
)
{
if
(
status
!=
mace
::
MACE_SUCCESS
)
{
LOG
(
WARNING
)
<<
"Set openmp or cpu affinity failed."
;
LOG
(
WARNING
)
<<
"Set openmp or cpu affinity failed."
;
}
}
...
...
mace/kernels/gemm_test.cc
浏览文件 @
26592a86
...
@@ -74,25 +74,26 @@ void GemvTest(index_t batch, index_t N, index_t M) {
...
@@ -74,25 +74,26 @@ void GemvTest(index_t batch, index_t N, index_t M) {
}
}
}
}
void
SGemmTest
(
index_t
N
,
void
SGemmTest
(
index_t
batch
,
index_t
N
,
index_t
K
,
index_t
K
,
index_t
M
,
index_t
M
,
bool
transpose_a
,
bool
transpose_a
,
bool
transpose_b
)
{
bool
transpose_b
)
{
std
::
unique_ptr
<
float
[]
>
A
(
new
float
[
N
*
K
]);
std
::
unique_ptr
<
float
[]
>
A
(
new
float
[
batch
*
N
*
K
]);
std
::
unique_ptr
<
float
[]
>
B
(
new
float
[
K
*
M
]);
std
::
unique_ptr
<
float
[]
>
B
(
new
float
[
batch
*
K
*
M
]);
std
::
unique_ptr
<
float
[]
>
C
(
new
float
[
N
*
M
]);
std
::
unique_ptr
<
float
[]
>
C
(
new
float
[
batch
*
N
*
M
]);
std
::
unique_ptr
<
float
[]
>
C_ref
(
new
float
[
N
*
M
]);
std
::
unique_ptr
<
float
[]
>
C_ref
(
new
float
[
batch
*
N
*
M
]);
std
::
random_device
rd
;
std
::
random_device
rd
;
std
::
mt19937
gen
(
rd
());
std
::
mt19937
gen
(
rd
());
std
::
normal_distribution
<
float
>
nd
(
0
,
1
);
std
::
normal_distribution
<
float
>
nd
(
0
,
1
);
std
::
generate
(
A
.
get
(),
A
.
get
()
+
N
*
K
,
std
::
generate
(
A
.
get
(),
A
.
get
()
+
batch
*
N
*
K
,
[
&
gen
,
&
nd
]
{
return
nd
(
gen
);
});
[
&
gen
,
&
nd
]
{
return
nd
(
gen
);
});
std
::
generate
(
B
.
get
(),
B
.
get
()
+
K
*
M
,
std
::
generate
(
B
.
get
(),
B
.
get
()
+
batch
*
K
*
M
,
[
&
gen
,
&
nd
]
{
return
nd
(
gen
);
});
[
&
gen
,
&
nd
]
{
return
nd
(
gen
);
});
kernels
::
GemmRef
(
A
.
get
(),
B
.
get
(),
1
,
N
,
K
,
M
,
C_ref
.
get
(),
transpose_a
,
kernels
::
GemmRef
(
A
.
get
(),
B
.
get
(),
batch
,
N
,
K
,
M
,
C_ref
.
get
(),
transpose_a
,
transpose_b
);
transpose_b
);
kernels
::
MatrixMap
<
const
float
>
matrix_a
;
kernels
::
MatrixMap
<
const
float
>
matrix_a
;
...
@@ -100,22 +101,38 @@ void SGemmTest(index_t N,
...
@@ -100,22 +101,38 @@ void SGemmTest(index_t N,
if
(
!
transpose_a
)
{
if
(
!
transpose_a
)
{
matrix_a
=
matrix_a
=
kernels
::
MatrixMap
<
const
float
>
(
N
,
K
,
kernels
::
RowMajor
,
A
.
get
());
kernels
::
MatrixMap
<
const
float
>
(
batch
,
N
,
K
,
kernels
::
RowMajor
,
A
.
get
());
}
else
{
}
else
{
matrix_a
=
matrix_a
=
kernels
::
MatrixMap
<
const
float
>
(
K
,
N
,
kernels
::
RowMajor
,
A
.
get
());
kernels
::
MatrixMap
<
const
float
>
(
batch
,
K
,
N
,
kernels
::
RowMajor
,
A
.
get
());
matrix_a
=
matrix_a
.
transpose
();
matrix_a
=
matrix_a
.
transpose
();
}
}
if
(
!
transpose_b
)
{
if
(
!
transpose_b
)
{
matrix_b
=
matrix_b
=
kernels
::
MatrixMap
<
const
float
>
(
K
,
M
,
kernels
::
RowMajor
,
B
.
get
());
kernels
::
MatrixMap
<
const
float
>
(
batch
,
K
,
M
,
kernels
::
RowMajor
,
B
.
get
());
}
else
{
}
else
{
matrix_b
=
matrix_b
=
kernels
::
MatrixMap
<
const
float
>
(
M
,
K
,
kernels
::
RowMajor
,
B
.
get
());
kernels
::
MatrixMap
<
const
float
>
(
batch
,
M
,
K
,
kernels
::
RowMajor
,
B
.
get
());
matrix_b
=
matrix_b
.
transpose
();
matrix_b
=
matrix_b
.
transpose
();
}
}
kernels
::
MatrixMap
<
float
>
matrix_c
(
N
,
M
,
kernels
::
RowMajor
,
C
.
get
());
kernels
::
MatrixMap
<
float
>
matrix_c
(
batch
,
N
,
M
,
kernels
::
RowMajor
,
C
.
get
());
kernels
::
SGemm
sgemm
;
kernels
::
SGemm
sgemm
;
sgemm
(
matrix_a
,
matrix_b
,
&
matrix_c
);
sgemm
(
matrix_a
,
matrix_b
,
&
matrix_c
);
...
@@ -168,26 +185,21 @@ TEST(GEMMTest, gemv) {
...
@@ -168,26 +185,21 @@ TEST(GEMMTest, gemv) {
}
}
namespace
{
namespace
{
void
TestSGemmTranspose
(
index_t
N
,
index_t
K
,
index_t
M
)
{
void
TestSGemmTranspose
(
index_t
batch
,
index_t
N
,
index_t
K
,
index_t
M
)
{
SGemmTest
(
N
,
K
,
M
,
false
,
false
);
SGemmTest
(
batch
,
N
,
K
,
M
,
false
,
false
);
SGemmTest
(
N
,
K
,
M
,
true
,
false
);
SGemmTest
(
batch
,
N
,
K
,
M
,
true
,
false
);
SGemmTest
(
N
,
K
,
M
,
false
,
true
);
SGemmTest
(
batch
,
N
,
K
,
M
,
false
,
true
);
SGemmTest
(
N
,
K
,
M
,
true
,
true
);
SGemmTest
(
batch
,
N
,
K
,
M
,
true
,
true
);
}
}
}
}
TEST
(
SGEMMTest
,
AlignedWithoutBatch
)
{
TestSGemmTranspose
(
4
,
4
,
4
);
TestSGemmTranspose
(
8
,
8
,
8
);
TestSGemmTranspose
(
16
,
16
,
16
);
}
TEST
(
SGEMMTest
,
UnalignedWithoutBatch
)
{
TEST
(
SGEMMTest
,
UnalignedWithoutBatch
)
{
std
::
vector
<
index_t
>
tests
{
1
,
5
,
14
,
31
,
47
};
std
::
vector
<
index_t
>
tests
{
1
,
5
,
14
,
31
,
47
};
for
(
index_t
N
:
tests
)
{
for
(
index_t
N
:
tests
)
{
for
(
index_t
K
:
tests
)
{
for
(
index_t
K
:
tests
)
{
for
(
index_t
M
:
tests
)
{
for
(
index_t
M
:
tests
)
{
TestSGemmTranspose
(
N
,
K
,
M
);
TestSGemmTranspose
(
1
,
N
,
K
,
M
);
TestSGemmTranspose
(
16
,
N
,
K
,
M
);
}
}
}
}
}
}
...
...
mace/kernels/matmul.h
浏览文件 @
26592a86
...
@@ -32,6 +32,7 @@
...
@@ -32,6 +32,7 @@
#include "mace/kernels/kernel.h"
#include "mace/kernels/kernel.h"
#include "mace/utils/utils.h"
#include "mace/utils/utils.h"
#include "mace/kernels/gemmlowp_util.h"
#include "mace/kernels/gemmlowp_util.h"
#include "mace/kernels/sgemm.h"
#ifdef MACE_ENABLE_OPENCL
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/cl2_header.h"
...
@@ -83,39 +84,34 @@ struct MatMulFunctor : OpKernel {
...
@@ -83,39 +84,34 @@ struct MatMulFunctor : OpKernel {
const
T
*
b_ptr_base
=
B
->
data
<
T
>
();
const
T
*
b_ptr_base
=
B
->
data
<
T
>
();
T
*
c_ptr_base
=
C
->
mutable_data
<
T
>
();
T
*
c_ptr_base
=
C
->
mutable_data
<
T
>
();
memset
(
c_ptr_base
,
0
,
batch
*
height
*
width
*
sizeof
(
T
));
const
index_t
height_a
=
A
->
dim
(
rank
-
2
);
const
index_t
width_a
=
A
->
dim
(
rank
-
1
);
if
(
height
==
1
&&
width
>
1
&&
B
->
is_weight
())
{
const
index_t
height_b
=
B
->
dim
(
rank
-
2
);
// A * B = (B^T * A^T)^T
const
index_t
width_b
=
B
->
dim
(
rank
-
1
);
if
(
!
transpose_b
)
{
if
(
B_transpose_
.
get
()
==
nullptr
)
{
sgemm_
.
Run
(
a_ptr_base
,
B_transpose_
.
reset
(
new
Tensor
(
context_
->
device
()
->
allocator
(),
b_ptr_base
,
DataTypeToEnum
<
T
>::
v
()));
batch
,
B_transpose_
->
Resize
({
batch
,
width
,
K
});
height_a
,
Tensor
::
MappingGuard
guardbt
(
B_transpose_
.
get
());
width_a
,
T
*
bt_ptr_base
=
B_transpose_
->
mutable_data
<
T
>
();
height_b
,
Transpose
(
b_ptr_base
,
K
,
width
,
width
,
bt_ptr_base
);
width_b
,
}
transpose_a
,
Tensor
::
MappingGuard
guardbt
(
B_transpose_
.
get
());
transpose_b
,
T
*
bt_ptr_base
=
B_transpose_
->
mutable_data
<
T
>
();
A
->
is_weight
(),
Gemv
(
bt_ptr_base
,
a_ptr_base
,
batch
,
K
,
width
,
c_ptr_base
);
B
->
is_weight
(),
}
else
{
c_ptr_base
,
Gemv
(
b_ptr_base
,
a_ptr_base
,
batch
,
K
,
width
,
c_ptr_base
);
context_
->
workspace
()
->
GetScratchBuffer
(
D
));
}
}
else
{
Gemm
(
a_ptr_base
,
b_ptr_base
,
batch
,
height
,
K
,
width
,
c_ptr_base
,
transpose_a
,
transpose_b
);
}
return
MACE_SUCCESS
;
return
MACE_SUCCESS
;
}
}
std
::
unique_ptr
<
Tensor
>
B_transpose
_
;
SGemm
sgemm
_
;
};
};
template
<
>
template
<
>
struct
MatMulFunctor
<
CPU
,
uint8_t
>
:
OpKernel
{
struct
MatMulFunctor
<
CPU
,
uint8_t
>
:
OpKernel
{
explicit
MatMulFunctor
(
OpKernelContext
*
context
)
:
OpKernel
(
context
)
{}
explicit
MatMulFunctor
(
OpKernelContext
*
context
)
:
OpKernel
(
context
)
{}
template
<
gemmlowp
::
MapOrder
AOrder
,
gemmlowp
::
MapOrder
BOrder
>
template
<
gemmlowp
::
MapOrder
AOrder
,
gemmlowp
::
MapOrder
BOrder
>
void
MatMulImpl
(
const
Tensor
*
A
,
void
MatMulImpl
(
const
Tensor
*
A
,
const
Tensor
*
B
,
const
Tensor
*
B
,
...
@@ -213,6 +209,7 @@ struct MatMulFunctor<CPU, uint8_t> : OpKernel {
...
@@ -213,6 +209,7 @@ struct MatMulFunctor<CPU, uint8_t> : OpKernel {
template
<
typename
T
>
template
<
typename
T
>
struct
MatMulFunctor
<
DeviceType
::
GPU
,
T
>
:
OpKernel
{
struct
MatMulFunctor
<
DeviceType
::
GPU
,
T
>
:
OpKernel
{
explicit
MatMulFunctor
(
OpKernelContext
*
context
)
:
OpKernel
(
context
)
{}
explicit
MatMulFunctor
(
OpKernelContext
*
context
)
:
OpKernel
(
context
)
{}
MaceStatus
operator
()(
const
Tensor
*
A
,
MaceStatus
operator
()(
const
Tensor
*
A
,
const
Tensor
*
B
,
const
Tensor
*
B
,
Tensor
*
C
,
Tensor
*
C
,
...
...
mace/kernels/matmul_benchmark.cc
浏览文件 @
26592a86
...
@@ -114,9 +114,11 @@ void MatmulBenchmark_Mace_SGemm(int iters, int m, int k, int n) {
...
@@ -114,9 +114,11 @@ void MatmulBenchmark_Mace_SGemm(int iters, int m, int k, int n) {
std
::
vector
<
float
>
rhs
(
k
*
n
);
std
::
vector
<
float
>
rhs
(
k
*
n
);
std
::
vector
<
float
>
result
(
m
*
n
);
std
::
vector
<
float
>
result
(
m
*
n
);
kernels
::
MatrixMap
<
const
float
>
matrix_lhs
(
m
,
k
,
RowMajor
,
lhs
.
data
(),
true
);
kernels
::
MatrixMap
<
const
float
>
matrix_lhs
(
1
,
m
,
k
,
RowMajor
,
lhs
.
data
(),
kernels
::
MatrixMap
<
const
float
>
matrix_rhs
(
k
,
n
,
RowMajor
,
rhs
.
data
(),
true
);
true
);
kernels
::
MatrixMap
<
float
>
matrix_result
(
m
,
n
,
RowMajor
,
result
.
data
());
kernels
::
MatrixMap
<
const
float
>
matrix_rhs
(
1
,
k
,
n
,
RowMajor
,
rhs
.
data
(),
true
);
kernels
::
MatrixMap
<
float
>
matrix_result
(
1
,
m
,
n
,
RowMajor
,
result
.
data
());
kernels
::
SGemm
sgemm
;
kernels
::
SGemm
sgemm
;
...
...
mace/kernels/sgemm.cc
浏览文件 @
26592a86
...
@@ -12,9 +12,11 @@
...
@@ -12,9 +12,11 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#include <memory>
#include "mace/kernels/sgemm.h"
#include "mace/kernels/sgemm.h"
#include "mace/core/runtime/cpu/cpu_runtime.h"
#include <memory>
#if defined(MACE_ENABLE_NEON)
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
#include <arm_neon.h>
...
@@ -29,29 +31,118 @@ namespace kernels {
...
@@ -29,29 +31,118 @@ namespace kernels {
void
SGemm
::
operator
()(
const
MatrixMap
<
const
float
>
&
lhs
,
void
SGemm
::
operator
()(
const
MatrixMap
<
const
float
>
&
lhs
,
const
MatrixMap
<
const
float
>
&
rhs
,
const
MatrixMap
<
const
float
>
&
rhs
,
MatrixMap
<
float
>
*
result
)
{
MatrixMap
<
float
>
*
result
,
ScratchBuffer
*
scratch_buffer
)
{
if
(
rhs
.
col
()
<
lhs
.
row
())
{
if
(
rhs
.
col
()
<
lhs
.
row
())
{
MatrixMap
<
const
float
>
lhs_transpose
=
lhs
.
transpose
();
MatrixMap
<
const
float
>
lhs_transpose
=
lhs
.
transpose
();
MatrixMap
<
const
float
>
rhs_transpose
=
rhs
.
transpose
();
MatrixMap
<
const
float
>
rhs_transpose
=
rhs
.
transpose
();
MatrixMap
<
float
>
result_transpose
=
result
->
transpose
();
MatrixMap
<
float
>
result_transpose
=
result
->
transpose
();
return
operator
()(
rhs_transpose
,
lhs_transpose
,
&
result_transpose
);
return
operator
()(
rhs_transpose
,
lhs_transpose
,
&
result_transpose
,
scratch_buffer
);
}
}
if
(
!
packed_
||
!
lhs
.
is_const
())
{
if
(
scratch_buffer
!=
nullptr
)
{
PackLhs
(
lhs
,
&
packed_lhs_
);
scratch_buffer
->
Rewind
();
index_t
total_size
=
result
->
size
();
if
(
!
lhs
.
is_const
())
{
total_size
+=
lhs
.
size
();
}
if
(
!
rhs
.
is_const
())
{
total_size
+=
rhs
.
size
();
}
scratch_buffer
->
GrowSize
(
total_size
*
sizeof
(
float
));
scratch_buffer
->
Rewind
();
if
(
!
lhs
.
is_const
())
{
packed_lhs_
.
reset
(
new
Tensor
(
scratch_buffer
->
Scratch
(
lhs
.
size
()
*
sizeof
(
float
)),
DT_FLOAT
));
}
if
(
!
rhs
.
is_const
())
{
packed_lhs_
.
reset
(
new
Tensor
(
scratch_buffer
->
Scratch
(
rhs
.
size
()
*
sizeof
(
float
)),
DT_FLOAT
));
}
packed_result_
.
reset
(
new
Tensor
(
scratch_buffer
->
Scratch
(
result
->
size
()
*
sizeof
(
float
)),
DT_FLOAT
));
}
if
(
packed_lhs_
.
get
()
==
nullptr
)
{
packed_lhs_
.
reset
(
new
Tensor
(
GetCPUAllocator
(),
DT_FLOAT
));
packed_lhs_
->
Resize
({
lhs
.
size
()});
}
if
(
packed_rhs_
.
get
()
==
nullptr
)
{
packed_rhs_
.
reset
(
new
Tensor
(
GetCPUAllocator
(),
DT_FLOAT
));
packed_rhs_
->
Resize
({
rhs
.
size
()});
}
}
if
(
!
packed_
||
!
rhs
.
is_const
())
{
if
(
packed_result_
.
get
()
==
nullptr
)
{
PackRhs
(
rhs
,
&
packed_rhs_
);
packed_result_
.
reset
(
new
Tensor
(
GetCPUAllocator
(),
DT_FLOAT
));
packed_result_
->
Resize
({
result
->
size
()});
}
if
(
!
lhs
.
is_const
()
||
!
packed_
)
{
PackLhs
(
lhs
,
packed_lhs_
.
get
());
}
if
(
!
rhs
.
is_const
()
||
!
packed_
)
{
PackRhs
(
rhs
,
packed_rhs_
.
get
());
}
}
packed_
=
true
;
packed_
=
true
;
operator
()(
packed_lhs_
,
RunInternal
(
*
packed_lhs_
,
packed_rhs_
,
*
packed_rhs_
,
lhs
.
row
(),
lhs
.
batch
(),
lhs
.
col
(),
lhs
.
row
(),
rhs
.
col
(),
lhs
.
col
(),
&
packed_result_
);
rhs
.
col
(),
UnPack
(
packed_result_
,
result
);
packed_result_
.
get
());
UnPack
(
*
packed_result_
,
result
);
}
void
SGemm
::
Run
(
const
float
*
A
,
const
float
*
B
,
const
index_t
batch
,
const
index_t
height_a
,
const
index_t
width_a
,
const
index_t
height_b
,
const
index_t
width_b
,
const
bool
transpose_a
,
const
bool
transpose_b
,
const
bool
is_a_weight
,
const
bool
is_b_weight
,
float
*
C
,
ScratchBuffer
*
scratch_buffer
)
{
index_t
height_c
=
height_a
;
index_t
width_c
=
width_b
;
if
(
transpose_a
)
{
height_c
=
width_a
;
}
if
(
transpose_b
)
{
width_c
=
height_b
;
}
MatrixMap
<
const
float
>
matrix_a
=
MatrixMap
<
const
float
>
(
batch
,
height_a
,
width_a
,
kernels
::
RowMajor
,
A
,
is_a_weight
);
MatrixMap
<
const
float
>
matrix_b
=
kernels
::
MatrixMap
<
const
float
>
(
batch
,
height_b
,
width_b
,
kernels
::
RowMajor
,
B
,
is_b_weight
);
if
(
transpose_a
)
{
matrix_a
=
matrix_a
.
transpose
();
}
if
(
transpose_b
)
{
matrix_b
=
matrix_b
.
transpose
();
}
MatrixMap
<
float
>
matrix_c
(
batch
,
height_c
,
width_c
,
kernels
::
RowMajor
,
C
);
operator
()(
matrix_a
,
matrix_b
,
&
matrix_c
,
scratch_buffer
);
}
}
#if defined(MACE_ENABLE_NEON)
#if defined(MACE_ENABLE_NEON)
...
@@ -141,17 +232,43 @@ void SGemm::operator()(const MatrixMap<const float> &lhs,
...
@@ -141,17 +232,43 @@ void SGemm::operator()(const MatrixMap<const float> &lhs,
#endif // __aarch64__
#endif // __aarch64__
#endif // MACE_ENABLE_NEON
#endif // MACE_ENABLE_NEON
void
SGemm
::
operator
()(
const
PackedBlock
<
float
>
&
lhs
,
void
SGemm
::
RunInternal
(
const
PackedBlock
&
lhs
,
const
PackedBlock
<
float
>
&
rhs
,
const
PackedBlock
&
rhs
,
const
index_t
height
,
const
index_t
batch
,
const
index_t
depth
,
const
index_t
height
,
const
index_t
width
,
const
index_t
depth
,
PackedBlock
<
float
>
*
result
)
{
const
index_t
width
,
result
->
tensor
()
->
Resize
({
height
*
width
});
PackedBlock
*
result
)
{
const
float
*
lhs_data
=
lhs
.
data
();
const
float
*
lhs_data
=
lhs
.
data
<
float
>
();
const
float
*
rhs_data
=
rhs
.
data
();
const
float
*
rhs_data
=
rhs
.
data
<
float
>
();
float
*
result_data
=
result
->
mutable_data
();
float
*
result_data
=
result
->
mutable_data
<
float
>
();
#define MACE_SGEMM_RUN_PER_BATCH \
for (index_t b = 0; b < batch; ++b) { \
RunPerBatch(lhs_data + b * height * depth, \
rhs_data + b * depth * width, \
height, \
depth, \
width, \
result_data + b * height * width); \
}
if
(
batch
>=
MaceOpenMPThreadCount
)
{
#pragma omp parallel for
MACE_SGEMM_RUN_PER_BATCH
}
else
{
MACE_SGEMM_RUN_PER_BATCH
}
#undef MACE_SGEMM_RUN_PER_BATCH
}
void
SGemm
::
RunPerBatch
(
const
float
*
lhs_data
,
const
float
*
rhs_data
,
const
index_t
height
,
const
index_t
depth
,
const
index_t
width
,
float
*
result_data
)
{
#if defined(MACE_ENABLE_NEON)
#if defined(MACE_ENABLE_NEON)
const
index_t
block_w
=
width
>>
2
;
const
index_t
block_w
=
width
>>
2
;
const
index_t
remain_w
=
width
-
(
block_w
<<
2
);
const
index_t
remain_w
=
width
-
(
block_w
<<
2
);
...
@@ -508,11 +625,10 @@ void SGemm::operator()(const PackedBlock<float> &lhs,
...
@@ -508,11 +625,10 @@ void SGemm::operator()(const PackedBlock<float> &lhs,
float32x4_t
c0
=
vdupq_n_f32
(
0.
f
);
float32x4_t
c0
=
vdupq_n_f32
(
0.
f
);
#if defined(__aarch64__)
// d: 8
block_d
=
remain_d
>>
3
;
block_d
=
remain_d
>>
3
;
remain_d
-=
(
block_d
<<
3
);
remain_d
-=
(
block_d
<<
3
);
// d: 8
for
(
index_t
bd
=
0
;
bd
<
block_d
;
++
bd
)
{
for
(
index_t
bd
=
0
;
bd
<
block_d
;
++
bd
)
{
// 1.8.4
// 1.8.4
float32x4_t
a0
,
a1
;
float32x4_t
a0
,
a1
;
...
@@ -535,7 +651,6 @@ void SGemm::operator()(const PackedBlock<float> &lhs,
...
@@ -535,7 +651,6 @@ void SGemm::operator()(const PackedBlock<float> &lhs,
lhs_ptr
+=
8
;
lhs_ptr
+=
8
;
rhs_ptr
+=
32
;
rhs_ptr
+=
32
;
}
}
#endif // __aarch64__
block_d
=
remain_d
>>
2
;
block_d
=
remain_d
>>
2
;
remain_d
-=
(
block_d
<<
2
);
remain_d
-=
(
block_d
<<
2
);
...
@@ -608,7 +723,7 @@ void SGemm::operator()(const PackedBlock<float> &lhs,
...
@@ -608,7 +723,7 @@ void SGemm::operator()(const PackedBlock<float> &lhs,
index_t
remain_d
=
depth
;
index_t
remain_d
=
depth
;
float32x4_t
c0
,
c1
,
c2
,
c3
,
c4
,
c5
,
c6
,
c7
;
float32x4_t
c0
,
c1
;
c0
=
vdupq_n_f32
(
0.
f
);
c0
=
vdupq_n_f32
(
0.
f
);
c1
=
vdupq_n_f32
(
0.
f
);
c1
=
vdupq_n_f32
(
0.
f
);
...
@@ -787,93 +902,74 @@ void SGemm::operator()(const PackedBlock<float> &lhs,
...
@@ -787,93 +902,74 @@ void SGemm::operator()(const PackedBlock<float> &lhs,
}
}
void
SGemm
::
PackLhs
(
const
MatrixMap
<
const
float
>
&
lhs
,
void
SGemm
::
PackLhs
(
const
MatrixMap
<
const
float
>
&
lhs
,
PackedBlock
<
float
>
*
packed_block
)
{
PackedBlock
*
packed_block
)
{
Pack
(
lhs
,
PackOrder
::
ColMajor
,
packed_block
);
Pack
(
lhs
,
PackOrder
::
ColMajor
,
packed_block
);
}
}
void
SGemm
::
PackRhs
(
const
MatrixMap
<
const
float
>
&
rhs
,
void
SGemm
::
PackRhs
(
const
MatrixMap
<
const
float
>
&
rhs
,
PackedBlock
<
float
>
*
packed_block
)
{
PackedBlock
*
packed_block
)
{
Pack
(
rhs
,
PackOrder
::
RowMajor
,
packed_block
);
Pack
(
rhs
,
PackOrder
::
RowMajor
,
packed_block
);
}
}
void
SGemm
::
UnPack
(
const
PackedBlock
<
float
>
&
packed_result
,
void
SGemm
::
Pack
(
const
MatrixMap
<
const
float
>
&
src
,
const
PackOrder
order
,
PackedBlock
*
packed_block
)
{
MACE_CHECK_NOTNULL
(
packed_block
);
const
index_t
height
=
src
.
row
();
const
index_t
width
=
src
.
col
();
auto
packed_data
=
packed_block
->
mutable_data
<
float
>
();
#define MACE_SGEMM_PACK_PER_BATCH \
for (index_t b = 0; b < src.batch(); ++b) { \
PackPerBatch(src, order, b, packed_data + b * height * width); \
}
if
(
src
.
batch
()
>=
MaceOpenMPThreadCount
)
{
#pragma omp parallel for
MACE_SGEMM_PACK_PER_BATCH
}
else
{
MACE_SGEMM_PACK_PER_BATCH
}
#undef MACE_SGEMM_PACK_PER_BATCH
}
void
SGemm
::
UnPack
(
const
PackedBlock
&
packed_result
,
MatrixMap
<
float
>
*
matrix_map
)
{
MatrixMap
<
float
>
*
matrix_map
)
{
MACE_CHECK_NOTNULL
(
matrix_map
);
MACE_CHECK_NOTNULL
(
matrix_map
);
const
index_t
height
=
matrix_map
->
row
();
const
index_t
height
=
matrix_map
->
row
();
const
index_t
width
=
matrix_map
->
col
();
const
index_t
width
=
matrix_map
->
col
();
auto
packed_data
=
packed_result
.
data
();
auto
packed_data
=
packed_result
.
data
<
float
>
();
auto
unpacked_data
=
matrix_map
->
data
();
if
(
matrix_map
->
major
()
==
Major
::
RowMajor
)
{
#define MACE_SGEMM_UNPACK_PER_BATCH \
// This is for non-transposed result
for (index_t b = 0; b < matrix_map->batch(); ++b) { \
index_t
w
=
0
;
UnPackPerBatch(packed_data + b * height * width, b, matrix_map); \
#if defined(MACE_ENABLE_NEON)
}
#pragma omp parallel for
for
(
index_t
iw
=
w
;
iw
<=
width
-
4
;
iw
+=
4
)
{
if
(
matrix_map
->
batch
()
>=
MaceOpenMPThreadCount
)
{
const
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
float
*
unpacked_data_ptr
=
unpacked_data
+
iw
;
for
(
index_t
h
=
0
;
h
<
height
;
++
h
)
{
const
index_t
packed_offset
=
h
*
4
;
const
index_t
unpacked_offset
=
h
*
width
;
float32x4_t
vs
=
vld1q_f32
(
packed_data_ptr
+
packed_offset
);
vst1q_f32
(
unpacked_data_ptr
+
unpacked_offset
,
vs
);
}
}
w
+=
(
width
-
w
)
/
4
*
4
;
#endif
#pragma omp parallel for
#pragma omp parallel for
for
(
index_t
iw
=
w
;
iw
<
width
;
++
iw
)
{
MACE_SGEMM_UNPACK_PER_BATCH
const
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
float
*
unpacked_data_ptr
=
unpacked_data
+
iw
;
for
(
index_t
h
=
0
;
h
<
height
;
++
h
)
{
unpacked_data_ptr
[
h
*
width
]
=
packed_data_ptr
[
h
];
}
}
}
else
{
}
else
{
// This is for transposed result
MACE_SGEMM_UNPACK_PER_BATCH
index_t
w
=
0
;
#if defined(MACE_ENABLE_NEON)
#pragma omp parallel for
for
(
index_t
iw
=
w
;
iw
<=
width
-
4
;
iw
+=
4
)
{
const
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
float
*
unpacked_data_ptr
=
unpacked_data
+
iw
*
height
;
for
(
index_t
h
=
0
;
h
<
height
;
++
h
)
{
const
index_t
packed_offset
=
h
*
4
;
const
index_t
unpacked_offset
=
h
;
float32x4_t
vs
=
vld1q_f32
(
packed_data_ptr
+
packed_offset
);
unpacked_data_ptr
[
unpacked_offset
]
=
vs
[
0
];
unpacked_data_ptr
[
unpacked_offset
+
height
]
=
vs
[
1
];
unpacked_data_ptr
[
unpacked_offset
+
2
*
height
]
=
vs
[
2
];
unpacked_data_ptr
[
unpacked_offset
+
3
*
height
]
=
vs
[
3
];
}
}
w
+=
(
width
-
w
)
/
4
*
4
;
#endif
#pragma omp parallel for
for
(
index_t
iw
=
w
;
iw
<
width
;
++
iw
)
{
std
::
copy_n
(
packed_data
+
iw
*
height
,
height
,
unpacked_data
+
iw
*
height
);
}
}
}
#undef MACE_SGEMM_UNPACK_PER_BATCH
}
}
void
SGemm
::
Pack
(
const
MatrixMap
<
const
float
>
&
src
,
void
SGemm
::
PackPerBatch
(
const
MatrixMap
<
const
float
>
&
src
,
const
PackOrder
order
,
const
PackOrder
order
,
PackedBlock
<
float
>
*
packed_block
)
{
const
index_t
batch_index
,
MACE_CHECK_NOTNULL
(
packed_block
);
float
*
packed_data
)
{
MACE_CHECK_NOTNULL
(
packed_data
);
const
index_t
height
=
src
.
row
();
const
index_t
height
=
src
.
row
();
const
index_t
width
=
src
.
col
();
const
index_t
width
=
src
.
col
();
packed_block
->
tensor
()
->
Resize
({
height
*
width
});
auto
src_data
=
src
.
batch_data
(
batch_index
);
auto
src_data
=
src
.
data
();
auto
packed_data
=
packed_block
->
mutable_data
();
if
(
src
.
major
()
==
Major
::
RowMajor
&&
order
==
PackOrder
::
ColMajor
)
{
if
(
src
.
major
()
==
Major
::
RowMajor
&&
order
==
PackOrder
::
ColMajor
)
{
// This is for packing no-transpose lhs.
// This is for packing no-transpose lhs.
index_t
h
=
0
;
index_t
h
=
0
;
#if defined(MACE_ENABLE_NEON)
#if defined(MACE_ENABLE_NEON)
#if defined(__aarch64__)
#if defined(__aarch64__)
#pragma omp parallel for
#pragma omp parallel for
for
(
index_t
ih
=
h
;
ih
<=
height
-
8
;
ih
+=
8
)
{
for
(
index_t
ih
=
h
;
ih
<=
height
-
8
;
ih
+=
8
)
{
const
float
*
src_data_ptr
=
src_data
+
ih
*
width
;
const
float
*
src_data_ptr
=
src_data
+
ih
*
width
;
...
@@ -919,7 +1015,7 @@ void SGemm::Pack(const MatrixMap<const float> &src,
...
@@ -919,7 +1015,7 @@ void SGemm::Pack(const MatrixMap<const float> &src,
// This is for packing transpose-needed lhs.
// This is for packing transpose-needed lhs.
index_t
h
=
0
;
index_t
h
=
0
;
#if defined(MACE_ENABLE_NEON)
#if defined(MACE_ENABLE_NEON)
#if defined(__aarch64__)
#if defined(__aarch64__)
#pragma omp parallel for
#pragma omp parallel for
for
(
index_t
ih
=
h
;
ih
<=
height
-
8
;
ih
+=
8
)
{
for
(
index_t
ih
=
h
;
ih
<=
height
-
8
;
ih
+=
8
)
{
const
float
*
src_data_ptr
=
src_data
+
ih
;
const
float
*
src_data_ptr
=
src_data
+
ih
;
...
@@ -960,7 +1056,7 @@ void SGemm::Pack(const MatrixMap<const float> &src,
...
@@ -960,7 +1056,7 @@ void SGemm::Pack(const MatrixMap<const float> &src,
// This is for packing no-transpose rhs.
// This is for packing no-transpose rhs.
index_t
w
=
0
;
index_t
w
=
0
;
#if defined(MACE_ENABLE_NEON)
#if defined(MACE_ENABLE_NEON)
#pragma omp parallel for
#pragma omp parallel for
for
(
index_t
iw
=
w
;
iw
<=
width
-
4
;
iw
+=
4
)
{
for
(
index_t
iw
=
w
;
iw
<=
width
-
4
;
iw
+=
4
)
{
const
float
*
src_data_ptr
=
src_data
+
iw
;
const
float
*
src_data_ptr
=
src_data
+
iw
;
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
...
@@ -985,7 +1081,7 @@ void SGemm::Pack(const MatrixMap<const float> &src,
...
@@ -985,7 +1081,7 @@ void SGemm::Pack(const MatrixMap<const float> &src,
// This is for packing transpose-needed rhs.
// This is for packing transpose-needed rhs.
index_t
w
=
0
;
index_t
w
=
0
;
#if defined(MACE_ENABLE_NEON)
#if defined(MACE_ENABLE_NEON)
#pragma omp parallel for
#pragma omp parallel for
for
(
index_t
iw
=
w
;
iw
<=
width
-
4
;
iw
+=
4
)
{
for
(
index_t
iw
=
w
;
iw
<=
width
-
4
;
iw
+=
4
)
{
const
float
*
src_data_ptr
=
src_data
+
iw
*
height
;
const
float
*
src_data_ptr
=
src_data
+
iw
*
height
;
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
...
@@ -1008,5 +1104,67 @@ void SGemm::Pack(const MatrixMap<const float> &src,
...
@@ -1008,5 +1104,67 @@ void SGemm::Pack(const MatrixMap<const float> &src,
}
}
}
}
void
SGemm
::
UnPackPerBatch
(
const
float
*
packed_data
,
const
index_t
batch_index
,
MatrixMap
<
float
>
*
matrix_map
)
{
MACE_CHECK_NOTNULL
(
matrix_map
);
const
index_t
height
=
matrix_map
->
row
();
const
index_t
width
=
matrix_map
->
col
();
auto
unpacked_data
=
matrix_map
->
batch_data
(
batch_index
);
if
(
matrix_map
->
major
()
==
Major
::
RowMajor
)
{
// This is for non-transposed result
index_t
w
=
0
;
#if defined(MACE_ENABLE_NEON)
#pragma omp parallel for
for
(
index_t
iw
=
w
;
iw
<=
width
-
4
;
iw
+=
4
)
{
const
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
float
*
unpacked_data_ptr
=
unpacked_data
+
iw
;
for
(
index_t
h
=
0
;
h
<
height
;
++
h
)
{
const
index_t
packed_offset
=
h
*
4
;
const
index_t
unpacked_offset
=
h
*
width
;
float32x4_t
vs
=
vld1q_f32
(
packed_data_ptr
+
packed_offset
);
vst1q_f32
(
unpacked_data_ptr
+
unpacked_offset
,
vs
);
}
}
w
+=
(
width
-
w
)
/
4
*
4
;
#endif
#pragma omp parallel for
for
(
index_t
iw
=
w
;
iw
<
width
;
++
iw
)
{
const
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
float
*
unpacked_data_ptr
=
unpacked_data
+
iw
;
for
(
index_t
h
=
0
;
h
<
height
;
++
h
)
{
unpacked_data_ptr
[
h
*
width
]
=
packed_data_ptr
[
h
];
}
}
}
else
{
// This is for transposed result
index_t
w
=
0
;
#if defined(MACE_ENABLE_NEON)
#pragma omp parallel for
for
(
index_t
iw
=
w
;
iw
<=
width
-
4
;
iw
+=
4
)
{
const
float
*
packed_data_ptr
=
packed_data
+
iw
*
height
;
float
*
unpacked_data_ptr
=
unpacked_data
+
iw
*
height
;
for
(
index_t
h
=
0
;
h
<
height
;
++
h
)
{
const
index_t
packed_offset
=
h
*
4
;
const
index_t
unpacked_offset
=
h
;
float32x4_t
vs
=
vld1q_f32
(
packed_data_ptr
+
packed_offset
);
unpacked_data_ptr
[
unpacked_offset
]
=
vs
[
0
];
unpacked_data_ptr
[
unpacked_offset
+
height
]
=
vs
[
1
];
unpacked_data_ptr
[
unpacked_offset
+
2
*
height
]
=
vs
[
2
];
unpacked_data_ptr
[
unpacked_offset
+
3
*
height
]
=
vs
[
3
];
}
}
w
+=
(
width
-
w
)
/
4
*
4
;
#endif
#pragma omp parallel for
for
(
index_t
iw
=
w
;
iw
<
width
;
++
iw
)
{
std
::
copy_n
(
packed_data
+
iw
*
height
,
height
,
unpacked_data
+
iw
*
height
);
}
}
}
}
// namespace kernels
}
// namespace kernels
}
// namespace mace
}
// namespace mace
mace/kernels/sgemm.h
浏览文件 @
26592a86
...
@@ -39,11 +39,13 @@ class MatrixMap {
...
@@ -39,11 +39,13 @@ class MatrixMap {
public:
public:
MatrixMap
()
{}
MatrixMap
()
{}
MatrixMap
(
const
index_t
row
,
MatrixMap
(
const
index_t
batch
,
const
index_t
row
,
const
index_t
col
,
const
index_t
col
,
const
Major
major
,
const
Major
major
,
T
*
data
,
T
*
data
,
const
bool
is_const
=
false
)
:
const
bool
is_const
=
false
)
:
batch_
(
batch
),
row_
(
row
),
row_
(
row
),
col_
(
col
),
col_
(
col
),
stride_
(
major
==
RowMajor
?
col
:
row
),
stride_
(
major
==
RowMajor
?
col
:
row
),
...
@@ -53,7 +55,11 @@ class MatrixMap {
...
@@ -53,7 +55,11 @@ class MatrixMap {
MatrixMap
transpose
()
const
{
MatrixMap
transpose
()
const
{
Major
transpose_major
=
major_
==
RowMajor
?
ColMajor
:
RowMajor
;
Major
transpose_major
=
major_
==
RowMajor
?
ColMajor
:
RowMajor
;
return
MatrixMap
(
col_
,
row_
,
transpose_major
,
data_
,
is_const_
);
return
MatrixMap
(
batch_
,
col_
,
row_
,
transpose_major
,
data_
,
is_const_
);
}
index_t
batch
()
const
{
return
batch_
;
}
}
index_t
row
()
const
{
index_t
row
()
const
{
...
@@ -76,8 +82,12 @@ class MatrixMap {
...
@@ -76,8 +82,12 @@ class MatrixMap {
return
data_
;
return
data_
;
}
}
T
*
data
(
int
row
,
int
col
)
const
{
T
*
batch_data
(
index_t
batch
)
const
{
return
data_
+
row
*
stride_
+
col
;
return
data_
+
batch
*
row_
*
col_
;
}
index_t
size
()
const
{
return
batch_
*
row_
*
col_
;
}
}
bool
is_const
()
const
{
bool
is_const
()
const
{
...
@@ -85,6 +95,7 @@ class MatrixMap {
...
@@ -85,6 +95,7 @@ class MatrixMap {
}
}
private:
private:
index_t
batch_
;
index_t
row_
;
index_t
row_
;
index_t
col_
;
index_t
col_
;
index_t
stride_
;
index_t
stride_
;
...
@@ -94,61 +105,76 @@ class MatrixMap {
...
@@ -94,61 +105,76 @@ class MatrixMap {
};
};
typedef
Major
PackOrder
;
typedef
Major
PackOrder
;
typedef
Tensor
PackedBlock
;
template
<
typename
T
>
class
PackedBlock
{
public:
PackedBlock
()
:
data_tensor_
(
GetCPUAllocator
(),
DataTypeToEnum
<
T
>::
v
())
{}
const
T
*
data
()
const
{
return
data_tensor_
.
data
<
T
>
();
}
T
*
mutable_data
()
{
return
data_tensor_
.
mutable_data
<
T
>
();
}
Tensor
*
tensor
()
{
return
&
data_tensor_
;
}
private:
Tensor
data_tensor_
;
};
class
SGemm
{
class
SGemm
{
public:
public:
SGemm
()
:
packed_
(
false
)
{}
SGemm
()
:
packed_lhs_
(
nullptr
),
packed_rhs_
(
nullptr
),
packed_
(
false
)
{}
void
operator
()(
const
MatrixMap
<
const
float
>
&
lhs
,
void
operator
()(
const
MatrixMap
<
const
float
>
&
lhs
,
const
MatrixMap
<
const
float
>
&
rhs
,
const
MatrixMap
<
const
float
>
&
rhs
,
MatrixMap
<
float
>
*
result
);
MatrixMap
<
float
>
*
result
,
ScratchBuffer
*
scratch_buffer
=
nullptr
);
void
operator
()(
const
PackedBlock
<
float
>
&
lhs
,
const
PackedBlock
<
float
>
&
rhs
,
void
Run
(
const
float
*
A
,
const
index_t
height
,
const
float
*
B
,
const
index_t
depth
,
const
index_t
batch
,
const
index_t
width
,
const
index_t
height_a
,
PackedBlock
<
float
>
*
result
);
const
index_t
width_a
,
const
index_t
height_b
,
const
index_t
width_b
,
const
bool
transpose_a
,
const
bool
transpose_b
,
const
bool
is_a_weight
,
const
bool
is_b_weight
,
float
*
C
,
ScratchBuffer
*
scratch_buffer
=
nullptr
);
void
PackLhs
(
const
MatrixMap
<
const
float
>
&
lhs
,
void
PackLhs
(
const
MatrixMap
<
const
float
>
&
lhs
,
PackedBlock
<
float
>
*
packed_block
);
PackedBlock
*
packed_block
);
void
PackRhs
(
const
MatrixMap
<
const
float
>
&
rhs
,
void
PackRhs
(
const
MatrixMap
<
const
float
>
&
rhs
,
PackedBlock
<
float
>
*
packed_block
);
PackedBlock
*
packed_block
);
void
UnPack
(
const
PackedBlock
<
float
>
&
packed_result
,
void
UnPack
(
const
PackedBlock
&
packed_result
,
MatrixMap
<
float
>
*
matrix_map
);
MatrixMap
<
float
>
*
matrix_map
);
private:
private:
void
Pack
(
const
MatrixMap
<
const
float
>
&
src
,
void
Pack
(
const
MatrixMap
<
const
float
>
&
src
,
const
PackOrder
order
,
const
PackOrder
order
,
PackedBlock
<
float
>
*
packed_block
);
PackedBlock
*
packed_block
);
void
PackPerBatch
(
const
MatrixMap
<
const
float
>
&
src
,
const
PackOrder
order
,
const
index_t
batch_index
,
float
*
packed_data
);
void
UnPackPerBatch
(
const
float
*
packed_data
,
const
index_t
batch_index
,
MatrixMap
<
float
>
*
matrix_map
);
void
RunInternal
(
const
PackedBlock
&
lhs
,
const
PackedBlock
&
rhs
,
const
index_t
batch
,
const
index_t
height
,
const
index_t
depth
,
const
index_t
width
,
PackedBlock
*
result
);
void
RunPerBatch
(
const
float
*
lhs
,
const
float
*
rhs
,
const
index_t
height
,
const
index_t
depth
,
const
index_t
width
,
float
*
result
);
std
::
unique_ptr
<
Tensor
>
packed_lhs_
;
std
::
unique_ptr
<
Tensor
>
packed_rhs_
;
std
::
unique_ptr
<
Tensor
>
packed_result_
;
PackedBlock
<
float
>
packed_lhs_
;
PackedBlock
<
float
>
packed_rhs_
;
PackedBlock
<
float
>
packed_result_
;
bool
packed_
;
bool
packed_
;
};
};
...
...
mace/kernels/sgemm_test.cc
→
mace/kernels/sgemm_
pack_
test.cc
浏览文件 @
26592a86
...
@@ -31,17 +31,17 @@ void TestPack(const std::vector<float> &data,
...
@@ -31,17 +31,17 @@ void TestPack(const std::vector<float> &data,
Major
src_order
,
Major
src_order
,
PackOrder
pack_order
)
{
PackOrder
pack_order
)
{
SGemm
sg
;
SGemm
sg
;
MatrixMap
<
const
float
>
src_matrix
(
height
,
width
,
src_order
,
data
.
data
());
MatrixMap
<
const
float
>
src_matrix
(
1
,
height
,
width
,
src_order
,
data
.
data
());
PackedBlock
<
float
>
packed
;
PackedBlock
packed
;
packed
.
tensor
()
->
Resize
({
height
,
width
});
packed
.
Resize
({
height
,
width
});
if
(
pack_order
==
PackOrder
::
ColMajor
)
{
if
(
pack_order
==
PackOrder
::
ColMajor
)
{
sg
.
PackLhs
(
src_matrix
,
&
packed
);
sg
.
PackLhs
(
src_matrix
,
&
packed
);
}
else
{
}
else
{
sg
.
PackRhs
(
src_matrix
,
&
packed
);
sg
.
PackRhs
(
src_matrix
,
&
packed
);
}
}
auto
packed_data
=
packed
.
data
();
auto
packed_data
=
packed
.
data
<
float
>
();
for
(
index_t
i
=
0
;
i
<
packed
.
tensor
()
->
size
();
++
i
)
{
for
(
index_t
i
=
0
;
i
<
packed
.
size
();
++
i
)
{
EXPECT_EQ
(
expected_data
[
i
],
packed_data
[
i
]);
EXPECT_EQ
(
expected_data
[
i
],
packed_data
[
i
]);
}
}
}
}
...
@@ -57,9 +57,9 @@ void TestUnPack(const index_t height,
...
@@ -57,9 +57,9 @@ void TestUnPack(const index_t height,
data
[
i
]
=
rand_r
(
&
seed
);
data
[
i
]
=
rand_r
(
&
seed
);
}
}
MatrixMap
<
const
float
>
src_matrix
(
height
,
width
,
src_order
,
data
.
data
());
MatrixMap
<
const
float
>
src_matrix
(
1
,
height
,
width
,
src_order
,
data
.
data
());
PackedBlock
<
float
>
packed
;
PackedBlock
packed
;
packed
.
tensor
()
->
Resize
({
height
,
width
});
packed
.
Resize
({
height
,
width
});
SGemm
sg
;
SGemm
sg
;
if
(
pack_order
==
PackOrder
::
ColMajor
)
{
if
(
pack_order
==
PackOrder
::
ColMajor
)
{
sg
.
PackLhs
(
src_matrix
,
&
packed
);
sg
.
PackLhs
(
src_matrix
,
&
packed
);
...
@@ -68,17 +68,18 @@ void TestUnPack(const index_t height,
...
@@ -68,17 +68,18 @@ void TestUnPack(const index_t height,
}
}
std
::
vector
<
float
>
unpacked
(
matrix_size
);
std
::
vector
<
float
>
unpacked
(
matrix_size
);
MatrixMap
<
float
>
unpacked_matrix
(
height
,
width
,
src_order
,
unpacked
.
data
());
MatrixMap
<
float
>
unpacked_matrix
(
1
,
height
,
width
,
src_order
,
unpacked
.
data
());
sg
.
UnPack
(
packed
,
&
unpacked_matrix
);
sg
.
UnPack
(
packed
,
&
unpacked_matrix
);
auto
unpacked_data
=
unpacked
.
data
();
auto
unpacked_data
=
unpacked
.
data
();
for
(
index_t
i
=
0
;
i
<
packed
.
tensor
()
->
size
();
++
i
)
{
for
(
index_t
i
=
0
;
i
<
packed
.
size
();
++
i
)
{
EXPECT_EQ
(
data
[
i
],
unpacked_data
[
i
]);
EXPECT_EQ
(
data
[
i
],
unpacked_data
[
i
]);
}
}
}
}
}
// namespace
}
// namespace
TEST
(
SGemmTest
,
Pack
)
{
TEST
(
SGemm
Pack
Test
,
Pack
)
{
std
::
vector
<
float
>
data
=
std
::
vector
<
float
>
data
=
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
,
32
,
33
,
34
,
35
,
36
};
21
,
22
,
23
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
,
32
,
33
,
34
,
35
,
36
};
...
@@ -149,7 +150,7 @@ TEST(SGemmTest, Pack) {
...
@@ -149,7 +150,7 @@ TEST(SGemmTest, Pack) {
#endif
#endif
}
}
TEST
(
SGemmTest
,
UnPack
)
{
TEST
(
SGemm
Pack
Test
,
UnPack
)
{
TestUnPack
(
4
,
3
,
Major
::
RowMajor
,
PackOrder
::
RowMajor
);
TestUnPack
(
4
,
3
,
Major
::
RowMajor
,
PackOrder
::
RowMajor
);
TestUnPack
(
4
,
4
,
Major
::
RowMajor
,
PackOrder
::
RowMajor
);
TestUnPack
(
4
,
4
,
Major
::
RowMajor
,
PackOrder
::
RowMajor
);
TestUnPack
(
4
,
5
,
Major
::
RowMajor
,
PackOrder
::
RowMajor
);
TestUnPack
(
4
,
5
,
Major
::
RowMajor
,
PackOrder
::
RowMajor
);
...
...
mace/ops/matmul.h
浏览文件 @
26592a86
...
@@ -40,7 +40,11 @@ class MatMulOp : public Operator<D, T> {
...
@@ -40,7 +40,11 @@ class MatMulOp : public Operator<D, T> {
"than or equal to 2"
);
"than or equal to 2"
);
index_t
rank
=
A
->
dim_size
();
index_t
rank
=
A
->
dim_size
();
for
(
index_t
i
=
0
;
i
<
rank
-
2
;
++
i
)
{
for
(
index_t
i
=
0
;
i
<
rank
-
2
;
++
i
)
{
MACE_CHECK
(
A
->
dim
(
i
)
==
B
->
dim
(
i
),
"batch dimensions are not equal"
);
MACE_CHECK
(
A
->
dim
(
i
)
==
B
->
dim
(
i
),
"batch dimensions are not equal: "
,
A
->
dim
(
i
),
" vs. "
,
B
->
dim
(
i
));
}
}
index_t
ak
=
transpose_a_
?
A
->
dim
(
rank
-
2
)
:
A
->
dim
(
rank
-
1
);
index_t
ak
=
transpose_a_
?
A
->
dim
(
rank
-
2
)
:
A
->
dim
(
rank
-
1
);
index_t
bk
=
transpose_b_
?
B
->
dim
(
rank
-
1
)
:
B
->
dim
(
rank
-
2
);
index_t
bk
=
transpose_b_
?
B
->
dim
(
rank
-
1
)
:
B
->
dim
(
rank
-
2
);
...
...
mace/ops/matmul_benchmark.cc
浏览文件 @
26592a86
...
@@ -33,13 +33,15 @@ void MatMulBenchmark(
...
@@ -33,13 +33,15 @@ void MatMulBenchmark(
// Add input data
// Add input data
net
.
AddRandomInput
<
D
,
T
>
(
"A"
,
{
batch
,
height
,
channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"A"
,
{
batch
,
height
,
channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"B"
,
{
batch
,
channels
,
out_width
});
net
.
AddRandomInput
<
D
,
T
>
(
"B"
,
{
batch
,
channels
,
out_width
});
net
.
GetTensor
(
"A"
)
->
SetIsWeight
(
true
);
net
.
GetTensor
(
"B"
)
->
SetIsWeight
(
true
);
if
(
DataTypeToEnum
<
T
>::
value
==
DT_UINT8
)
{
if
(
DataTypeToEnum
<
T
>::
value
==
DT_UINT8
)
{
net
.
GetTensor
(
"A"
)
->
SetScale
(
0.1
);
net
.
GetTensor
(
"A"
)
->
SetScale
(
0.1
);
net
.
GetTensor
(
"B"
)
->
SetScale
(
0.1
);
net
.
GetTensor
(
"B"
)
->
SetScale
(
0.1
);
}
}
if
(
D
==
DeviceType
::
GPU
)
{
if
(
D
==
DeviceType
::
GPU
)
{
BufferToImage
<
D
,
T
>
(
&
net
,
"A"
,
"AImage"
,
kernels
::
BufferType
::
IN_OUT_WIDTH
);
BufferToImage
<
D
,
T
>
(
&
net
,
"A"
,
"AImage"
,
kernels
::
BufferType
::
IN_OUT_WIDTH
);
BufferToImage
<
D
,
T
>
(
&
net
,
"B"
,
"BImage"
,
BufferToImage
<
D
,
T
>
(
&
net
,
"B"
,
"BImage"
,
kernels
::
BufferType
::
IN_OUT_HEIGHT
);
kernels
::
BufferType
::
IN_OUT_HEIGHT
);
...
@@ -71,7 +73,7 @@ void MatMulBenchmark(
...
@@ -71,7 +73,7 @@ void MatMulBenchmark(
mace
::
testing
::
StartTiming
();
mace
::
testing
::
StartTiming
();
while
(
iters
--
)
{
while
(
iters
--
)
{
net
.
Run
Op
(
D
);
net
.
Run
(
);
}
}
net
.
Sync
();
net
.
Sync
();
}
}
...
@@ -86,6 +88,8 @@ void MatMulTransposeBenchmark(
...
@@ -86,6 +88,8 @@ void MatMulTransposeBenchmark(
// Add input data
// Add input data
net
.
AddRandomInput
<
D
,
T
>
(
"A"
,
{
batch
,
height
,
channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"A"
,
{
batch
,
height
,
channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"B"
,
{
batch
,
out_width
,
channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"B"
,
{
batch
,
out_width
,
channels
});
net
.
GetTensor
(
"A"
)
->
SetIsWeight
(
true
);
net
.
GetTensor
(
"B"
)
->
SetIsWeight
(
true
);
if
(
DataTypeToEnum
<
T
>::
value
==
DT_UINT8
)
{
if
(
DataTypeToEnum
<
T
>::
value
==
DT_UINT8
)
{
net
.
GetTensor
(
"A"
)
->
SetScale
(
0.1
);
net
.
GetTensor
(
"A"
)
->
SetScale
(
0.1
);
net
.
GetTensor
(
"B"
)
->
SetScale
(
0.1
);
net
.
GetTensor
(
"B"
)
->
SetScale
(
0.1
);
...
@@ -116,7 +120,7 @@ void MatMulTransposeBenchmark(
...
@@ -116,7 +120,7 @@ void MatMulTransposeBenchmark(
mace
::
testing
::
StartTiming
();
mace
::
testing
::
StartTiming
();
while
(
iters
--
)
{
while
(
iters
--
)
{
net
.
Run
Op
(
D
);
net
.
Run
(
);
}
}
net
.
Sync
();
net
.
Sync
();
}
}
...
@@ -154,10 +158,15 @@ void MatMulTransposeBenchmark(
...
@@ -154,10 +158,15 @@ void MatMulTransposeBenchmark(
MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU); \
MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU); \
MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, uint8_t, CPU);
MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, uint8_t, CPU);
MACE_BM_MATMUL
(
1
,
128
,
128
,
49
);
MACE_BM_MATMUL
(
2
,
128
,
128
,
49
);
MACE_BM_MATMUL
(
3
,
128
,
128
,
49
);
MACE_BM_MATMUL
(
4
,
128
,
128
,
49
);
MACE_BM_MATMUL
(
16
,
32
,
128
,
49
);
MACE_BM_MATMUL
(
16
,
32
,
128
,
49
);
MACE_BM_MATMUL
(
16
,
32
,
128
,
961
);
MACE_BM_MATMUL
(
16
,
32
,
128
,
961
);
MACE_BM_MATMUL
(
16
,
32
,
128
,
3969
);
MACE_BM_MATMUL
(
16
,
32
,
128
,
3969
);
MACE_BM_MATMUL
(
16
,
128
,
128
,
49
);
MACE_BM_MATMUL
(
16
,
128
,
128
,
49
);
MACE_BM_MATMUL
(
16
,
49
,
128
,
128
);
MACE_BM_MATMUL
(
16
,
128
,
128
,
961
);
MACE_BM_MATMUL
(
16
,
128
,
128
,
961
);
MACE_BM_MATMUL
(
16
,
128
,
128
,
3969
);
MACE_BM_MATMUL
(
16
,
128
,
128
,
3969
);
...
...
mace/ops/winograd_transform_benchmark.cc
浏览文件 @
26592a86
...
@@ -211,8 +211,8 @@ void WinoMatMulBenchmark(
...
@@ -211,8 +211,8 @@ void WinoMatMulBenchmark(
const
index_t
round_w
=
(
width
+
block_size
-
1
)
/
block_size
;
const
index_t
round_w
=
(
width
+
block_size
-
1
)
/
block_size
;
const
index_t
out_width
=
round_h
*
round_w
;
const
index_t
out_width
=
round_h
*
round_w
;
// Add input data
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"A"
,
{
batch
,
out_channels
,
in_channels
,
1
});
net
.
AddRandomInput
<
D
,
float
>
(
"A"
,
{
batch
,
out_channels
,
in_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"B"
,
{
batch
,
in_channels
,
out_width
,
1
});
net
.
AddRandomInput
<
D
,
float
>
(
"B"
,
{
batch
,
in_channels
,
out_width
});
if
(
D
==
DeviceType
::
GPU
)
{
if
(
D
==
DeviceType
::
GPU
)
{
BufferToImage
<
D
,
T
>
(
&
net
,
"A"
,
"AImage"
,
kernels
::
BufferType
::
IN_OUT_WIDTH
);
BufferToImage
<
D
,
T
>
(
&
net
,
"A"
,
"AImage"
,
kernels
::
BufferType
::
IN_OUT_WIDTH
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录