Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
5f2001d2
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
5f2001d2
编写于
5月 18, 2018
作者:
李
李寅
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'conv1x1' into 'master'
Optimize conv1x1 See merge request !498
上级
e22a337d
67b7b36a
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
181 addition
and
170 deletion
+181
-170
mace/core/buffer.h
mace/core/buffer.h
+17
-2
mace/core/tensor.h
mace/core/tensor.h
+1
-1
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+5
-3
mace/kernels/conv_pool_2d_util.cc
mace/kernels/conv_pool_2d_util.cc
+1
-0
mace/kernels/gemm.cc
mace/kernels/gemm.cc
+157
-164
未找到文件。
mace/core/buffer.h
浏览文件 @
5f2001d2
...
...
@@ -61,6 +61,8 @@ class BufferBase {
virtual
void
Clear
()
=
0
;
virtual
void
Clear
(
index_t
size
)
=
0
;
virtual
index_t
offset
()
const
{
return
0
;
}
template
<
typename
T
>
...
...
@@ -198,7 +200,11 @@ class Buffer : public BufferBase {
bool
OnHost
()
const
{
return
allocator_
->
OnHost
();
}
void
Clear
()
{
memset
(
reinterpret_cast
<
char
*>
(
raw_mutable_data
()),
0
,
size_
);
Clear
(
size_
);
}
void
Clear
(
index_t
size
)
{
memset
(
reinterpret_cast
<
char
*>
(
raw_mutable_data
()),
0
,
size
);
}
protected:
...
...
@@ -312,6 +318,11 @@ class Image : public BufferBase {
MACE_NOT_IMPLEMENTED
;
}
void
Clear
(
index_t
size
)
{
MACE_UNUSED
(
size
);
MACE_NOT_IMPLEMENTED
;
}
private:
Allocator
*
allocator_
;
std
::
vector
<
size_t
>
shape_
;
...
...
@@ -431,7 +442,11 @@ class BufferSlice : public BufferBase {
bool
OnHost
()
const
{
return
buffer_
->
OnHost
();
}
void
Clear
()
{
memset
(
raw_mutable_data
(),
0
,
size_
);
Clear
(
size_
);
}
void
Clear
(
index_t
size
)
{
memset
(
raw_mutable_data
(),
0
,
size
);
}
private:
...
...
mace/core/tensor.h
浏览文件 @
5f2001d2
...
...
@@ -208,7 +208,7 @@ class Tensor {
inline
void
Clear
()
{
MACE_CHECK_NOTNULL
(
buffer_
);
buffer_
->
Clear
();
buffer_
->
Clear
(
raw_size
()
);
}
inline
void
Reshape
(
const
std
::
vector
<
index_t
>
&
shape
)
{
...
...
mace/kernels/conv_2d.h
浏览文件 @
5f2001d2
...
...
@@ -297,7 +297,6 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
output_shape
.
data
());
}
output
->
Resize
(
output_shape
);
output
->
Clear
();
index_t
batch
=
output
->
dim
(
0
);
index_t
channels
=
output
->
dim
(
1
);
...
...
@@ -415,7 +414,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
if
(
extra_input_width
!=
padded_input_width
)
{
pad_right
+=
(
extra_input_width
-
padded_input_width
);
}
}
else
{
}
else
if
(
!
use_neon_1x1_s1
)
{
extra_output_height
=
height
;
extra_input_height
=
std
::
max
(
padded_input_height
,
(
extra_output_height
-
1
)
*
stride_h
...
...
@@ -602,7 +601,6 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
const
Tensor
*
pad_input_ptr
=
input
;
if
(
extra_input_height
!=
input_height
||
extra_input_width
!=
input_width
)
{
padded_input
.
Clear
();
ConstructNCHWInputWithSpecificPadding
(
input
,
pad_top
,
pad_bottom
,
...
...
@@ -612,13 +610,17 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
pad_input_ptr
=
&
padded_input
;
}
// TODO(libin): don't need clear after bias is integrated in each conv
Tensor
*
pad_output_ptr
=
output
;
if
(
extra_output_height
!=
height
||
extra_output_width
!=
width
)
{
padded_output
.
Reshape
({
batch
,
channels
,
extra_output_height
,
extra_output_width
});
padded_output
.
Clear
();
pad_output_ptr
=
&
padded_output
;
}
else
if
(
!
use_neon_1x1_s1
)
{
output
->
Clear
();
}
const
float
*
pad_input_data
=
pad_input_ptr
->
data
<
float
>
();
float
*
pad_output_data
=
pad_output_ptr
->
mutable_data
<
float
>
();
...
...
mace/kernels/conv_pool_2d_util.cc
浏览文件 @
5f2001d2
...
...
@@ -377,6 +377,7 @@ void ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
std
::
vector
<
index_t
>
output_shape
(
{
batch
,
channels
,
height
+
pad_height
,
width
+
pad_width
});
output_tensor
->
Resize
(
output_shape
);
output_tensor
->
Clear
();
Tensor
::
MappingGuard
padded_output_mapper
(
output_tensor
);
float
*
output_data
=
output_tensor
->
mutable_data
<
float
>
();
...
...
mace/kernels/gemm.cc
浏览文件 @
5f2001d2
...
...
@@ -140,8 +140,8 @@ inline void GemmTile(const float *A,
#endif
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
for
(
h
=
0
;
h
+
7
<
height
;
h
+=
8
)
{
for
(
k
=
0
;
k
+
7
<
K
;
k
+=
8
)
{
for
(
h
=
0
;
h
<
height
-
7
;
h
+=
8
)
{
for
(
k
=
0
;
k
<
K
-
7
;
k
+=
8
)
{
const
float
*
a_ptr
=
A
+
(
h
*
stride_k
+
k
);
#ifdef __clang__
int
nw
=
width
>>
2
;
...
...
@@ -185,153 +185,147 @@ inline void GemmTile(const float *A,
float
*
c_ptr7
=
C
+
(
h
+
7
)
*
stride_w
;
asm
volatile
(
"0:
\n
"
"prfm pldl1keep, [%1, #128]
\n
"
"ld1 {v24.4s}, [%1]
\n
"
// load b: 0-7
"prfm pldl1keep, [%9, #128]
\n
"
"ld1 {v16.4s}, [%9], #16
\n
"
"prfm pldl1keep, [%10, #128]
\n
"
"ld1 {v17.4s}, [%10], #16
\n
"
"prfm pldl1keep, [%11, #128]
\n
"
"ld1 {v18.4s}, [%11], #16
\n
"
"prfm pldl1keep, [%12, #128]
\n
"
"ld1 {v19.4s}, [%12], #16
\n
"
"prfm pldl1keep, [%1, #128]
\n
"
"ld1 {v18.4s}, [%1]
\n
"
"prfm pldl1keep, [%2, #128]
\n
"
"ld1 {v25.4s}, [%2]
\n
"
"prfm pldl1keep, [%13, #128]
\n
"
"ld1 {v20.4s}, [%13], #16
\n
"
"prfm pldl1keep, [%14, #128]
\n
"
"ld1 {v21.4s}, [%14], #16
\n
"
"ld1 {v19.4s}, [%2]
\n
"
"prfm pldl1keep, [%15, #128]
\n
"
"ld1 {v22.4s}, [%15], #16
\n
"
"prfm pldl1keep, [%16, #128]
\n
"
"ld1 {v23.4s}, [%16], #16
\n
"
"0:
\n
"
"prfm pldl1keep, [%3, #128]
\n
"
"ld1 {v26.4s}, [%3]
\n
"
"fmla v24.4s, v16.4s, %34.s[0]
\n
"
"fmla v24.4s, v17.4s, %34.s[1]
\n
"
"fmla v24.4s, v18.4s, %34.s[2]
\n
"
"fmla v24.4s, v19.4s, %34.s[3]
\n
"
"fmla v24.4s, v20.4s, %35.s[0]
\n
"
"fmla v24.4s, v21.4s, %35.s[1]
\n
"
"fmla v24.4s, v22.4s, %35.s[2]
\n
"
"fmla v24.4s, v23.4s, %35.s[3]
\n
"
"st1 {v24.4s}, [%1], #16
\n
"
"fmla v25.4s, v16.4s, %36.s[0]
\n
"
"fmla v25.4s, v17.4s, %36.s[1]
\n
"
"fmla v25.4s, v18.4s, %36.s[2]
\n
"
"fmla v25.4s, v19.4s, %36.s[3]
\n
"
"fmla v25.4s, v20.4s, %37.s[0]
\n
"
"fmla v25.4s, v21.4s, %37.s[1]
\n
"
"fmla v25.4s, v22.4s, %37.s[2]
\n
"
"fmla v25.4s, v23.4s, %37.s[3]
\n
"
"ld1 {v20.4s}, [%3]
\n
"
"prfm pldl1keep, [%4, #128]
\n
"
"ld1 {v24.4s}, [%4]
\n
"
"st1 {v25.4s}, [%2], #16
\n
"
"fmla v26.4s, v16.4s, %38.s[0]
\n
"
"fmla v26.4s, v17.4s, %38.s[1]
\n
"
"fmla v26.4s, v18.4s, %38.s[2]
\n
"
"fmla v26.4s, v19.4s, %38.s[3]
\n
"
"fmla v26.4s, v20.4s, %39.s[0]
\n
"
"fmla v26.4s, v21.4s, %39.s[1]
\n
"
"fmla v26.4s, v22.4s, %39.s[2]
\n
"
"fmla v26.4s, v23.4s, %39.s[3]
\n
"
"ld1 {v21.4s}, [%4]
\n
"
"prfm pldl1keep, [%5, #128]
\n
"
"ld1 {v25.4s}, [%5]
\n
"
"ld1 {v22.4s}, [%5]
\n
"
"prfm pldl1keep, [%6, #128]
\n
"
"ld1 {v23.4s}, [%6]
\n
"
"prfm pldl1keep, [%7, #128]
\n
"
"ld1 {v24.4s}, [%7]
\n
"
"prfm pldl1keep, [%8, #128]
\n
"
"ld1 {v25.4s}, [%8]
\n
"
"prfm pldl1keep, [%10, #128]
\n
"
"ld1 {v17.4s}, [%10], #16
\n
"
"st1 {v26.4s}, [%3], #16
\n
"
"fmla v18.4s, v16.4s, %34.s[0]
\n
"
"fmla v19.4s, v16.4s, %35.s[0]
\n
"
"fmla v20.4s, v16.4s, %36.s[0]
\n
"
"fmla v21.4s, v16.4s, %37.s[0]
\n
"
"fmla v22.4s, v16.4s, %38.s[0]
\n
"
"fmla v23.4s, v16.4s, %39.s[0]
\n
"
"fmla v24.4s, v16.4s, %40.s[0]
\n
"
"fmla v24.4s, v17.4s, %40.s[1]
\n
"
"fmla v24.4s, v18.4s, %40.s[2]
\n
"
"fmla v24.4s, v19.4s, %40.s[3]
\n
"
"fmla v25.4s, v16.4s, %41.s[0]
\n
"
"fmla v24.4s, v20.4s, %41.s[0
]
\n
"
"fmla v24.4s, v21.4s, %41
.s[1]
\n
"
"fmla v24.4s, v22.4s, %41.s[2
]
\n
"
"fmla v24.4s, v23.4s, %41.s[3
]
\n
"
"fmla v18.4s, v17.4s, %34.s[1
]
\n
"
"fmla v19.4s, v17.4s, %35
.s[1]
\n
"
"fmla v20.4s, v17.4s, %36.s[1
]
\n
"
"fmla v21.4s, v17.4s, %37.s[1
]
\n
"
"prfm pldl1keep, [%6, #128]
\n
"
"ld1 {v26.4s}, [%6]
\n
"
"prfm pldl1keep, [%11, #128]
\n
"
"ld1 {v16.4s}, [%11], #16
\n
"
"st1 {v24.4s}, [%4], #16
\n
"
"fmla v22.4s, v17.4s, %38.s[1]
\n
"
"fmla v23.4s, v17.4s, %39.s[1]
\n
"
"fmla v24.4s, v17.4s, %40.s[1]
\n
"
"fmla v25.4s, v17.4s, %41.s[1]
\n
"
"fmla v25.4s, v16.4s, %42.s[0
]
\n
"
"fmla v25.4s, v17.4s, %42.s[1
]
\n
"
"fmla v25.4s, v18.4s, %42
.s[2]
\n
"
"fmla v25.4s, v19.4s, %42.s[3
]
\n
"
"fmla v18.4s, v16.4s, %34.s[2
]
\n
"
"fmla v19.4s, v16.4s, %35.s[2
]
\n
"
"fmla v20.4s, v16.4s, %36
.s[2]
\n
"
"fmla v21.4s, v16.4s, %37.s[2
]
\n
"
"fmla v25.4s, v20.4s, %43.s[0]
\n
"
"fmla v25.4s, v21.4s, %43.s[1]
\n
"
"fmla v25.4s, v22.4s, %43.s[2]
\n
"
"fmla v25.4s, v23.4s, %43.s[3]
\n
"
"prfm pldl1keep, [%12, #128]
\n
"
"ld1 {v17.4s}, [%12], #16
\n
"
"prfm pldl1keep, [%7, #128]
\n
"
"ld1 {v24.4s}, [%7]
\n
"
"fmla v22.4s, v16.4s, %38.s[2]
\n
"
"fmla v23.4s, v16.4s, %39.s[2]
\n
"
"fmla v24.4s, v16.4s, %40.s[2]
\n
"
"fmla v25.4s, v16.4s, %41.s[2]
\n
"
"st1 {v25.4s}, [%5], #16
\n
"
"fmla v18.4s, v17.4s, %34.s[3]
\n
"
"fmla v19.4s, v17.4s, %35.s[3]
\n
"
"fmla v20.4s, v17.4s, %36.s[3]
\n
"
"fmla v21.4s, v17.4s, %37.s[3]
\n
"
"fmla v26.4s, v16.4s, %44.s[0]
\n
"
"fmla v26.4s, v17.4s, %44.s[1]
\n
"
"fmla v26.4s, v18.4s, %44.s[2]
\n
"
"fmla v26.4s, v19.4s, %44.s[3]
\n
"
"prfm pldl1keep, [%13, #128]
\n
"
"ld1 {v16.4s}, [%13], #16
\n
"
"fmla v26.4s, v20.4s, %45.s[0
]
\n
"
"fmla v26.4s, v21.4s, %45.s[1
]
\n
"
"fmla v26.4s, v22.4s, %45.s[2
]
\n
"
"fmla v26.4s, v23.4s, %45
.s[3]
\n
"
"fmla v22.4s, v17.4s, %38.s[3
]
\n
"
"fmla v23.4s, v17.4s, %39.s[3
]
\n
"
"fmla v24.4s, v17.4s, %40.s[3
]
\n
"
"fmla v25.4s, v17.4s, %41
.s[3]
\n
"
"prfm pldl1keep, [%8, #128]
\n
"
"ld1 {v25.4s}, [%8]
\n
"
"fmla v18.4s, v16.4s, %42.s[0]
\n
"
"fmla v19.4s, v16.4s, %43.s[0]
\n
"
"fmla v20.4s, v16.4s, %44.s[0]
\n
"
"fmla v21.4s, v16.4s, %45.s[0]
\n
"
"st1 {v26.4s}, [%6], #16
\n
"
"prfm pldl1keep, [%14, #128]
\n
"
"ld1 {v17.4s}, [%14], #16
\n
"
"fmla v24
.4s, v16.4s, %46.s[0]
\n
"
"fmla v24.4s, v17.4s, %46.s[1
]
\n
"
"fmla v24.4s, v18.4s, %46.s[2
]
\n
"
"fmla v24.4s, v19.4s, %46.s[3
]
\n
"
"fmla v22
.4s, v16.4s, %46.s[0]
\n
"
"fmla v23.4s, v16.4s, %47.s[0
]
\n
"
"fmla v24.4s, v16.4s, %48.s[0
]
\n
"
"fmla v25.4s, v16.4s, %49.s[0
]
\n
"
"fmla v24.4s, v20.4s, %47.s[0
]
\n
"
"fmla v24.4s, v21.4s, %47
.s[1]
\n
"
"fmla v24.4s, v22.4s, %47.s[2
]
\n
"
"fmla v24.4s, v23.4s, %47.s[3
]
\n
"
"fmla v18.4s, v17.4s, %42.s[1
]
\n
"
"fmla v19.4s, v17.4s, %43
.s[1]
\n
"
"fmla v20.4s, v17.4s, %44.s[1
]
\n
"
"fmla v21.4s, v17.4s, %45.s[1
]
\n
"
"st1 {v24.4s}, [%7], #16
\n
"
"prfm pldl1keep, [%15, #128]
\n
"
"ld1 {v16.4s}, [%15], #16
\n
"
"fmla v25.4s, v16.4s, %48.s[0
]
\n
"
"fmla v25.4s, v17.4s, %48
.s[1]
\n
"
"fmla v25.4s, v18.4s, %48.s[2
]
\n
"
"fmla v25.4s, v19.4s, %48.s[3
]
\n
"
"fmla v22.4s, v17.4s, %46.s[1
]
\n
"
"fmla v23.4s, v17.4s, %47
.s[1]
\n
"
"fmla v24.4s, v17.4s, %48.s[1
]
\n
"
"fmla v25.4s, v17.4s, %49.s[1
]
\n
"
"fmla v25.4s, v20.4s, %49.s[0
]
\n
"
"fmla v25.4s, v21.4s, %49.s[1
]
\n
"
"fmla v25.4s, v22.4s, %49
.s[2]
\n
"
"fmla v25.4s, v23.4s, %49.s[3
]
\n
"
"fmla v18.4s, v16.4s, %42.s[2
]
\n
"
"fmla v19.4s, v16.4s, %43.s[2
]
\n
"
"fmla v20.4s, v16.4s, %44
.s[2]
\n
"
"fmla v21.4s, v16.4s, %45.s[2
]
\n
"
"prfm pldl1keep, [%16, #128]
\n
"
"ld1 {v17.4s}, [%16], #16
\n
"
"fmla v22.4s, v16.4s, %46.s[2]
\n
"
"fmla v23.4s, v16.4s, %47.s[2]
\n
"
"fmla v24.4s, v16.4s, %48.s[2]
\n
"
"fmla v25.4s, v16.4s, %49.s[2]
\n
"
"fmla v18.4s, v17.4s, %42.s[3]
\n
"
"fmla v19.4s, v17.4s, %43.s[3]
\n
"
"fmla v20.4s, v17.4s, %44.s[3]
\n
"
"fmla v21.4s, v17.4s, %45.s[3]
\n
"
"st1 {v18.4s}, [%1], #16
\n
"
"st1 {v19.4s}, [%2], #16
\n
"
"st1 {v20.4s}, [%3], #16
\n
"
"st1 {v21.4s}, [%4], #16
\n
"
"fmla v22.4s, v17.4s, %46.s[3]
\n
"
"fmla v23.4s, v17.4s, %47.s[3]
\n
"
"fmla v24.4s, v17.4s, %48.s[3]
\n
"
"fmla v25.4s, v17.4s, %49.s[3]
\n
"
"st1 {v22.4s}, [%5], #16
\n
"
"st1 {v23.4s}, [%6], #16
\n
"
"st1 {v24.4s}, [%7], #16
\n
"
"st1 {v25.4s}, [%8], #16
\n
"
"prfm pldl1keep, [%9, #128]
\n
"
"ld1 {v16.4s}, [%9], #16
\n
"
"prfm pldl1keep, [%1, #128]
\n
"
"ld1 {v18.4s}, [%1]
\n
"
"prfm pldl1keep, [%2, #128]
\n
"
"ld1 {v19.4s}, [%2]
\n
"
"subs %w0, %w0, #1
\n
"
"bne 0b
\n
"
:
"=r"
(
nw
),
// 0
...
...
@@ -369,20 +363,20 @@ inline void GemmTile(const float *A,
"15"
(
b_ptr6
),
// 32
"16"
(
b_ptr7
),
// 33
"w"
(
a0
),
// 34
"w"
(
a
1
),
// 35
"w"
(
a
2
),
// 36
"w"
(
a
3
),
// 37
"w"
(
a
4
),
// 38
"w"
(
a
5
),
// 39
"w"
(
a
6
),
// 40
"w"
(
a
7
),
// 41
"w"
(
a
8
),
// 42
"w"
(
a
9
),
// 43
"w"
(
a
10
),
// 44
"w"
(
a
11
),
// 45
"w"
(
a
12
),
// 46
"w"
(
a1
3
),
// 47
"w"
(
a1
4
),
// 48
"w"
(
a
2
),
// 35
"w"
(
a
4
),
// 36
"w"
(
a
6
),
// 37
"w"
(
a
8
),
// 38
"w"
(
a
10
),
// 39
"w"
(
a
12
),
// 40
"w"
(
a
14
),
// 41
"w"
(
a
1
),
// 42
"w"
(
a
3
),
// 43
"w"
(
a
5
),
// 44
"w"
(
a
7
),
// 45
"w"
(
a
9
),
// 46
"w"
(
a1
1
),
// 47
"w"
(
a1
3
),
// 48
"w"
(
a15
)
// 49
:
"cc"
,
"memory"
,
"v16"
,
...
...
@@ -585,7 +579,6 @@ void Gemm(const float *A,
}
memset
(
C
,
0
,
sizeof
(
float
)
*
batch
*
height
*
width
);
// It is better to use large block size if it fits for fast cache.
// Assume l1 cache size is 32k, we load three blocks at a time (A, B, C),
// the block size should be sqrt(32k / sizeof(T) / 3).
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录