Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
冰之2023
Mace
提交
9bbac6ea
Mace
项目概览
冰之2023
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
9bbac6ea
编写于
4月 10, 2018
作者:
W
wuchenghui
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
implement fc with gemv and convert global conv to fc
上级
6cfa7ab0
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
207 addition
and
33 deletion
+207
-33
mace/kernels/arm/conv_winograd_test.cc
mace/kernels/arm/conv_winograd_test.cc
+13
-17
mace/kernels/arm/fully_connected.cc
mace/kernels/arm/fully_connected.cc
+1
-1
mace/kernels/gemm.cc
mace/kernels/gemm.cc
+110
-0
mace/kernels/gemm.h
mace/kernels/gemm.h
+12
-0
mace/kernels/gemm_test.cc
mace/kernels/gemm_test.cc
+37
-11
mace/ops/fully_connected_test.cc
mace/ops/fully_connected_test.cc
+1
-1
mace/python/tools/caffe_converter_lib.py
mace/python/tools/caffe_converter_lib.py
+19
-2
mace/python/tools/tf_converter_lib.py
mace/python/tools/tf_converter_lib.py
+13
-0
mace/tools/validation/mace_run.cc
mace/tools/validation/mace_run.cc
+1
-1
未找到文件。
mace/kernels/arm/conv_winograd_test.cc
浏览文件 @
9bbac6ea
...
...
@@ -5,6 +5,7 @@
#include <gtest/gtest.h>
#include <random>
#include <algorithm>
#include <memory>
#include "mace/kernels/arm/conv_winograd.h"
#include "mace/core/types.h"
...
...
@@ -25,51 +26,46 @@ TEST(ConvWinogradTest, winograd) {
index_t
filter_size
=
3
*
3
*
in_channels
*
out_channels
;
index_t
output_size
=
batch
*
out_channels
*
out_height
*
out_width
;
float
*
input_data
=
new
float
[
input_size
]
;
float
*
filter_data
=
new
float
[
filter_size
]
;
float
*
output_data
=
new
float
[
output_size
]
;
float
*
output_data_ref
=
new
float
[
output_size
]
;
std
::
unique_ptr
<
float
[]
>
input_data
(
new
float
[
input_size
])
;
std
::
unique_ptr
<
float
[]
>
filter_data
(
new
float
[
filter_size
])
;
std
::
unique_ptr
<
float
[]
>
output_data
(
new
float
[
output_size
])
;
std
::
unique_ptr
<
float
[]
>
output_data_ref
(
new
float
[
output_size
])
;
std
::
random_device
rd
;
std
::
mt19937
gen
(
rd
());
std
::
normal_distribution
<
float
>
nd
(
0
,
1
);
std
::
generate
(
input_data
,
input_data
+
input_size
,
std
::
generate
(
input_data
.
get
(),
input_data
.
get
()
+
input_size
,
[
&
gen
,
&
nd
]
{
return
std
::
max
(
-
1.0
f
,
std
::
min
(
1.0
f
,
nd
(
gen
)));
});
std
::
generate
(
filter_data
,
filter_data
+
filter_size
,
std
::
generate
(
filter_data
.
get
(),
filter_data
.
get
()
+
filter_size
,
[
&
gen
,
&
nd
]
{
return
std
::
max
(
-
1.0
f
,
std
::
min
(
1.0
f
,
nd
(
gen
)));
});
kernels
::
ConvRef3x3s1
(
input_data
,
filter_data
,
kernels
::
ConvRef3x3s1
(
input_data
.
get
()
,
filter_data
.
get
()
,
batch
,
in_height
,
in_width
,
in_channels
,
out_channels
,
output_data_ref
);
output_data_ref
.
get
()
);
kernels
::
WinoGradConv3x3s1
(
input_data
,
filter_data
,
kernels
::
WinoGradConv3x3s1
(
input_data
.
get
()
,
filter_data
.
get
()
,
batch
,
in_height
,
in_width
,
in_channels
,
out_channels
,
6
,
output_data
);
output_data
.
get
()
);
// test
for
(
index_t
i
=
0
;
i
<
output_size
;
++
i
)
{
EXPECT_NEAR
(
output_data_ref
[
i
],
output_data
[
i
],
0.1
)
<<
" with index "
<<
i
;
}
delete
[]
input_data
;
delete
[]
filter_data
;
delete
[]
output_data
;
delete
[]
output_data_ref
;
}
}
// namespace kernels
...
...
mace/kernels/arm/fully_connected.cc
浏览文件 @
9bbac6ea
...
...
@@ -25,7 +25,7 @@ void FullyConnectedFunctor<DeviceType::NEON,
float
*
output_ptr
=
output
->
mutable_data
<
float
>
();
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
Gem
m
(
weight_ptr
,
input_ptr
,
1
,
output_size
,
input_size
,
1
,
output_ptr
);
Gem
v
(
weight_ptr
,
input_ptr
,
input_size
,
output_size
,
output_ptr
);
for
(
int
j
=
0
;
j
<
output_size
;
++
j
)
{
output_ptr
[
j
]
+=
bias_ptr
[
j
];
}
...
...
mace/kernels/gemm.cc
浏览文件 @
9bbac6ea
...
...
@@ -514,5 +514,115 @@ void GemmRef(const float *A,
}
}
void
GemvRef
(
const
float
*
m_ptr
,
const
float
*
v_ptr
,
const
index_t
width
,
const
index_t
height
,
float
*
out_ptr
)
{
memset
(
out_ptr
,
0
,
sizeof
(
float
)
*
height
);
for
(
int
h
=
0
;
h
<
height
;
++
h
)
{
for
(
int
w
=
0
;
w
<
width
;
++
w
)
{
out_ptr
[
h
]
+=
v_ptr
[
w
]
*
m_ptr
[
h
*
width
+
w
];
}
}
}
void
Gemv
(
const
float
*
m_ptr
,
const
float
*
v_ptr
,
const
index_t
width
,
const
index_t
height
,
float
*
out_ptr
)
{
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
index_t
height_d4
=
height
>>
2
;
index_t
width_d4
=
width
>>
2
;
index_t
remain_w
=
width
-
(
width_d4
<<
2
);
index_t
remain_h
=
height
-
(
height_d4
<<
2
);
#pragma omp parallel for
for
(
index_t
h
=
0
;
h
<
height_d4
;
++
h
)
{
const
float
*
m_ptr0
=
m_ptr
+
h
*
width
*
4
;
const
float
*
m_ptr1
=
m_ptr0
+
width
;
const
float
*
m_ptr2
=
m_ptr1
+
width
;
const
float
*
m_ptr3
=
m_ptr2
+
width
;
const
float
*
v_ptr0
=
v_ptr
;
float
*
out_ptr0
=
out_ptr
+
h
*
4
;
float32x4_t
vm0
,
vm1
,
vm2
,
vm3
;
float32x4_t
vv
;
float32x4_t
vsum0
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vsum1
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vsum2
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vsum3
=
vdupq_n_f32
(
0.
f
);
for
(
index_t
w
=
0
;
w
<
width_d4
;
++
w
)
{
vm0
=
vld1q_f32
(
m_ptr0
);
vm1
=
vld1q_f32
(
m_ptr1
);
vm2
=
vld1q_f32
(
m_ptr2
);
vm3
=
vld1q_f32
(
m_ptr3
);
vv
=
vld1q_f32
(
v_ptr0
);
vsum0
=
vmlaq_f32
(
vsum0
,
vm0
,
vv
);
vsum1
=
vmlaq_f32
(
vsum1
,
vm1
,
vv
);
vsum2
=
vmlaq_f32
(
vsum2
,
vm2
,
vv
);
vsum3
=
vmlaq_f32
(
vsum3
,
vm3
,
vv
);
m_ptr0
+=
4
;
m_ptr1
+=
4
;
m_ptr2
+=
4
;
m_ptr3
+=
4
;
v_ptr0
+=
4
;
}
float
sum0
=
vaddvq_f32
(
vsum0
);
float
sum1
=
vaddvq_f32
(
vsum1
);
float
sum2
=
vaddvq_f32
(
vsum2
);
float
sum3
=
vaddvq_f32
(
vsum3
);
// handle remaining w
for
(
index_t
w
=
0
;
w
<
remain_w
;
++
w
)
{
sum0
+=
m_ptr0
[
0
]
*
v_ptr0
[
0
];
sum1
+=
m_ptr1
[
0
]
*
v_ptr0
[
0
];
sum2
+=
m_ptr2
[
0
]
*
v_ptr0
[
0
];
sum3
+=
m_ptr3
[
0
]
*
v_ptr0
[
0
];
m_ptr0
++
;
m_ptr1
++
;
m_ptr2
++
;
m_ptr3
++
;
v_ptr0
++
;
}
*
out_ptr0
++
=
sum0
;
*
out_ptr0
++
=
sum1
;
*
out_ptr0
++
=
sum2
;
*
out_ptr0
++
=
sum3
;
}
// handle remaining h
index_t
remain_start_height
=
height_d4
<<
2
;
#pragma omp parallel for
for
(
index_t
h
=
0
;
h
<
remain_h
;
++
h
)
{
float32x4_t
vsum0
=
vdupq_n_f32
(
0.
f
);
const
float
*
m_ptr0
=
m_ptr
+
(
h
+
remain_start_height
)
*
width
;
const
float
*
v_ptr0
=
v_ptr
;
for
(
index_t
w
=
0
;
w
<
width_d4
;
++
w
)
{
float32x4_t
vm
=
vld1q_f32
(
m_ptr0
);
float32x4_t
vv
=
vld1q_f32
(
v_ptr0
);
vsum0
=
vmlaq_f32
(
vsum0
,
vm
,
vv
);
m_ptr0
+=
4
;
v_ptr0
+=
4
;
}
float
sum
=
vaddvq_f32
(
vsum0
);
for
(
index_t
w
=
0
;
w
<
remain_w
;
++
w
)
{
sum
+=
m_ptr0
[
0
]
*
v_ptr0
[
0
];
m_ptr0
++
;
v_ptr0
++
;
}
out_ptr
[
remain_start_height
+
h
]
=
sum
;
}
#else
GemvRef
(
m_ptr
,
v_ptr
,
width
,
height
,
out_ptr
);
#endif
}
}
// namespace kernels
}
// namespace mace
mace/kernels/gemm.h
浏览文件 @
9bbac6ea
...
...
@@ -29,6 +29,18 @@ void GemmRef(const float *A,
const
index_t
width
,
float
*
C
);
void
Gemv
(
const
float
*
m_ptr
,
const
float
*
v_ptr
,
const
index_t
width
,
const
index_t
height
,
float
*
out_ptr
);
void
GemvRef
(
const
float
*
m_ptr
,
const
float
*
v_ptr
,
const
index_t
width
,
const
index_t
height
,
float
*
out_ptr
);
}
// namespace kernels
}
// namespace mace
...
...
mace/kernels/gemm_test.cc
浏览文件 @
9bbac6ea
...
...
@@ -4,6 +4,7 @@
#include <gtest/gtest.h>
#include <random>
#include <memory>
#include "mace/kernels/gemm.h"
#include "mace/core/types.h"
...
...
@@ -14,33 +15,58 @@ TEST(GEMMTest, gemm) {
index_t
N
=
17
;
index_t
M
=
33
;
index_t
K
=
64
;
float
*
A
=
new
float
[
N
*
K
]
;
float
*
B
=
new
float
[
K
*
M
]
;
float
*
C
=
new
float
[
N
*
M
]
;
float
*
C_ref
=
new
float
[
N
*
M
]
;
std
::
unique_ptr
<
float
[]
>
A
(
new
float
[
N
*
K
])
;
std
::
unique_ptr
<
float
[]
>
B
(
new
float
[
K
*
M
])
;
std
::
unique_ptr
<
float
[]
>
C
(
new
float
[
N
*
M
])
;
std
::
unique_ptr
<
float
[]
>
C_ref
(
new
float
[
N
*
M
])
;
std
::
random_device
rd
;
std
::
mt19937
gen
(
rd
());
std
::
normal_distribution
<
float
>
nd
(
0
,
1
);
std
::
generate
(
A
,
A
+
N
*
K
,
std
::
generate
(
A
.
get
(),
A
.
get
()
+
N
*
K
,
[
&
gen
,
&
nd
]
{
return
nd
(
gen
);
});
std
::
generate
(
B
,
B
+
K
*
M
,
std
::
generate
(
B
.
get
(),
B
.
get
()
+
K
*
M
,
[
&
gen
,
&
nd
]
{
return
nd
(
gen
);
});
kernels
::
Gemm
(
A
,
B
,
1
,
N
,
K
,
M
,
C
);
kernels
::
GemmRef
(
A
,
B
,
N
,
K
,
M
,
C_ref
);
kernels
::
Gemm
(
A
.
get
(),
B
.
get
(),
1
,
N
,
K
,
M
,
C
.
get
()
);
kernels
::
GemmRef
(
A
.
get
(),
B
.
get
(),
N
,
K
,
M
,
C_ref
.
get
()
);
for
(
int
i
=
0
;
i
<
N
*
M
;
++
i
)
{
EXPECT_NEAR
(
C_ref
[
i
],
C
[
i
],
0.1
);
}
}
TEST
(
GEMMTest
,
gemv
)
{
index_t
N
=
17
;
index_t
K
=
63
;
std
::
unique_ptr
<
float
[]
>
A
(
new
float
[
N
*
K
]);
std
::
unique_ptr
<
float
[]
>
B
(
new
float
[
K
]);
std
::
unique_ptr
<
float
[]
>
C
(
new
float
[
N
]);
std
::
unique_ptr
<
float
[]
>
C_ref
(
new
float
[
N
]);
delete
[]
A
;
delete
[]
B
;
delete
[]
C
;
std
::
random_device
rd
;
std
::
mt19937
gen
(
rd
());
std
::
normal_distribution
<
float
>
nd
(
0
,
1
);
std
::
generate
(
A
.
get
(),
A
.
get
()
+
N
*
K
,
[
&
gen
,
&
nd
]
{
return
nd
(
gen
);
});
std
::
generate
(
B
.
get
(),
B
.
get
()
+
K
,
[
&
gen
,
&
nd
]
{
return
nd
(
gen
);
});
kernels
::
Gemv
(
A
.
get
(),
B
.
get
(),
K
,
N
,
C
.
get
());
kernels
::
GemvRef
(
A
.
get
(),
B
.
get
(),
K
,
N
,
C_ref
.
get
());
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
EXPECT_NEAR
(
C_ref
[
i
],
C
[
i
],
0.1
);
}
}
}
// namespace mace
mace/ops/fully_connected_test.cc
浏览文件 @
9bbac6ea
...
...
@@ -308,7 +308,7 @@ void FullyConnectedTestNEON(const index_t batch,
ExpectTensorNear
<
float
>
(
*
net
.
GetOutput
(
"OutputExptected"
),
*
net
.
GetOutput
(
"OutputNeon"
),
0.0
0
1
);
0.01
);
}
TEST_F
(
FullyConnectedOpTest
,
TestNEON
)
{
...
...
mace/python/tools/caffe_converter_lib.py
浏览文件 @
9bbac6ea
...
...
@@ -101,8 +101,13 @@ class Shapes(object):
return
output_shape
@
staticmethod
def
fully_connected_shape
(
input_shape
,
weight_shape
):
return
[
input_shape
[
0
],
1
,
1
,
weight_shape
[
0
]]
def
fully_connected_shape
(
input_shape
,
weight_shape
,
input_format
=
'NHWC'
):
if
input_format
==
'NHWC'
:
return
[
input_shape
[
0
],
1
,
1
,
weight_shape
[
0
]]
elif
input_format
==
'NCHW'
:
return
[
input_shape
[
0
],
weight_shape
[
0
],
1
,
1
]
else
:
raise
Exception
(
"format %s is not supported"
%
input_format
)
@
staticmethod
def
concat_shape
(
input_shapes
,
axis
):
...
...
@@ -445,6 +450,18 @@ class CaffeConverter(object):
final_op
.
output_shape_map
[
final_op
.
layer
.
top
[
0
]]
=
output_shape
self
.
resolved_ops
.
add
(
activation_op
.
name
)
if
op_def
.
type
in
(
"Conv2D"
,
"FusedConv2D"
)
and
\
output_shape
[
2
]
==
1
and
\
((
input_format
==
'NCHW'
and
output_shape
[
3
]
==
1
)
or
(
input_format
==
'NHWC'
and
output_shape
[
1
]
==
1
)):
print
"convert op %s from CONV to FC"
%
op
.
name
op_def
.
type
=
'FC'
filter_shape
=
weight_data
.
shape
new_shape
=
[
filter_shape
[
0
],
filter_shape
[
1
]
*
filter_shape
[
2
]
*
filter_shape
[
3
],
1
,
1
]
weight_data
.
reshape
(
new_shape
)
op_def
.
output
.
extend
([
final_op
.
name
+
':0'
])
self
.
add_output_shape
(
op_def
,
output_shape
)
self
.
net_def
.
op
.
extend
([
op_def
])
...
...
mace/python/tools/tf_converter_lib.py
浏览文件 @
9bbac6ea
...
...
@@ -402,6 +402,19 @@ class TFConverter(object):
final_op
=
op
self
.
resolved_ops
[
op
.
name
]
=
1
# convert global conv to fc
filter_shape
=
get_input_tensor
(
op
,
1
).
shape
.
as_list
()
input_shape
=
get_input_tensor
(
op
,
0
).
shape
.
as_list
()
if
op_def
.
type
==
"Conv2D"
and
input_shape
[
1
]
==
filter_shape
[
0
]
and
\
input_shape
[
2
]
==
filter_shape
[
1
]
and
\
(
op
.
get_attr
(
'padding'
)
==
'VALID'
or
filter_shape
[
0
]
==
1
and
filter_shape
[
1
]
==
1
):
print
"convert op %s from CONV to FC"
%
op
.
name
op_def
.
type
=
'FC'
self
.
reshape_tensor
[
get_input_tensor
(
op
,
1
).
name
]
=
\
[
filter_shape
[
3
],
filter_shape
[
2
]
*
filter_shape
[
1
]
*
filter_shape
[
0
],
1
,
1
]
if
len
(
self
.
tf_graph
.
get
(
op
.
name
,
[]))
==
1
and
\
self
.
tf_graph
[
op
.
name
][
0
].
type
==
'BiasAdd'
:
bias_add_op
=
self
.
tf_graph
[
op
.
name
][
0
]
...
...
mace/tools/validation/mace_run.cc
浏览文件 @
9bbac6ea
...
...
@@ -190,7 +190,7 @@ DEFINE_int32(restart_round, 1, "restart round");
DEFINE_int32
(
malloc_check_cycle
,
-
1
,
"malloc debug check cycle, -1 to disable"
);
DEFINE_int32
(
gpu_perf_hint
,
2
,
"0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"
);
DEFINE_int32
(
gpu_priority_hint
,
1
,
"0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"
);
DEFINE_int32
(
omp_num_threads
,
8
,
"num of openmp threads"
);
DEFINE_int32
(
omp_num_threads
,
4
,
"num of openmp threads"
);
DEFINE_int32
(
cpu_power_option
,
0
,
"0:DEFAULT/1:HIGH_PERFORMANCE/2:BATTERY_SAVE"
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录