Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
73057ea3
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
73057ea3
编写于
4月 02, 2018
作者:
李
李寅
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Implement fc and eltwise for neon
上级
fdf938ce
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
170 addition
and
55 deletion
+170
-55
mace/kernels/arm/fully_connected.cc
mace/kernels/arm/fully_connected.cc
+39
-0
mace/kernels/fully_connected.h
mace/kernels/fully_connected.h
+14
-0
mace/ops/eltwise.cc
mace/ops/eltwise.cc
+5
-0
mace/ops/fully_connected.cc
mace/ops/fully_connected.cc
+6
-0
mace/ops/fully_connected_test.cc
mace/ops/fully_connected_test.cc
+106
-55
未找到文件。
mace/kernels/arm/fully_connected.cc
0 → 100644
浏览文件 @
73057ea3
//
// Copyright (c) 2018 XiaoMi All rights reserved.
//
#include "mace/kernels/fully_connected.h"
#include "mace/kernels/gemm.h"
namespace
mace
{
namespace
kernels
{
void
FullyConnectedFunctor
<
DeviceType
::
NEON
,
float
>::
operator
()(
const
Tensor
*
input
,
const
Tensor
*
weight
,
const
Tensor
*
bias
,
Tensor
*
output
,
StatsFuture
*
future
)
{
std
::
vector
<
index_t
>
output_shape
=
{
input
->
dim
(
0
),
weight
->
dim
(
0
),
1
,
1
};
output
->
Resize
(
output_shape
);
const
index_t
N
=
output
->
dim
(
0
);
const
index_t
input_size
=
weight
->
dim
(
1
);
const
index_t
output_size
=
weight
->
dim
(
0
);
const
float
*
input_ptr
=
input
->
data
<
float
>
();
const
float
*
weight_ptr
=
weight
->
data
<
float
>
();
const
float
*
bias_ptr
=
bias
==
nullptr
?
nullptr
:
bias
->
data
<
float
>
();
float
*
output_ptr
=
output
->
mutable_data
<
float
>
();
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
Gemm
(
weight_ptr
,
input_ptr
,
1
,
output_size
,
input_size
,
1
,
output_ptr
);
for
(
int
j
=
0
;
j
<
output_size
;
++
j
)
{
output_ptr
[
j
]
+=
bias_ptr
[
j
];
}
}
DoActivation
(
output_ptr
,
output_ptr
,
output
->
size
(),
activation_
,
relux_max_limit_
);
}
}
// namespace kernels
}
// namespace mace
mace/kernels/fully_connected.h
浏览文件 @
73057ea3
...
...
@@ -76,6 +76,20 @@ struct FullyConnectedFunctor : FullyConnectedBase {
}
};
template
<
>
struct
FullyConnectedFunctor
<
DeviceType
::
NEON
,
float
>
:
FullyConnectedBase
{
FullyConnectedFunctor
(
const
BufferType
weight_type
,
const
ActivationType
activation
,
const
float
relux_max_limit
)
:
FullyConnectedBase
(
weight_type
,
activation
,
relux_max_limit
)
{}
void
operator
()(
const
Tensor
*
input
,
const
Tensor
*
weight
,
const
Tensor
*
bias
,
Tensor
*
output
,
StatsFuture
*
future
);
};
template
<
typename
T
>
struct
FullyConnectedFunctor
<
DeviceType
::
OPENCL
,
T
>
:
FullyConnectedBase
{
FullyConnectedFunctor
(
const
BufferType
weight_type
,
...
...
mace/ops/eltwise.cc
浏览文件 @
73057ea3
...
...
@@ -25,6 +25,11 @@ void Register_Eltwise(OperatorRegistry *op_registry) {
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
EltwiseOp
<
DeviceType
::
OPENCL
,
half
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Eltwise"
)
.
Device
(
DeviceType
::
NEON
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
EltwiseOp
<
DeviceType
::
NEON
,
float
>
);
}
}
// namespace ops
...
...
mace/ops/fully_connected.cc
浏览文件 @
73057ea3
...
...
@@ -25,6 +25,12 @@ void Register_FullyConnected(OperatorRegistry *op_registry) {
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
FullyConnectedOp
<
DeviceType
::
OPENCL
,
half
>
);
REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FC"
)
.
Device
(
DeviceType
::
NEON
)
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
FullyConnectedOp
<
DeviceType
::
NEON
,
float
>
);
}
}
// namespace ops
...
...
mace/ops/fully_connected_test.cc
浏览文件 @
73057ea3
...
...
@@ -13,7 +13,7 @@ namespace test {
class
FullyConnectedOpTest
:
public
OpsTestBase
{};
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
Simple
(
const
std
::
vector
<
index_t
>
&
input_shape
,
const
std
::
vector
<
float
>
&
input_value
,
const
std
::
vector
<
index_t
>
&
weight_shape
,
...
...
@@ -38,12 +38,12 @@ void Simple(const std::vector<index_t> &input_shape,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"FC"
,
"FullyConnectedTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"WeightImage"
)
.
Input
(
"BiasImage"
)
.
Output
(
"OutputImage"
)
.
AddIntArg
(
"weight_type"
,
kernels
::
BufferType
::
WEIGHT_HEIGHT
)
.
Finalize
(
net
.
NewOperatorDef
());
.
Input
(
"InputImage"
)
.
Input
(
"WeightImage"
)
.
Input
(
"BiasImage"
)
.
Output
(
"OutputImage"
)
.
AddIntArg
(
"weight_type"
,
kernels
::
BufferType
::
WEIGHT_HEIGHT
)
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
...
...
@@ -52,11 +52,11 @@ void Simple(const std::vector<index_t> &input_shape,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
}
else
{
OpDefBuilder
(
"FC"
,
"FullyConnectedTest"
)
.
Input
(
"Input"
)
.
Input
(
"Weight"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
.
Input
(
"Input"
)
.
Input
(
"Weight"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
}
...
...
@@ -72,14 +72,14 @@ TEST_F(FullyConnectedOpTest, SimpleCPU) {
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
},
{
1
},
{
2
},
{
1
,
1
,
1
,
1
},
{
206
});
Simple
<
DeviceType
::
CPU
>
(
{
1
,
1
,
2
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
2
,
10
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
10
,
20
,
30
,
40
,
50
,
60
,
70
,
80
,
90
,
100
},
{
2
},
{
2
,
3
},
{
1
,
1
,
1
,
2
},
{
387
,
3853
});
{
1
,
1
,
2
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
2
,
10
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
10
,
20
,
30
,
40
,
50
,
60
,
70
,
80
,
90
,
100
},
{
2
},
{
2
,
3
},
{
1
,
1
,
1
,
2
},
{
387
,
3853
});
Simple
<
DeviceType
::
CPU
>
(
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
5
,
6
},
{
1
,
2
,
3
,
4
,
5
,
6
,
10
,
20
,
30
,
40
,
50
,
60
,
1
,
2
,
3
,
4
,
5
,
6
,
10
,
20
,
30
,
40
,
50
,
60
,
1
,
2
,
3
,
4
,
5
,
6
},
{
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
1
,
1
,
5
},
{
92
,
912
,
94
,
914
,
96
});
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
5
,
6
},
{
1
,
2
,
3
,
4
,
5
,
6
,
10
,
20
,
30
,
40
,
50
,
60
,
1
,
2
,
3
,
4
,
5
,
6
,
10
,
20
,
30
,
40
,
50
,
60
,
1
,
2
,
3
,
4
,
5
,
6
},
{
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
1
,
1
,
5
},
{
92
,
912
,
94
,
914
,
96
});
}
TEST_F
(
FullyConnectedOpTest
,
SimpleCPUWithBatch
)
{
...
...
@@ -92,14 +92,14 @@ TEST_F(FullyConnectedOpTest, SimpleOPENCL) {
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
},
{
1
},
{
2
},
{
1
,
1
,
1
,
1
},
{
206
});
Simple
<
DeviceType
::
OPENCL
>
(
{
1
,
1
,
2
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
2
,
10
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
10
,
20
,
30
,
40
,
50
,
60
,
70
,
80
,
90
,
100
},
{
2
},
{
2
,
3
},
{
1
,
1
,
1
,
2
},
{
387
,
3853
});
{
1
,
1
,
2
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
2
,
10
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
10
,
20
,
30
,
40
,
50
,
60
,
70
,
80
,
90
,
100
},
{
2
},
{
2
,
3
},
{
1
,
1
,
1
,
2
},
{
387
,
3853
});
Simple
<
DeviceType
::
OPENCL
>
(
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
5
,
6
},
{
1
,
2
,
3
,
4
,
5
,
6
,
10
,
20
,
30
,
40
,
50
,
60
,
1
,
2
,
3
,
4
,
5
,
6
,
10
,
20
,
30
,
40
,
50
,
60
,
1
,
2
,
3
,
4
,
5
,
6
},
{
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
1
,
1
,
5
},
{
92
,
912
,
94
,
914
,
96
});
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
5
,
6
},
{
1
,
2
,
3
,
4
,
5
,
6
,
10
,
20
,
30
,
40
,
50
,
60
,
1
,
2
,
3
,
4
,
5
,
6
,
10
,
20
,
30
,
40
,
50
,
60
,
1
,
2
,
3
,
4
,
5
,
6
},
{
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
1
,
1
,
5
},
{
92
,
912
,
94
,
914
,
96
});
}
TEST_F
(
FullyConnectedOpTest
,
SimpleGPUWithBatch
)
{
...
...
@@ -107,7 +107,7 @@ TEST_F(FullyConnectedOpTest, SimpleGPUWithBatch) {
{
1
,
2
,
3
,
4
},
{
1
},
{
2
},
{
2
,
1
,
1
,
1
},
{
32
,
72
});
}
template
<
typename
T
>
template
<
typename
T
>
void
Complex
(
const
index_t
batch
,
const
index_t
height
,
const
index_t
width
,
...
...
@@ -118,17 +118,17 @@ void Complex(const index_t batch,
// Construct graph
OpsTestNet
net
;
OpDefBuilder
(
"FC"
,
"FullyConnectedTest"
)
.
Input
(
"Input"
)
.
Input
(
"Weight"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
.
Input
(
"Input"
)
.
Input
(
"Weight"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddRandomInput
<
DeviceType
::
OPENCL
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
OPENCL
,
float
>
(
"Weight"
,
{
out_channel
,
height
*
width
*
channels
});
"Weight"
,
{
out_channel
,
height
*
width
*
channels
});
net
.
AddRandomInput
<
DeviceType
::
OPENCL
,
float
>
(
"Bias"
,
{
out_channel
});
// run cpu
...
...
@@ -147,13 +147,13 @@ void Complex(const index_t batch,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"FC"
,
"FullyConnectedTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"WeightImage"
)
.
Input
(
"BiasImage"
)
.
Output
(
"OutputImage"
)
.
AddIntArg
(
"weight_type"
,
kernels
::
BufferType
::
WEIGHT_HEIGHT
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
.
Input
(
"InputImage"
)
.
Input
(
"WeightImage"
)
.
Input
(
"BiasImage"
)
.
Output
(
"OutputImage"
)
.
AddIntArg
(
"weight_type"
,
kernels
::
BufferType
::
WEIGHT_HEIGHT
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run on opencl
net
.
RunOp
(
DeviceType
::
OPENCL
);
...
...
@@ -189,7 +189,7 @@ TEST_F(FullyConnectedOpTest, OPENCLHalfUnAlignedWithBatch) {
Complex
<
half
>
(
31
,
21
,
11
,
23
,
103
);
}
template
<
typename
T
>
template
<
typename
T
>
void
TestWXFormat
(
const
index_t
batch
,
const
index_t
height
,
const
index_t
width
,
...
...
@@ -200,17 +200,17 @@ void TestWXFormat(const index_t batch,
// Construct graph
OpsTestNet
net
;
OpDefBuilder
(
"FC"
,
"FullyConnectedTest"
)
.
Input
(
"Input"
)
.
Input
(
"Weight"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
.
Input
(
"Input"
)
.
Input
(
"Weight"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddRandomInput
<
DeviceType
::
OPENCL
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
OPENCL
,
float
>
(
"Weight"
,
{
out_channel
,
height
*
width
*
channels
});
"Weight"
,
{
out_channel
,
height
*
width
*
channels
});
net
.
AddRandomInput
<
DeviceType
::
OPENCL
,
float
>
(
"Bias"
,
{
out_channel
});
// run cpu
...
...
@@ -229,12 +229,12 @@ void TestWXFormat(const index_t batch,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"FC"
,
"FullyConnectedTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"WeightImage"
)
.
Input
(
"BiasImage"
)
.
Output
(
"OutputImage"
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
.
Input
(
"InputImage"
)
.
Input
(
"WeightImage"
)
.
Input
(
"BiasImage"
)
.
Output
(
"OutputImage"
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
DeviceType
::
OPENCL
);
...
...
@@ -266,6 +266,57 @@ TEST_F(FullyConnectedOpTest, OPENCLHalfWidthFormatAligned) {
TestWXFormat
<
half
>
(
1
,
16
,
32
,
32
,
32
);
}
void
FullyConnectedTestNEON
(
const
index_t
batch
,
const
index_t
height
,
const
index_t
width
,
const
index_t
channels
,
const
index_t
out_channel
)
{
srand
(
time
(
NULL
));
// Construct graph
OpsTestNet
net
;
OpDefBuilder
(
"FC"
,
"FullyConnectedTest"
)
.
Input
(
"Input"
)
.
Input
(
"Weight"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Weight"
,
{
out_channel
,
height
*
width
*
channels
});
net
.
AddRandomInput
<
DeviceType
::
CPU
,
float
>
(
"Bias"
,
{
out_channel
});
// run cpu
net
.
RunOp
();
// Run on neon
OpDefBuilder
(
"FC"
,
"FullyConnectedTest"
)
.
Input
(
"Input"
)
.
Input
(
"Weight"
)
.
Input
(
"Bias"
)
.
Output
(
"OutputNeon"
)
.
Finalize
(
net
.
NewOperatorDef
());
// Run on device
net
.
RunOp
(
DeviceType
::
NEON
);
net
.
FillNHWCInputToNCHWInput
<
DeviceType
::
CPU
,
float
>
(
"OutputExptected"
,
"Output"
);
ExpectTensorNear
<
float
>
(
*
net
.
GetOutput
(
"OutputExptected"
),
*
net
.
GetOutput
(
"OutputNeon"
),
0.001
);
}
TEST_F
(
FullyConnectedOpTest
,
TestNEON
)
{
FullyConnectedTestNEON
(
1
,
7
,
7
,
32
,
16
);
FullyConnectedTestNEON
(
1
,
7
,
7
,
512
,
128
);
FullyConnectedTestNEON
(
1
,
1
,
1
,
2048
,
1024
);
}
}
// namespace test
}
// namespace ops
}
// namespace mace
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录