Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
f00c4112
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
f00c4112
编写于
8月 25, 2017
作者:
H
hedaoyuan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Neon depthwise conv with filterSize = 4 and stride = 2.
上级
6dcff9a4
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
121 addition
and
1 deletion
+121
-1
paddle/function/neon/NeonDepthwiseConv.cpp
paddle/function/neon/NeonDepthwiseConv.cpp
+121
-1
未找到文件。
paddle/function/neon/NeonDepthwiseConv.cpp
浏览文件 @
f00c4112
...
...
@@ -364,6 +364,116 @@ struct DepthwiseConvKernel<4, 1> {
}
};
/**
* Each step calculates four elements of the output.
*/
template
<
>
struct
DepthwiseConvKernel
<
4
,
2
>
{
static
void
run
(
const
float
*
inputData
,
const
float
*
filterData
,
int
inputHeight
,
int
inputWidth
,
int
outputChannels
,
int
outputHeight
,
int
outputWidth
,
int
filterMultiplier
,
float
*
outputData
)
{
const
int
steps
=
outputWidth
>>
2
;
const
int
remain
=
outputWidth
&
3
;
for
(
int
c
=
0
;
c
<
outputChannels
;
c
++
,
filterData
+=
16
)
{
// Load the filters
float32x4_t
k
[
4
];
k
[
0
]
=
vld1q_f32
(
filterData
);
k
[
1
]
=
vld1q_f32
(
filterData
+
4
);
k
[
2
]
=
vld1q_f32
(
filterData
+
8
);
k
[
3
]
=
vld1q_f32
(
filterData
+
12
);
const
float
*
start
=
inputData
+
(
c
/
filterMultiplier
)
*
(
inputHeight
*
inputWidth
);
float32x4_t
input
[
4
][
4
];
for
(
int
h
=
0
;
h
<
outputHeight
;
h
++
)
{
const
float
*
r0
=
start
+
2
*
h
*
inputWidth
;
const
float
*
r1
=
start
+
(
2
*
h
+
1
)
*
inputWidth
;
const
float
*
r2
=
start
+
(
2
*
h
+
2
)
*
inputWidth
;
const
float
*
r3
=
start
+
(
2
*
h
+
3
)
*
inputWidth
;
for
(
int
s
=
0
;
s
<
steps
;
s
++
)
{
// Load the inputs
float32x4x2_t
data1
;
float32x4x2_t
data2
;
data1
=
vld2q_f32
(
r0
);
data2
=
vld2q_f32
(
r0
+
8
);
input
[
0
][
0
]
=
data1
.
val
[
0
];
input
[
0
][
1
]
=
data1
.
val
[
1
];
input
[
0
][
2
]
=
vextq_f32
(
data1
.
val
[
0
],
data2
.
val
[
0
],
1
);
input
[
0
][
3
]
=
vextq_f32
(
data1
.
val
[
1
],
data2
.
val
[
1
],
1
);
data1
=
vld2q_f32
(
r1
);
data2
=
vld2q_f32
(
r1
+
8
);
input
[
1
][
0
]
=
data1
.
val
[
0
];
input
[
1
][
1
]
=
data1
.
val
[
1
];
input
[
1
][
2
]
=
vextq_f32
(
data1
.
val
[
0
],
data2
.
val
[
0
],
1
);
input
[
1
][
3
]
=
vextq_f32
(
data1
.
val
[
1
],
data2
.
val
[
1
],
1
);
data1
=
vld2q_f32
(
r2
);
data2
=
vld2q_f32
(
r2
+
8
);
input
[
2
][
0
]
=
data1
.
val
[
0
];
input
[
2
][
1
]
=
data1
.
val
[
1
];
input
[
2
][
2
]
=
vextq_f32
(
data1
.
val
[
0
],
data2
.
val
[
0
],
1
);
input
[
2
][
3
]
=
vextq_f32
(
data1
.
val
[
1
],
data2
.
val
[
1
],
1
);
data1
=
vld2q_f32
(
r3
);
data2
=
vld2q_f32
(
r3
+
8
);
input
[
3
][
0
]
=
data1
.
val
[
0
];
input
[
3
][
1
]
=
data1
.
val
[
1
];
input
[
3
][
2
]
=
vextq_f32
(
data1
.
val
[
0
],
data2
.
val
[
0
],
1
);
input
[
3
][
3
]
=
vextq_f32
(
data1
.
val
[
1
],
data2
.
val
[
1
],
1
);
float32x4_t
tmp1
=
vdupq_n_f32
(
0.
f
);
float32x4_t
tmp2
=
vdupq_n_f32
(
0.
f
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
0
][
0
],
k
[
0
],
0
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
0
][
1
],
k
[
0
],
1
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
0
][
2
],
k
[
0
],
2
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
0
][
3
],
k
[
0
],
3
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
1
][
0
],
k
[
1
],
0
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
1
][
1
],
k
[
1
],
1
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
1
][
2
],
k
[
1
],
2
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
1
][
3
],
k
[
1
],
3
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
2
][
0
],
k
[
2
],
0
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
2
][
1
],
k
[
2
],
1
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
2
][
2
],
k
[
2
],
2
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
2
][
3
],
k
[
2
],
3
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
3
][
0
],
k
[
3
],
0
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
3
][
1
],
k
[
3
],
1
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
3
][
2
],
k
[
3
],
2
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
3
][
3
],
k
[
3
],
3
);
tmp1
=
vaddq_f32
(
tmp1
,
tmp2
);
vst1q_f32
(
outputData
,
tmp1
);
r0
+=
8
;
r1
+=
8
;
r2
+=
8
;
r3
+=
8
;
outputData
+=
4
;
}
for
(
int
r
=
0
;
r
<
remain
;
r
++
)
{
float32x4_t
i0
=
vld1q_f32
(
r0
);
float32x4_t
i1
=
vld1q_f32
(
r1
);
float32x4_t
i2
=
vld1q_f32
(
r2
);
float32x4_t
i3
=
vld1q_f32
(
r3
);
*
outputData
=
conv4x4
(
i0
,
i1
,
i2
,
i3
,
k
[
0
],
k
[
1
],
k
[
2
],
k
[
3
]);
r0
+=
2
;
r1
+=
2
;
r2
+=
2
;
r3
+=
2
;
outputData
++
;
}
}
}
}
};
template
<
DeviceType
Device
>
class
NeonDepthwiseConvFunction
:
public
ConvFunctionBase
{
public:
...
...
@@ -449,7 +559,7 @@ public:
outputWidth
,
filterMultiplier
,
outputData
);
}
else
if
(
filterWidth
==
4
)
{
}
else
if
(
filterWidth
==
4
&&
strideH
()
==
1
)
{
DepthwiseConvKernel
<
4
,
1
>::
run
(
inputPadding
,
filterData
,
inputHeight
,
...
...
@@ -459,6 +569,16 @@ public:
outputWidth
,
filterMultiplier
,
outputData
);
}
else
if
(
filterWidth
==
4
&&
strideH
()
==
2
)
{
DepthwiseConvKernel
<
4
,
2
>::
run
(
inputPadding
,
filterData
,
inputHeight
,
inputWidth
,
outputChannels
,
outputHeight
,
outputWidth
,
filterMultiplier
,
outputData
);
}
inputPadding
+=
inputChannels
*
inputHeight
*
inputWidth
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录