Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
5ba523e7
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5ba523e7
编写于
8月 13, 2018
作者:
S
smilejames
提交者:
GitHub
8月 13, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into develop
上级
63250a65
2808898a
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
662 addition
and
481 deletion
+662
-481
README.md
README.md
+3
-3
src/operators/math/depthwise_conv_3x3.cpp
src/operators/math/depthwise_conv_3x3.cpp
+659
-478
未找到文件。
README.md
浏览文件 @
5ba523e7
...
...
@@ -28,13 +28,13 @@ Paddle-Moible是PaddlePaddle组织下的项目,是一个致力于嵌入式平
|mobilenet arm v7|1线程|2线程|4线程|
|------------|----|-----|-----|
|麒麟960(ms)|110.586|7
2.474|49.833
|
|麒麟960(ms)|110.586|7
0.897|47.474
|
|||||
|mobilenetssd arm v7|1线程|2线程|4线程|
|麒麟960(ms)|22
4.464|142.544|96.068
|
|麒麟960(ms)|22
2.124|138.952|90.856
|
|||||
|googlenet(v1) arm v7|1线程|2线程|4线程|
|麒麟960(ms)|348.018|24
2.689
|169.998|
|麒麟960(ms)|348.018|24
0.304
|169.998|
|||||
|squeezenet arm v7|1线程|2线程|4线程|
|麒麟960(ms)|84.685|56.544|38.833|
...
...
src/operators/math/depthwise_conv_3x3.cpp
浏览文件 @
5ba523e7
...
...
@@ -613,7 +613,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
}
int
m
;
for
(
m
=
1
;
m
<
output_width
-
4
;
m
+=
4
)
{
float
*
output_ptr
=
output_data
+
m
;
float32x4_t
in0
,
in1
,
in2
,
in3
,
tmp0
,
tmp1
,
tmp2
,
tmp3
,
out0
;
...
...
@@ -637,7 +636,8 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
}
vst1q_f32
(
output_ptr
,
out0
);
}
for
(
m
=
1
;
(
m
+
3
)
<
output_width
-
1
;
m
=
m
+
4
)
{
for
(
m
=
1
;
(
m
+
3
)
<
output_width
-
1
;
m
+=
4
)
{
}
for
(
int
j
=
m
;
j
<
output_width
-
1
;
j
++
)
{
output_data
[
j
]
=
input_data
[
j
-
1
]
*
w10
+
input_data
[
j
]
*
w11
+
...
...
@@ -652,7 +652,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
}
}
for
(
m
=
1
;
(
m
+
3
)
<
output_width
-
1
;
m
=
m
+
4
)
{
for
(
m
=
1
;
m
<
output_width
-
4
;
m
+=
4
)
{
float
*
output_ptr
=
output_data
+
(
output_height
-
1
)
*
output_width
+
m
;
...
...
@@ -807,71 +807,60 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
w21 * input_data[l] + w22 * input_data[l + 1];
output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] +
w20 * input_data[2 * l - 2] +
w21 * input_data[2 * l - 1];
output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l -
1] + w20 * input_data[2 * l - 2] + w21 * input_data[2 * l - 1];
output_data[(l - 1) * l] =
w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1]
+
w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1];
w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l
+
1] +
w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1];
output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
w01 * input_data[(l - 2) * (l + 1) + 1] +
w10 * input_data[l * l - 2] +
w11 * input_data[l * l - 1];
output_data[0] = output_data[0] * newscale_data[j] + newbias_data[j];
output_data[l - 1] =
output_data[l - 1] * newscale_data[j] + newbias_data[j];
output_data[(l - 1) * l] =
output_data[0] = output_data[0] * newscale_data[j] +
newbias_data[j]; output_data[l - 1] = output_data[l - 1] *
newscale_data[j] + newbias_data[j]; output_data[(l - 1) * l] =
output_data[(l - 1) * l] * newscale_data[j] + newbias_data[j];
output_data[l * l - 1] =
output_data[l * l - 1] * newscale_data[j] + newbias_data[j];
if (if_relu) {
output_data[0] = output_data[0] < 0 ? 0 : output_data[0];
output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l - 1];
output_data[(l - 1) * l] =
output_data[(l - 1) * l] < 0 ? 0 : output_data[(l - 1) * l];
output_data[l * l - 1] =
output_data[l * l - 1] < 0 ? 0 : output_data[l * l - 1];
output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l -
1]; output_data[(l - 1) * l] = output_data[(l - 1) * l] < 0 ? 0 :
output_data[(l - 1) * l]; output_data[l * l - 1] = output_data[l * l - 1]
< 0 ? 0 : output_data[l * l - 1];
}
for (int i = 1; i < l - 1; ++i) {
output_data[i * l] =
w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1];
output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] +
w01 * input_data[i * l + l - 1 - l] +
w10 * input_data[i * l + l - 1 - 1] +
w11 * input_data[i * l + l - 1] +
w20 * input_data[i * l + l - 1 + l - 1] +
w21 * input_data[i * l + l - 1 + l];
output_data[i * l] =
output_data[i * l] * newscale_data[j] + newbias_data[j];
output_data[i * l + l - 1] =
output_data[i * l + l - 1] * newscale_data[j] + newbias_data[j];
w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1]
+ w11 * input_data[i * l] + w12 * input_data[i * l + 1] + w21 *
input_data[i * l + l] + w22 * input_data[i * l + l + 1]; output_data[i *
l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] + w01 * input_data[i
* l + l - 1 - l] + w10 * input_data[i * l + l - 1 - 1] + w11 *
input_data[i * l + l - 1] + w20 * input_data[i * l + l - 1 + l - 1] + w21
* input_data[i * l + l - 1 + l]; output_data[i * l] = output_data[i * l]
* newscale_data[j] + newbias_data[j]; output_data[i * l + l - 1] =
output_data[i * l + l - 1] * newscale_data[j] +
newbias_data[j];
if (if_relu) {
output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i *
l]; output_data[i * l + l - 1] =
output_data[i * l + l - 1] < 0 ? 0 :
output_data[i * l + l - 1];
output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i
* l]; output_data[i * l + l - 1] = output_data[i * l + l - 1] < 0 ? 0 :
output_data[i * l + l - 1];
}
}
// top 1 row and bottom 1 row
const float *input_tmp = input_data;
float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
tmp3, tmp4, tmp5, out0;
in0 = vld1q_f32(input_tmp);
in2 = vld1q_f32(input_tmp + l);
const float *input_tmp_end = input_tmp + (l - 2) * l;
in4 = vld1q_f32(input_tmp_end);
in6 = vld1q_f32(input_tmp_end + l);
int c_mid = l_mid;
auto output_ptr = output_data + 1;
for (; c_mid > 3; c_mid -= 4) {
in1 = vld1q_f32(input_tmp + 4);
in3 = vld1q_f32(input_tmp + l + 4);
float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1,
tmp2, tmp3, tmp4, tmp5, out0; in0 = vld1q_f32(input_tmp); in2 =
vld1q_f32(input_tmp + l); const float *input_tmp_end = input_tmp + (l -
2) * l; in4 = vld1q_f32(input_tmp_end); in6 = vld1q_f32(input_tmp_end +
l); int c_mid = l_mid; auto output_ptr = output_data + 1; for (; c_mid >
3; c_mid -= 4) { in1 = vld1q_f32(input_tmp + 4); in3 =
vld1q_f32(input_tmp + l + 4);
tmp0 = vextq_f32(in0, in1, 1);
tmp1 = vextq_f32(in0, in1, 2);
...
...
@@ -1068,6 +1057,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
}
}
*/
#endif
}
...
...
@@ -1482,230 +1472,421 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
const
float
*
newscale_data
=
new_scale
->
data
<
float
>
();
const
float
*
newbias_data
=
new_bias
->
data
<
float
>
();
float32x4_t
vnewbias
=
vdupq_n_f32
(
0.0
);
float32x4_t
vnewscale
=
vdupq_n_f32
(
1.0
);
const
int
in_h
=
static_cast
<
int
>
(
input
->
dims
()[
2
]);
const
int
in_w
=
static_cast
<
int
>
(
input
->
dims
()[
3
]);
const
int
out_h
=
static_cast
<
int
>
(
output
->
dims
()[
2
]);
const
int
out_w
=
static_cast
<
int
>
(
output
->
dims
()[
3
]);
const
int
out_l
=
out_h
;
const
int
in_l
=
in_h
;
const
int
inhxw
=
in_h
*
in_w
;
const
int
outhxw
=
out_h
*
out_w
;
const
int
if_pad
=
in_l
-
1
==
(
out_l
-
1
)
*
2
?
1
:
0
;
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
const
int
c
=
static_cast
<
int
>
(
input
->
dims
()[
1
]);
const
float
*
input_row_ptr
;
float
*
output_row_ptr
;
const
int
input_channel
=
static_cast
<
int
>
(
input
->
dims
()[
1
]);
const
int
w_times
=
(
out_w
-
2
)
/
3
;
const
int
input_height
=
static_cast
<
int
>
(
input
->
dims
()[
2
]);
const
int
input_width
=
static_cast
<
int
>
(
input
->
dims
()[
3
]);
const
int
output_height
=
static_cast
<
int
>
(
output
->
dims
()[
2
]);
const
int
output_width
=
static_cast
<
int
>
(
output
->
dims
()[
3
]);
const
int
inhxw
=
input_height
*
input_width
;
const
int
outhxw
=
output_height
*
output_width
;
float32x4x2_t
input_buff_mid
{},
input_buff_bottom
[
w_times
+
1
];
float32x4_t
elewise_res0
,
elewise_res1
,
elewise_res2
,
res3
;
int
out2in_mid
;
float32x4_t
vnewbias
=
vdupq_n_f32
(
0.0
);
float32x4_t
vnewscale
=
vdupq_n_f32
(
1.0
);
float32x4_t
zero
=
vdupq_n_f32
(
0.0
);
for
(
int
b
=
batch_size
;
b
>
0
;
--
b
)
{
const
float
*
filter_data_tmp
=
filter_data
;
for
(
int
j
=
0
;
j
<
c
;
++
j
)
{
auto
output_data_tmp
=
output_data
+
j
*
out_h
*
out_w
;
auto
input_data_tmp
=
input_data
+
j
*
in_h
*
in_w
;
auto
input_const
=
input_data_tmp
;
vnewbias
=
vdupq_n_f32
(
newbias_data
[
j
]);
vnewscale
=
vdupq_n_f32
(
newscale_data
[
j
]);
float
w00
=
filter_data_tmp
[
0
];
float
w01
=
filter_data_tmp
[
1
];
float
w02
=
filter_data_tmp
[
2
];
float
w10
=
filter_data_tmp
[
3
];
float
w11
=
filter_data_tmp
[
4
];
float
w12
=
filter_data_tmp
[
5
];
float
w20
=
filter_data_tmp
[
6
];
float
w21
=
filter_data_tmp
[
7
];
float
w22
=
filter_data_tmp
[
8
];
int
h_mid
=
0
;
for
(;
h_mid
<
out_h
-
1
;
h_mid
++
)
{
input_row_ptr
=
input_data_tmp
+
1
+
h_mid
*
2
*
in_w
;
output_row_ptr
=
output_data_tmp
+
1
+
h_mid
*
out_w
;
for
(
int
b
=
0
;
b
<
batch_size
;
b
++
)
{
filter_data
=
filter
->
data
<
float
>
();
for
(
int
c
=
0
;
c
<
input_channel
;
c
++
)
{
vnewbias
=
vdupq_n_f32
(
newbias_data
[
c
]);
vnewscale
=
vdupq_n_f32
(
newscale_data
[
c
]);
for
(
int
w4
=
0
;
w4
<
w_times
+
1
;
w4
++
)
{
if
(
h_mid
==
0
)
{
elewise_res1
=
zero
;
elewise_res0
=
zero
;
elewise_res2
=
zero
;
}
else
{
elewise_res1
=
vmulq_n_f32
(
input_buff_bottom
[
w4
].
val
[
1
],
w01
);
elewise_res0
=
vmulq_n_f32
(
input_buff_bottom
[
w4
].
val
[
0
],
w00
);
elewise_res2
=
vmulq_n_f32
(
input_buff_bottom
[
w4
].
val
[
0
],
w02
);
}
input_buff_mid
=
vld2q_f32
(
input_row_ptr
);
input_buff_bottom
[
w4
]
=
vld2q_f32
(
input_row_ptr
+
in_w
);
float
w00
=
filter_data
[
0
];
float
w01
=
filter_data
[
1
];
float
w02
=
filter_data
[
2
];
float
w10
=
filter_data
[
3
];
float
w11
=
filter_data
[
4
];
float
w12
=
filter_data
[
5
];
float
w20
=
filter_data
[
6
];
float
w21
=
filter_data
[
7
];
float
w22
=
filter_data
[
8
];
elewise_res1
=
vmlaq_n_f32
(
elewise_res1
,
input_buff_mid
.
val
[
1
],
w11
);
elewise_res0
=
vmlaq_n_f32
(
elewise_res0
,
input_buff_mid
.
val
[
0
],
w10
);
elewise_res2
=
vmlaq_n_f32
(
elewise_res2
,
input_buff_mid
.
val
[
0
],
w12
);
int
m
;
for
(
m
=
1
;
m
<
output_width
-
2
;
m
=
m
+
3
)
{
float
*
output_ptr
=
output_data
+
m
;
float32x4x2_t
input_buff_mid
{},
input_buff_bottom
{};
float32x4_t
in0
,
in1
,
in2
,
in3
,
tmp0
,
tmp1
,
tmp2
,
tmp3
,
out0
;
input_buff_mid
=
vld2q_f32
(
input_data
+
(
2
*
m
-
1
));
input_buff_bottom
=
vld2q_f32
(
input_data
+
input_width
+
(
2
*
m
-
1
));
elewise_res1
=
vmlaq_n_f32
(
elewise_res1
,
input_buff_bottom
[
w4
].
val
[
1
],
w21
);
elewise_res0
=
vmlaq_n_f32
(
elewise_res0
,
input_buff_bottom
[
w4
].
val
[
0
],
w20
);
elewise_res2
=
vmlaq_n_f32
(
elewise_res2
,
input_buff_bottom
[
w4
].
val
[
0
],
w22
);
in0
=
input_buff_mid
.
val
[
0
];
tmp0
=
input_buff_mid
.
val
[
1
];
tmp1
=
vextq_f32
(
in0
,
zero
,
1
);
res3
=
vaddq_f32
(
vextq_f32
(
elewise_res2
,
zero
,
1
),
vaddq_f32
(
elewise_res0
,
elewise_res1
))
;
res3
=
vmlaq_f32
(
vnewbias
,
vnewscale
,
res3
);
in2
=
input_buff_bottom
.
val
[
0
];
tmp2
=
input_buff_bottom
.
val
[
1
]
;
tmp3
=
vextq_f32
(
in2
,
zero
,
1
);
out0
=
vmulq_n_f32
(
in0
,
w10
);
out0
=
vmlaq_n_f32
(
out0
,
tmp0
,
w11
);
out0
=
vmlaq_n_f32
(
out0
,
tmp1
,
w12
);
out0
=
vmlaq_n_f32
(
out0
,
in2
,
w20
);
out0
=
vmlaq_n_f32
(
out0
,
tmp2
,
w21
);
out0
=
vmlaq_n_f32
(
out0
,
tmp3
,
w22
);
out0
=
vmlaq_f32
(
vnewbias
,
vnewscale
,
out0
);
if
(
if_relu
)
{
res3
=
vmaxq_f32
(
res3
,
zero
);
out0
=
vmaxq_f32
(
out0
,
zero
);
}
vst1q_f32
(
output_row_ptr
,
res3
);
input_row_ptr
+=
6
;
output_row_ptr
+=
3
;
vst1q_f32
(
output_ptr
,
out0
);
}
for
(
m
=
1
;
m
<
output_width
-
2
;
m
+=
3
)
{
}
for
(
int
j
=
m
;
j
<
output_width
;
j
++
)
{
output_data
[
j
]
=
input_data
[
2
*
j
-
1
]
*
w10
+
input_data
[
2
*
j
]
*
w11
+
input_data
[
2
*
j
+
1
]
*
w12
+
input_data
[
2
*
j
-
1
+
input_width
]
*
w20
+
input_data
[
2
*
j
+
input_width
]
*
w21
+
input_data
[
2
*
j
+
1
+
input_width
]
*
w22
;
output_data
[
j
]
=
newscale_data
[
c
]
*
output_data
[
j
]
+
newbias_data
[
c
];
if
(
if_relu
)
{
output_data
[
j
]
=
output_data
[
j
]
<
0
?
0
:
output_data
[
j
];
}
}
clock
();
input_row_ptr
=
input_data_tmp
+
1
+
h_mid
*
2
*
in_w
;
output_row_ptr
=
output_data_tmp
+
1
+
h_mid
*
out_w
;
#pragma omp parallel for
for
(
int
w4
=
0
;
w4
<
w_times
+
1
;
w4
++
)
{
elewise_res1
=
vmulq_n_f32
(
input_buff_bottom
[
w4
].
val
[
1
],
w01
);
elewise_res0
=
vmulq_n_f32
(
input_buff_bottom
[
w4
].
val
[
0
],
w00
);
elewise_res2
=
vmulq_n_f32
(
input_buff_bottom
[
w4
].
val
[
0
],
w02
);
for
(
int
i
=
1
;
i
<
output_height
;
i
+=
1
)
{
for
(
int
m
=
1
;
m
<
output_width
-
2
;
m
+=
3
)
{
float
*
output_ptr
=
output_data
+
i
*
output_width
+
m
;
float32x4x2_t
input_buff_top
{},
input_buff_mid
{},
input_buff_bottom
{};
float32x4_t
in0
,
in1
,
in2
,
in3
,
in4
,
in5
,
tmp0
,
tmp1
,
tmp2
,
tmp3
,
tmp4
,
tmp5
,
out0
;
input_buff_top
=
vld2q_f32
(
input_data
+
(
2
*
i
-
1
)
*
input_width
+
(
2
*
m
-
1
));
input_buff_mid
=
vld2q_f32
(
input_data
+
(
2
*
i
)
*
input_width
+
(
2
*
m
-
1
));
input_buff_bottom
=
vld2q_f32
(
input_data
+
(
2
*
i
+
1
)
*
input_width
+
(
2
*
m
-
1
));
input_buff_mid
=
vld2q_f32
(
input_row_ptr
);
input_buff_bottom
[
w4
]
=
vld2q_f32
(
input_row_ptr
+
in_w
);
in0
=
input_buff_top
.
val
[
0
];
tmp0
=
input_buff_top
.
val
[
1
];
tmp1
=
vextq_f32
(
in0
,
zero
,
1
);
elewise_res1
=
vmlaq_n_f32
(
elewise_res1
,
input_buff_mid
.
val
[
1
],
w11
)
;
elewise_res0
=
vmlaq_n_f32
(
elewise_res0
,
input_buff_mid
.
val
[
0
],
w10
)
;
elewise_res2
=
vmlaq_n_f32
(
elewise_res2
,
input_buff_mid
.
val
[
0
],
w12
);
in2
=
input_buff_mid
.
val
[
0
]
;
tmp2
=
input_buff_mid
.
val
[
1
]
;
tmp3
=
vextq_f32
(
in2
,
zero
,
1
);
if
(
!
if_pad
)
{
elewise_res1
=
vmlaq_n_f32
(
elewise_res1
,
input_buff_bottom
[
w4
].
val
[
1
],
w21
);
elewise_res0
=
vmlaq_n_f32
(
elewise_res0
,
input_buff_bottom
[
w4
].
val
[
0
],
w20
);
elewise_res2
=
vmlaq_n_f32
(
elewise_res2
,
input_buff_bottom
[
w4
].
val
[
0
],
w22
);
}
res3
=
vaddq_f32
(
vextq_f32
(
elewise_res2
,
zero
,
1
),
vaddq_f32
(
elewise_res0
,
elewise_res1
));
res3
=
vmlaq_f32
(
vnewbias
,
vnewscale
,
res3
);
in4
=
input_buff_bottom
.
val
[
0
];
tmp4
=
input_buff_bottom
.
val
[
1
];
tmp5
=
vextq_f32
(
in4
,
zero
,
1
);
out0
=
vmulq_n_f32
(
in0
,
w00
);
out0
=
vmlaq_n_f32
(
out0
,
tmp0
,
w01
);
out0
=
vmlaq_n_f32
(
out0
,
tmp1
,
w02
);
out0
=
vmlaq_n_f32
(
out0
,
in2
,
w10
);
out0
=
vmlaq_n_f32
(
out0
,
tmp2
,
w11
);
out0
=
vmlaq_n_f32
(
out0
,
tmp3
,
w12
);
out0
=
vmlaq_n_f32
(
out0
,
in4
,
w20
);
out0
=
vmlaq_n_f32
(
out0
,
tmp4
,
w21
);
out0
=
vmlaq_n_f32
(
out0
,
tmp5
,
w22
);
out0
=
vmlaq_f32
(
vnewbias
,
vnewscale
,
out0
);
if
(
if_relu
)
{
res3
=
vmaxq_f32
(
res3
,
zero
);
}
if
((
w4
!=
w_times
))
{
vst1q_f32
(
output_row_ptr
,
res3
);
}
else
{
if
(
out_l
-
2
-
w_times
*
3
==
1
)
{
vst1q_lane_f32
(
output_row_ptr
,
res3
,
0
);
}
else
if
(
out_l
-
2
-
w_times
*
3
==
2
)
{
vst1q_lane_f32
(
output_row_ptr
,
res3
,
0
);
vst1q_lane_f32
(
output_row_ptr
+
1
,
res3
,
1
);
out0
=
vmaxq_f32
(
out0
,
zero
);
}
vst1q_f32
(
output_ptr
,
out0
);
}
in
put_row_ptr
+=
6
;
output_row_ptr
+=
3
;
in
t
m
;
for
(
m
=
1
;
m
<
output_width
-
2
;
m
+=
3
)
{
}
output_data_tmp
[
0
]
=
input_const
[
0
]
*
w11
+
input_const
[
1
]
*
w12
+
input_const
[
in_l
]
*
w21
+
input_const
[
in_l
+
1
]
*
w22
;
out2in_mid
=
(
out_l
-
1
)
*
2
;
output_data_tmp
[
out_l
-
1
]
=
w10
*
input_const
[
out2in_mid
-
1
]
+
w11
*
input_const
[
out2in_mid
]
+
w20
*
input_const
[
out2in_mid
+
in_w
-
1
]
+
w21
*
input_const
[
out2in_mid
+
in_w
]
+
(
1
-
if_pad
)
*
(
w12
*
input_const
[
out2in_mid
+
1
]
+
w22
*
input_const
[
out2in_mid
+
in_w
+
1
]);
out2in_mid
=
(
out_l
-
1
)
*
2
*
in_w
;
output_data_tmp
[
out_l
*
(
out_l
-
1
)]
=
w01
*
input_const
[
out2in_mid
-
in_w
]
+
w02
*
input_const
[
out2in_mid
-
in_w
+
1
]
+
w11
*
input_const
[
out2in_mid
]
+
w12
*
input_const
[
out2in_mid
+
1
]
+
(
1
-
if_pad
)
*
(
w21
*
input_const
[
out2in_mid
+
in_w
]
+
w22
*
input_const
[
out2in_mid
+
in_w
+
1
]);
out2in_mid
=
(
out_l
-
1
)
*
2
*
in_w
+
(
out_l
-
1
)
*
2
;
output_data_tmp
[
out_l
*
out_l
-
1
]
=
w00
*
input_const
[
out2in_mid
-
in_w
-
1
]
+
w01
*
input_const
[
out2in_mid
-
in_w
]
+
w10
*
input_const
[
out2in_mid
-
1
]
+
w11
*
input_const
[
out2in_mid
]
+
(
1
-
if_pad
)
*
(
w20
*
input_const
[
out2in_mid
+
in_w
-
1
]
+
w21
*
input_const
[
out2in_mid
+
in_w
]
+
w02
*
input_const
[
out2in_mid
-
in_w
+
1
]
+
w12
*
input_const
[
out2in_mid
+
1
]
+
w22
*
input_const
[
out2in_mid
+
in_w
+
1
]);
output_data_tmp
[
0
]
=
output_data_tmp
[
0
]
*
newscale_data
[
j
]
+
newbias_data
[
j
];
output_data_tmp
[
out_l
-
1
]
=
output_data_tmp
[
out_l
-
1
]
*
newscale_data
[
j
]
+
newbias_data
[
j
];
output_data_tmp
[
out_l
*
(
out_l
-
1
)]
=
output_data_tmp
[
out_l
*
(
out_l
-
1
)]
*
newscale_data
[
j
]
+
newbias_data
[
j
];
output_data_tmp
[
out_l
*
out_l
-
1
]
=
output_data_tmp
[
out_l
*
out_l
-
1
]
*
newscale_data
[
j
]
+
newbias_data
[
j
];
for
(
int
j
=
m
;
j
<
output_width
;
j
++
)
{
output_data
[
i
*
output_width
+
j
]
=
input_data
[(
2
*
i
-
1
)
*
input_width
+
2
*
j
-
1
]
*
w00
+
input_data
[(
2
*
i
-
1
)
*
input_width
+
2
*
j
]
*
w01
+
input_data
[(
2
*
i
-
1
)
*
input_width
+
2
*
j
+
1
]
*
w02
+
input_data
[(
2
*
i
)
*
input_width
+
2
*
j
-
1
]
*
w10
+
input_data
[(
2
*
i
)
*
input_width
+
2
*
j
]
*
w11
+
input_data
[(
2
*
i
)
*
input_width
+
2
*
j
+
1
]
*
w12
+
input_data
[(
2
*
i
+
1
)
*
input_width
+
2
*
j
-
1
]
*
w20
+
input_data
[(
2
*
i
+
1
)
*
input_width
+
2
*
j
]
*
w21
+
input_data
[(
2
*
i
+
1
)
*
input_width
+
2
*
j
+
1
]
*
w22
;
output_data
[
i
*
output_width
+
j
]
=
newscale_data
[
c
]
*
output_data
[
i
*
output_width
+
j
]
+
newbias_data
[
c
];
if
(
if_relu
)
{
output_data_tmp
[
0
]
=
output_data_tmp
[
0
]
<
0
?
0
:
output_data_tmp
[
0
];
output_data_tmp
[
out_l
-
1
]
=
output_data_tmp
[
out_l
-
1
]
<
0
?
0
:
output_data_tmp
[
out_l
-
1
];
output_data_tmp
[
out_l
*
(
out_l
-
1
)]
=
output_data_tmp
[
out_l
*
(
out_l
-
1
)]
<
0
?
0
:
output_data_tmp
[
out_l
*
(
out_l
-
1
)];
output_data_tmp
[
out_l
*
out_l
-
1
]
=
output_data_tmp
[
out_l
*
out_l
-
1
]
<
0
output_data
[
i
*
output_width
+
j
]
=
output_data
[
i
*
output_width
+
j
]
<
0
?
0
:
output_data_tmp
[
out_l
*
out_l
-
1
];
:
output_data
[
i
*
output_width
+
j
];
}
for
(
int
i
=
1
;
i
<
out_h
-
1
;
i
++
)
{
out2in_mid
=
i
*
2
*
in_w
;
output_data_tmp
[
i
*
out_l
]
=
w01
*
input_const
[
out2in_mid
-
in_w
]
+
w02
*
input_const
[
out2in_mid
-
in_w
+
1
]
+
w11
*
input_const
[
out2in_mid
]
+
w12
*
input_const
[
out2in_mid
+
1
]
+
w21
*
input_const
[
out2in_mid
+
in_w
]
+
w22
*
input_const
[
out2in_mid
+
in_w
+
1
];
}
}
output_data
[
0
]
=
input_data
[
0
]
*
w11
+
input_data
[
1
]
*
w12
+
input_data
[
input_height
]
*
w21
+
input_data
[
input_height
+
1
]
*
w22
;
out2in_mid
=
i
*
2
*
in_w
+
(
out_l
-
1
)
*
2
;
output_data_tmp
[
i
*
out_l
+
out_l
-
1
]
=
w00
*
input_const
[
out2in_mid
-
in_w
-
1
]
+
w01
*
input_const
[
out2in_mid
-
in_w
]
+
w10
*
input_const
[
out2in_mid
-
1
]
+
w11
*
input_const
[
out2in_mid
]
+
w20
*
input_const
[
out2in_mid
+
in_w
-
1
]
+
w21
*
input_const
[
out2in_mid
+
in_w
]
+
(
1
-
if_pad
)
*
(
w02
*
input_const
[
out2in_mid
-
in_w
+
1
]
+
w12
*
input_const
[
out2in_mid
+
1
]
+
w22
*
input_const
[
out2in_mid
+
in_w
+
1
]);
output_data_tmp
[
i
*
out_l
]
=
output_data_tmp
[
i
*
out_l
]
*
newscale_data
[
j
]
+
newbias_data
[
j
];
output_data_tmp
[
i
*
out_l
+
out_l
-
1
]
=
output_data_tmp
[
i
*
out_l
+
out_l
-
1
]
*
newscale_data
[
j
]
+
newbias_data
[
j
];
output_data
[
0
]
=
newscale_data
[
c
]
*
output_data
[
0
]
+
newbias_data
[
c
];
if
(
if_relu
)
{
output_data_tmp
[
i
*
out_l
]
=
output_data_tmp
[
i
*
out_l
]
<
0
?
0
:
output_data_tmp
[
i
*
out_l
];
output_data_tmp
[
i
*
out_l
+
out_l
-
1
]
=
output_data_tmp
[
i
*
out_l
+
out_l
-
1
]
<
0
output_data
[
0
]
=
output_data
[
0
]
<
0
?
0
:
output_data
[
0
];
}
for
(
int
i
=
1
;
i
<
output_height
;
i
++
)
{
output_data
[
i
*
output_width
]
=
input_data
[(
2
*
i
-
1
)
*
input_width
]
*
w01
+
input_data
[(
2
*
i
-
1
)
*
input_width
+
1
]
*
w02
+
input_data
[(
2
*
i
)
*
input_width
]
*
w11
+
input_data
[(
2
*
i
)
*
input_width
+
1
]
*
w12
+
input_data
[(
2
*
i
+
1
)
*
input_width
]
*
w21
+
input_data
[(
2
*
i
+
1
)
*
input_width
+
1
]
*
w22
;
output_data
[
i
*
output_width
]
=
newscale_data
[
c
]
*
output_data
[
i
*
output_width
]
+
newbias_data
[
c
];
if
(
if_relu
)
{
output_data
[
i
*
output_width
]
=
output_data
[
i
*
output_width
]
<
0
?
0
:
output_data_tmp
[
i
*
out_l
+
out_l
-
1
];
:
output_data
[
i
*
output_width
];
}
}
filter_data_tmp
+=
9
;
input_data
=
input_data
+
inhxw
;
output_data
=
output_data
+
outhxw
;
filter_data
=
filter_data
+
9
;
}
input_data
+=
inhxw
*
c
;
output_data
+=
outhxw
*
c
;
}
// const float *input_data = input->data<float>();
// const float *filter_data = filter->data<float>();
// float *output_data = output->data<float>();
// const float *newscale_data = new_scale->data<float>();
// const float *newbias_data = new_bias->data<float>();
//
// float32x4_t vnewbias = vdupq_n_f32(0.0);
// float32x4_t vnewscale = vdupq_n_f32(1.0);
//
// const int in_h = static_cast<int>(input->dims()[2]);
// const int in_w = static_cast<int>(input->dims()[3]);
// const int out_h = static_cast<int>(output->dims()[2]);
// const int out_w = static_cast<int>(output->dims()[3]);
// const int out_l = out_h;
// const int in_l = in_h;
// const int inhxw = in_h * in_w;
// const int outhxw = out_h * out_w;
// const int if_pad = in_l - 1 == (out_l - 1) * 2 ? 1 : 0;
// const int batch_size = static_cast<int>(input->dims()[0]);
// const int c = static_cast<int>(input->dims()[1]);
// const float *input_row_ptr;
// float *output_row_ptr;
//
// const int w_times = (out_w - 2) / 3;
//
// float32x4x2_t input_buff_mid{}, input_buff_bottom[w_times + 1];
// float32x4_t elewise_res0, elewise_res1, elewise_res2, res3;
// int out2in_mid;
// float32x4_t zero = vdupq_n_f32(0.0);
// for (int b = batch_size; b > 0; --b) {
// const float *filter_data_tmp = filter_data;
// for (int j = 0; j < c; ++j) {
// auto output_data_tmp = output_data + j * out_h * out_w;
// auto input_data_tmp = input_data + j * in_h * in_w;
// auto input_const = input_data_tmp;
//
// vnewbias = vdupq_n_f32(newbias_data[j]);
// vnewscale = vdupq_n_f32(newscale_data[j]);
//
// float w00 = filter_data_tmp[0];
// float w01 = filter_data_tmp[1];
// float w02 = filter_data_tmp[2];
// float w10 = filter_data_tmp[3];
// float w11 = filter_data_tmp[4];
// float w12 = filter_data_tmp[5];
// float w20 = filter_data_tmp[6];
// float w21 = filter_data_tmp[7];
// float w22 = filter_data_tmp[8];
//
// int h_mid = 0;
//
// for (; h_mid < out_h - 1; h_mid++) {
// input_row_ptr = input_data_tmp + 1 + h_mid * 2 * in_w;
// output_row_ptr = output_data_tmp + 1 + h_mid * out_w;
//
// for (int w4 = 0; w4 < w_times + 1; w4++) {
// if (h_mid == 0) {
// elewise_res1 = zero;
// elewise_res0 = zero;
// elewise_res2 = zero;
// } else {
// elewise_res1 = vmulq_n_f32(input_buff_bottom[w4].val[1], w01);
// elewise_res0 = vmulq_n_f32(input_buff_bottom[w4].val[0], w00);
// elewise_res2 = vmulq_n_f32(input_buff_bottom[w4].val[0], w02);
// }
// input_buff_mid = vld2q_f32(input_row_ptr);
// input_buff_bottom[w4] = vld2q_f32(input_row_ptr + in_w);
//
// elewise_res1 = vmlaq_n_f32(elewise_res1, input_buff_mid.val[1],
// w11); elewise_res0 = vmlaq_n_f32(elewise_res0,
// input_buff_mid.val[0], w10); elewise_res2 =
// vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12);
//
// elewise_res1 =
// vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1],
// w21);
// elewise_res0 =
// vmlaq_n_f32(elewise_res0, input_buff_bottom[w4].val[0],
// w20);
// elewise_res2 =
// vmlaq_n_f32(elewise_res2, input_buff_bottom[w4].val[0],
// w22);
//
// res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1),
// vaddq_f32(elewise_res0, elewise_res1));
// res3 = vmlaq_f32(vnewbias, vnewscale, res3);
//
// if (if_relu) {
// res3 = vmaxq_f32(res3, zero);
// }
// vst1q_f32(output_row_ptr, res3);
//
// input_row_ptr += 6;
// output_row_ptr += 3;
// }
// }
// clock();
//
// input_row_ptr = input_data_tmp + 1 + h_mid * 2 * in_w;
// output_row_ptr = output_data_tmp + 1 + h_mid * out_w;
//
// for (int w4 = 0; w4 < w_times + 1; w4++) {
// elewise_res1 = vmulq_n_f32(input_buff_bottom[w4].val[1], w01);
// elewise_res0 = vmulq_n_f32(input_buff_bottom[w4].val[0], w00);
// elewise_res2 = vmulq_n_f32(input_buff_bottom[w4].val[0], w02);
//
// input_buff_mid = vld2q_f32(input_row_ptr);
// input_buff_bottom[w4] = vld2q_f32(input_row_ptr + in_w);
//
// elewise_res1 = vmlaq_n_f32(elewise_res1, input_buff_mid.val[1],
// w11); elewise_res0 = vmlaq_n_f32(elewise_res0,
// input_buff_mid.val[0], w10); elewise_res2 =
// vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12);
//
// if (!if_pad) {
// elewise_res1 =
// vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1],
// w21);
// elewise_res0 =
// vmlaq_n_f32(elewise_res0, input_buff_bottom[w4].val[0],
// w20);
// elewise_res2 =
// vmlaq_n_f32(elewise_res2, input_buff_bottom[w4].val[0],
// w22);
// }
// res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1),
// vaddq_f32(elewise_res0, elewise_res1));
// res3 = vmlaq_f32(vnewbias, vnewscale, res3);
//
// if (if_relu) {
// res3 = vmaxq_f32(res3, zero);
// }
// if ((w4 != w_times)) {
// vst1q_f32(output_row_ptr, res3);
// } else {
// if (out_l - 2 - w_times * 3 == 1) {
// vst1q_lane_f32(output_row_ptr, res3, 0);
// } else if (out_l - 2 - w_times * 3 == 2) {
// vst1q_lane_f32(output_row_ptr, res3, 0);
// vst1q_lane_f32(output_row_ptr + 1, res3, 1);
// }
// }
// input_row_ptr += 6;
// output_row_ptr += 3;
// }
//
// output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 +
// input_const[in_l] * w21 +
// input_const[in_l + 1] * w22;
//
// out2in_mid = (out_l - 1) * 2;
// output_data_tmp[out_l - 1] =
// w10 * input_const[out2in_mid - 1] + w11 *
// input_const[out2in_mid] + w20 * input_const[out2in_mid + in_w -
// 1] + w21 * input_const[out2in_mid + in_w] + (1 - if_pad) * (w12
// * input_const[out2in_mid + 1] +
// w22 * input_const[out2in_mid + in_w + 1]);
//
// out2in_mid = (out_l - 1) * 2 * in_w;
//
// output_data_tmp[out_l * (out_l - 1)] =
// w01 * input_const[out2in_mid - in_w] +
// w02 * input_const[out2in_mid - in_w + 1] +
// w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid +
// 1] + (1 - if_pad) * (w21 * input_const[out2in_mid + in_w] +
// w22 * input_const[out2in_mid + in_w + 1]);
// out2in_mid = (out_l - 1) * 2 * in_w + (out_l - 1) * 2;
//
// output_data_tmp[out_l * out_l - 1] =
// w00 * input_const[out2in_mid - in_w - 1] +
// w01 * input_const[out2in_mid - in_w] +
// w10 * input_const[out2in_mid - 1] + w11 *
// input_const[out2in_mid] + (1 - if_pad) * (w20 *
// input_const[out2in_mid + in_w - 1] +
// w21 * input_const[out2in_mid + in_w] +
// w02 * input_const[out2in_mid - in_w + 1] +
// w12 * input_const[out2in_mid + 1] +
// w22 * input_const[out2in_mid + in_w + 1]);
// output_data_tmp[0] =
// output_data_tmp[0] * newscale_data[j] + newbias_data[j];
// output_data_tmp[out_l - 1] =
// output_data_tmp[out_l - 1] * newscale_data[j] + newbias_data[j];
// output_data_tmp[out_l * (out_l - 1)] =
// output_data_tmp[out_l * (out_l - 1)] * newscale_data[j] +
// newbias_data[j];
// output_data_tmp[out_l * out_l - 1] =
// output_data_tmp[out_l * out_l - 1] * newscale_data[j] +
// newbias_data[j];
// if (if_relu) {
// output_data_tmp[0] = output_data_tmp[0] < 0 ? 0 :
// output_data_tmp[0]; output_data_tmp[out_l - 1] =
// output_data_tmp[out_l - 1] < 0 ? 0 : output_data_tmp[out_l -
// 1];
// output_data_tmp[out_l * (out_l - 1)] =
// output_data_tmp[out_l * (out_l - 1)] < 0
// ? 0
// : output_data_tmp[out_l * (out_l - 1)];
// output_data_tmp[out_l * out_l - 1] =
// output_data_tmp[out_l * out_l - 1] < 0
// ? 0
// : output_data_tmp[out_l * out_l - 1];
// }
// for (int i = 1; i < out_h - 1; i++) {
// out2in_mid = i * 2 * in_w;
// output_data_tmp[i * out_l] = w01 * input_const[out2in_mid - in_w]
// +
// w02 * input_const[out2in_mid - in_w +
// 1] + w11 * input_const[out2in_mid] +
// w12 * input_const[out2in_mid + 1] +
// w21 * input_const[out2in_mid + in_w]
// + w22 * input_const[out2in_mid + in_w
// + 1];
//
// out2in_mid = i * 2 * in_w + (out_l - 1) * 2;
// output_data_tmp[i * out_l + out_l - 1] =
// w00 * input_const[out2in_mid - in_w - 1] +
// w01 * input_const[out2in_mid - in_w] +
// w10 * input_const[out2in_mid - 1] + w11 *
// input_const[out2in_mid] + w20 * input_const[out2in_mid + in_w
// - 1] + w21 * input_const[out2in_mid + in_w] + (1 - if_pad) *
// (w02 * input_const[out2in_mid - in_w + 1] +
// w12 * input_const[out2in_mid + 1] +
// w22 * input_const[out2in_mid + in_w + 1]);
// output_data_tmp[i * out_l] =
// output_data_tmp[i * out_l] * newscale_data[j] +
// newbias_data[j];
// output_data_tmp[i * out_l + out_l - 1] =
// output_data_tmp[i * out_l + out_l - 1] * newscale_data[j] +
// newbias_data[j];
// if (if_relu) {
// output_data_tmp[i * out_l] =
// output_data_tmp[i * out_l] < 0 ? 0 : output_data_tmp[i *
// out_l];
// output_data_tmp[i * out_l + out_l - 1] =
// output_data_tmp[i * out_l + out_l - 1] < 0
// ? 0
// : output_data_tmp[i * out_l + out_l - 1];
// }
// }
// filter_data_tmp += 9;
// }
// input_data += inhxw * c;
// output_data += outhxw * c;
// }
#endif
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录