Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
60695ebd
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
60695ebd
编写于
5月 21, 2018
作者:
B
Bin Li
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Optimize conv1x15 conv15x1
上级
97f820c2
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
371 addition
and
15 deletion
+371
-15
mace/kernels/arm/conv_2d_neon.h
mace/kernels/arm/conv_2d_neon.h
+12
-0
mace/kernels/arm/conv_2d_neon_15x1.cc
mace/kernels/arm/conv_2d_neon_15x1.cc
+163
-0
mace/kernels/arm/conv_2d_neon_1x15.cc
mace/kernels/arm/conv_2d_neon_1x15.cc
+149
-0
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+36
-14
mace/ops/conv_2d_benchmark.cc
mace/ops/conv_2d_benchmark.cc
+5
-1
mace/ops/conv_2d_test.cc
mace/ops/conv_2d_test.cc
+6
-0
未找到文件。
mace/kernels/arm/conv_2d_neon.h
浏览文件 @
60695ebd
...
...
@@ -65,6 +65,18 @@ extern void Conv2dNeonK7x7S3(const float *input,
const
index_t
*
out_shape
,
float
*
output
);
extern
void
Conv2dNeonK1x15S1
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
);
extern
void
Conv2dNeonK15x1S1
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
);
}
// namespace kernels
}
// namespace mace
...
...
mace/kernels/arm/conv_2d_neon_15x1.cc
0 → 100644
浏览文件 @
60695ebd
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
#endif
#include "mace/kernels/arm/conv_2d_neon.h"
#include "mace/utils/utils.h"
namespace
mace
{
namespace
kernels
{
inline
void
Conv2dCPUK15x1Calc
(
const
float
*
in_ptr
,
const
float
*
filter_ptr
,
const
index_t
in_width
,
const
index_t
in_channels
,
const
index_t
out_height
,
const
index_t
out_width
,
const
index_t
w
,
const
index_t
tile_width
,
const
index_t
out_image_size
,
float
*
out_ptr
,
const
index_t
io
,
const
int
stride
)
{
for
(
index_t
ih
=
0
;
ih
<
out_height
;
++
ih
)
{
for
(
index_t
iw
=
0
;
iw
<
tile_width
&&
w
+
iw
<
out_width
;
++
iw
)
{
for
(
int
i
=
0
;
i
<
15
;
++
i
)
{
for
(
int
j
=
0
;
j
<
1
;
++
j
)
{
out_ptr
[
io
*
out_image_size
+
ih
*
out_width
+
w
+
iw
]
+=
in_ptr
[(
ih
*
stride
+
i
)
*
in_width
+
((
w
+
iw
)
*
stride
+
j
)]
*
filter_ptr
[
io
*
in_channels
*
15
+
i
*
1
+
j
];
}
}
}
}
}
// Ho = 4, Wo = 1, Co = 1
void
Conv2dNeonK15x1S1
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
)
{
const
index_t
in_image_size
=
in_shape
[
2
]
*
in_shape
[
3
];
const
index_t
out_image_size
=
out_shape
[
2
]
*
out_shape
[
3
];
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
const
index_t
tile_width
=
out_shape
[
1
]
<
4
?
RoundUpDiv4
(
out_shape
[
3
])
:
out_shape
[
3
];
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
++
m
)
{
for
(
index_t
w
=
0
;
w
<
out_shape
[
3
];
w
+=
tile_width
)
{
const
index_t
out_height
=
out_shape
[
2
];
const
index_t
out_width
=
out_shape
[
3
];
const
index_t
in_channels
=
in_shape
[
1
];
const
index_t
in_width
=
in_shape
[
3
];
float
*
out_ptr_base
=
output
+
b
*
out_batch_size
+
m
*
out_image_size
;
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
const
float
*
in_ptr_base
=
input
+
b
*
in_batch_size
+
c
*
in_image_size
;
const
float
*
filter_ptr
=
filter
+
m
*
in_channels
*
15
+
c
*
15
;
#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
/* load filter (1 outch x 1 height x 4 width) */
float32x4_t
vf0
,
vf1
,
vf2
,
vf3
;
vf0
=
vld1q_f32
(
filter_ptr
);
vf1
=
vld1q_f32
(
filter_ptr
+
4
);
vf2
=
vld1q_f32
(
filter_ptr
+
8
);
vf3
=
vld1q_f32
(
filter_ptr
+
11
);
for
(
index_t
h
=
0
;
h
+
3
<
out_height
;
h
+=
4
)
{
for
(
index_t
wt
=
0
;
wt
<
tile_width
&&
w
+
wt
<
out_width
;
++
wt
)
{
// load output
index_t
out_offset
=
h
*
out_width
+
w
+
wt
;
// output (1 outch x 1 height x 4 width): vo_outch_height
float32x4_t
vo
=
{
out_ptr_base
[
out_offset
],
out_ptr_base
[
out_offset
+
out_width
],
out_ptr_base
[
out_offset
+
2
*
out_width
],
out_ptr_base
[
out_offset
+
3
*
out_width
]};
// input offset
index_t
in_offset
=
h
*
in_width
+
w
+
wt
;
// input (3 slide)
float32x4_t
vi0
=
{
in_ptr_base
[
in_offset
],
in_ptr_base
[
in_offset
+
in_width
],
in_ptr_base
[
in_offset
+
2
*
in_width
],
in_ptr_base
[
in_offset
+
3
*
in_width
]};
float32x4_t
vi4
=
{
in_ptr_base
[
in_offset
+
4
*
in_width
],
in_ptr_base
[
in_offset
+
5
*
in_width
],
in_ptr_base
[
in_offset
+
6
*
in_width
],
in_ptr_base
[
in_offset
+
7
*
in_width
]};
float32x4_t
vi8
=
{
in_ptr_base
[
in_offset
+
8
*
in_width
],
in_ptr_base
[
in_offset
+
9
*
in_width
],
in_ptr_base
[
in_offset
+
10
*
in_width
],
in_ptr_base
[
in_offset
+
11
*
in_width
]};
float32x4_t
vi12
=
{
in_ptr_base
[
in_offset
+
12
*
in_width
],
in_ptr_base
[
in_offset
+
13
*
in_width
],
in_ptr_base
[
in_offset
+
14
*
in_width
],
in_ptr_base
[
in_offset
+
15
*
in_width
]};
float32x4_t
vi16
=
{
in_ptr_base
[
in_offset
+
16
*
in_width
],
in_ptr_base
[
in_offset
+
17
*
in_width
]};
float32x4_t
vi1
=
vextq_f32
(
vi0
,
vi4
,
1
);
float32x4_t
vi2
=
vextq_f32
(
vi0
,
vi4
,
2
);
float32x4_t
vi3
=
vextq_f32
(
vi0
,
vi4
,
3
);
float32x4_t
vi5
=
vextq_f32
(
vi4
,
vi8
,
1
);
float32x4_t
vi6
=
vextq_f32
(
vi4
,
vi8
,
2
);
float32x4_t
vi7
=
vextq_f32
(
vi4
,
vi8
,
3
);
float32x4_t
vi9
=
vextq_f32
(
vi8
,
vi12
,
1
);
float32x4_t
vi10
=
vextq_f32
(
vi8
,
vi12
,
2
);
float32x4_t
vi11
=
vextq_f32
(
vi8
,
vi12
,
3
);
float32x4_t
vi13
=
vextq_f32
(
vi12
,
vi16
,
1
);
float32x4_t
vi14
=
vextq_f32
(
vi12
,
vi16
,
2
);
vo
=
vmlaq_lane_f32
(
vo
,
vi0
,
vget_low_f32
(
vf0
),
0
);
vo
=
vmlaq_lane_f32
(
vo
,
vi1
,
vget_low_f32
(
vf0
),
1
);
vo
=
vmlaq_lane_f32
(
vo
,
vi2
,
vget_high_f32
(
vf0
),
0
);
vo
=
vmlaq_lane_f32
(
vo
,
vi3
,
vget_high_f32
(
vf0
),
1
);
vo
=
vmlaq_lane_f32
(
vo
,
vi4
,
vget_low_f32
(
vf1
),
0
);
vo
=
vmlaq_lane_f32
(
vo
,
vi5
,
vget_low_f32
(
vf1
),
1
);
vo
=
vmlaq_lane_f32
(
vo
,
vi6
,
vget_high_f32
(
vf1
),
0
);
vo
=
vmlaq_lane_f32
(
vo
,
vi7
,
vget_high_f32
(
vf1
),
1
);
vo
=
vmlaq_lane_f32
(
vo
,
vi8
,
vget_low_f32
(
vf2
),
0
);
vo
=
vmlaq_lane_f32
(
vo
,
vi9
,
vget_low_f32
(
vf2
),
1
);
vo
=
vmlaq_lane_f32
(
vo
,
vi10
,
vget_high_f32
(
vf2
),
0
);
vo
=
vmlaq_lane_f32
(
vo
,
vi11
,
vget_high_f32
(
vf2
),
1
);
vo
=
vmlaq_lane_f32
(
vo
,
vi12
,
vget_low_f32
(
vf3
),
1
);
vo
=
vmlaq_lane_f32
(
vo
,
vi13
,
vget_high_f32
(
vf3
),
0
);
vo
=
vmlaq_lane_f32
(
vo
,
vi14
,
vget_high_f32
(
vf3
),
1
);
out_ptr_base
[
out_offset
]
=
vo
[
0
];
out_ptr_base
[
out_offset
+
out_width
]
=
vo
[
1
];
out_ptr_base
[
out_offset
+
2
*
out_width
]
=
vo
[
2
];
out_ptr_base
[
out_offset
+
3
*
out_width
]
=
vo
[
3
];
}
// wt
}
// h
#else
Conv2dCPUK15x1Calc
(
in_ptr_base
,
filter_ptr
,
in_width
,
in_channels
,
out_height
,
out_width
,
w
,
tile_width
,
out_image_size
,
out_ptr_base
,
0
,
1
);
#endif
}
// c
}
// w
}
// m
}
// b
}
}
// namespace kernels
}
// namespace mace
mace/kernels/arm/conv_2d_neon_1x15.cc
0 → 100644
浏览文件 @
60695ebd
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
#endif
#include "mace/kernels/arm/conv_2d_neon.h"
#include "mace/utils/utils.h"
#include "mace/utils/logging.h"
namespace
mace
{
namespace
kernels
{
inline
void
Conv2dCPUK1x15Calc
(
const
float
*
in_ptr
,
const
float
*
filter_ptr
,
const
index_t
in_width
,
const
index_t
in_channels
,
const
index_t
out_height
,
const
index_t
h
,
const
index_t
tile_height
,
const
index_t
out_width
,
const
index_t
out_image_size
,
float
*
out_ptr
,
const
index_t
io
,
const
int
stride
)
{
for
(
index_t
ih
=
0
;
ih
<
tile_height
&&
h
+
ih
<
out_height
;
++
ih
)
{
for
(
index_t
iw
=
0
;
iw
<
out_width
;
++
iw
)
{
for
(
int
i
=
0
;
i
<
1
;
++
i
)
{
for
(
int
j
=
0
;
j
<
15
;
++
j
)
{
out_ptr
[
io
*
out_image_size
+
(
h
+
ih
)
*
out_width
+
iw
]
+=
in_ptr
[((
h
+
ih
)
*
stride
+
i
)
*
in_width
+
(
iw
*
stride
+
j
)]
*
filter_ptr
[
io
*
in_channels
*
15
+
i
*
15
+
j
];
}
}
}
}
}
// Ho = 1, Wo = 4, Co = 1
void
Conv2dNeonK1x15S1
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
float
*
output
)
{
const
index_t
in_image_size
=
in_shape
[
2
]
*
in_shape
[
3
];
const
index_t
out_image_size
=
out_shape
[
2
]
*
out_shape
[
3
];
const
index_t
in_batch_size
=
in_shape
[
1
]
*
in_image_size
;
const
index_t
out_batch_size
=
out_shape
[
1
]
*
out_image_size
;
const
index_t
tile_height
=
out_shape
[
1
]
<
4
?
RoundUpDiv4
(
out_shape
[
2
])
:
out_shape
[
2
];
#pragma omp parallel for collapse(3)
for
(
index_t
b
=
0
;
b
<
out_shape
[
0
];
++
b
)
{
for
(
index_t
m
=
0
;
m
<
out_shape
[
1
];
++
m
)
{
for
(
index_t
h
=
0
;
h
<
out_shape
[
2
];
h
+=
tile_height
)
{
const
index_t
out_height
=
out_shape
[
2
];
const
index_t
out_width
=
out_shape
[
3
];
const
index_t
in_channels
=
in_shape
[
1
];
const
index_t
in_width
=
in_shape
[
3
];
float
*
out_ptr_base
=
output
+
b
*
out_batch_size
+
m
*
out_image_size
;
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
const
float
*
in_ptr_base
=
input
+
b
*
in_batch_size
+
c
*
in_image_size
;
const
float
*
filter_ptr
=
filter
+
m
*
in_channels
*
15
+
c
*
15
;
#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
/* load filter (1 outch x 4 height x 1 width) */
float32x4_t
vf0
,
vf1
,
vf2
,
vf3
;
vf0
=
vld1q_f32
(
filter_ptr
);
vf1
=
vld1q_f32
(
filter_ptr
+
4
);
vf2
=
vld1q_f32
(
filter_ptr
+
8
);
vf3
=
vld1q_f32
(
filter_ptr
+
11
);
for
(
index_t
ht
=
0
;
ht
<
tile_height
&&
h
+
ht
<
out_height
;
++
ht
)
{
for
(
index_t
w
=
0
;
w
+
3
<
out_width
;
w
+=
4
)
{
// output (1 outch x 1 height x 4 width): vo_outch_height
float32x4_t
vo
;
// load output
index_t
out_offset
=
(
h
+
ht
)
*
out_width
+
w
;
vo
=
vld1q_f32
(
out_ptr_base
+
out_offset
);
// input (3 slide)
float32x4_t
vi0
,
vi1
,
vi2
,
vi3
,
vi4
,
vi5
,
vi6
,
vi7
,
vi8
,
vi9
,
vi10
,
vi11
,
vi12
,
vi13
,
vi14
,
vi16
;
// input offset
index_t
in_offset
=
(
h
+
ht
)
*
in_width
+
w
;
// load input
vi0
=
vld1q_f32
(
in_ptr_base
+
in_offset
);
vi4
=
vld1q_f32
(
in_ptr_base
+
in_offset
+
4
);
vi8
=
vld1q_f32
(
in_ptr_base
+
in_offset
+
8
);
vi12
=
vld1q_f32
(
in_ptr_base
+
in_offset
+
12
);
vi16
=
vld1q_f32
(
in_ptr_base
+
in_offset
+
16
);
vi1
=
vextq_f32
(
vi0
,
vi4
,
1
);
vi2
=
vextq_f32
(
vi0
,
vi4
,
2
);
vi3
=
vextq_f32
(
vi0
,
vi4
,
3
);
vi5
=
vextq_f32
(
vi4
,
vi8
,
1
);
vi6
=
vextq_f32
(
vi4
,
vi8
,
2
);
vi7
=
vextq_f32
(
vi4
,
vi8
,
3
);
vi9
=
vextq_f32
(
vi8
,
vi12
,
1
);
vi10
=
vextq_f32
(
vi8
,
vi12
,
2
);
vi11
=
vextq_f32
(
vi8
,
vi12
,
3
);
vi13
=
vextq_f32
(
vi12
,
vi16
,
1
);
vi14
=
vextq_f32
(
vi12
,
vi16
,
2
);
vo
=
vmlaq_lane_f32
(
vo
,
vi0
,
vget_low_f32
(
vf0
),
0
);
vo
=
vmlaq_lane_f32
(
vo
,
vi1
,
vget_low_f32
(
vf0
),
1
);
vo
=
vmlaq_lane_f32
(
vo
,
vi2
,
vget_high_f32
(
vf0
),
0
);
vo
=
vmlaq_lane_f32
(
vo
,
vi3
,
vget_high_f32
(
vf0
),
1
);
vo
=
vmlaq_lane_f32
(
vo
,
vi4
,
vget_low_f32
(
vf1
),
0
);
vo
=
vmlaq_lane_f32
(
vo
,
vi5
,
vget_low_f32
(
vf1
),
1
);
vo
=
vmlaq_lane_f32
(
vo
,
vi6
,
vget_high_f32
(
vf1
),
0
);
vo
=
vmlaq_lane_f32
(
vo
,
vi7
,
vget_high_f32
(
vf1
),
1
);
vo
=
vmlaq_lane_f32
(
vo
,
vi8
,
vget_low_f32
(
vf2
),
0
);
vo
=
vmlaq_lane_f32
(
vo
,
vi9
,
vget_low_f32
(
vf2
),
1
);
vo
=
vmlaq_lane_f32
(
vo
,
vi10
,
vget_high_f32
(
vf2
),
0
);
vo
=
vmlaq_lane_f32
(
vo
,
vi11
,
vget_high_f32
(
vf2
),
1
);
vo
=
vmlaq_lane_f32
(
vo
,
vi12
,
vget_low_f32
(
vf3
),
1
);
vo
=
vmlaq_lane_f32
(
vo
,
vi13
,
vget_high_f32
(
vf3
),
0
);
vo
=
vmlaq_lane_f32
(
vo
,
vi14
,
vget_high_f32
(
vf3
),
1
);
vst1q_f32
(
out_ptr_base
+
out_offset
,
vo
);
}
// w
}
// ht
#else
Conv2dCPUK1x15Calc
(
in_ptr_base
,
filter_ptr
,
in_width
,
in_channels
,
out_height
,
h
,
tile_height
,
out_width
,
out_image_size
,
out_ptr_base
,
0
,
1
);
#endif
}
// c
}
// h
}
// m
}
// b
}
}
// namespace kernels
}
// namespace mace
mace/kernels/conv_2d.h
浏览文件 @
60695ebd
...
...
@@ -363,6 +363,10 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
&&
stride_h
==
2
&&
stride_w
==
2
&&
dilation_h
==
1
&&
dilation_w
==
1
;
bool
use_neon_7x7_s3
=
filter_h
==
7
&&
filter_w
==
7
&&
stride_h
==
3
&&
stride_w
==
3
&&
dilation_h
==
1
&&
dilation_w
==
1
;
bool
use_neon_1x15_s1
=
filter_h
==
1
&&
filter_w
==
15
&&
stride_h
==
1
&&
stride_w
==
1
&&
dilation_h
==
1
&&
dilation_w
==
1
;
bool
use_neon_15x1_s1
=
filter_h
==
15
&&
filter_w
==
1
&&
stride_h
==
1
&&
stride_w
==
1
&&
dilation_h
==
1
&&
dilation_w
==
1
;
std
::
vector
<
index_t
>
transformed_input_shape
;
std
::
vector
<
index_t
>
transformed_output_shape
;
...
...
@@ -402,24 +406,26 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
tile_count
});
transformed_filter_shape
.
insert
(
transformed_filter_shape
.
end
(),
{
in_tile_area
,
channels
,
input_channels
});
}
else
if
(
use_neon_3x3_s1
)
{
extra_output_height
=
RoundUp
<
index_t
>
(
height
,
2
);
extra_input_height
=
std
::
max
(
padded_input_height
,
extra_output_height
+
2
);
extra_output_width
=
RoundUp
<
index_t
>
(
width
,
4
);
extra_input_width
=
std
::
max
(
padded_input_width
,
extra_output_width
+
2
);
if
(
extra_input_height
!=
padded_input_height
)
{
pad_bottom
+=
(
extra_input_height
-
padded_input_height
);
}
if
(
extra_input_width
!=
padded_input_width
)
{
pad_right
+=
(
extra_input_width
-
padded_input_width
);
}
else
{
index_t
tile_h
,
tile_w
;
if
(
use_neon_1x1_s1
)
{
tile_h
=
1
;
tile_w
=
1
;
}
else
if
(
use_neon_3x3_s1
)
{
tile_h
=
2
;
tile_w
=
4
;
}
else
if
(
use_neon_15x1_s1
)
{
tile_h
=
4
;
tile_w
=
1
;
}
else
{
tile_h
=
1
;
tile_w
=
4
;
}
}
else
if
(
!
use_neon_1x1_s1
)
{
extra_output_height
=
height
;
extra_output_height
=
RoundUp
<
index_t
>
(
height
,
tile_h
);
extra_input_height
=
std
::
max
(
padded_input_height
,
(
extra_output_height
-
1
)
*
stride_h
+
(
filter_h
-
1
)
*
dilation_h
+
1
);
extra_output_width
=
RoundUp
<
index_t
>
(
width
,
4
);
extra_output_width
=
RoundUp
<
index_t
>
(
width
,
tile_w
);
extra_input_width
=
std
::
max
(
padded_input_width
,
(
extra_output_width
-
1
)
*
stride_w
+
(
filter_w
-
1
)
*
dilation_w
+
1
);
...
...
@@ -584,6 +590,22 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
extra_output_shape
,
pad_output
);
};
}
else
if
(
use_neon_1x15_s1
)
{
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
Conv2dNeonK1x15S1
(
pad_input
,
filter_data
,
extra_input_shape
,
extra_output_shape
,
pad_output
);
};
}
else
if
(
use_neon_15x1_s1
)
{
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
Conv2dNeonK15x1S1
(
pad_input
,
filter_data
,
extra_input_shape
,
extra_output_shape
,
pad_output
);
};
}
else
{
conv_func
=
[
=
](
const
float
*
pad_input
,
float
*
pad_output
)
{
Conv2dGeneral
(
pad_input
,
...
...
mace/ops/conv_2d_benchmark.cc
浏览文件 @
60695ebd
...
...
@@ -165,10 +165,14 @@ BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 4, VALID, 32);
BM_CONV_2D
(
1
,
128
,
56
,
56
,
1
,
1
,
1
,
1
,
SAME
,
128
);
BM_CONV_2D
(
1
,
1024
,
7
,
7
,
1
,
1
,
1
,
1
,
SAME
,
1024
);
BM_CONV_2D
(
64
,
32
,
34
,
34
,
3
,
3
,
1
,
1
,
VALID
,
32
);
BM_CONV_2D
(
1
,
32
,
34
,
34
,
3
,
3
,
1
,
1
,
VALID
,
32
);
// bokeh
BM_CONV_2D
(
1
,
32
,
256
,
256
,
1
,
15
,
1
,
1
,
SAME
,
2
);
BM_CONV_2D
(
1
,
32
,
256
,
256
,
15
,
1
,
1
,
1
,
SAME
,
2
);
BM_CONV_2D
(
1
,
64
,
64
,
64
,
15
,
1
,
1
,
1
,
SAME
,
2
);
}
// namespace test
}
// namespace ops
}
// namespace mace
mace/ops/conv_2d_test.cc
浏览文件 @
60695ebd
...
...
@@ -779,11 +779,17 @@ TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConv15x1S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
32
,
32
},
{
15
,
1
,
256
,
2
},
{
1
,
1
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
64
,
64
},
{
15
,
1
,
64
,
2
},
{
1
,
1
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
256
,
256
},
{
15
,
1
,
32
,
2
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConv1x15S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
32
,
32
},
{
1
,
15
,
256
,
2
},
{
1
,
1
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
256
,
256
},
{
1
,
15
,
32
,
2
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConv7x75S12
)
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录