Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
66950a4f
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
403
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
66950a4f
编写于
5月 15, 2020
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(dnn/arm): add nchw44 fp32 direct conv stride2
GitOrigin-RevId: 4106b46b6cea3b496a8dfcfae80f6383a96d7739
上级
bfe945fb
变更
10
展开全部
隐藏空白更改
内联
并排
Showing
10 changed file
with
1290 addition
and
41 deletion
+1290
-41
dnn/src/arm_common/conv_bias/fp32/algos.h
dnn/src/arm_common/conv_bias/fp32/algos.h
+17
-0
dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_algo.cpp
..._common/conv_bias/fp32/f32_direct_stride2_nchw44_algo.cpp
+281
-0
dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.cpp
..._common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.cpp
+748
-0
dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.h
...rm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.h
+40
-0
dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw_nchw44_kern.cpp
...on/conv_bias/fp32/f32_direct_stride2_nchw_nchw44_kern.cpp
+4
-4
dnn/src/arm_common/conv_bias/intrinsic_helper.h
dnn/src/arm_common/conv_bias/intrinsic_helper.h
+150
-21
dnn/src/arm_common/conv_bias/opr_impl.cpp
dnn/src/arm_common/conv_bias/opr_impl.cpp
+3
-0
dnn/src/arm_common/conv_bias/opr_impl.h
dnn/src/arm_common/conv_bias/opr_impl.h
+2
-0
dnn/test/arm_common/conv_bias.cpp
dnn/test/arm_common/conv_bias.cpp
+10
-0
dnn/test/arm_common/conv_bias_multi_thread.cpp
dnn/test/arm_common/conv_bias_multi_thread.cpp
+35
-16
未找到文件。
dnn/src/arm_common/conv_bias/fp32/algos.h
浏览文件 @
66950a4f
...
...
@@ -178,6 +178,23 @@ public:
const
NCBKernSizeParam
&
param
)
const
override
;
};
class
ConvBiasImpl
::
AlgoF32DirectStride2NCHW44
final
:
public
AlgoBase
{
SmallVector
<
NCBKern
>
get_kimpls
(
const
NCBKernSizeParam
&
param
)
const
;
public:
AlgoF32DirectStride2NCHW44
()
{}
bool
is_reproducible
()
const
override
{
return
true
;
}
const
char
*
name
()
const
override
{
return
"F32_CONV_NCHW44_DIRECT_S2"
;
}
bool
usable
(
fallback
::
ConvBiasImpl
*
,
const
NCBKernSizeParam
&
param
,
AlgoSelectionStrategy
algo_selection_strategy
)
const
override
;
size_t
get_workspace
(
fallback
::
ConvBiasImpl
*
,
const
NCBKernSizeParam
&
param
)
const
override
;
virtual
SmallVector
<
NCBKern
>
dispatch_kerns
(
fallback
::
ConvBiasImpl
*
opr
,
const
NCBKernSizeParam
&
param
)
const
override
;
};
class
ConvBiasImpl
::
AlgoF32DirectStride1
final
:
public
AlgoBase
{
SmallVector
<
NCBKern
>
get_kimpls
(
const
NCBKernSizeParam
&
param
)
const
;
bool
m_large_group
;
...
...
dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_algo.cpp
0 → 100644
浏览文件 @
66950a4f
/**
* \file dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_algo.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied.
*/
#include "megdnn/oprs.h"
#include "src/arm_common/conv_bias/fp32/algos.h"
#include "src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.h"
#include "src/arm_common/conv_bias/fp32/strategy.h"
#include "src/arm_common/elemwise_op.h"
#include "src/common/opr_delegate.h"
#include "midout.h"
using
namespace
megdnn
;
using
namespace
arm_common
;
using
conv_fun
=
std
::
function
<
void
(
WorkspaceBundle
bundle
,
const
ConvBiasImpl
::
NCBKernParam
&
kern_param
,
const
ConvBiasImpl
::
NCBKernIndex
&
ncb_index
,
const
CpuNDRange
&
workspace_ids
,
const
CpuNDRange
&
ncb_range
)
>
;
MIDOUT_DECL
(
megdnn_arm_common_conv_bias_fp32_nchw44_stride2
)
namespace
{
// block_helper is used to calculate oh block size
static
inline
int
block_helper
(
const
int
nthread
,
const
int
amount
,
const
int
size_per_unit
)
{
constexpr
int
l2_cache_size
=
256
*
1024
;
const
int
block_per_thread
=
div_ceil
(
amount
,
nthread
);
const
int
best_block
=
std
::
min
(
amount
,
(
l2_cache_size
+
size_per_unit
/
2
)
/
size_per_unit
);
const
int
max_block_num
=
div_ceil
(
block_per_thread
,
best_block
);
const
int
min_block_num
=
std
::
max
(
max_block_num
-
1
,
1
);
const
int
max_block
=
div_ceil
(
block_per_thread
,
max_block_num
);
const
int
min_block
=
div_ceil
(
block_per_thread
,
min_block_num
);
const
int
max_loss
=
std
::
abs
(
max_block_num
*
max_block
-
block_per_thread
);
const
int
min_loss
=
std
::
abs
(
min_block_num
*
min_block
-
block_per_thread
);
int
block
=
max_loss
>
min_loss
?
min_block
:
max_block
;
return
block
;
}
static
inline
size_t
get_perthread_cache_bytes
(
const
int
ic
,
const
int
ih2
,
const
int
iw2
)
{
// border_size is used to avoid read illegal memory
int
border_size
=
64
*
2
;
return
ic
*
ih2
*
iw2
*
sizeof
(
float
)
+
border_size
;
}
static
void
get_rectified_size
(
const
megdnn
::
fallback
::
ConvBiasImpl
::
NCBKernSizeParam
&
param
,
int
&
ih2
,
int
&
iw2
,
int
&
oh2
,
int
&
ow2
)
{
int
ic
=
param
.
filter_meta
.
icpg
;
int
iw
=
param
.
isz
[
1
];
int
oh
=
param
.
osz
[
0
];
int
ow
=
param
.
osz
[
1
];
oh2
=
oh
;
ow2
=
ow
;
constexpr
int
cacheline
=
64
/
sizeof
(
float
);
int
block_oh
=
block_helper
(
param
.
nr_threads
,
oh
,
ic
*
iw
*
sizeof
(
float
)
*
2
);
auto
&&
fm
=
param
.
filter_meta
;
const
int
stride_h
=
static_cast
<
int
>
(
fm
.
stride
[
0
]);
const
int
filter_h
=
static_cast
<
int
>
(
fm
.
spatial
[
0
]);
ih2
=
block_oh
*
stride_h
+
filter_h
-
stride_h
;
iw2
=
round_up
(
iw
+
2
*
static_cast
<
int
>
(
fm
.
padding
[
1
]),
cacheline
);
}
static
WorkspaceBundle
get_bundle
(
const
ConvBiasImpl
::
NCBKernSizeParam
&
param
)
{
auto
&&
fm
=
param
.
filter_meta
;
int
ic
=
fm
.
icpg
;
int
ih2
,
iw2
,
oh2
,
ow2
;
get_rectified_size
(
param
,
ih2
,
iw2
,
oh2
,
ow2
);
size_t
src_size
=
get_perthread_cache_bytes
(
ic
,
ih2
,
iw2
);
return
{
nullptr
,
{
src_size
*
param
.
nr_threads
}};
};
template
<
size_t
filter
,
BiasMode
bias_mode
,
typename
Op
>
static
void
do_conv_kern
(
WorkspaceBundle
bundle
,
const
ConvBiasImpl
::
NCBKernParam
&
kern_param
,
const
ConvBiasImpl
::
NCBKernIndex
&
ncb_index
,
const
CpuNDRange
&
,
const
CpuNDRange
&
)
{
const
int
oh
=
kern_param
.
osz
[
0
];
const
int
ow
=
kern_param
.
osz
[
1
];
const
int
fh
=
kern_param
.
filter_meta
.
spatial
[
0
];
const
int
fw
=
kern_param
.
filter_meta
.
spatial
[
1
];
const
int
ic
=
kern_param
.
filter_meta
.
icpg
;
const
int
oc
=
kern_param
.
filter_meta
.
ocpg
;
const
int
ih
=
kern_param
.
isz
[
0
];
const
int
iw
=
kern_param
.
isz
[
1
];
const
int
stride_h
=
kern_param
.
filter_meta
.
stride
[
0
];
const
int
ph
=
kern_param
.
filter_meta
.
padding
[
0
];
const
int
pw
=
kern_param
.
filter_meta
.
padding
[
1
];
int
ih2
=
0
;
int
iw2
=
0
;
int
oh2
=
0
;
int
ow2
=
0
;
get_rectified_size
(
kern_param
,
ih2
,
iw2
,
oh2
,
ow2
);
bundle
.
set
(
kern_param
.
workspace_ptr
);
constexpr
int
pack_c
=
4
;
const
int
batch_id
=
ncb_index
.
ndrange_id
[
0
];
const
int
group_id
=
ncb_index
.
ndrange_id
[
1
];
constexpr
int
oc_idx
=
0
;
int
oc_block
=
oc
;
int
oh_block
=
block_helper
(
kern_param
.
nr_threads
,
oh2
,
ic
*
iw
*
sizeof
(
float
)
*
2
);
const
int
oh_idx
=
ncb_index
.
ndrange_id
[
2
];
const
int
oh_block_real
=
std
::
min
(
oh
-
oh_idx
*
oh_block
,
oh_block
);
const
int
ih_real
=
oh_block_real
*
stride_h
+
fh
-
stride_h
;
const
int
src_top_pad
=
std
::
max
(
ph
-
oh_idx
*
oh_block
*
stride_h
,
0
);
const
int
src_bottom_pad
=
std
::
max
(
(
oh_idx
*
oh_block
+
oh_block_real
-
1
)
*
stride_h
+
fh
-
ih
-
ph
,
0
);
const
int
remain_right_pad
=
std
::
max
(
iw2
-
iw
-
pw
,
0
);
const
int
src_offset
=
std
::
max
(
oh_idx
*
oh_block
*
stride_h
-
ph
,
0
)
*
iw
*
pack_c
;
const
float
*
origin_sptr
=
static_cast
<
const
float
*>
(
kern_param
.
src
<
float
>
(
batch_id
,
group_id
,
0
,
1
,
1
))
+
src_offset
;
const
size_t
src_size
=
get_perthread_cache_bytes
(
ic
,
ih2
,
iw2
);
float
*
sptr
=
reinterpret_cast
<
float
*>
((
int8_t
*
)
bundle
.
get
(
0
)
+
ncb_index
.
thread_id
*
src_size
);
conv_bias
::
pack_src_fp32_nchw44_stride2
(
sptr
,
origin_sptr
,
ph
,
pw
,
remain_right_pad
,
ih_real
-
src_top_pad
-
src_bottom_pad
,
iw
,
iw2
,
src_top_pad
,
src_bottom_pad
,
ic
,
ih
*
iw
);
const
float
*
fptr
=
kern_param
.
filter
<
dt_float32
>
(
group_id
)
+
oc_idx
*
fh
*
fw
*
ic
;
float_t
*
dst
=
kern_param
.
dst
<
float_t
>
(
batch_id
,
group_id
)
+
oh_idx
*
oh_block
*
ow
*
pack_c
;
const
int
bias_offset
=
bias_mode
==
BiasMode
::
BIAS
?
oh_idx
*
oh_block
*
ow
*
pack_c
:
oc_idx
;
const
float
*
bptr
=
kern_param
.
bias
<
dt_float32
>
(
batch_id
,
group_id
)
+
bias_offset
;
Op
op
;
#define KERN1_NCHW44_CONV(filter) \
conv_bias::conv_direct_stride2_##filter##x##filter##_fp32_nchw44< \
\
bias_mode, Op>(sptr, fptr, bptr, nullptr, dst, oc_block, ic, \
ih_real, iw2, oh, oh_block_real, ow, op, ph, pw)
DISPATCH_FILTER
(
filter
,
KERN1_NCHW44_CONV
);
#undef KERN1_NCHW44_CONV
}
}
// namespace
/* ===================== stride2 algo ===================== */
bool
ConvBiasImpl
::
AlgoF32DirectStride2NCHW44
::
usable
(
fallback
::
ConvBiasImpl
*
,
const
NCBKernSizeParam
&
param
,
AlgoSelectionStrategy
)
const
{
auto
&&
fm
=
param
.
filter_meta
;
auto
fh
=
fm
.
spatial
[
0
];
int
oc
=
fm
.
ocpg
;
bool
ok_type
=
((
param
.
src_type
.
enumv
()
==
DTypeEnum
::
Float32
&&
param
.
filter_type
.
enumv
()
==
DTypeEnum
::
Float32
&&
(
param
.
dst_type
.
enumv
()
==
DTypeEnum
::
Float32
)))
&&
(
fm
.
format
==
param
::
Convolution
::
Format
::
NCHW44
);
bool
ok_src_dst
=
(
oc
%
4
==
0
&&
oc
>=
4
);
bool
ok_filter
=
fm
.
spatial_ndim
==
2
&&
fh
==
fm
.
spatial
[
1
]
&&
(
fh
==
2
||
fh
==
3
||
fh
==
5
||
fh
==
7
);
bool
ok_slide
=
fm
.
dilation
[
0
]
==
1
&&
fm
.
dilation
[
1
]
==
1
&&
fm
.
stride
[
0
]
==
2
&&
fm
.
stride
[
1
]
==
2
;
bool
ok_conv
=
!
fm
.
should_flip
;
bool
avaible
=
ok_type
&&
ok_src_dst
&&
ok_filter
&&
ok_slide
&&
ok_conv
;
return
avaible
;
}
size_t
ConvBiasImpl
::
AlgoF32DirectStride2NCHW44
::
get_workspace
(
fallback
::
ConvBiasImpl
*
,
const
NCBKernSizeParam
&
param
)
const
{
return
get_bundle
(
param
).
total_size_in_bytes
();
}
SmallVector
<
ConvBiasImpl
::
NCBKern
>
ConvBiasImpl
::
AlgoF32DirectStride2NCHW44
::
dispatch_kerns
(
fallback
::
ConvBiasImpl
*
,
const
NCBKernSizeParam
&
param
)
const
{
auto
fm
=
param
.
filter_meta
;
const
int
batch
=
param
.
n
;
const
int
group
=
fm
.
group
;
WorkspaceBundle
wbundle
=
get_bundle
(
param
);
conv_fun
do_conv_fun
=
nullptr
;
// NOTE: remain_w is not used to gen hash of midout for compatible with
// shape runtime
#define DO_CONV_KERN_FUN(filter, bias_mode, op) \
MIDOUT_BEGIN(megdnn_arm_common_conv_bias_fp32_nchw44_stride2, \
midout_iv(#filter #bias_mode #op##_hash)) { \
do_conv_fun = do_conv_kern<filter, bias_mode, op>; \
} \
MIDOUT_END();
#define GET_OP_PARAM(filter, bias_mode) \
switch (param.nonlineMode) { \
case param::ConvBias::NonlineMode::IDENTITY: \
DO_CONV_KERN_FUN(filter, bias_mode, NoneOp<dt_float32>) \
break; \
case param::ConvBias::NonlineMode::RELU: \
DO_CONV_KERN_FUN(filter, bias_mode, ReluOp<dt_float32>) \
break; \
case param::ConvBias::NonlineMode::H_SWISH: \
DO_CONV_KERN_FUN(filter, bias_mode, HSwishOp<dt_float32>) \
break; \
default: \
megdnn_assert(0); \
break; \
}
#define GET_BIAS_MODE_PARAM(filter) \
switch (param.bias_mode) { \
case BiasMode::NO_BIAS: \
GET_OP_PARAM(filter, BiasMode::NO_BIAS) \
break; \
case BiasMode::BROADCAST_CHANNEL_BIAS: \
GET_OP_PARAM(filter, BiasMode::BROADCAST_CHANNEL_BIAS) \
break; \
case BiasMode::BIAS: \
GET_OP_PARAM(filter, BiasMode::BIAS) \
break; \
default: \
megdnn_assert(0); \
break; \
}
#define DISPATCH_CONV_KERN() \
switch (param.filter_meta.spatial[0]) { \
case 2: \
GET_BIAS_MODE_PARAM(2) \
break; \
case 3: \
GET_BIAS_MODE_PARAM(3) \
break; \
case 5: \
GET_BIAS_MODE_PARAM(5) \
break; \
case 7: \
GET_BIAS_MODE_PARAM(7) \
break; \
default: \
megdnn_assert(0); \
break; \
}
DISPATCH_CONV_KERN
();
#undef DO_CONV_KERN_FUN
#undef GET_REMAIN_W_PARAM
#undef GET_OP_PARAM
#undef GET_BIAS_MODE_PARAM
#undef DISPATCH_CONV_KERN
megdnn_assert
(
do_conv_fun
);
SmallVector
<
ConvBiasImpl
::
NCBKern
>
ret_kerns
;
WorkspaceBundle
bundle
=
wbundle
;
int
oh
=
param
.
osz
[
0
];
int
ic
=
param
.
filter_meta
.
icpg
;
int
iw
=
param
.
isz
[
1
];
int
oh_block
=
block_helper
(
param
.
nr_threads
,
oh
,
ic
*
iw
*
sizeof
(
float
)
*
2
);
CpuNDRange
ncb_range
=
{
static_cast
<
size_t
>
(
batch
),
static_cast
<
size_t
>
(
group
),
static_cast
<
size_t
>
(
div_ceil
(
oh
,
oh_block
))};
auto
do_conv
=
[
bundle
,
do_conv_fun
,
ncb_range
](
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
{
do_conv_fun
(
bundle
,
kern_param
,
ncb_index
,
ncb_index
.
ndrange_id
,
ncb_range
);
};
ret_kerns
.
push_back
({
do_conv
,
ncb_range
});
return
ret_kerns
;
}
// vim: syntax=cpp.doxygen
dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.cpp
0 → 100644
浏览文件 @
66950a4f
此差异已折叠。
点击以展开。
dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.h
0 → 100644
浏览文件 @
66950a4f
/**
* \file dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw44_kern.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/opr_impl.h"
#include "src/fallback/conv_bias/common.h"
namespace
megdnn
{
namespace
arm_common
{
namespace
conv_bias
{
#define KERN(stride, i, layout) \
template <BiasMode bias_mode, typename Op> \
void conv_direct_##stride##_##i##x##i##_fp32_##layout( \
const float* src, const float* filter, const float* bias, \
float* temp, float* dst, const int oc, const int ic, const int ih, \
const int iw, const int oh, const int oh_block, const int ow, \
const Op& op, const int ph, const int pw);
KERN
(
stride2
,
2
,
nchw44
)
KERN
(
stride2
,
3
,
nchw44
)
KERN
(
stride2
,
5
,
nchw44
)
KERN
(
stride2
,
7
,
nchw44
)
#undef KERN
void
pack_src_fp32_nchw44_stride2
(
float
*
sptr_base
,
const
float
*
sptr_origin
,
const
int
ph
,
const
int
pw
,
const
int
pad_right
,
const
int
ih
,
const
int
iw
,
const
int
iw2
,
const
int
pad_top
,
const
int
pad_bottom
,
const
int
ic
,
const
int
ic_stride
);
}
// namespace conv_bias
}
// namespace arm_common
}
// namespace megdnn
dnn/src/arm_common/conv_bias/fp32/f32_direct_stride2_nchw_nchw44_kern.cpp
浏览文件 @
66950a4f
...
...
@@ -111,7 +111,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 7, oc_block> {
const
int
ld_src_ic
=
ih
*
iw
;
constexpr
int
c_dim
=
OCHelper
<
oc_block
>::
val
;
float32x4_t
c
[
c_dim
][
8
];
init_ocx_ow8
<
c_dim
,
bias_mode
>
(
c
,
bias_ptr
,
oc_step
);
init_ocx_ow8
<
c_dim
,
bias_mode
,
8
>
(
c
,
bias_ptr
,
oc_step
);
for
(
int
ic_idx
=
0
;
ic_idx
<
ic
;
ic_idx
+=
loop_ic_step
)
{
float32x4_t
src
[
src_reg_size
];
...
...
@@ -157,7 +157,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 5, oc_block> {
const
int
ld_src_ic
=
ih
*
iw
;
constexpr
int
c_dim
=
OCHelper
<
oc_block
>::
val
;
float32x4_t
c
[
c_dim
][
8
];
init_ocx_ow8
<
c_dim
,
bias_mode
>
(
c
,
bias_ptr
,
oc_step
);
init_ocx_ow8
<
c_dim
,
bias_mode
,
8
>
(
c
,
bias_ptr
,
oc_step
);
for
(
int
ic_idx
=
0
;
ic_idx
<
ic
;
ic_idx
+=
loop_ic_step
)
{
float32x4_t
src
[
src_reg_size
];
...
...
@@ -201,7 +201,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 3, oc_block> {
const
int
ld_src_ic
=
ih
*
iw
;
constexpr
int
c_dim
=
OCHelper
<
oc_block
>::
val
;
float32x4_t
c
[
c_dim
][
8
];
init_ocx_ow8
<
c_dim
,
bias_mode
>
(
c
,
bias_ptr
,
oc_step
);
init_ocx_ow8
<
c_dim
,
bias_mode
,
8
>
(
c
,
bias_ptr
,
oc_step
);
for
(
int
ic_idx
=
0
;
ic_idx
<
ic
;
ic_idx
+=
loop_ic_step
)
{
float32x4_t
src
[
src_reg_size
];
...
...
@@ -258,7 +258,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 2, oc_block> {
const
int
ld_src_ic
=
ih
*
iw
;
constexpr
int
c_dim
=
OCHelper
<
oc_block
>::
val
;
float32x4_t
c
[
c_dim
][
8
];
init_ocx_ow8
<
c_dim
,
bias_mode
>
(
c
,
bias_ptr
,
oc_step
);
init_ocx_ow8
<
c_dim
,
bias_mode
,
8
>
(
c
,
bias_ptr
,
oc_step
);
for
(
int
ic_idx
=
0
;
ic_idx
<
ic
;
ic_idx
+=
loop_ic_step
)
{
float32x4_t
src
[
src_reg_size
];
...
...
dnn/src/arm_common/conv_bias/intrinsic_helper.h
浏览文件 @
66950a4f
...
...
@@ -194,7 +194,20 @@ struct StoreOcxOw8Remain<2, 0, Op, T> {
op
({{
c
[
1
][
6
],
c
[
1
][
7
]}},
dst_ptr
+
ld_dst_oc
+
24
);
}
};
template
<
typename
Op
,
typename
T
>
struct
StoreOcxOw8Remain
<
2
,
8
,
Op
,
T
>
{
static
void
impl
(
T
&
c
,
const
Op
&
op
,
float32_t
*
dst_ptr
,
int
ld_dst_oc
)
{
op
({{
c
[
0
][
0
],
c
[
0
][
1
]}},
dst_ptr
);
op
({{
c
[
0
][
2
],
c
[
0
][
3
]}},
dst_ptr
+
8
);
op
({{
c
[
0
][
4
],
c
[
0
][
5
]}},
dst_ptr
+
16
);
op
({{
c
[
0
][
6
],
c
[
0
][
7
]}},
dst_ptr
+
24
);
op
({{
c
[
1
][
0
],
c
[
1
][
1
]}},
dst_ptr
+
ld_dst_oc
);
op
({{
c
[
1
][
2
],
c
[
1
][
3
]}},
dst_ptr
+
ld_dst_oc
+
8
);
op
({{
c
[
1
][
4
],
c
[
1
][
5
]}},
dst_ptr
+
ld_dst_oc
+
16
);
op
({{
c
[
1
][
6
],
c
[
1
][
7
]}},
dst_ptr
+
ld_dst_oc
+
24
);
}
};
template
<
typename
Op
,
typename
T
>
struct
StoreOcxOw8Remain
<
2
,
7
,
Op
,
T
>
{
static
void
impl
(
T
&
c
,
const
Op
&
op
,
float32_t
*
dst_ptr
,
int
ld_dst_oc
)
{
...
...
@@ -277,6 +290,15 @@ struct StoreOcxOw8Remain<1, 0, Op, T> {
op
({{
c
[
0
][
6
],
c
[
0
][
7
]}},
dst_ptr
+
24
);
}
};
template
<
typename
Op
,
typename
T
>
struct
StoreOcxOw8Remain
<
1
,
8
,
Op
,
T
>
{
static
void
impl
(
T
&
c
,
const
Op
&
op
,
float32_t
*
dst_ptr
,
int
)
{
op
({{
c
[
0
][
0
],
c
[
0
][
1
]}},
dst_ptr
);
op
({{
c
[
0
][
2
],
c
[
0
][
3
]}},
dst_ptr
+
8
);
op
({{
c
[
0
][
4
],
c
[
0
][
5
]}},
dst_ptr
+
16
);
op
({{
c
[
0
][
6
],
c
[
0
][
7
]}},
dst_ptr
+
24
);
}
};
template
<
typename
Op
,
typename
T
>
struct
StoreOcxOw8Remain
<
1
,
7
,
Op
,
T
>
{
...
...
@@ -499,46 +521,127 @@ inline void init_oc8_ow8(int32x4_t c[2][8], const int32_t* bias_ptr,
}
}
/////////////////////////init_ocx_ow8////////////////////
template
<
int
c_dim
,
BiasMode
bias_mode
,
typename
T
,
typename
T2
>
template
<
int
c_dim
,
BiasMode
bias_mode
,
int
ow_block
,
typename
T
,
typename
T2
>
struct
InitOcxOw8
{
static
void
impl
(
T
&
c
,
T2
bias_ptr
,
int
oc_step
);
};
template
<
BiasMode
bias_mode
,
typename
T
,
typename
T2
>
struct
InitOcxOw8
<
2
,
bias_mode
,
T
,
T2
>
{
template
<
typename
T
,
typename
T2
>
struct
InitOcxOw8
<
2
,
BiasMode
::
NO_BIAS
,
8
,
T
,
T2
>
{
static
void
impl
(
T
&
c
,
const
float32_t
*
,
int
)
{
#define BAIS_INIT(step) \
c[0][step] = vdupq_n_f32(0); \
c[1][step] = vdupq_n_f32(0);
UNROLL_CALL_RAW
(
8
,
BAIS_INIT
);
#undef BAIS_INIT
}
};
template
<
typename
T
,
typename
T2
>
struct
InitOcxOw8
<
2
,
BiasMode
::
NO_BIAS
,
4
,
T
,
T2
>
{
static
void
impl
(
T
&
c
,
const
float32_t
*
,
int
)
{
#define BAIS_INIT(step) \
c[0][step] = vdupq_n_f32(0); \
c[1][step] = vdupq_n_f32(0);
UNROLL_CALL_RAW
(
4
,
BAIS_INIT
);
#undef BAIS_INIT
}
};
template
<
typename
T
,
typename
T2
>
struct
InitOcxOw8
<
2
,
BiasMode
::
BROADCAST_CHANNEL_BIAS
,
8
,
T
,
T2
>
{
static
void
impl
(
T
&
c
,
const
float32_t
*
bias_ptr
,
int
oc_step
)
{
if
(
bias_mode
==
BiasMode
::
BROADCAST_CHANNEL_BIAS
)
{
#define BAIS_INIT(step) \
c[0][step] = vld1q_f32(bias_ptr); \
c[1][step] = vld1q_f32(bias_ptr + oc_step);
UNROLL_CALL_RAW
(
8
,
BAIS_INIT
);
UNROLL_CALL_RAW
(
8
,
BAIS_INIT
);
#undef BAIS_INIT
}
else
{
#define BAIS_INIT(step) \
c[0][step] = vdupq_n_f32(0); \
c[1][step] = vdupq_n_f32(0);
UNROLL_CALL_RAW
(
8
,
BAIS_INIT
);
}
};
template
<
typename
T
,
typename
T2
>
struct
InitOcxOw8
<
2
,
BiasMode
::
BROADCAST_CHANNEL_BIAS
,
4
,
T
,
T2
>
{
static
void
impl
(
T
&
c
,
const
float32_t
*
bias_ptr
,
int
oc_step
)
{
#define BAIS_INIT(step) \
c[0][step] = vld1q_f32(bias_ptr); \
c[1][step] = vld1q_f32(bias_ptr + oc_step);
UNROLL_CALL_RAW
(
4
,
BAIS_INIT
);
#undef BAIS_INIT
}
};
template
<
typename
T
,
typename
T2
>
struct
InitOcxOw8
<
2
,
BiasMode
::
BIAS
,
8
,
T
,
T2
>
{
static
void
impl
(
T
&
c
,
const
float32_t
*
bias_ptr
,
int
oc_step
)
{
constexpr
int
simd_len
=
4
;
#define BAIS_INIT(step) \
c[0][step] = vld1q_f32(bias_ptr + step * simd_len); \
c[1][step] = vld1q_f32(bias_ptr + oc_step + step * simd_len);
UNROLL_CALL_RAW
(
8
,
BAIS_INIT
);
#undef BAIS_INIT
}
}
};
template
<
BiasMode
bias_mode
,
typename
T
,
typename
T2
>
struct
InitOcxOw8
<
1
,
bias_mode
,
T
,
T2
>
{
template
<
typename
T
,
typename
T2
>
struct
InitOcxOw8
<
2
,
BiasMode
::
BIAS
,
4
,
T
,
T2
>
{
static
void
impl
(
T
&
c
,
const
float32_t
*
bias_ptr
,
int
oc_step
)
{
constexpr
int
simd_len
=
4
;
#define BAIS_INIT(step) \
c[0][step] = vld1q_f32(bias_ptr + step * simd_len); \
c[1][step] = vld1q_f32(bias_ptr + oc_step + step * simd_len);
UNROLL_CALL_RAW
(
4
,
BAIS_INIT
);
#undef BAIS_INIT
}
};
template
<
typename
T
,
typename
T2
>
struct
InitOcxOw8
<
1
,
BiasMode
::
NO_BIAS
,
8
,
T
,
T2
>
{
static
void
impl
(
T
&
c
,
const
float32_t
*
,
int
)
{
#define BAIS_INIT(step) c[0][step] = vdupq_n_f32(0);
UNROLL_CALL_RAW
(
8
,
BAIS_INIT
);
#undef BAIS_INIT
}
};
template
<
typename
T
,
typename
T2
>
struct
InitOcxOw8
<
1
,
BiasMode
::
NO_BIAS
,
4
,
T
,
T2
>
{
static
void
impl
(
T
&
c
,
const
float32_t
*
,
int
)
{
#define BAIS_INIT(step) c[0][step] = vdupq_n_f32(0);
UNROLL_CALL_RAW
(
4
,
BAIS_INIT
);
#undef BAIS_INIT
}
};
template
<
typename
T
,
typename
T2
>
struct
InitOcxOw8
<
1
,
BiasMode
::
BROADCAST_CHANNEL_BIAS
,
8
,
T
,
T2
>
{
static
void
impl
(
T
&
c
,
const
float32_t
*
bias_ptr
,
int
)
{
if
(
bias_mode
==
BiasMode
::
BROADCAST_CHANNEL_BIAS
)
{
#define BAIS_INIT(step) c[0][step] = vld1q_f32(bias_ptr);
UNROLL_CALL_RAW
(
8
,
BAIS_INIT
);
UNROLL_CALL_RAW
(
8
,
BAIS_INIT
);
#undef BAIS_INIT
}
else
{
#define BAIS_INIT(step) c[0][step] = vdupq_n_f32(0);
UNROLL_CALL_RAW
(
8
,
BAIS_INIT
);
}
};
template
<
typename
T
,
typename
T2
>
struct
InitOcxOw8
<
1
,
BiasMode
::
BROADCAST_CHANNEL_BIAS
,
4
,
T
,
T2
>
{
static
void
impl
(
T
&
c
,
const
float32_t
*
bias_ptr
,
int
)
{
#define BAIS_INIT(step) c[0][step] = vld1q_f32(bias_ptr);
UNROLL_CALL_RAW
(
4
,
BAIS_INIT
);
#undef BAIS_INIT
}
};
template
<
typename
T
,
typename
T2
>
struct
InitOcxOw8
<
1
,
BiasMode
::
BIAS
,
8
,
T
,
T2
>
{
static
void
impl
(
T
&
c
,
const
float32_t
*
bias_ptr
,
int
)
{
constexpr
int
simd_len
=
4
;
#define BAIS_INIT(step) c[0][step] = vld1q_f32(bias_ptr + step * simd_len);
UNROLL_CALL_RAW
(
8
,
BAIS_INIT
);
#undef BAIS_INIT
}
};
template
<
typename
T
,
typename
T2
>
struct
InitOcxOw8
<
1
,
BiasMode
::
BIAS
,
4
,
T
,
T2
>
{
static
void
impl
(
T
&
c
,
const
float32_t
*
bias_ptr
,
int
)
{
constexpr
int
simd_len
=
4
;
#define BAIS_INIT(step) c[0][step] = vld1q_f32(bias_ptr + step * simd_len);
UNROLL_CALL_RAW
(
4
,
BAIS_INIT
);
#undef BAIS_INIT
}
}
};
template
<
int
c_dim
,
BiasMode
bias_mode
,
typename
T
,
typename
T2
>
template
<
int
c_dim
,
BiasMode
bias_mode
,
int
ow_block
,
typename
T
,
typename
T2
>
inline
void
init_ocx_ow8
(
T
&
c
,
T2
bias_ptr
,
int
oc_step
)
{
InitOcxOw8
<
c_dim
,
bias_mode
,
T
,
T2
>::
impl
(
c
,
bias_ptr
,
oc_step
);
InitOcxOw8
<
c_dim
,
bias_mode
,
ow_block
,
T
,
T2
>::
impl
(
c
,
bias_ptr
,
oc_step
);
}
/////////////////////init_ocx_ow4/////////////////////
template
<
int
c_dim
,
BiasMode
bias_mode
,
typename
T
>
...
...
@@ -638,6 +741,20 @@ struct LoadHelper<6, base_offset, ptr_step, 0, Func, T, T2, XT...> {
UNROLL_CALL_RAW
(
6
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
,
typename
...
XT
>
struct
LoadHelper
<
7
,
base_offset
,
ptr_step
,
0
,
Func
,
T
,
T2
,
XT
...
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
,
XT
...
args
)
{
UNROLL_CALL_RAW
(
7
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
,
typename
...
XT
>
struct
LoadHelper
<
8
,
base_offset
,
ptr_step
,
0
,
Func
,
T
,
T2
,
XT
...
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
,
XT
...
args
)
{
UNROLL_CALL_RAW
(
8
,
WEIGHT_CB
);
}
};
#undef WEIGHT_CB
#define WEIGHT_CB(step) \
...
...
@@ -674,6 +791,11 @@ struct LoadHelper<7, base_offset, ptr_step, 1, Func, T, T2> {
static
void
impl
(
T
&
src
,
T2
ptr
,
int
)
{
UNROLL_CALL_RAW
(
7
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
>
struct
LoadHelper
<
8
,
base_offset
,
ptr_step
,
1
,
Func
,
T
,
T2
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
)
{
UNROLL_CALL_RAW
(
8
,
WEIGHT_CB
);
}
};
#undef WEIGHT_CB
#define WEIGHT_CB(step) \
...
...
@@ -724,6 +846,13 @@ struct LoadHelper<7, base_offset, ptr_step, 2, Func, T, T2> {
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
>
struct
LoadHelper
<
8
,
base_offset
,
ptr_step
,
2
,
Func
,
T
,
T2
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
oc_offset
)
{
UNROLL_CALL_RAW
(
8
,
WEIGHT_CB
);
}
};
#undef WEIGHT_CB
template
<
int
weight_number
,
int
base_offset
,
int
ptr_step
,
int
c_dim
,
...
...
dnn/src/arm_common/conv_bias/opr_impl.cpp
浏览文件 @
66950a4f
...
...
@@ -67,6 +67,7 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj {
AlgoF32Direct
f32_direct_large_group
{
true
};
AlgoF32Direct
f32_direct_small_group
{
false
};
AlgoF32DirectStride2NCHW44
f32_direct_stride2_nchw44
;
AlgoF32DirectStride2
f32_direct_stride2_large_group
{
true
};
AlgoF32DirectStride2
f32_direct_stride2_small_group
{
false
};
AlgoF32DirectStride1
f32_direct_stride1_large_group
{
true
};
...
...
@@ -125,6 +126,8 @@ public:
direct_algos
.
emplace_back
(
&
i8x8x16_stride2_large_group
);
direct_algos
.
emplace_back
(
&
i8x8x16_stride2_small_group
);
direct_algos
.
emplace_back
(
&
f32_direct_stride2_nchw_nchw44
);
direct_algos
.
emplace_back
(
&
f32_direct_stride2_nchw44
);
direct_algos
.
emplace_back
(
&
f32_direct_stride1_large_group
);
direct_algos
.
emplace_back
(
&
f32_direct_stride1_small_group
);
direct_algos
.
emplace_back
(
&
f32_direct_stride2_large_group
);
...
...
dnn/src/arm_common/conv_bias/opr_impl.h
浏览文件 @
66950a4f
...
...
@@ -68,6 +68,8 @@ private:
class
AlgoF32DirectStride1
;
class
AlgoF32DirectStride2
;
class
AlgoF32DirectStride2NCHWNCHW44
;
class
AlgoF32DirectStride2NCHW44
;
class
AlgoI8x8x16Direct
;
class
AlgoI8x8x16Stride2
;
class
AlgoI8x8x16Stride2Filter2
;
...
...
dnn/test/arm_common/conv_bias.cpp
浏览文件 @
66950a4f
...
...
@@ -198,6 +198,16 @@ static void benchmark_convbias(Handle* handle, bool is_fp32 = false) {
run
(
1
,
1
,
4
,
112
,
112
,
2
,
2
,
true
);
run
(
1
,
3
,
32
,
224
,
224
,
3
,
2
,
true
);
run
(
1
,
3
,
64
,
224
,
224
,
7
,
2
,
true
);
run
(
1
,
64
,
128
,
56
,
56
,
3
,
2
,
false
);
run
(
1
,
128
,
256
,
28
,
28
,
3
,
2
,
false
);
run
(
1
,
256
,
512
,
14
,
14
,
3
,
2
,
false
);
run
(
1
,
64
,
128
,
56
,
56
,
7
,
2
,
false
);
run
(
1
,
128
,
256
,
28
,
28
,
7
,
2
,
false
);
run
(
1
,
256
,
512
,
14
,
14
,
7
,
2
,
false
);
run
(
1
,
64
,
64
,
48
,
48
,
3
,
2
,
false
);
}
else
{
for
(
size_t
stride
:
{
1
,
2
})
{
printf
(
"stride %zu
\n
"
,
stride
);
...
...
dnn/test/arm_common/conv_bias_multi_thread.cpp
浏览文件 @
66950a4f
...
...
@@ -72,18 +72,16 @@ std::vector<conv_bias::TestArg> get_int8_quint8_conv_bias_args(
std
::
vector
<
conv_bias
::
TestArg
>
get_nchw44_conv_bias_args
(
std
::
vector
<
size_t
>
kernel_vec
,
size_t
stride
,
bool
no_pad
=
false
,
bool
no_bias
=
false
,
bool
no_nonlinemode
=
false
,
bool
is_input_nchw
=
false
)
{
bool
is_input_nchw
=
false
,
bool
support_full_bias
=
false
)
{
using
namespace
conv_bias
;
using
NLMode
=
param
::
ConvBias
::
NonlineMode
;
std
::
vector
<
TestArg
>
args
;
auto
pack
=
[
&
](
size_t
n
,
size_t
oc
,
size_t
ic
,
size_t
h
,
size_t
w
,
size_t
kernel
,
size_t
stride
,
size_t
group
,
NLMode
nlmode
,
int
any_pad
=
-
1
)
{
megdnn
::
BiasMode
bias_mode
,
int
any_pad
=
-
1
)
{
constexpr
int
pack_c
=
4
;
const
size_t
pad
=
any_pad
>=
0
?
any_pad
:
kernel
/
2
;
auto
bias_mode
=
no_bias
?
megdnn
::
BiasMode
::
NO_BIAS
:
megdnn
::
BiasMode
::
BROADCAST_CHANNEL_BIAS
;
auto
oc_per_group
=
oc
/
group
;
auto
ic_per_group
=
ic
/
group
;
bool
ok_group
=
(
oc
%
group
==
0
&&
ic
%
group
==
0
)
&&
...
...
@@ -116,6 +114,10 @@ std::vector<conv_bias::TestArg> get_nchw44_conv_bias_args(
auto
bias_tensor_shape
=
TensorShape
{};
if
(
bias_mode
==
megdnn
::
BiasMode
::
BROADCAST_CHANNEL_BIAS
)
{
bias_tensor_shape
=
{
1
,
oc
/
pack_c
,
1
,
1
,
pack_c
};
}
else
if
(
bias_mode
==
megdnn
::
BiasMode
::
BIAS
)
{
bias_tensor_shape
=
{
n
,
oc
/
pack_c
,
(
h
+
2
*
pad
-
kernel
)
/
stride
+
1
,
(
w
+
2
*
pad
-
kernel
)
/
stride
+
1
,
pack_c
};
}
if
(
group
==
1
)
{
param
.
sparse
=
param
::
ConvBias
::
Sparse
::
DENSE
;
...
...
@@ -149,19 +151,29 @@ std::vector<conv_bias::TestArg> get_nchw44_conv_bias_args(
nonlinemode
.
emplace_back
(
NLMode
::
RELU
);
nonlinemode
.
emplace_back
(
NLMode
::
H_SWISH
);
}
for
(
auto
nlmode
:
nonlinemode
)
for
(
size_t
n
:
{
1
,
2
})
for
(
size_t
kernel
:
kernel_vec
)
for
(
size_t
oc
:
{
4
,
12
,
32
})
for
(
size_t
ic
:
{
1
,
3
,
4
,
12
,
32
})
for
(
size_t
h
:
{
3
,
5
,
12
})
for
(
size_t
w
:
{
7
,
16
,
23
})
{
for
(
size_t
group
=
1
;
group
<=
std
::
min
(
oc
,
ic
);
++
group
)
{
pack
(
n
,
oc
,
ic
,
h
,
w
,
kernel
,
stride
,
group
,
nlmode
);
std
::
vector
<
megdnn
::
BiasMode
>
bias_mode
=
{
megdnn
::
BiasMode
::
BROADCAST_CHANNEL_BIAS
};
if
(
no_bias
)
{
bias_mode
.
emplace_back
(
megdnn
::
BiasMode
::
NO_BIAS
);
}
if
(
support_full_bias
)
{
bias_mode
.
emplace_back
(
megdnn
::
BiasMode
::
BIAS
);
}
for
(
auto
bias
:
bias_mode
)
for
(
auto
nlmode
:
nonlinemode
)
for
(
size_t
n
:
{
1
,
2
})
for
(
size_t
kernel
:
kernel_vec
)
for
(
size_t
oc
:
{
4
,
12
,
32
})
for
(
size_t
ic
:
{
1
,
3
,
4
,
12
,
32
})
for
(
size_t
h
:
{
3
,
5
,
12
})
for
(
size_t
w
:
{
7
,
16
,
23
})
{
for
(
size_t
group
=
1
;
group
<=
std
::
min
(
oc
,
ic
);
++
group
)
{
pack
(
n
,
oc
,
ic
,
h
,
w
,
kernel
,
stride
,
group
,
nlmode
,
bias
);
}
}
}
return
args
;
}
...
...
@@ -325,6 +337,13 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_DIRECT_FP32_SMALL_GROUP) {
get_conv_bias_args
({
1
,
2
,
3
,
4
,
5
,
6
,
7
},
1
,
false
,
false
,
false
),
handle
(),
"F32DIRECT_SMALL_GROUP"
);
}
TEST_F
(
ARM_COMMON_MULTI_THREADS
,
CONVBIAS_DIRECT_FP32_NCHW44_S2
)
{
check_conv_bias
(
get_nchw44_conv_bias_args
({
2
,
3
,
5
,
7
},
2
,
false
,
false
,
false
,
false
,
true
),
handle
(),
"F32_CONV_NCHW44_DIRECT_S2"
);
}
TEST_F
(
ARM_COMMON_MULTI_THREADS
,
CONVBIAS_DIRECT_FP32_STR1_LARGE_GROUP
)
{
check_conv_bias
(
get_conv_bias_args
({
2
,
3
,
5
,
7
},
1
,
false
,
false
,
false
),
handle
(),
"F32STRD1_LARGE_GROUP"
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录