Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
ed7fa104
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
403
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
ed7fa104
编写于
4月 07, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(fallback): move direct multi_thread_common helper to fallback
GitOrigin-RevId: 27ed93e4c1d56d550c006a470bb4c95ee5ff2032
上级
8871ad74
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
122 addition
and
91 deletion
+122
-91
dnn/src/aarch64/conv_bias/fp16/algos.cpp
dnn/src/aarch64/conv_bias/fp16/algos.cpp
+7
-7
dnn/src/aarch64/conv_bias/fp32/algos.cpp
dnn/src/aarch64/conv_bias/fp32/algos.cpp
+9
-8
dnn/src/arm_common/conv_bias/f16/algos.cpp
dnn/src/arm_common/conv_bias/f16/algos.cpp
+37
-30
dnn/src/arm_common/conv_bias/fp32/algos.cpp
dnn/src/arm_common/conv_bias/fp32/algos.cpp
+45
-33
dnn/src/fallback/conv_bias/direct/multi_thread_common.cpp
dnn/src/fallback/conv_bias/direct/multi_thread_common.cpp
+8
-6
dnn/src/fallback/conv_bias/direct/multi_thread_common.h
dnn/src/fallback/conv_bias/direct/multi_thread_common.h
+13
-4
dnn/src/fallback/general_intrinsic/gi_float.h
dnn/src/fallback/general_intrinsic/gi_float.h
+2
-2
dnn/src/fallback/general_intrinsic/gi_int.h
dnn/src/fallback/general_intrinsic/gi_int.h
+1
-1
未找到文件。
dnn/src/aarch64/conv_bias/fp16/algos.cpp
浏览文件 @
ed7fa104
...
...
@@ -12,8 +12,8 @@
#include "src/aarch64/conv_bias/fp16/algos.h"
#include "src/aarch64/conv_bias/fp16/stride2_kern.h"
#include "src/arm_common/conv_bias/direct/multi_thread_common.h"
#include "src/arm_common/conv_bias/postprocess_helper.h"
#include "src/fallback/conv_bias/direct/multi_thread_common.h"
using
namespace
megdnn
;
using
namespace
aarch64
;
...
...
@@ -43,7 +43,7 @@ size_t ConvBiasImpl::AlgoF16DirectStride2::get_workspace(
const
NCBKernSizeParam
&
param
)
const
{
MIDOUT_BEGIN
(
megdnn_aarch64_conv_bias_stride2_conv2357_fp16
,
0
,
1
)
{
bool
large_group
=
param
.
filter_meta
.
group
>=
param
.
nr_threads
;
auto
wbundle
=
arm_common
::
MultithreadDirectConvCommon
<
auto
wbundle
=
fallback
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
get_bundle_stride
(
param
,
large_group
);
return
wbundle
.
total_size_in_bytes
();
}
...
...
@@ -83,7 +83,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16DirectStride2::get_kimpl
conv
=
fp16
::
conv_stride2
::
do_conv_7x7_stride2
;
}
WorkspaceBundle
bundle
=
arm_common
::
MultithreadDirectConvCommon
<
WorkspaceBundle
bundle
=
fallback
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
get_bundle_stride
(
param
,
large_group
);
SmallVector
<
NCBKern
>
ret_kerns
;
...
...
@@ -98,13 +98,13 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16DirectStride2::get_kimpl
size_t
OC
=
fm
.
ocpg
;
bundle
.
set
(
kern_param
.
workspace_ptr
);
for
(
size_t
ic
=
0
;
ic
<
IC
;
ic
++
)
{
arm_common
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
fallback
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
copy_padding_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
{
ncb_index
.
thread_id
,
0
,
ic
});
}
for
(
size_t
oc
=
0
;
oc
<
OC
;
oc
++
)
{
arm_common
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
fallback
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
do_conv_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
conv
,
{
ncb_index
.
thread_id
,
0
,
oc
});
...
...
@@ -116,7 +116,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16DirectStride2::get_kimpl
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
mutable
{
bundle
.
set
(
kern_param
.
workspace_ptr
);
arm_common
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
fallback
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
copy_padding_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
ncb_index
.
ndrange_id
);
};
...
...
@@ -125,7 +125,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16DirectStride2::get_kimpl
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
mutable
{
bundle
.
set
(
kern_param
.
workspace_ptr
);
arm_common
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
fallback
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
do_conv_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
conv
,
ncb_index
.
ndrange_id
);
};
...
...
dnn/src/aarch64/conv_bias/fp32/algos.cpp
浏览文件 @
ed7fa104
...
...
@@ -11,9 +11,9 @@
#include "src/aarch64/conv_bias/fp32/algos.h"
#include "src/aarch64/conv_bias/fp32/stride2_kern.h"
#include "src/arm_common/conv_bias/direct/multi_thread_common.h"
#include "src/arm_common/conv_bias/postprocess_helper.h"
#include "src/fallback/conv_bias/common.h"
#include "src/fallback/conv_bias/direct/multi_thread_common.h"
#include "midout.h"
...
...
@@ -42,8 +42,9 @@ size_t ConvBiasImpl::AlgoF32DirectStride2::get_workspace(
const
NCBKernSizeParam
&
param
)
const
{
MIDOUT_BEGIN
(
megdnn_aarch64_conv_bias_stride2_conv2357_fp32
,
0
,
1
)
{
bool
large_group
=
param
.
filter_meta
.
group
>=
param
.
nr_threads
;
auto
wbundle
=
arm_common
::
MultithreadDirectConvCommon
<
float
,
float
>::
get_bundle_stride
(
param
,
large_group
);
auto
wbundle
=
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
get_bundle_stride
(
param
,
large_group
);
return
wbundle
.
total_size_in_bytes
();
}
MIDOUT_END
();
...
...
@@ -82,7 +83,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride2::get_kimpl
}
WorkspaceBundle
bundle
=
arm_common
::
MultithreadDirectConvCommon
<
float
,
float
>::
get_bundle_stride
(
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
get_bundle_stride
(
param
,
large_group
);
SmallVector
<
NCBKern
>
ret_kerns
;
...
...
@@ -97,13 +98,13 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride2::get_kimpl
size_t
OC
=
fm
.
ocpg
;
bundle
.
set
(
kern_param
.
workspace_ptr
);
for
(
size_t
ic
=
0
;
ic
<
IC
;
ic
++
)
{
arm_common
::
MultithreadDirectConvCommon
<
float
,
float
>::
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
copy_padding_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
{
ncb_index
.
thread_id
,
0
,
ic
});
}
for
(
size_t
oc
=
0
;
oc
<
OC
;
oc
++
)
{
arm_common
::
MultithreadDirectConvCommon
<
float
,
float
>::
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
do_conv_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
conv
,
{
ncb_index
.
thread_id
,
0
,
oc
});
...
...
@@ -115,7 +116,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride2::get_kimpl
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
mutable
{
bundle
.
set
(
kern_param
.
workspace_ptr
);
arm_common
::
MultithreadDirectConvCommon
<
float
,
float
>::
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
copy_padding_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
ncb_index
.
ndrange_id
);
};
...
...
@@ -124,7 +125,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride2::get_kimpl
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
mutable
{
bundle
.
set
(
kern_param
.
workspace_ptr
);
arm_common
::
MultithreadDirectConvCommon
<
float
,
float
>::
do_conv_kern_stride
(
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
do_conv_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
conv
,
ncb_index
.
ndrange_id
);
};
ret_kerns
.
push_back
({
do_conv
,
{
group
,
N
,
OC
}});
...
...
dnn/src/arm_common/conv_bias/f16/algos.cpp
浏览文件 @
ed7fa104
...
...
@@ -10,7 +10,6 @@
*/
#include "src/arm_common/conv_bias/f16/algos.h"
#include "src/arm_common/conv_bias/direct/multi_thread_common.h"
#include "src/arm_common/conv_bias/f16/direct.h"
#include "src/arm_common/conv_bias/f16/do_conv_stride1.h"
#include "src/arm_common/conv_bias/f16/strategy.h"
...
...
@@ -18,6 +17,7 @@
#include "src/arm_common/conv_bias/postprocess_helper.h"
#include "src/common/opr_delegate.h"
#include "src/fallback/conv_bias/common.h"
#include "src/fallback/conv_bias/direct/multi_thread_common.h"
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#include "midout.h"
MIDOUT_DECL
(
megdnn_arm_common_winograd_fp16
)
...
...
@@ -187,8 +187,9 @@ bool ConvBiasImpl::AlgoF16Direct::usable(
size_t
ConvBiasImpl
::
AlgoF16Direct
::
get_workspace
(
const
NCBKernSizeParam
&
param
)
const
{
MIDOUT_BEGIN
(
megdnn_arm_common_conv_bias_fp16_kimpl
,
0
,
1
)
{
bool
large_group
=
param
.
filter_meta
.
group
>=
param
.
nr_threads
;
auto
wbundle
=
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
get_bundle
(
param
,
large_group
);
auto
wbundle
=
fallback
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
get_bundle
(
param
,
large_group
);
return
wbundle
.
total_size_in_bytes
();
}
MIDOUT_END
();
...
...
@@ -204,7 +205,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16Direct::get_kimpls(
size_t
group
=
fm
.
group
;
bool
large_group
=
group
>=
param
.
nr_threads
;
WorkspaceBundle
bundle
=
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
get_bundle
(
fallback
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
get_bundle
(
param
,
large_group
);
SmallVector
<
NCBKern
>
ret_kerns
;
//! When group >= nr_threads, treat it as large_group, each thread process
...
...
@@ -220,17 +221,20 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16Direct::get_kimpls(
bundle
.
set
(
kern_param
.
workspace_ptr
);
if
(
fm
.
should_flip
)
{
for
(
size_t
oc
=
0
;
oc
<
OC
;
oc
++
)
{
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
weight_flip_kern
(
bundle
,
kern_param
,
ncb_index
,
{
ncb_index
.
thread_id
,
0
,
oc
});
fallback
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
weight_flip_kern
(
bundle
,
kern_param
,
ncb_index
,
{
ncb_index
.
thread_id
,
0
,
oc
});
}
}
for
(
size_t
ic
=
0
;
ic
<
IC
;
ic
++
)
{
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
copy_padding_kern
(
bundle
,
kern_param
,
ncb_index
,
{
ncb_index
.
thread_id
,
0
,
ic
});
fallback
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
copy_padding_kern
(
bundle
,
kern_param
,
ncb_index
,
{
ncb_index
.
thread_id
,
0
,
ic
});
}
for
(
size_t
oc
=
0
;
oc
<
OC
;
oc
++
)
{
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
do_conv_kern
(
fallback
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
do_conv_kern
(
bundle
,
kern_param
,
ncb_index
,
fp16
::
conv_bias
::
kern_direct_f16
,
{
ncb_index
.
thread_id
,
0
,
oc
});
}
...
...
@@ -242,8 +246,9 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16Direct::get_kimpls(
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
mutable
{
bundle
.
set
(
kern_param
.
workspace_ptr
);
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
weight_flip_kern
(
bundle
,
kern_param
,
ncb_index
,
ncb_index
.
ndrange_id
);
fallback
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
weight_flip_kern
(
bundle
,
kern_param
,
ncb_index
,
ncb_index
.
ndrange_id
);
};
ret_kerns
.
push_back
({
weight_flip
,
{
group
,
1
_z
,
OC
}});
}
...
...
@@ -251,15 +256,16 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16Direct::get_kimpls(
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
mutable
{
bundle
.
set
(
kern_param
.
workspace_ptr
);
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
copy_padding_kern
(
bundle
,
kern_param
,
ncb_index
,
ncb_index
.
ndrange_id
);
fallback
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
copy_padding_kern
(
bundle
,
kern_param
,
ncb_index
,
ncb_index
.
ndrange_id
);
};
ret_kerns
.
push_back
({
copy_padding
,
{
group
,
N
,
IC
}});
auto
do_conv
=
[
bundle
](
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
mutable
{
bundle
.
set
(
kern_param
.
workspace_ptr
);
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
do_conv_kern
(
fallback
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
do_conv_kern
(
bundle
,
kern_param
,
ncb_index
,
fp16
::
conv_bias
::
kern_direct_f16
,
ncb_index
.
ndrange_id
);
};
...
...
@@ -324,9 +330,8 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16DirectStride1::get_kimpl
}
SWITCH_KERN
();
WorkspaceBundle
bundle
=
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
get_bundle_stride
(
param
,
large_group
);
WorkspaceBundle
bundle
=
fallback
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
get_bundle_stride
(
param
,
large_group
);
SmallVector
<
NCBKern
>
ret_kerns
;
//! When group >= nr_threads, treat it as large_group, each thread process
//! one group for better performance
...
...
@@ -340,15 +345,16 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16DirectStride1::get_kimpl
size_t
OC
=
fm
.
ocpg
;
bundle
.
set
(
kern_param
.
workspace_ptr
);
for
(
size_t
ic
=
0
;
ic
<
IC
;
ic
++
)
{
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
fallback
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
copy_padding_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
{
ncb_index
.
thread_id
,
0
,
ic
});
}
for
(
size_t
oc
=
0
;
oc
<
OC
;
oc
++
)
{
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
do_conv_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
conv_kern_function
,
{
ncb_index
.
thread_id
,
0
,
oc
});
fallback
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
do_conv_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
conv_kern_function
,
{
ncb_index
.
thread_id
,
0
,
oc
});
}
};
ret_kerns
.
push_back
({
exec_one_group
,
{
group
,
N
,
1
_z
}});
...
...
@@ -357,17 +363,19 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF16DirectStride1::get_kimpl
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
mutable
{
bundle
.
set
(
kern_param
.
workspace_ptr
);
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
copy_padding_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
ncb_index
.
ndrange_id
);
fallback
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
copy_padding_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
ncb_index
.
ndrange_id
);
};
ret_kerns
.
push_back
({
copy_padding
,
{
group
,
N
,
IC
}});
auto
do_conv
=
[
bundle
,
conv_kern_function
](
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
mutable
{
bundle
.
set
(
kern_param
.
workspace_ptr
);
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
do_conv_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
conv_kern_function
,
ncb_index
.
ndrange_id
);
fallback
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
do_conv_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
conv_kern_function
,
ncb_index
.
ndrange_id
);
};
ret_kerns
.
push_back
({
do_conv
,
{
group
,
N
,
OC
}});
}
...
...
@@ -378,9 +386,8 @@ size_t ConvBiasImpl::AlgoF16DirectStride1::get_workspace(
const
NCBKernSizeParam
&
param
)
const
{
MIDOUT_BEGIN
(
megdnn_arm_common_conv_bias_fp16_kimpl
,
1
,
1
)
{
bool
large_group
=
param
.
filter_meta
.
group
>=
param
.
nr_threads
;
auto
bundle
=
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
get_bundle_stride
(
param
,
large_group
);
auto
bundle
=
fallback
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>::
get_bundle_stride
(
param
,
large_group
);
return
bundle
.
total_size_in_bytes
();
}
MIDOUT_END
();
...
...
dnn/src/arm_common/conv_bias/fp32/algos.cpp
浏览文件 @
ed7fa104
...
...
@@ -11,7 +11,6 @@
*/
#include "src/arm_common/conv_bias/fp32/algos.h"
#include "src/arm_common/conv_bias/direct/multi_thread_common.h"
#include "src/arm_common/conv_bias/fp32/direct.h"
#include "src/arm_common/conv_bias/fp32/do_conv_stride1.h"
#include "src/arm_common/conv_bias/fp32/do_conv_stride2.h"
...
...
@@ -20,6 +19,7 @@
#include "src/arm_common/conv_bias/postprocess_helper.h"
#include "src/common/opr_delegate.h"
#include "src/fallback/conv_bias/common.h"
#include "src/fallback/conv_bias/direct/multi_thread_common.h"
#include "midout.h"
...
...
@@ -343,7 +343,7 @@ bool ConvBiasImpl::AlgoF32Direct::usable(
size_t
ConvBiasImpl
::
AlgoF32Direct
::
get_workspace
(
const
NCBKernSizeParam
&
param
)
const
{
MIDOUT_BEGIN
(
megdnn_arm_common_conv_bias_f32_kimpl
,
0
,
1
)
{
bool
large_group
=
param
.
filter_meta
.
group
>=
param
.
nr_threads
;
auto
wbundle
=
MultithreadDirectConvCommon
<
float
,
float
>::
get_bundle
(
auto
wbundle
=
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
get_bundle
(
param
,
large_group
);
return
wbundle
.
total_size_in_bytes
();
}
...
...
@@ -359,7 +359,8 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::get_kimpls(
size_t
group
=
fm
.
group
;
bool
large_group
=
group
>=
param
.
nr_threads
;
WorkspaceBundle
bundle
=
MultithreadDirectConvCommon
<
float
,
float
>::
get_bundle
(
param
,
large_group
);
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
get_bundle
(
param
,
large_group
);
SmallVector
<
NCBKern
>
ret_kerns
;
//! When group >= nr_threads, treat it as large_group, each thread process
//! one group for better performance
...
...
@@ -374,17 +375,18 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::get_kimpls(
bundle
.
set
(
kern_param
.
workspace_ptr
);
if
(
fm
.
should_flip
)
{
for
(
size_t
oc
=
0
;
oc
<
OC
;
oc
++
)
{
MultithreadDirectConvCommon
<
float
,
float
>::
weight_flip_kern
(
bundle
,
kern_param
,
ncb_index
,
{
ncb_index
.
thread_id
,
0
,
oc
});
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
weight_flip_kern
(
bundle
,
kern_param
,
ncb_index
,
{
ncb_index
.
thread_id
,
0
,
oc
});
}
}
for
(
size_t
ic
=
0
;
ic
<
IC
;
ic
++
)
{
MultithreadDirectConvCommon
<
float
,
float
>::
copy_padding_kern
(
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
copy_padding_kern
(
bundle
,
kern_param
,
ncb_index
,
{
ncb_index
.
thread_id
,
0
,
ic
});
}
for
(
size_t
oc
=
0
;
oc
<
OC
;
oc
++
)
{
MultithreadDirectConvCommon
<
float
,
float
>::
do_conv_kern
(
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
do_conv_kern
(
bundle
,
kern_param
,
ncb_index
,
fp32
::
conv_bias
::
kern_direct
,
{
ncb_index
.
thread_id
,
0
,
oc
});
}
...
...
@@ -396,7 +398,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::get_kimpls(
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
mutable
{
bundle
.
set
(
kern_param
.
workspace_ptr
);
MultithreadDirectConvCommon
<
float
,
float
>::
weight_flip_kern
(
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
weight_flip_kern
(
bundle
,
kern_param
,
ncb_index
,
ncb_index
.
ndrange_id
);
};
ret_kerns
.
push_back
({
weight_flip
,
{
group
,
1
_z
,
OC
}});
...
...
@@ -405,7 +407,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::get_kimpls(
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
mutable
{
bundle
.
set
(
kern_param
.
workspace_ptr
);
MultithreadDirectConvCommon
<
float
,
float
>::
copy_padding_kern
(
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
copy_padding_kern
(
bundle
,
kern_param
,
ncb_index
,
ncb_index
.
ndrange_id
);
};
ret_kerns
.
push_back
({
copy_padding
,
{
group
,
N
,
IC
}});
...
...
@@ -413,7 +415,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::get_kimpls(
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
mutable
{
bundle
.
set
(
kern_param
.
workspace_ptr
);
MultithreadDirectConvCommon
<
float
,
float
>::
do_conv_kern
(
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
do_conv_kern
(
bundle
,
kern_param
,
ncb_index
,
fp32
::
conv_bias
::
kern_direct
,
ncb_index
.
ndrange_id
);
};
...
...
@@ -452,8 +454,9 @@ size_t ConvBiasImpl::AlgoF32DirectStride1::get_workspace(
const
NCBKernSizeParam
&
param
)
const
{
MIDOUT_BEGIN
(
megdnn_arm_common_conv_bias_f32_kimpl
,
1
,
1
)
{
bool
large_group
=
param
.
filter_meta
.
group
>=
param
.
nr_threads
;
auto
bundle
=
MultithreadDirectConvCommon
<
float
,
float
>::
get_bundle_stride
(
param
,
large_group
);
auto
bundle
=
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
get_bundle_stride
(
param
,
large_group
);
return
bundle
.
total_size_in_bytes
();
}
MIDOUT_END
();
...
...
@@ -492,7 +495,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride1::get_kimpl
SWITCH_KERN_STR1
();
WorkspaceBundle
bundle
=
MultithreadDirectConvCommon
<
float
,
float
>::
get_bundle_stride
(
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
get_bundle_stride
(
param
,
large_group
);
SmallVector
<
NCBKern
>
ret_kerns
;
//! When group >= nr_threads, treat it as large_group, each thread process
...
...
@@ -507,13 +510,16 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride1::get_kimpl
size_t
OC
=
fm
.
ocpg
;
bundle
.
set
(
kern_param
.
workspace_ptr
);
for
(
size_t
ic
=
0
;
ic
<
IC
;
ic
++
)
{
MultithreadDirectConvCommon
<
float
,
float
>::
copy_padding_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
{
ncb_index
.
thread_id
,
0
,
ic
});
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
copy_padding_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
{
ncb_index
.
thread_id
,
0
,
ic
});
}
for
(
size_t
oc
=
0
;
oc
<
OC
;
oc
++
)
{
MultithreadDirectConvCommon
<
float
,
float
>::
do_conv_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
conv_kern_function
,
{
ncb_index
.
thread_id
,
0
,
oc
});
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
do_conv_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
conv_kern_function
,
{
ncb_index
.
thread_id
,
0
,
oc
});
}
};
ret_kerns
.
push_back
({
exec_one_group
,
{
group
,
N
,
1
_z
}});
...
...
@@ -522,15 +528,16 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride1::get_kimpl
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
mutable
{
bundle
.
set
(
kern_param
.
workspace_ptr
);
MultithreadDirectConvCommon
<
float
,
float
>::
copy_padding_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
ncb_index
.
ndrange_id
);
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
copy_padding_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
ncb_index
.
ndrange_id
);
};
ret_kerns
.
push_back
({
copy_padding
,
{
group
,
N
,
IC
}});
auto
do_conv
=
[
bundle
,
conv_kern_function
](
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
mutable
{
bundle
.
set
(
kern_param
.
workspace_ptr
);
MultithreadDirectConvCommon
<
float
,
float
>::
do_conv_kern_stride
(
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
do_conv_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
conv_kern_function
,
ncb_index
.
ndrange_id
);
};
...
...
@@ -570,8 +577,9 @@ size_t ConvBiasImpl::AlgoF32DirectStride2::get_workspace(
const
NCBKernSizeParam
&
param
)
const
{
MIDOUT_BEGIN
(
megdnn_arm_common_conv_bias_f32_kimpl
,
2
,
1
)
{
bool
large_group
=
param
.
filter_meta
.
group
>=
param
.
nr_threads
;
auto
bundle
=
MultithreadDirectConvCommon
<
float
,
float
>::
get_bundle_stride
(
param
,
large_group
);
auto
bundle
=
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
get_bundle_stride
(
param
,
large_group
);
return
bundle
.
total_size_in_bytes
();
}
MIDOUT_END
();
...
...
@@ -609,7 +617,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride2::get_kimpl
SWITCH_KERN_STR2
();
WorkspaceBundle
bundle
=
MultithreadDirectConvCommon
<
float
,
float
>::
get_bundle_stride
(
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
get_bundle_stride
(
param
,
large_group
);
SmallVector
<
NCBKern
>
ret_kerns
;
//! When group >= nr_threads, treat it as large_group, each thread process
...
...
@@ -624,13 +632,16 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride2::get_kimpl
size_t
OC
=
fm
.
ocpg
;
bundle
.
set
(
kern_param
.
workspace_ptr
);
for
(
size_t
ic
=
0
;
ic
<
IC
;
ic
++
)
{
MultithreadDirectConvCommon
<
float
,
float
>::
copy_padding_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
{
ncb_index
.
thread_id
,
0
,
ic
});
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
copy_padding_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
{
ncb_index
.
thread_id
,
0
,
ic
});
}
for
(
size_t
oc
=
0
;
oc
<
OC
;
oc
++
)
{
MultithreadDirectConvCommon
<
float
,
float
>::
do_conv_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
conv_kern_function
,
{
ncb_index
.
thread_id
,
0
,
oc
});
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
do_conv_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
conv_kern_function
,
{
ncb_index
.
thread_id
,
0
,
oc
});
}
};
ret_kerns
.
push_back
({
exec_one_group
,
{
group
,
N
,
1
_z
}});
...
...
@@ -639,15 +650,16 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectStride2::get_kimpl
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
mutable
{
bundle
.
set
(
kern_param
.
workspace_ptr
);
MultithreadDirectConvCommon
<
float
,
float
>::
copy_padding_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
ncb_index
.
ndrange_id
);
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
copy_padding_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
ncb_index
.
ndrange_id
);
};
ret_kerns
.
push_back
({
copy_padding
,
{
group
,
N
,
IC
}});
auto
do_conv
=
[
bundle
,
conv_kern_function
](
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
mutable
{
bundle
.
set
(
kern_param
.
workspace_ptr
);
MultithreadDirectConvCommon
<
float
,
float
>::
do_conv_kern_stride
(
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>::
do_conv_kern_stride
(
bundle
,
kern_param
,
ncb_index
,
conv_kern_function
,
ncb_index
.
ndrange_id
);
};
...
...
dnn/src/
arm_common
/conv_bias/direct/multi_thread_common.cpp
→
dnn/src/
fallback
/conv_bias/direct/multi_thread_common.cpp
浏览文件 @
ed7fa104
/**
* \file dnn/src/
arm_common
/conv_bias/direct/multi_thread_common.cpp
* \file dnn/src/
fallback
/conv_bias/direct/multi_thread_common.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
...
...
@@ -9,12 +9,14 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "src/arm_common/conv_bias/direct/multi_thread_common.h"
#include "src/arm_common/conv_bias/postprocess_helper.h"
#include "multi_thread_common.h"
#include "src/fallback/matrix_mul/opr_impl.h"
using
namespace
megdnn
;
using
namespace
arm_common
;
using
namespace
fallback
;
#if MEGDNN_X86
using
namespace
x86
;
#endif
namespace
{
bool
need_dst_copy
(
const
megdnn
::
fallback
::
ConvBiasImpl
::
NCBKernSizeParam
&
param
)
{
...
...
@@ -354,8 +356,8 @@ void MultithreadDirectConvCommon<io_ctype, compute_ctype>::do_conv_kern_stride(
kern_param
.
nonlineMode
,
kern_param
.
bias_type
,
kern_param
.
dst_type
,
1
_z
,
1
_z
,
OH
,
OW
);
};
template
class
megdnn
::
arm_common
::
MultithreadDirectConvCommon
<
float
,
float
>;
template
class
megdnn
::
fallback
::
MultithreadDirectConvCommon
<
float
,
float
>;
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
template
class
megdnn
::
arm_common
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>;
template
class
megdnn
::
fallback
::
MultithreadDirectConvCommon
<
dt_float16
,
__fp16
>;
#endif
// vim: syntax=cpp.doxygen
dnn/src/
arm_common
/conv_bias/direct/multi_thread_common.h
→
dnn/src/
fallback
/conv_bias/direct/multi_thread_common.h
浏览文件 @
ed7fa104
/**
* \file dnn/src/
arm_common
/conv_bias/direct/multi_thread_common.h
* \file dnn/src/
fallback
/conv_bias/direct/multi_thread_common.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
...
...
@@ -10,11 +10,20 @@
*/
#pragma once
#include "src/
arm_common
/conv_bias/opr_impl.h"
#include "src/
fallback
/conv_bias/opr_impl.h"
#include "src/fallback/matrix_mul/opr_impl.h"
#if MEGDNN_X86
#include "src/x86/conv_bias/postprocess_helper.h"
#elif (MEGDNN_ARMV7 || MEGDNN_AARCH64)
#include "src/arm_common/conv_bias/postprocess_helper.h"
#else
//! TODO: optimize common postprocess_helper with general intrinsic
#include "src/common/postprocess_helper.h"
#endif
namespace
megdnn
{
namespace
arm_common
{
namespace
fallback
{
template
<
class
io_ctype
,
class
compute_ctype
>
class
MultithreadDirectConvCommon
{
...
...
@@ -53,7 +62,7 @@ public:
const
CpuNDRange
&
workspace_ids
);
};
}
// namespace
arm_common
}
// namespace
fallback
}
// namespace megdnn
// vim: syntax=cpp.doxygen
dnn/src/fallback/general_intrinsic/gi_float.h
浏览文件 @
ed7fa104
...
...
@@ -42,7 +42,7 @@ GI_FLOAT32_t GiReintInt32ToFloat32(GI_INT32_t Vector) {
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_castsi128_ps
(
Vector
);
#else
return
(
GI_FLOAT32_t
)
In
;
return
(
GI_FLOAT32_t
)
Vector
;
#endif
}
...
...
@@ -53,7 +53,7 @@ GI_FLOAT32_t GiReintUint32ToFloat32(GI_UINT32_t Vector) {
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_castsi128_ps
(
Vector
);
#else
return
(
GI_FLOAT32_t
)
In
;
return
(
GI_FLOAT32_t
)
Vector
;
#endif
}
...
...
dnn/src/fallback/general_intrinsic/gi_int.h
浏览文件 @
ed7fa104
/**
* \file dnn/src/fallback/general_intrinsic/gi_
floa
t.h
* \file dnn/src/fallback/general_intrinsic/gi_
in
t.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2022 Megvii Inc. All rights reserved.
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录