Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
d968942f
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
399
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
d968942f
编写于
4月 05, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
perf(cuda): speedup direct large kernel conv
GitOrigin-RevId: 3ff6a9caebbd1dc4c5c1c23b51945f7574f186ca
上级
b2cffdde
变更
2
展开全部
显示空白变更内容
内联
并排
Showing
2 changed file
with
611 addition
and
176 deletion
+611
-176
dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter.cuh
dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter.cuh
+21
-5
dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter_algo.cuh
...c/cuda/conv_bias/chanwise/depthwise_large_filter_algo.cuh
+590
-171
未找到文件。
dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter.cuh
浏览文件 @
d968942f
...
@@ -59,14 +59,15 @@ struct ConvTraitInner {
...
@@ -59,14 +59,15 @@ struct ConvTraitInner {
static
int
const
smem_src_h
=
static
int
const
smem_src_h
=
(
OutTileConfig
::
block_h
-
1
)
*
stride_h
+
FilterTileConfig
::
unroll_h
;
(
OutTileConfig
::
block_h
-
1
)
*
stride_h
+
FilterTileConfig
::
unroll_h
;
static
int
const
smem_buff_h
=
FilterTileConfig
::
unroll_h
;
static
int
const
smem_buff_h
=
FilterTileConfig
::
unroll_h
;
static
int
const
smem_load_h
=
smem_src_h
+
smem_buff_h
;
static
int
const
smem_load_h
=
smem_src_h
+
smem_buff_h
*
FilterTileConfig
::
unroll_w
*
ThreadConfig
::
thread_x
;
static
int
const
smem_h
=
smem_load_h
+
smem_buff_h
;
static
int
const
smem_h
=
smem_load_h
+
smem_buff_h
;
static
int
const
smem_w
=
static
int
const
smem_w
=
DIVUP
((
OutTileConfig
::
block_w
-
1
)
*
stride_w
+
DIVUP
((
OutTileConfig
::
block_w
-
1
)
*
stride_w
+
FilterTileConfig
::
unroll_w
*
ThreadConfig
::
thread_x
,
FilterTileConfig
::
unroll_w
*
ThreadConfig
::
thread_x
,
2
)
*
2
)
*
2
;
2
;
static
int
const
smem_size
=
smem_h
*
smem_w
;
static
int
const
load_w
=
static
int
const
load_w
=
smem_w
>
ThreadConfig
::
nr_threads
?
ThreadConfig
::
nr_threads
:
smem_w
;
smem_w
>
ThreadConfig
::
nr_threads
?
ThreadConfig
::
nr_threads
:
smem_w
;
static
int
const
load_h
=
1
;
static
int
const
load_h
=
1
;
...
@@ -74,21 +75,36 @@ struct ConvTraitInner {
...
@@ -74,21 +75,36 @@ struct ConvTraitInner {
static
int
const
reg_w
=
DIVUP
(
smem_w
,
load_w
);
static
int
const
reg_w
=
DIVUP
(
smem_w
,
load_w
);
static
bool
constexpr
check_bounds_h
=
smem_h
%
load_h
!=
0
;
static
bool
constexpr
check_bounds_h
=
smem_h
%
load_h
!=
0
;
static
bool
constexpr
check_bounds_w
=
smem_w
%
load_w
!=
0
;
static
bool
constexpr
check_bounds_w
=
smem_w
%
load_w
!=
0
;
// to avoid bank confilct, every bank_offset_line in 8 lines, add one offset
static
int
const
bank_w
=
smem_w
/
(
4
/
sizeof
(
CompType
));
static
int
const
bank_offset_line
=
(
bank_w
%
32
==
0
||
bank_w
%
FilterTileConfig
::
unroll_w
==
0
)
?
1
:
(
bank_w
%
16
==
0
?
2
:
4
);
static
int
const
smem_size
=
smem_h
*
smem_w
+
DIVUP
(
smem_h
,
bank_offset_line
)
*
(
4
/
sizeof
(
CompType
));
};
};
struct
FilterTileCount
{
struct
FilterTileCount
{
static
int
const
smem_flt_h
=
FilterTileConfig
::
unroll_h
;
static
int
const
smem_flt_h
=
FilterTileConfig
::
unroll_h
;
static
int
const
smem_buff_h
=
FilterTileConfig
::
unroll_h
;
static
int
const
smem_buff_h
=
FilterTileConfig
::
unroll_h
;
static
int
const
smem_load_h
=
smem_flt_h
+
smem_buff_h
;
static
int
const
smem_h
=
smem_load_h
+
smem_buff_h
;
static
int
const
smem_w
=
FilterTileConfig
::
unroll_w
*
ThreadConfig
::
thread_x
;
static
int
const
smem_w
=
FilterTileConfig
::
unroll_w
*
ThreadConfig
::
thread_x
;
static
int
const
smem_size
=
smem_h
*
smem_w
;
static
int
const
smem_load_h
=
smem_flt_h
+
smem_buff_h
*
smem_w
;
static
int
const
smem_h
=
smem_load_h
+
smem_buff_h
;
static
int
const
load_w
=
smem_w
>
32
?
32
:
smem_w
;
static
int
const
load_w
=
smem_w
>
32
?
32
:
smem_w
;
static
int
const
load_h
=
ThreadConfig
::
nr_threads
/
load_w
;
static
int
const
load_h
=
ThreadConfig
::
nr_threads
/
load_w
;
static
int
const
reg_h
=
1
;
static
int
const
reg_h
=
1
;
static
int
const
reg_w
=
DIVUP
(
smem_w
,
load_w
);
static
int
const
reg_w
=
DIVUP
(
smem_w
,
load_w
);
static
bool
constexpr
check_bounds_h
=
smem_h
%
load_h
!=
0
;
static
bool
constexpr
check_bounds_h
=
smem_h
%
load_h
!=
0
;
static
bool
constexpr
check_bounds_w
=
smem_w
%
load_w
!=
0
;
static
bool
constexpr
check_bounds_w
=
smem_w
%
load_w
!=
0
;
// to avoid bank confilct, every bank_offset_line in 8 lines, add one offset
static
int
const
bank_w
=
smem_w
/
(
4
/
sizeof
(
CompType
));
static
int
const
bank_offset_line
=
(
bank_w
%
32
==
0
||
bank_w
%
FilterTileConfig
::
unroll_w
==
0
)
?
1
:
(
bank_w
%
16
==
0
?
2
:
4
);
static
int
const
smem_size
=
smem_h
*
smem_w
+
DIVUP
(
smem_h
,
bank_offset_line
)
*
(
4
/
sizeof
(
CompType
));
};
};
};
};
...
...
dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter_algo.cuh
浏览文件 @
d968942f
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录