Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
4e9b1c4e
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
404
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
4e9b1c4e
编写于
9月 21, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(dnn): add rrconv wgrad, support int32 and uint8 region mask
GitOrigin-RevId: 0da9b3bca86ba6ae11289e451b224aca348f647b
上级
9e020d23
变更
14
显示空白变更内容
内联
并排
Showing
14 changed file
with
1059 addition
and
6 deletion
+1059
-6
dnn/scripts/cutlass_generator/BUILD
dnn/scripts/cutlass_generator/BUILD
+1
-0
dnn/scripts/cutlass_generator/conv2d_operation.py
dnn/scripts/cutlass_generator/conv2d_operation.py
+221
-1
dnn/scripts/cutlass_generator/gen_list.py
dnn/scripts/cutlass_generator/gen_list.py
+1
-0
dnn/scripts/cutlass_generator/generator.py
dnn/scripts/cutlass_generator/generator.py
+263
-2
dnn/scripts/cutlass_generator/library.py
dnn/scripts/cutlass_generator/library.py
+7
-0
dnn/scripts/cutlass_generator/list.bzl
dnn/scripts/cutlass_generator/list.bzl
+0
-0
dnn/src/CMakeLists.txt
dnn/src/CMakeLists.txt
+1
-0
dnn/src/cuda/cutlass/convolution_operation.h
dnn/src/cuda/cutlass/convolution_operation.h
+80
-0
dnn/src/cuda/cutlass/initialize_all.cu
dnn/src/cuda/cutlass/initialize_all.cu
+2
-0
dnn/src/cuda/cutlass/library.h
dnn/src/cuda/cutlass/library.h
+8
-0
dnn/src/cuda/cutlass/operation_table.cpp
dnn/src/cuda/cutlass/operation_table.cpp
+5
-0
dnn/src/cuda/cutlass/operation_table.h
dnn/src/cuda/cutlass/operation_table.h
+18
-2
dnn/src/cuda/region_restricted_convolution/opr_impl.cpp
dnn/src/cuda/region_restricted_convolution/opr_impl.cpp
+133
-1
dnn/test/cuda/region_restricted_convolution.cpp
dnn/test/cuda/region_restricted_convolution.cpp
+319
-0
未找到文件。
dnn/scripts/cutlass_generator/BUILD
浏览文件 @
4e9b1c4e
...
...
@@ -19,6 +19,7 @@ genrule(
CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_dgrad --type tensorop884 $(@D)
CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_wgrad --type simt $(@D)
CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_wgrad --type tensorop884 $(@D)
CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations rrconv2d_wgrad --type simt $(@D)
"""
,
tools
=
[
"//brain/megbrain/dnn/scripts/cutlass_generator:generator.py"
],
visibility
=
[
"//visibility:public"
],
...
...
dnn/scripts/cutlass_generator/conv2d_operation.py
浏览文件 @
4e9b1c4e
...
...
@@ -35,6 +35,8 @@ class Conv2dOperation:
without_shared_load
=
False
,
required_cuda_ver_major
=
9
,
required_cuda_ver_minor
=
2
,
rin
=
None
,
rout
=
None
,
):
self
.
operation_kind
=
OperationKind
.
Conv2d
...
...
@@ -54,6 +56,8 @@ class Conv2dOperation:
self
.
without_shared_load
=
without_shared_load
self
.
required_cuda_ver_major
=
required_cuda_ver_major
self
.
required_cuda_ver_minor
=
required_cuda_ver_minor
self
.
rin
=
rin
self
.
rout
=
rout
#
def
accumulator_type
(
self
):
...
...
@@ -95,6 +99,8 @@ class Conv2dOperation:
conv_type_name
=
""
if
self
.
conv_type
==
ConvType
.
DepthwiseConvolution
:
conv_type_name
=
"dw"
elif
self
.
conv_type
==
ConvType
.
RegionRestrictedConvolution
:
conv_type_name
=
"rr"
return
"%s%s%s%s%s%s%s_%s"
%
(
ShortDataTypeNames
[
self
.
accumulator_type
()],
...
...
@@ -125,6 +131,9 @@ class Conv2dOperation:
elif
self
.
src
.
element
==
self
.
flt
.
element
:
extended_name
=
"${core_name}_${element_src}"
if
self
.
rin
!=
None
:
extended_name
+=
"_${element_rin}"
extended_name
=
SubstituteTemplate
(
extended_name
,
{
...
...
@@ -132,6 +141,7 @@ class Conv2dOperation:
"element_flt"
:
DataTypeNames
[
self
.
flt
.
element
],
"element_dst"
:
DataTypeNames
[
self
.
dst
.
element
],
"core_name"
:
self
.
core_name
(),
"element_rin"
:
DataTypeNames
[
self
.
rin
.
element
],
},
)
...
...
@@ -512,6 +522,115 @@ using Convolution_${operation_name} =
return
SubstituteTemplate
(
self
.
template
,
values
)
class
EmitRegionRestrictedConvolutionBackwardFilterInstance
:
def
__init__
(
self
):
self
.
template
=
"""
// kernel instance "${operation_name}" generated by cutlass generator
using Convolution_${operation_name} =
typename cutlass::conv::device::RegionRestrictedConvolutionBackwardFilter<
${element_src},
${layout_src},
${element_diff},
${layout_diff},
${element_src_mask},
${layout_src_mask},
${element_output_mask},
${layout_output_mask},
${element_grad},
${layout_grad},
${element_accumulator},
${conv_type},
${opcode_class},
${arch},
cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
${epilogue_functor}<
${element_grad},
${epilogue_vector_length},
${element_accumulator},
${element_epilogue}
>,
${swizzling_functor},
${stages},
${alignment_src},
${alignment_diff},
${alignment_src_mask},
${alignment_output_mask},
${special_optimization},
${math_operator},
${implicit_gemm_mode}>;
"""
def
emit
(
self
,
operation
):
warp_shape
=
[
int
(
operation
.
tile_description
.
threadblock_shape
[
idx
]
/
operation
.
tile_description
.
warp_count
[
idx
]
)
for
idx
in
range
(
3
)
]
epilogue_vector_length
=
int
(
min
(
operation
.
dst
.
alignment
*
DataTypeSize
[
operation
.
dst
.
element
],
128
)
/
DataTypeSize
[
operation
.
dst
.
element
]
)
values
=
{
"operation_name"
:
operation
.
procedural_name
(),
"conv_type"
:
ConvTypeTag
[
operation
.
conv_type
],
"element_src"
:
DataTypeTag
[
operation
.
src
.
element
],
"layout_src"
:
LayoutTag
[
operation
.
src
.
layout
],
"element_diff"
:
DataTypeTag
[
operation
.
flt
.
element
],
"layout_diff"
:
LayoutTag
[
operation
.
flt
.
layout
],
"element_src_mask"
:
DataTypeTag
[
operation
.
rin
.
element
],
"layout_src_mask"
:
LayoutTag
[
operation
.
rin
.
layout
],
"element_output_mask"
:
DataTypeTag
[
operation
.
rout
.
element
],
"layout_output_mask"
:
LayoutTag
[
operation
.
rout
.
layout
],
"element_grad"
:
DataTypeTag
[
operation
.
dst
.
element
],
"layout_grad"
:
LayoutTag
[
operation
.
dst
.
layout
],
"element_accumulator"
:
DataTypeTag
[
operation
.
accumulator_type
()],
"opcode_class"
:
OpcodeClassTag
[
operation
.
tile_description
.
math_instruction
.
opcode_class
],
"arch"
:
"cutlass::arch::Sm%d"
%
operation
.
arch
,
"threadblock_shape_m"
:
str
(
operation
.
tile_description
.
threadblock_shape
[
0
]),
"threadblock_shape_n"
:
str
(
operation
.
tile_description
.
threadblock_shape
[
1
]),
"threadblock_shape_k"
:
str
(
operation
.
tile_description
.
threadblock_shape
[
2
]),
"warp_shape_m"
:
str
(
warp_shape
[
0
]),
"warp_shape_n"
:
str
(
warp_shape
[
1
]),
"warp_shape_k"
:
str
(
warp_shape
[
2
]),
"instruction_shape_m"
:
str
(
operation
.
tile_description
.
math_instruction
.
instruction_shape
[
0
]
),
"instruction_shape_n"
:
str
(
operation
.
tile_description
.
math_instruction
.
instruction_shape
[
1
]
),
"instruction_shape_k"
:
str
(
operation
.
tile_description
.
math_instruction
.
instruction_shape
[
2
]
),
"epilogue_vector_length"
:
str
(
epilogue_vector_length
),
"epilogue_functor"
:
EpilogueFunctorTag
[
operation
.
epilogue_functor
],
"element_epilogue"
:
str
(
DataTypeTag
[
operation
.
element_epilogue
]),
"swizzling_functor"
:
SwizzlingFunctorTag
[
operation
.
swizzling_functor
],
"stages"
:
str
(
operation
.
tile_description
.
stages
),
"alignment_src"
:
str
(
operation
.
src
.
alignment
),
"alignment_diff"
:
str
(
operation
.
flt
.
alignment
),
"alignment_src_mask"
:
str
(
operation
.
rin
.
alignment
),
"alignment_output_mask"
:
str
(
operation
.
rout
.
alignment
),
"special_optimization"
:
SpecialOptimizeDescTag
[
operation
.
special_optimization
],
"math_operator"
:
MathOperationTag
[
operation
.
tile_description
.
math_instruction
.
math_operation
],
"implicit_gemm_mode"
:
ImplicitGemmModeTag
[
operation
.
implicit_gemm_mode
],
}
return
SubstituteTemplate
(
self
.
template
,
values
)
###################################################################################################
#
# Generator functions for all layouts
...
...
@@ -540,7 +659,10 @@ def GenerateConv2d(
operations
=
[]
element_epilogue
=
DataType
.
f32
if
conv_type
==
ConvType
.
DepthwiseConvolution
:
if
(
conv_type
==
ConvType
.
DepthwiseConvolution
or
conv_type
==
ConvType
.
RegionRestrictedConvolution
):
if
conv_kind
==
ConvKind
.
Fprop
:
swizzling_functor
=
SwizzlingFunctor
.
DepthwiseConvolutionFprop
elif
conv_kind
==
ConvKind
.
Dgrad
:
...
...
@@ -680,6 +802,16 @@ def GenerateConv2d(
flt_layout
,
int
(
flt_align
/
DataTypeSize
[
tile
.
math_instruction
.
element_a
]),
)
rin
=
TensorDescription
(
tile
.
math_instruction
.
element_rin
,
src_layout
,
int
(
src_align
/
DataTypeSize
[
tile
.
math_instruction
.
element_rin
]),
)
rout
=
TensorDescription
(
tile
.
math_instruction
.
element_rout
,
dst_layout
,
int
(
dst_align
/
DataTypeSize
[
tile
.
math_instruction
.
element_rout
]),
)
bias
=
TensorDescription
(
bias_type
,
dst_layout
,
max
(
1
,
int
(
32
/
DataTypeSize
[
bias_type
]))
)
...
...
@@ -704,6 +836,8 @@ def GenerateConv2d(
without_shared_load
,
required_cuda_ver_major
,
required_cuda_ver_minor
,
rin
,
rout
,
)
operations
.
append
(
new_operation
)
if
use_special_optimization
!=
SpecialOptimizeDesc
.
NoneSpecialOpt
:
...
...
@@ -724,6 +858,8 @@ def GenerateConv2d(
without_shared_load
,
required_cuda_ver_major
,
required_cuda_ver_minor
,
rin
,
rout
,
)
operations
.
append
(
new_operation
)
return
operations
...
...
@@ -955,5 +1091,89 @@ void initialize_${operation_name}(Manifest &manifest) {
self
.
kernel_file
.
close
()
class
EmitRegionRestrictedConvSingleKernelWrapper
:
def
__init__
(
self
,
kernel_path
,
operation
,
short_path
=
False
):
self
.
kernel_path
=
kernel_path
self
.
operation
=
operation
self
.
short_path
=
short_path
# Now only support wgrad
assert
self
.
operation
.
conv_kind
==
ConvKind
.
Wgrad
self
.
instance_emitter
=
EmitRegionRestrictedConvolutionBackwardFilterInstance
()
self
.
convolution_name
=
"RegionRestrictedConvolutionBackwardFilterOperation"
self
.
header_template
=
"""
#if __CUDACC_VER_MAJOR__ > ${required_cuda_ver_major} || (__CUDACC_VER_MAJOR__ == ${required_cuda_ver_major} && __CUDACC_VER_MINOR__ >= ${required_cuda_ver_minor})
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "cutlass/convolution/device/convolution.h"
#include "src/cuda/cutlass/manifest.h"
#include "src/cuda/cutlass/convolution_operation.h"
"""
self
.
instance_template
=
"""
${operation_instance}
"""
self
.
manifest_template
=
"""
namespace cutlass {
namespace library {
void initialize_${operation_name}(Manifest &manifest) {
manifest.append(new ${convolution_name}<Convolution_${operation_name}>(
"${operation_name}"
));
}
} // namespace library
} // namespace cutlass
"""
self
.
epilogue_template
=
"""
#pragma GCC diagnostic pop
#endif
"""
#
def
__enter__
(
self
):
if
self
.
short_path
:
self
.
kernel_path
=
os
.
path
.
join
(
self
.
kernel_path
,
"%s.cu"
%
GlobalCnt
.
cnt
)
GlobalCnt
.
cnt
+=
1
else
:
self
.
kernel_path
=
os
.
path
.
join
(
self
.
kernel_path
,
"%s.cu"
%
self
.
operation
.
procedural_name
()
)
self
.
kernel_file
=
open
(
self
.
kernel_path
,
"w"
)
return
self
#
def
emit
(
self
):
self
.
kernel_file
.
write
(
SubstituteTemplate
(
self
.
instance_template
,
{
"operation_instance"
:
self
.
instance_emitter
.
emit
(
self
.
operation
)},
)
)
# emit manifest helper
manifest
=
SubstituteTemplate
(
self
.
manifest_template
,
{
"operation_name"
:
self
.
operation
.
procedural_name
(),
"convolution_name"
:
self
.
convolution_name
,
},
)
self
.
kernel_file
.
write
(
manifest
)
#
def
__exit__
(
self
,
exception_type
,
exception_value
,
traceback
):
self
.
kernel_file
.
close
()
###################################################################################################
###################################################################################################
dnn/scripts/cutlass_generator/gen_list.py
浏览文件 @
4e9b1c4e
...
...
@@ -64,4 +64,5 @@ if __name__ == "__main__":
write_merge_file_name
(
f
,
"dwconv2d_dgrad"
,
"tensorop884"
,
4
)
write_merge_file_name
(
f
,
"dwconv2d_wgrad"
,
"simt"
,
2
)
write_merge_file_name
(
f
,
"dwconv2d_wgrad"
,
"tensorop884"
,
4
)
write_merge_file_name
(
f
,
"rrconv2d_wgrad"
,
"simt"
,
2
)
f
.
write
(
"]"
)
dnn/scripts/cutlass_generator/generator.py
浏览文件 @
4e9b1c4e
...
...
@@ -1260,6 +1260,218 @@ def GenerateDwconv2d_Simt(args, conv_kind):
return
operations
def
GenerateRegionRestrictedconv2d_Simt
(
args
,
conv_kind
):
################################################################################
# warps per threadblock
################################################################################
warpsPerThreadblocks
=
[]
for
warpsPerThreadblock0
in
warpsPerThreadblockEdge
:
for
warpsPerThreadblock1
in
warpsPerThreadblockEdge
:
if
(
warpsPerThreadblock0
/
warpsPerThreadblock1
<=
warpsPerThreadblockRatio
and
warpsPerThreadblock1
/
warpsPerThreadblock0
<=
warpsPerThreadblockRatio
and
warpsPerThreadblock0
*
warpsPerThreadblock1
<=
warpsPerThreadblockMax
):
warpsPerThreadblocks
.
append
(
[
warpsPerThreadblock0
,
warpsPerThreadblock1
]
)
################################################################################
# warp shapes
################################################################################
warpNumThreads
=
32
warpShapes
=
[]
for
warp0
in
warpShapeEdges
:
for
warp1
in
warpShapeEdges
:
if
(
warp0
/
warp1
<=
warpShapeRatio
and
warp1
/
warp0
<=
warpShapeRatio
and
warp0
*
warp1
<=
warpShapeMax
and
warp0
*
warp1
>
warpShapeMin
):
warpShapes
.
append
([
warp0
,
warp1
])
# sgemm
(
precisionType
,
precisionBits
,
threadblockMaxElements
,
threadblockTilesL0
,
)
=
precisions
[
"s"
]
layouts
=
[(
LayoutType
.
TensorNCHW
,
LayoutType
.
TensorNCHW
)]
math_instructions
=
[
MathInstruction
(
[
1
,
1
,
1
],
DataType
.
f32
,
DataType
.
f32
,
DataType
.
f32
,
OpcodeClass
.
Simt
,
MathOperation
.
multiply_add
,
DataType
.
s32
,
DataType
.
s32
,
),
MathInstruction
(
[
1
,
1
,
1
],
DataType
.
f32
,
DataType
.
f32
,
DataType
.
f32
,
OpcodeClass
.
Simt
,
MathOperation
.
multiply_add
,
DataType
.
s8
,
DataType
.
s8
,
),
]
min_cc
=
50
max_cc
=
1024
dst_layouts
=
[
LayoutType
.
TensorNCHW
]
dst_types
=
[
DataType
.
f32
]
if
conv_kind
==
ConvKind
.
Wgrad
:
alignment_constraints
=
[
32
]
else
:
alignment_constraints
=
[
128
,
32
]
operations
=
[]
for
math_inst
in
math_instructions
:
tile_descriptions
=
[
TileDescription
([
128
,
128
,
8
],
1
,
[
4
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
128
,
64
,
8
],
1
,
[
2
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
64
,
128
,
8
],
1
,
[
2
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
128
,
32
,
8
],
1
,
[
2
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
32
,
128
,
8
],
1
,
[
1
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
64
,
64
,
8
],
1
,
[
2
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
32
,
64
,
8
],
1
,
[
1
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
64
,
32
,
8
],
1
,
[
1
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
32
,
32
,
8
],
1
,
[
1
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
]
for
warpsPerThreadblock
in
warpsPerThreadblocks
:
for
warpShape
in
warpShapes
:
warpThreadsM
=
0
if
warpShape
[
0
]
>
warpShape
[
1
]:
warpThreadsM
=
8
else
:
warpThreadsM
=
4
warpThreadsN
=
warpNumThreads
/
warpThreadsM
# skip shapes with conflicting rectangularity
# they are unlikely to be fastest
blockG
=
warpsPerThreadblock
[
0
]
>
warpsPerThreadblock
[
1
]
blockL
=
warpsPerThreadblock
[
0
]
<
warpsPerThreadblock
[
1
]
warpG
=
warpShape
[
0
]
>
warpShape
[
1
]
warpL
=
warpShape
[
0
]
<
warpShape
[
1
]
blockG2
=
warpsPerThreadblock
[
0
]
>
warpsPerThreadblock
[
1
]
*
2
blockL2
=
warpsPerThreadblock
[
0
]
*
2
<
warpsPerThreadblock
[
1
]
warpG2
=
warpShape
[
0
]
>
warpShape
[
1
]
*
2
warpL2
=
warpShape
[
0
]
*
2
<
warpShape
[
1
]
if
blockG2
and
warpL
:
continue
if
blockL2
and
warpG
:
continue
if
warpG2
and
blockL
:
continue
if
warpL2
and
blockG
:
continue
# check threadblock ratios and max
threadblockTile
=
[
warpShape
[
0
]
*
warpsPerThreadblock
[
0
],
warpShape
[
1
]
*
warpsPerThreadblock
[
1
],
]
if
threadblockTile
[
0
]
*
threadblockTile
[
1
]
>
threadblockMaxElements
:
continue
if
threadblockTile
[
0
]
>
threadblockEdgeMax
:
continue
if
threadblockTile
[
1
]
>
threadblockEdgeMax
:
continue
totalThreads
=
(
warpNumThreads
*
warpsPerThreadblock
[
0
]
*
warpsPerThreadblock
[
1
]
)
# calculate unroll
# ensure that every iteration at least a full load of A,B are done
unrollMin
=
8
unrollMin0
=
totalThreads
//
threadblockTile
[
0
]
unrollMin1
=
totalThreads
//
threadblockTile
[
1
]
unroll
=
max
(
unrollMin
,
unrollMin0
,
unrollMin1
)
threadTileM
=
warpShape
[
0
]
//
warpThreadsM
threadTileN
=
warpShape
[
1
]
//
warpThreadsN
if
threadTileM
<
2
or
threadTileN
<
2
:
continue
if
threadTileM
*
threadTileN
*
precisionBits
>
8
*
8
*
32
:
continue
# epilogue currently only supports N < WarpNumThreads
if
threadblockTile
[
1
]
<
warpNumThreads
:
continue
# limit smem
smemBitsA
=
threadblockTile
[
0
]
*
unroll
*
2
*
precisionBits
smemBitsB
=
threadblockTile
[
1
]
*
unroll
*
2
*
precisionBits
smemKBytes
=
(
smemBitsA
+
smemBitsB
)
/
8
/
1024
if
smemKBytes
>
48
:
continue
tile
=
TileDescription
(
[
threadblockTile
[
0
],
threadblockTile
[
1
],
unroll
],
1
,
[
threadblockTile
[
0
]
//
warpShape
[
0
],
threadblockTile
[
1
]
//
warpShape
[
1
],
1
,
],
math_inst
,
min_cc
,
max_cc
,
)
def
filter
(
t
:
TileDescription
)
->
bool
:
nonlocal
tile
return
(
t
.
threadblock_shape
[
0
]
==
tile
.
threadblock_shape
[
0
]
and
t
.
threadblock_shape
[
1
]
==
tile
.
threadblock_shape
[
1
]
and
t
.
threadblock_shape
[
2
]
==
tile
.
threadblock_shape
[
2
]
and
t
.
warp_count
[
0
]
==
tile
.
warp_count
[
0
]
and
t
.
warp_count
[
1
]
==
tile
.
warp_count
[
1
]
and
t
.
warp_count
[
2
]
==
tile
.
warp_count
[
2
]
and
t
.
stages
==
tile
.
stages
)
if
not
any
(
t
for
t
in
tile_descriptions
if
filter
(
t
)):
continue
for
layout
in
layouts
:
for
dst_type
,
dst_layout
in
zip
(
dst_types
,
dst_layouts
):
for
alignment_src
in
alignment_constraints
:
operations
+=
GenerateConv2d
(
ConvType
.
RegionRestrictedConvolution
,
conv_kind
,
[
tile
],
layout
[
0
],
layout
[
1
],
dst_layout
,
dst_type
,
min_cc
,
alignment_src
,
32
,
32
,
SpecialOptimizeDesc
.
NoneSpecialOpt
,
ImplicitGemmMode
.
GemmNT
if
conv_kind
==
ConvKind
.
Wgrad
else
ImplicitGemmMode
.
GemmTN
,
)
return
operations
#
def
GenerateDwconv2d_TensorOp_884
(
args
,
conv_kind
):
layouts
=
[(
LayoutType
.
TensorNCHW
,
LayoutType
.
TensorNCHW
)]
...
...
@@ -1644,6 +1856,14 @@ def GenerateDwconv2dWgradOperations(args):
return
GenerateDwconv2d_TensorOp_884
(
args
,
ConvKind
.
Wgrad
)
def
GenerateRegionRestrictedconv2dWgradOperations
(
args
):
assert
args
.
type
==
"simt"
,
(
"operation RegionRestrictedconv2d wgrad only support"
"simt. (got:{})"
.
format
(
args
.
type
)
)
return
GenerateRegionRestrictedconv2d_Simt
(
args
,
ConvKind
.
Wgrad
)
def
GenerateGemmOperations
(
args
):
if
args
.
type
==
"tensorop884"
:
return
GeneratesGemm_TensorOp_884
(
args
)
...
...
@@ -1698,6 +1918,8 @@ def ConcatFile(
sub_string_1
=
sub_string_2
=
"simt"
if
"dwconv2d_"
in
operations
:
filtered_operations
=
operations
[:
2
]
+
operations
[
9
:]
if
"rrconv2d_"
in
operations
:
filtered_operations
=
operations
[:
2
]
+
operations
[
9
:]
elif
(
"conv2d"
in
operations
)
or
(
"deconv"
in
operations
):
filtered_operations
=
"cutlass"
else
:
...
...
@@ -1893,6 +2115,7 @@ if __name__ == "__main__":
"dwconv2d_fprop"
,
"dwconv2d_dgrad"
,
"dwconv2d_wgrad"
,
"rrconv2d_wgrad"
,
],
required
=
True
,
help
=
"Specifies the operation to generate (gemm, gemv, conv2d, deconv, dwconv2d_fprop, dwconv2d_dgrad, dwconv2d_wgrad)"
,
...
...
@@ -1928,9 +2151,11 @@ if __name__ == "__main__":
operations
=
GenerateDwconv2dFpropOperations
(
args
)
elif
args
.
operations
==
"dwconv2d_dgrad"
:
operations
=
GenerateDwconv2dDgradOperations
(
args
)
else
:
assert
args
.
operations
==
"dwconv2d_wgrad"
,
"invalid operation"
elif
args
.
operations
==
"dwconv2d_wgrad"
:
operations
=
GenerateDwconv2dWgradOperations
(
args
)
else
:
assert
args
.
operations
==
"rrconv2d_wgrad"
,
"invalid operation"
operations
=
GenerateRegionRestrictedconv2dWgradOperations
(
args
)
if
(
args
.
operations
==
"conv2d"
...
...
@@ -1974,6 +2199,42 @@ if __name__ == "__main__":
required_cuda_ver_minor
,
epilogue
,
)
elif
args
.
operations
==
"rrconv2d_wgrad"
:
for
operation
in
operations
:
with
EmitRegionRestrictedConvSingleKernelWrapper
(
args
.
output
,
operation
,
short_path
)
as
emitter
:
emitter
.
emit
()
head
=
EmitRegionRestrictedConvSingleKernelWrapper
(
args
.
output
,
operations
[
0
],
short_path
).
header_template
required_cuda_ver_major
=
operations
[
0
].
required_cuda_ver_major
required_cuda_ver_minor
=
operations
[
0
].
required_cuda_ver_minor
epilogue
=
EmitRegionRestrictedConvSingleKernelWrapper
(
args
.
output
,
operations
[
0
],
short_path
).
epilogue_template
if
"tensorop"
in
args
.
type
:
ConcatFile
(
4
,
args
.
output
,
args
.
operations
,
args
.
type
,
head
,
required_cuda_ver_major
,
required_cuda_ver_minor
,
epilogue
,
)
else
:
ConcatFile
(
2
,
args
.
output
,
args
.
operations
,
args
.
type
,
head
,
required_cuda_ver_major
,
required_cuda_ver_minor
,
epilogue
,
)
elif
args
.
operations
==
"gemm"
:
for
operation
in
operations
:
with
EmitGemmSingleKernelWrapper
(
...
...
dnn/scripts/cutlass_generator/library.py
浏览文件 @
4e9b1c4e
...
...
@@ -532,6 +532,7 @@ class ConvType(enum.Enum):
Local
=
enum_auto
()
LocalShare
=
enum_auto
()
DepthwiseConvolution
=
enum_auto
()
RegionRestrictedConvolution
=
enum_auto
()
ConvTypeTag
=
{
...
...
@@ -540,6 +541,8 @@ ConvTypeTag = {
ConvType
.
Local
:
"cutlass::conv::ConvType::kLocal"
,
ConvType
.
LocalShare
:
"cutlass::conv::ConvType::kLocalShare"
,
ConvType
.
DepthwiseConvolution
:
"cutlass::conv::ConvType::kDepthwiseConvolution"
,
# RegionRestrictedConvolution using the same conv type with Depthwise
ConvType
.
RegionRestrictedConvolution
:
"cutlass::conv::ConvType::kDepthwiseConvolution"
,
}
#
...
...
@@ -640,6 +643,8 @@ class MathInstruction:
element_accumulator
,
opcode_class
,
math_operation
=
MathOperation
.
multiply_add
,
element_rin
=
DataType
.
s32
,
element_rout
=
DataType
.
s32
,
):
self
.
instruction_shape
=
instruction_shape
self
.
element_a
=
element_a
...
...
@@ -647,6 +652,8 @@ class MathInstruction:
self
.
element_accumulator
=
element_accumulator
self
.
opcode_class
=
opcode_class
self
.
math_operation
=
math_operation
self
.
element_rin
=
element_rin
self
.
element_rout
=
element_rout
#
...
...
dnn/scripts/cutlass_generator/list.bzl
浏览文件 @
4e9b1c4e
此差异由.gitattributes 抑制。
dnn/src/CMakeLists.txt
浏览文件 @
4e9b1c4e
...
...
@@ -188,6 +188,7 @@ if(MGE_WITH_CUDA)
gen_cutlass_kimpl
(
dwconv2d_dgrad tensorop884 CUTLASS_SOURCES
)
gen_cutlass_kimpl
(
dwconv2d_wgrad simt CUTLASS_SOURCES
)
gen_cutlass_kimpl
(
dwconv2d_wgrad tensorop884 CUTLASS_SOURCES
)
gen_cutlass_kimpl
(
rrconv2d_wgrad simt CUTLASS_SOURCES
)
list
(
PREPEND CUSOURCES
${
CUTLASS_SOURCES
}
)
# Compile the following file first, the priority_compile_opr.txt is generated by
...
...
dnn/src/cuda/cutlass/convolution_operation.h
浏览文件 @
4e9b1c4e
...
...
@@ -452,6 +452,86 @@ public:
///////////////////////////////////////////////////////////////////////////////////////////////////
template
<
typename
Operator_
>
class
RegionRestrictedConvolutionBackwardFilterOperation
:
public
ConvolutionBackwardFilterOperationBase
<
Operator_
>
{
public:
using
Operator
=
Operator_
;
using
ElementSrc
=
typename
Operator
::
ElementSrc
;
using
LayoutSrc
=
typename
Operator
::
LayoutSrc
;
using
ElementDiff
=
typename
Operator
::
ElementDiff
;
using
LayoutDiff
=
typename
Operator
::
LayoutDiff
;
using
ElementGrad
=
typename
Operator
::
ElementGrad
;
using
LayoutGrad
=
typename
Operator
::
LayoutGrad
;
using
ElementAccumulator
=
typename
Operator
::
ElementAccumulator
;
using
ElementCompute
=
typename
Operator
::
EpilogueOutputOp
::
ElementCompute
;
using
OperatorArguments
=
typename
Operator
::
Arguments
;
using
ElementRin
=
typename
Operator
::
ElementMaskInput
;
using
LayoutRin
=
typename
Operator
::
LayoutMaskInput
;
using
ElementRout
=
typename
Operator
::
ElementMaskOutput
;
using
LayoutRout
=
typename
Operator
::
LayoutMaskOutput
;
RegionRestrictedConvolutionBackwardFilterOperation
(
char
const
*
name
=
"unknown_gemm"
)
:
ConvolutionBackwardFilterOperationBase
<
Operator_
>
(
name
)
{
/// rin in description -> rin in C++ template
this
->
m_description
.
rin
=
make_TensorDescription
<
ElementRin
,
LayoutRin
>
(
Operator
::
kAlignmentMaskInput
);
/// rout in description -> rout in C++ template
this
->
m_description
.
rout
=
make_TensorDescription
<
ElementRout
,
LayoutRout
>
(
Operator
::
kAlignmentMaskOutput
);
this
->
m_description
.
without_shared_load
=
false
;
}
virtual
Status
run
(
void
const
*
arguments_ptr
,
void
*
device_workspace
=
nullptr
,
cudaStream_t
stream
=
nullptr
)
const
{
cutlass
::
conv
::
Operator
conv_op
=
this
->
m_description
.
conv_op
;
ConvolutionArguments
const
*
conv_args
=
reinterpret_cast
<
ConvolutionArguments
const
*>
(
arguments_ptr
);
const
auto
&
ps
=
conv_args
->
problem_size
;
OperatorArguments
args
;
args
.
problem_size
=
ps
;
/// src in convolution arguments -> ref_src
args
.
ref_src
=
{
static_cast
<
ElementSrc
*>
(
const_cast
<
void
*>
(
conv_args
->
src
)),
LayoutSrc
::
packed
(
implicit_gemm_tensor_b_extent
(
conv_op
,
ps
))};
/// filter in convolution arguments -> ref_diff
args
.
ref_diff
=
{
static_cast
<
ElementDiff
*>
(
const_cast
<
void
*>
(
conv_args
->
filter
)),
LayoutDiff
::
packed
(
implicit_gemm_tensor_a_extent
(
conv_op
,
ps
))};
/// dst in convolution arguments -> ref_grad
args
.
ref_grad
=
{
static_cast
<
ElementGrad
*>
(
conv_args
->
dst
),
LayoutGrad
::
packed
(
implicit_gemm_tensor_c_extent
(
conv_op
,
ps
))};
/// rin in convolution arguments -> ref_mask_input
args
.
ref_mask_input
=
{
static_cast
<
ElementRin
*>
(
const_cast
<
void
*>
(
conv_args
->
rin
)),
LayoutRin
::
packed
(
implicit_gemm_tensor_rin_extent
(
conv_op
,
ps
))};
/// rout in convolution arguments -> ref_mask_output
args
.
ref_mask_output
=
{
static_cast
<
ElementRout
*>
(
const_cast
<
void
*>
(
conv_args
->
rout
)),
LayoutRout
::
packed
(
implicit_gemm_tensor_rout_extent
(
conv_op
,
ps
))};
args
.
output_op
=
init_epilogue_param
<
typename
Operator
::
EpilogueOutputOp
>
().
get
(
conv_args
);
Operator
op
;
Status
status
=
op
.
initialize
(
args
,
device_workspace
);
if
(
status
!=
Status
::
kSuccess
)
{
return
status
;
}
return
op
.
run
(
stream
);
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
}
// namespace library
}
// namespace cutlass
...
...
dnn/src/cuda/cutlass/initialize_all.cu
浏览文件 @
4e9b1c4e
...
...
@@ -50,6 +50,7 @@ void initialize_all_deconv_simt_operations(Manifest& manifest);
void
initialize_all_dwconv2d_fprop_simt_operations
(
Manifest
&
manifest
);
void
initialize_all_dwconv2d_dgrad_simt_operations
(
Manifest
&
manifest
);
void
initialize_all_dwconv2d_wgrad_simt_operations
(
Manifest
&
manifest
);
void
initialize_all_rrconv2d_wgrad_simt_operations
(
Manifest
&
manifest
);
#if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED) && CUTLASS_ARCH_MMA_SM70_SUPPORTED
void
initialize_all_gemm_tensorop884_operations
(
Manifest
&
manifest
);
void
initialize_all_dwconv2d_fprop_tensorop884_operations
(
Manifest
&
manifest
);
...
...
@@ -70,6 +71,7 @@ void initialize_all(Manifest& manifest) {
initialize_all_dwconv2d_fprop_simt_operations
(
manifest
);
initialize_all_dwconv2d_dgrad_simt_operations
(
manifest
);
initialize_all_dwconv2d_wgrad_simt_operations
(
manifest
);
initialize_all_rrconv2d_wgrad_simt_operations
(
manifest
);
#if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED) && CUTLASS_ARCH_MMA_SM70_SUPPORTED
initialize_all_gemm_tensorop884_operations
(
manifest
);
initialize_all_dwconv2d_fprop_tensorop884_operations
(
manifest
);
...
...
dnn/src/cuda/cutlass/library.h
浏览文件 @
4e9b1c4e
...
...
@@ -471,6 +471,10 @@ struct ConvolutionDescription : public OperationDescription {
conv
::
SpecialOptimizeDesc
special_optimization
;
conv
::
ImplicitGemmMode
gemm_mode
;
bool
without_shared_load
;
// only used by rrconv
TensorDescription
rin
;
TensorDescription
rout
;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
...
...
@@ -499,6 +503,10 @@ struct ConvolutionArguments {
/// Host pointer to extra param struct
void
const
*
extra_param
;
// only used by rrconv, default: nullptr
void
const
*
rin
=
nullptr
;
void
const
*
rout
=
nullptr
;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
...
...
dnn/src/cuda/cutlass/operation_table.cpp
浏览文件 @
4e9b1c4e
...
...
@@ -118,6 +118,11 @@ ConvolutionKey get_convolution_key_from_desc(const ConvolutionDescription& desc)
key
.
alignment_filter
=
desc
.
filter
.
alignment
;
key
.
without_shared_load
=
desc
.
without_shared_load
;
key
.
element_rin
=
desc
.
rin
.
element
;
key
.
layout_rin
=
desc
.
rin
.
layout
;
key
.
element_rout
=
desc
.
rout
.
element
;
key
.
layout_rout
=
desc
.
rout
.
layout
;
return
key
;
}
...
...
dnn/src/cuda/cutlass/operation_table.h
浏览文件 @
4e9b1c4e
...
...
@@ -201,6 +201,12 @@ struct ConvolutionKey {
bool
without_shared_load
;
// only used by rrconv
library
::
NumericTypeID
element_rin
=
library
::
NumericTypeID
::
kInvalid
;
library
::
LayoutTypeID
layout_rin
=
library
::
LayoutTypeID
::
kInvalid
;
library
::
NumericTypeID
element_rout
=
library
::
NumericTypeID
::
kInvalid
;
library
::
LayoutTypeID
layout_rout
=
library
::
LayoutTypeID
::
kInvalid
;
inline
bool
operator
==
(
ConvolutionKey
const
&
rhs
)
const
{
return
(
conv_op
==
rhs
.
conv_op
)
&&
(
element_src
==
rhs
.
element_src
)
&&
(
layout_src
==
rhs
.
layout_src
)
&&
...
...
@@ -223,7 +229,9 @@ struct ConvolutionKey {
(
special_optimization
==
rhs
.
special_optimization
)
&&
(
alignment_src
==
rhs
.
alignment_src
)
&&
(
alignment_filter
==
rhs
.
alignment_filter
)
&&
(
without_shared_load
==
rhs
.
without_shared_load
);
(
without_shared_load
==
rhs
.
without_shared_load
)
&&
(
element_rin
==
rhs
.
element_rin
)
&&
(
layout_rin
==
rhs
.
layout_rin
)
&&
(
element_rout
==
rhs
.
element_rout
)
&&
(
layout_rout
==
rhs
.
layout_rout
);
}
inline
bool
operator
!=
(
ConvolutionKey
const
&
rhs
)
const
{
return
!
(
*
this
==
rhs
);
}
...
...
@@ -260,7 +268,11 @@ struct ConvolutionKey {
"
\n
special_optimization: "
+
to_string
(
special_optimization
)
+
"
\n
alignment_src: "
+
std
::
to_string
(
alignment_src
)
+
"
\n
alignment_filter: "
+
std
::
to_string
(
alignment_filter
)
+
"
\n
without_shared_load: "
+
to_string
(
without_shared_load
)
+
"
\n
}"
;
"
\n
without_shared_load: "
+
to_string
(
without_shared_load
)
+
"
\n
element_rin: "
+
to_string
(
element_rin
)
+
"
\n
layout_rin: "
+
to_string
(
layout_rin
)
+
"
\n
element_rout: "
+
to_string
(
element_rout
)
+
"
\n
layout_rout: "
+
to_string
(
layout_rout
)
+
"
\n
}"
;
}
};
...
...
@@ -293,6 +305,10 @@ struct ConvolutionKeyHasher {
.
update
(
&
key
.
alignment_src
,
sizeof
(
key
.
alignment_src
))
.
update
(
&
key
.
alignment_filter
,
sizeof
(
key
.
alignment_filter
))
.
update
(
&
key
.
without_shared_load
,
sizeof
(
key
.
without_shared_load
))
.
update
(
&
key
.
element_rin
,
sizeof
(
key
.
element_rin
))
.
update
(
&
key
.
layout_rin
,
sizeof
(
key
.
layout_rin
))
.
update
(
&
key
.
element_rout
,
sizeof
(
key
.
element_rout
))
.
update
(
&
key
.
layout_rout
,
sizeof
(
key
.
layout_rout
))
.
digest
();
}
};
...
...
dnn/src/cuda/region_restricted_convolution/opr_impl.cpp
浏览文件 @
4e9b1c4e
#include "src/cuda/region_restricted_convolution/opr_impl.h"
#include "src/cuda/cutlass/singleton.h"
#include "src/cuda/region_restricted_convolution/chanwise/depthwise_large_filter.cuh"
#include "src/cuda/region_restricted_convolution/chanwise/kern.cuh"
#include "src/cuda/utils.h"
...
...
@@ -6,6 +7,7 @@
using
namespace
megdnn
;
using
namespace
cuda
;
using
namespace
region_restricted_convolution
;
using
namespace
cutlass
::
library
;
/* ============== RegionRestrictedConvolutionForwardImpl ============== */
void
RegionRestrictedConvolutionForwardImpl
::
exec
(
...
...
@@ -113,7 +115,137 @@ size_t RegionRestrictedConvolutionBackwardFilterImpl::get_workspace_in_bytes(
void
RegionRestrictedConvolutionBackwardFilterImpl
::
exec
(
_megdnn_tensor_in
src
,
_megdnn_tensor_in
diff
,
_megdnn_tensor_in
rin
,
_megdnn_tensor_in
rout
,
_megdnn_tensor_out
grad
,
_megdnn_workspace
workspace
)
{
megdnn_throw
(
"Region Restricted Conv BackwardFilter unimplemented"
);
auto
fm
=
check_exec
(
src
.
layout
,
diff
.
layout
,
rin
.
layout
,
rout
.
layout
,
grad
.
layout
,
workspace
.
size
);
megdnn_assert
(
fm
.
group
>
1
&&
src
.
layout
.
dtype
.
category
()
==
DTypeCategory
::
FLOAT
&&
param
().
compute_mode
==
Param
::
ComputeMode
::
DEFAULT
&&
fm
.
spatial_ndim
==
2
&&
fm
.
icpg
==
1
&&
fm
.
ocpg
==
1
&&
fm
.
dilation
[
0
]
==
1
&&
fm
.
dilation
[
1
]
==
1
&&
!
fm
.
should_flip
&&
param
().
stride_h
==
1
&&
param
().
stride_w
==
1
);
int
hi
=
src
.
layout
.
operator
[](
2
),
wi
=
src
.
layout
.
operator
[](
3
);
int
n
=
diff
.
layout
.
operator
[](
0
),
ho
=
diff
.
layout
.
operator
[](
2
),
wo
=
diff
.
layout
.
operator
[](
3
);
int
co
=
fm
.
group
,
ci
=
co
,
groups
=
co
;
int
fh
=
fm
.
spatial
[
0
],
fw
=
fm
.
spatial
[
1
];
int
sh
=
fm
.
stride
[
0
],
sw
=
fm
.
stride
[
1
];
int
ph
=
fm
.
padding
[
0
],
pw
=
fm
.
padding
[
1
];
int
dh
=
0
,
dw
=
0
;
// check if channelwise convolution
megdnn_assert
(
fm
.
icpg
==
1
&&
fm
.
ocpg
==
1
);
auto
stream
=
cuda_stream
(
handle
());
float
alpha
=
1.
f
;
float
beta
=
0.
f
;
ConvolutionKey
key
;
int
threadblock_shape_n
=
128
;
int
warp_shape_m
=
32
;
int
warp_shape_n
=
64
;
if
(
grad
.
layout
.
operator
[](
3
)
%
8
<
4
)
{
threadblock_shape_n
=
64
;
warp_shape_m
=
64
;
warp_shape_n
=
32
;
}
if
(
rin
.
layout
.
dtype
==
dtype
::
Int32
()
&&
rout
.
layout
.
dtype
==
dtype
::
Int32
())
{
key
=
{
cutlass
::
conv
::
Operator
::
kWgrad
,
NumericTypeID
::
kF32
,
LayoutTypeID
::
kTensorNCHW
,
NumericTypeID
::
kF32
,
LayoutTypeID
::
kTensorNCHW
,
NumericTypeID
::
kF32
,
LayoutTypeID
::
kTensorNCHW
,
NumericTypeID
::
kF32
,
LayoutTypeID
::
kTensorNCHW
,
NumericTypeID
::
kF32
,
cutlass
::
conv
::
ConvType
::
kDepthwiseConvolution
,
128
,
threadblock_shape_n
,
8
,
warp_shape_m
,
warp_shape_n
,
8
,
1
,
1
,
1
,
cutlass
::
epilogue
::
EpilogueType
::
kLinearCombination
,
1
,
cutlass
::
conv
::
SpecialOptimizeDesc
::
NONE
,
1
,
1
,
false
,
NumericTypeID
::
kS32
,
LayoutTypeID
::
kTensorNCHW
,
NumericTypeID
::
kS32
,
LayoutTypeID
::
kTensorNCHW
,
};
}
else
if
(
rin
.
layout
.
dtype
==
dtype
::
Uint8
()
&&
rout
.
layout
.
dtype
==
dtype
::
Uint8
())
{
key
=
{
cutlass
::
conv
::
Operator
::
kWgrad
,
NumericTypeID
::
kF32
,
LayoutTypeID
::
kTensorNCHW
,
NumericTypeID
::
kF32
,
LayoutTypeID
::
kTensorNCHW
,
NumericTypeID
::
kF32
,
LayoutTypeID
::
kTensorNCHW
,
NumericTypeID
::
kF32
,
LayoutTypeID
::
kTensorNCHW
,
NumericTypeID
::
kF32
,
cutlass
::
conv
::
ConvType
::
kDepthwiseConvolution
,
128
,
threadblock_shape_n
,
8
,
warp_shape_m
,
warp_shape_n
,
8
,
1
,
1
,
1
,
cutlass
::
epilogue
::
EpilogueType
::
kLinearCombination
,
1
,
cutlass
::
conv
::
SpecialOptimizeDesc
::
NONE
,
1
,
1
,
false
,
NumericTypeID
::
kS8
,
LayoutTypeID
::
kTensorNCHW
,
NumericTypeID
::
kS8
,
LayoutTypeID
::
kTensorNCHW
,
};
}
else
{
megdnn_throw
(
ssprintf
(
"don't support region restricted type rin: %s, rout: %s"
,
rin
.
layout
.
dtype
.
name
(),
rout
.
layout
.
dtype
.
name
())
.
c_str
());
}
const
Operation
*
op
=
(
const
Operation
*
)
Singleton
::
get
().
operation_table
.
find_op
(
key
);
cutlass
::
conv
::
Conv2dProblemSize
problem_size
{
n
,
hi
,
wi
,
ci
,
co
,
fh
,
fw
,
ho
,
wo
,
ph
,
pw
,
sh
,
sw
,
dh
,
dw
,
cutlass
::
conv
::
Mode
::
kCrossCorrelation
,
1
,
// split k slices, always 1
groups
,
// groups
};
cutlass
::
library
::
ConvolutionArguments
conv_args
{
problem_size
,
src
.
raw_ptr
(),
diff
.
raw_ptr
(),
nullptr
,
nullptr
,
grad
.
raw_ptr
(),
&
alpha
,
&
beta
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
rin
.
raw_ptr
(),
rout
.
raw_ptr
()};
cutlass_check
(
op
->
run
(
&
conv_args
,
nullptr
,
stream
));
after_kernel_launch
();
}
// vim: syntax=cpp.doxygen
dnn/test/cuda/region_restricted_convolution.cpp
浏览文件 @
4e9b1c4e
...
...
@@ -465,6 +465,206 @@ TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_UINT8) {
run_bench
(
64
,
384
,
32
,
32
,
31
,
31
,
1
,
1
,
10
);
}
TEST_F
(
CUDA
,
BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32
)
{
require_compute_capability
(
7
,
5
);
Benchmarker
<
ConvolutionBackwardFilter
>
bencher
(
handle_cuda
());
bencher
.
set_display
(
false
);
bencher
.
set_before_exec_callback
(
AlgoChecker
<
ConvolutionBackwardFilter
>
(
"FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_128X128X8_32X64X8_2stage"
));
Benchmarker
<
RegionRestrictedConvolutionBackwardFilter
>
rr_bencher
(
handle_cuda
());
rr_bencher
.
set_display
(
false
);
ConvolutionBackwardFilter
::
Param
param
;
param
.
format
=
ConvolutionBackwardFilter
::
Param
::
Format
::
NCHW
;
param
.
sparse
=
ConvolutionBackwardFilter
::
Param
::
Sparse
::
GROUP
;
RegionRestrictedConvolutionBackwardFilter
::
Param
rr_param
;
rr_param
.
format
=
RegionRestrictedConvolutionBackwardFilter
::
Param
::
Format
::
NCHW
;
rr_param
.
sparse
=
RegionRestrictedConvolutionBackwardFilter
::
Param
::
Sparse
::
GROUP
;
UniformIntRNG
r_rng
{
1
,
3
};
auto
run_bench
=
[
&
](
size_t
batch
,
size_t
g
,
size_t
hi
,
size_t
wi
,
size_t
fh
,
size_t
fw
,
size_t
sh
,
size_t
sw
,
size_t
nr_times
)
{
param
.
pad_h
=
fh
/
2
;
param
.
pad_w
=
fw
/
2
;
param
.
stride_h
=
sh
;
param
.
stride_w
=
sw
;
rr_param
.
pad_h
=
fh
/
2
;
rr_param
.
pad_w
=
fw
/
2
;
rr_param
.
stride_h
=
sh
;
rr_param
.
stride_w
=
sw
;
bencher
.
set_param
(
param
)
.
set_dtype
(
0
,
dtype
::
Float32
())
.
set_dtype
(
1
,
dtype
::
Float32
())
.
set_dtype
(
2
,
dtype
::
Float32
())
.
set_dtype
(
4
,
dtype
::
Float32
());
bencher
.
proxy
()
->
target_execution_policy
=
{};
bencher
.
set_times
(
nr_times
);
rr_bencher
.
set_param
(
rr_param
)
.
set_dtype
(
0
,
dtype
::
Float32
())
.
set_dtype
(
1
,
dtype
::
Float32
())
.
set_dtype
(
2
,
dtype
::
Int32
())
.
set_dtype
(
3
,
dtype
::
Int32
());
rr_bencher
.
set_rng
(
2
,
&
r_rng
).
set_rng
(
3
,
&
r_rng
);
rr_bencher
.
set_times
(
nr_times
);
size_t
ho
=
infer_conv_shape
(
hi
,
fh
,
sh
,
param
.
pad_h
);
size_t
wo
=
infer_conv_shape
(
wi
,
fw
,
sw
,
param
.
pad_w
);
TensorShape
src
{
batch
,
g
,
hi
,
wi
},
diff
{
batch
,
g
,
ho
,
wo
},
rin
{
batch
,
hi
,
wi
},
rout
{
batch
,
ho
,
wo
},
grad
{
g
,
1
,
1
,
fh
,
fw
};
float
bandwith
=
static_cast
<
float
>
(
src
.
total_nr_elems
()
+
diff
.
total_nr_elems
()
+
grad
.
total_nr_elems
())
/
(
1024
*
1024
*
1024
)
*
1e3
;
float
rr_bandwith
=
static_cast
<
float
>
(
src
.
total_nr_elems
()
+
diff
.
total_nr_elems
()
+
rin
.
total_nr_elems
()
+
rout
.
total_nr_elems
()
+
grad
.
total_nr_elems
())
/
(
1024
*
1024
*
1024
)
*
1e3
;
auto
time_in_ms
=
bencher
.
execs
({
src
,
diff
,
grad
})
/
nr_times
;
auto
ops
=
2.0
*
batch
*
g
*
hi
*
wi
*
fh
*
fw
/
(
time_in_ms
*
1e-3
)
*
1e-12
;
auto
rr_time_in_ms
=
rr_bencher
.
execs
({
src
,
diff
,
rin
,
rout
,
grad
})
/
nr_times
;
auto
rr_ops
=
2.0
*
batch
*
g
*
hi
*
wi
*
fh
*
fw
/
(
rr_time_in_ms
*
1e-3
)
*
1e-12
;
printf
(
"[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
"src=%s, "
"diff=%s, grad=%s
\n
"
"time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops
\n
"
"bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.
\n
"
,
src
.
to_string
().
c_str
(),
diff
.
to_string
().
c_str
(),
grad
.
to_string
().
c_str
(),
time_in_ms
,
rr_time_in_ms
,
ops
,
rr_ops
,
bandwith
*
4
/
time_in_ms
,
rr_bandwith
*
4
/
rr_time_in_ms
,
time_in_ms
/
rr_time_in_ms
);
};
run_bench
(
64
,
384
,
32
,
32
,
3
,
3
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
5
,
5
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
7
,
7
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
9
,
9
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
11
,
11
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
13
,
13
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
15
,
15
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
17
,
17
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
19
,
19
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
21
,
21
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
23
,
23
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
25
,
25
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
27
,
27
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
29
,
29
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
31
,
31
,
1
,
1
,
1000
);
}
TEST_F
(
CUDA
,
BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32_RINT8
)
{
require_compute_capability
(
7
,
5
);
Benchmarker
<
ConvolutionBackwardFilter
>
bencher
(
handle_cuda
());
bencher
.
set_display
(
false
);
bencher
.
set_before_exec_callback
(
AlgoChecker
<
ConvolutionBackwardFilter
>
(
"FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_128X128X8_32X64X8_2stage"
));
Benchmarker
<
RegionRestrictedConvolutionBackwardFilter
>
rr_bencher
(
handle_cuda
());
rr_bencher
.
set_display
(
false
);
ConvolutionBackwardFilter
::
Param
param
;
param
.
format
=
ConvolutionBackwardFilter
::
Param
::
Format
::
NCHW
;
param
.
sparse
=
ConvolutionBackwardFilter
::
Param
::
Sparse
::
GROUP
;
RegionRestrictedConvolutionBackwardFilter
::
Param
rr_param
;
rr_param
.
format
=
RegionRestrictedConvolutionBackwardFilter
::
Param
::
Format
::
NCHW
;
rr_param
.
sparse
=
RegionRestrictedConvolutionBackwardFilter
::
Param
::
Sparse
::
GROUP
;
UniformIntRNG
r_rng
{
1
,
3
};
auto
run_bench
=
[
&
](
size_t
batch
,
size_t
g
,
size_t
hi
,
size_t
wi
,
size_t
fh
,
size_t
fw
,
size_t
sh
,
size_t
sw
,
size_t
nr_times
)
{
param
.
pad_h
=
fh
/
2
;
param
.
pad_w
=
fw
/
2
;
param
.
stride_h
=
sh
;
param
.
stride_w
=
sw
;
rr_param
.
pad_h
=
fh
/
2
;
rr_param
.
pad_w
=
fw
/
2
;
rr_param
.
stride_h
=
sh
;
rr_param
.
stride_w
=
sw
;
bencher
.
set_param
(
param
)
.
set_dtype
(
0
,
dtype
::
Float32
())
.
set_dtype
(
1
,
dtype
::
Float32
())
.
set_dtype
(
2
,
dtype
::
Float32
())
.
set_dtype
(
4
,
dtype
::
Float32
());
bencher
.
proxy
()
->
target_execution_policy
=
{};
bencher
.
set_times
(
nr_times
);
rr_bencher
.
set_param
(
rr_param
)
.
set_dtype
(
0
,
dtype
::
Float32
())
.
set_dtype
(
1
,
dtype
::
Float32
())
.
set_dtype
(
2
,
dtype
::
Uint8
())
.
set_dtype
(
3
,
dtype
::
Uint8
());
rr_bencher
.
set_rng
(
2
,
&
r_rng
).
set_rng
(
3
,
&
r_rng
);
rr_bencher
.
set_times
(
nr_times
);
size_t
ho
=
infer_conv_shape
(
hi
,
fh
,
sh
,
param
.
pad_h
);
size_t
wo
=
infer_conv_shape
(
wi
,
fw
,
sw
,
param
.
pad_w
);
TensorShape
src
{
batch
,
g
,
hi
,
wi
},
diff
{
batch
,
g
,
ho
,
wo
},
rin
{
batch
,
hi
,
wi
},
rout
{
batch
,
ho
,
wo
},
grad
{
g
,
1
,
1
,
fh
,
fw
};
float
bandwith
=
static_cast
<
float
>
(
src
.
total_nr_elems
()
+
diff
.
total_nr_elems
()
+
grad
.
total_nr_elems
())
/
(
1024
*
1024
*
1024
)
*
1e3
;
float
rr_bandwith
=
static_cast
<
float
>
(
src
.
total_nr_elems
()
+
diff
.
total_nr_elems
()
+
rin
.
total_nr_elems
()
+
rout
.
total_nr_elems
()
+
grad
.
total_nr_elems
())
/
(
1024
*
1024
*
1024
)
*
1e3
;
auto
time_in_ms
=
bencher
.
execs
({
src
,
diff
,
grad
})
/
nr_times
;
auto
ops
=
2.0
*
batch
*
g
*
hi
*
wi
*
fh
*
fw
/
(
time_in_ms
*
1e-3
)
*
1e-12
;
auto
rr_time_in_ms
=
rr_bencher
.
execs
({
src
,
diff
,
rin
,
rout
,
grad
})
/
nr_times
;
auto
rr_ops
=
2.0
*
batch
*
g
*
hi
*
wi
*
fh
*
fw
/
(
rr_time_in_ms
*
1e-3
)
*
1e-12
;
printf
(
"[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
"src=%s, "
"diff=%s, grad=%s
\n
"
"time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops
\n
"
"bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.
\n
"
,
src
.
to_string
().
c_str
(),
diff
.
to_string
().
c_str
(),
grad
.
to_string
().
c_str
(),
time_in_ms
,
rr_time_in_ms
,
ops
,
rr_ops
,
bandwith
*
4
/
time_in_ms
,
rr_bandwith
*
4
/
rr_time_in_ms
,
time_in_ms
/
rr_time_in_ms
);
};
run_bench
(
64
,
384
,
32
,
32
,
3
,
3
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
5
,
5
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
7
,
7
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
9
,
9
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
11
,
11
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
13
,
13
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
15
,
15
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
17
,
17
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
19
,
19
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
21
,
21
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
23
,
23
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
25
,
25
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
27
,
27
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
29
,
29
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
31
,
31
,
1
,
1
,
1000
);
}
#endif
TEST_F
(
CUDA
,
REGION_RESTRICTED_CONV_BWD_DATA_FP32
)
{
...
...
@@ -585,6 +785,125 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32_RIN_EQ_ROUT) {
}
}
TEST_F
(
CUDA
,
REGION_RESTRICTED_CONV_BWD_FILTER_FP32
)
{
Checker
<
RegionRestrictedConvolutionBackwardFilter
>
checker
(
handle_cuda
());
for
(
auto
dt
:
std
::
vector
<
DType
>
{
dtype
::
Int32
(),
dtype
::
Uint8
()})
{
auto
run
=
[
&
checker
,
&
dt
](
size_t
n
,
size_t
g
,
size_t
ih
,
size_t
fh
,
size_t
padding
,
size_t
stride
)
{
RegionRestrictedConvolutionBackwardFilter
::
Param
cur_param
;
cur_param
.
mode
=
RegionRestrictedConvolutionBackwardFilter
::
Param
::
Mode
::
CROSS_CORRELATION
;
cur_param
.
compute_mode
=
RegionRestrictedConvolutionBackwardFilter
::
Param
::
ComputeMode
::
DEFAULT
;
cur_param
.
sparse
=
RegionRestrictedConvolutionBackwardFilter
::
Param
::
Sparse
::
GROUP
;
checker
.
set_dtype
(
0
,
dtype
::
Float32
())
.
set_dtype
(
1
,
dtype
::
Float32
())
.
set_dtype
(
2
,
dt
)
.
set_dtype
(
3
,
dt
);
float
scale
=
64.
f
/
sqrt
(
fh
*
fh
);
UniformFloatRNG
rng
(
scale
,
2
*
scale
);
UniformIntRNG
r_rng
{
1
,
2
};
checker
.
set_rng
(
0
,
&
rng
).
set_rng
(
1
,
&
rng
).
set_rng
(
2
,
&
r_rng
).
set_rng
(
3
,
&
r_rng
);
cur_param
.
pad_h
=
cur_param
.
pad_w
=
padding
;
cur_param
.
stride_h
=
cur_param
.
stride_w
=
stride
;
size_t
oh
=
(
ih
+
2
*
padding
-
fh
+
1
)
/
stride
;
checker
.
set_param
(
cur_param
).
execs
({
{
n
,
g
*
1
,
ih
,
ih
},
// src
{
n
,
g
*
1
,
oh
,
oh
},
// diff
{
n
,
ih
,
ih
},
// rin
{
n
,
oh
,
oh
},
// rout
{
g
,
1
,
1
,
fh
,
fh
}
// grad
});
};
if
(
dt
==
dtype
::
Int32
())
{
run
(
4
,
8
,
32
,
5
,
5
/
2
,
1
);
run
(
1
,
2
,
2
,
2
,
0
,
1
);
run
(
1
,
2
,
3
,
3
,
0
,
1
);
run
(
1
,
2
,
4
,
4
,
0
,
1
);
run
(
1
,
2
,
5
,
5
,
0
,
1
);
run
(
1
,
2
,
6
,
6
,
0
,
1
);
run
(
1
,
2
,
7
,
7
,
0
,
1
);
}
run
(
4
,
8
,
32
,
7
,
7
/
2
,
1
);
run
(
4
,
8
,
32
,
9
,
9
/
2
,
1
);
run
(
4
,
8
,
32
,
11
,
11
/
2
,
1
);
run
(
4
,
8
,
32
,
13
,
13
/
2
,
1
);
run
(
4
,
8
,
32
,
15
,
15
/
2
,
1
);
run
(
4
,
8
,
32
,
17
,
17
/
2
,
1
);
run
(
4
,
8
,
32
,
19
,
19
/
2
,
1
);
run
(
4
,
8
,
32
,
21
,
21
/
2
,
1
);
run
(
4
,
8
,
32
,
23
,
23
/
2
,
1
);
run
(
4
,
8
,
32
,
25
,
25
/
2
,
1
);
run
(
4
,
8
,
32
,
27
,
27
/
2
,
1
);
run
(
4
,
8
,
32
,
29
,
29
/
2
,
1
);
run
(
4
,
8
,
32
,
31
,
31
/
2
,
1
);
}
}
TEST_F
(
CUDA
,
REGION_RESTRICTED_CONV_BWD_FILTER_FP32_RIN_EQ_ROUT
)
{
Checker
<
RegionRestrictedConvolutionBackwardFilter
>
checker
(
handle_cuda
());
for
(
auto
dt
:
std
::
vector
<
DType
>
{
dtype
::
Int32
(),
dtype
::
Uint8
()})
{
auto
run
=
[
&
checker
,
&
dt
](
size_t
n
,
size_t
g
,
size_t
ih
,
size_t
fh
,
size_t
padding
,
size_t
stride
)
{
RegionRestrictedConvolutionBackwardFilter
::
Param
cur_param
;
cur_param
.
mode
=
RegionRestrictedConvolutionBackwardFilter
::
Param
::
Mode
::
CROSS_CORRELATION
;
cur_param
.
compute_mode
=
RegionRestrictedConvolutionBackwardFilter
::
Param
::
ComputeMode
::
DEFAULT
;
cur_param
.
sparse
=
RegionRestrictedConvolutionBackwardFilter
::
Param
::
Sparse
::
GROUP
;
checker
.
set_dtype
(
0
,
dtype
::
Float32
())
.
set_dtype
(
1
,
dtype
::
Float32
())
.
set_dtype
(
2
,
dt
)
.
set_dtype
(
3
,
dt
);
float
scale
=
64.
f
/
sqrt
(
fh
*
fh
);
UniformFloatRNG
rng
(
scale
,
2
*
scale
);
UniformIntRNG
r_rng
{
1
,
1
};
checker
.
set_rng
(
0
,
&
rng
).
set_rng
(
1
,
&
rng
).
set_rng
(
2
,
&
r_rng
).
set_rng
(
3
,
&
r_rng
);
cur_param
.
pad_h
=
cur_param
.
pad_w
=
padding
;
cur_param
.
stride_h
=
cur_param
.
stride_w
=
stride
;
size_t
oh
=
(
ih
+
2
*
padding
-
fh
+
1
)
/
stride
;
checker
.
set_param
(
cur_param
).
execs
({
{
n
,
g
*
1
,
ih
,
ih
},
// src
{
n
,
g
*
1
,
oh
,
oh
},
// diff
{
n
,
ih
,
ih
},
// rin
{
n
,
oh
,
oh
},
// rout
{
g
,
1
,
1
,
fh
,
fh
}
// grad
});
};
if
(
dt
==
dtype
::
Int32
())
{
run
(
4
,
8
,
32
,
5
,
5
/
2
,
1
);
run
(
1
,
2
,
2
,
2
,
0
,
1
);
run
(
1
,
2
,
3
,
3
,
0
,
1
);
run
(
1
,
2
,
4
,
4
,
0
,
1
);
run
(
1
,
2
,
5
,
5
,
0
,
1
);
run
(
1
,
2
,
6
,
6
,
0
,
1
);
run
(
1
,
2
,
7
,
7
,
0
,
1
);
}
run
(
4
,
8
,
32
,
7
,
7
/
2
,
1
);
run
(
4
,
8
,
32
,
9
,
9
/
2
,
1
);
run
(
4
,
8
,
32
,
11
,
11
/
2
,
1
);
run
(
4
,
8
,
32
,
13
,
13
/
2
,
1
);
run
(
4
,
8
,
32
,
15
,
15
/
2
,
1
);
run
(
4
,
8
,
32
,
17
,
17
/
2
,
1
);
run
(
4
,
8
,
32
,
19
,
19
/
2
,
1
);
run
(
4
,
8
,
32
,
21
,
21
/
2
,
1
);
run
(
4
,
8
,
32
,
23
,
23
/
2
,
1
);
run
(
4
,
8
,
32
,
25
,
25
/
2
,
1
);
run
(
4
,
8
,
32
,
27
,
27
/
2
,
1
);
run
(
4
,
8
,
32
,
29
,
29
/
2
,
1
);
run
(
4
,
8
,
32
,
31
,
31
/
2
,
1
);
}
}
}
// namespace test
}
// namespace megdnn
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录