Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
11f022ff
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
403
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
11f022ff
编写于
8月 11, 2021
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(dnn/cuda): add nhwc int8 imma conv and conv fuse typecvt
GitOrigin-RevId: 229e1eb4be0fe29932117f226523b446c9ca665e
上级
03e80759
变更
33
隐藏空白更改
内联
并排
Showing
33 changed file
with
1042 addition
and
271 deletion
+1042
-271
dnn/scripts/cutlass_generator/conv2d_operation.py
dnn/scripts/cutlass_generator/conv2d_operation.py
+19
-17
dnn/scripts/cutlass_generator/generator.py
dnn/scripts/cutlass_generator/generator.py
+102
-29
dnn/scripts/cutlass_generator/library.py
dnn/scripts/cutlass_generator/library.py
+18
-0
dnn/scripts/cutlass_generator/list.bzl
dnn/scripts/cutlass_generator/list.bzl
+0
-0
dnn/src/common/convolution.cpp
dnn/src/common/convolution.cpp
+4
-1
dnn/src/cuda/conv_bias/algo.cpp
dnn/src/cuda/conv_bias/algo.cpp
+30
-1
dnn/src/cuda/conv_bias/algo.h
dnn/src/cuda/conv_bias/algo.h
+45
-1
dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp
dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp
+16
-5
dnn/src/cuda/conv_bias/cutlass_reorder_filter.cu
dnn/src/cuda/conv_bias/cutlass_reorder_filter.cu
+45
-26
dnn/src/cuda/conv_bias/cutlass_reorder_filter.cuh
dnn/src/cuda/conv_bias/cutlass_reorder_filter.cuh
+3
-3
dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp
dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp
+16
-2
dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp
...rc/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp
+10
-2
dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp
dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp
+34
-27
dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp
dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp
+10
-2
dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp
dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp
+12
-5
dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp
dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp
+294
-0
dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp
...src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp
+29
-10
dnn/src/cuda/conv_bias/opr_impl.h
dnn/src/cuda/conv_bias/opr_impl.h
+1
-0
dnn/src/cuda/convolution/backward_data/algo.h
dnn/src/cuda/convolution/backward_data/algo.h
+2
-0
dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp
...nvolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp
+41
-30
dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp
...onvolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp
+42
-27
dnn/src/cuda/cutlass/convolution_operation.h
dnn/src/cuda/cutlass/convolution_operation.h
+2
-2
dnn/src/cuda/cutlass/library.h
dnn/src/cuda/cutlass/library.h
+1
-1
dnn/src/cuda/cutlass/operation_table.cpp
dnn/src/cuda/cutlass/operation_table.cpp
+15
-13
dnn/src/cuda/cutlass/operation_table.h
dnn/src/cuda/cutlass/operation_table.h
+6
-6
dnn/src/cuda/cutlass/util.cu
dnn/src/cuda/cutlass/util.cu
+29
-0
dnn/src/cuda/cutlass/util.h
dnn/src/cuda/cutlass/util.h
+4
-0
dnn/src/cuda/matrix_mul/algos.h
dnn/src/cuda/matrix_mul/algos.h
+4
-0
dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp
dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp
+36
-28
dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
+36
-29
dnn/test/cuda/conv_bias_int8.cpp
dnn/test/cuda/conv_bias_int8.cpp
+130
-1
dnn/test/cuda/conv_test_utils.cpp
dnn/test/cuda/conv_test_utils.cpp
+6
-2
dnn/test/cuda/convolution.cpp
dnn/test/cuda/convolution.cpp
+0
-1
未找到文件。
dnn/scripts/cutlass_generator/conv2d_operation.py
浏览文件 @
11f022ff
...
@@ -19,8 +19,8 @@ class Conv2dOperation:
...
@@ -19,8 +19,8 @@ class Conv2dOperation:
#
#
def
__init__
(
self
,
conv_kind
,
conv_type
,
arch
,
tile_description
,
src
,
flt
,
bias
,
dst
,
element_epilogue
,
\
def
__init__
(
self
,
conv_kind
,
conv_type
,
arch
,
tile_description
,
src
,
flt
,
bias
,
dst
,
element_epilogue
,
\
epilogue_functor
=
EpilogueFunctor
.
LinearCombination
,
swizzling_functor
=
SwizzlingFunctor
.
Identity4
,
\
epilogue_functor
=
EpilogueFunctor
.
LinearCombination
,
swizzling_functor
=
SwizzlingFunctor
.
Identity4
,
\
need_load_from_const
=
True
,
implicit_gemm_mode
=
ImplicitGemmMode
.
GemmNT
,
without_shared_load
=
False
,
\
special_optimization
=
SpecialOptimizeDesc
.
NoneSpecialOpt
,
implicit_gemm_mode
=
ImplicitGemmMode
.
GemmNT
,
\
required_cuda_ver_major
=
9
,
required_cuda_ver_minor
=
2
):
without_shared_load
=
False
,
required_cuda_ver_major
=
9
,
required_cuda_ver_minor
=
2
):
self
.
operation_kind
=
OperationKind
.
Conv2d
self
.
operation_kind
=
OperationKind
.
Conv2d
self
.
conv_kind
=
conv_kind
self
.
conv_kind
=
conv_kind
...
@@ -34,7 +34,7 @@ class Conv2dOperation:
...
@@ -34,7 +34,7 @@ class Conv2dOperation:
self
.
element_epilogue
=
element_epilogue
self
.
element_epilogue
=
element_epilogue
self
.
epilogue_functor
=
epilogue_functor
self
.
epilogue_functor
=
epilogue_functor
self
.
swizzling_functor
=
swizzling_functor
self
.
swizzling_functor
=
swizzling_functor
self
.
need_load_from_const
=
need_load_from_const
self
.
special_optimization
=
special_optimization
self
.
implicit_gemm_mode
=
implicit_gemm_mode
self
.
implicit_gemm_mode
=
implicit_gemm_mode
self
.
without_shared_load
=
without_shared_load
self
.
without_shared_load
=
without_shared_load
self
.
required_cuda_ver_major
=
required_cuda_ver_major
self
.
required_cuda_ver_major
=
required_cuda_ver_major
...
@@ -60,16 +60,18 @@ class Conv2dOperation:
...
@@ -60,16 +60,18 @@ class Conv2dOperation:
else
:
else
:
inst_shape
=
''
inst_shape
=
''
unity_kernel
=
''
special_opt
=
''
if
not
self
.
need_load_from_const
:
if
self
.
special_optimization
==
SpecialOptimizeDesc
.
ConvFilterUnity
:
unity_kernel
=
'_1x1'
special_opt
=
'_1x1'
elif
self
.
special_optimization
==
SpecialOptimizeDesc
.
DeconvDoubleUpsampling
:
special_opt
=
'_s2'
reorder_k
=
''
reorder_k
=
''
if
self
.
without_shared_load
:
if
self
.
without_shared_load
:
reorder_k
=
'_roc'
reorder_k
=
'_roc'
return
"%s%s%s%s%s%s_%s"
%
(
ShortDataTypeNames
[
self
.
accumulator_type
()],
\
return
"%s%s%s%s%s%s_%s"
%
(
ShortDataTypeNames
[
self
.
accumulator_type
()],
\
inst_shape
,
intermediate_type
,
ConvKindNames
[
self
.
conv_kind
],
unity_kernel
,
\
inst_shape
,
intermediate_type
,
ConvKindNames
[
self
.
conv_kind
],
special_opt
,
\
reorder_k
,
ShortEpilogueNames
[
self
.
epilogue_functor
])
reorder_k
,
ShortEpilogueNames
[
self
.
epilogue_functor
])
#
#
...
@@ -183,7 +185,7 @@ using Convolution =
...
@@ -183,7 +185,7 @@ using Convolution =
${stages},
${stages},
${alignment_src},
${alignment_src},
${alignment_filter},
${alignment_filter},
${
nonuninity_kernel
},
${
special_optimization
},
${math_operator},
${math_operator},
${implicit_gemm_mode},
${implicit_gemm_mode},
${without_shared_load}>;
${without_shared_load}>;
...
@@ -226,7 +228,7 @@ using Convolution =
...
@@ -226,7 +228,7 @@ using Convolution =
'stages'
:
str
(
operation
.
tile_description
.
stages
),
'stages'
:
str
(
operation
.
tile_description
.
stages
),
'alignment_src'
:
str
(
operation
.
src
.
alignment
),
'alignment_src'
:
str
(
operation
.
src
.
alignment
),
'alignment_filter'
:
str
(
operation
.
flt
.
alignment
),
'alignment_filter'
:
str
(
operation
.
flt
.
alignment
),
'
nonuninity_kernel'
:
str
(
operation
.
need_load_from_const
).
lower
()
,
'
special_optimization'
:
SpecialOptimizeDescTag
[
operation
.
special_optimization
]
,
'math_operator'
:
MathOperationTag
[
operation
.
tile_description
.
math_instruction
.
math_operation
],
'math_operator'
:
MathOperationTag
[
operation
.
tile_description
.
math_instruction
.
math_operation
],
'implicit_gemm_mode'
:
ImplicitGemmModeTag
[
operation
.
implicit_gemm_mode
],
'implicit_gemm_mode'
:
ImplicitGemmModeTag
[
operation
.
implicit_gemm_mode
],
'without_shared_load'
:
str
(
operation
.
without_shared_load
).
lower
()
'without_shared_load'
:
str
(
operation
.
without_shared_load
).
lower
()
...
@@ -266,7 +268,7 @@ using Deconvolution =
...
@@ -266,7 +268,7 @@ using Deconvolution =
${stages},
${stages},
${alignment_src},
${alignment_src},
${alignment_filter},
${alignment_filter},
${
nonuninity_kernel
},
${
special_optimization
},
${math_operator},
${math_operator},
${implicit_gemm_mode}>;
${implicit_gemm_mode}>;
"""
"""
...
@@ -308,7 +310,7 @@ using Deconvolution =
...
@@ -308,7 +310,7 @@ using Deconvolution =
'stages'
:
str
(
operation
.
tile_description
.
stages
),
'stages'
:
str
(
operation
.
tile_description
.
stages
),
'alignment_src'
:
str
(
operation
.
src
.
alignment
),
'alignment_src'
:
str
(
operation
.
src
.
alignment
),
'alignment_filter'
:
str
(
operation
.
flt
.
alignment
),
'alignment_filter'
:
str
(
operation
.
flt
.
alignment
),
'
nonuninity_kernel'
:
str
(
operation
.
need_load_from_const
).
lower
()
,
'
special_optimization'
:
SpecialOptimizeDescTag
[
operation
.
special_optimization
]
,
'math_operator'
:
MathOperationTag
[
operation
.
tile_description
.
math_instruction
.
math_operation
],
'math_operator'
:
MathOperationTag
[
operation
.
tile_description
.
math_instruction
.
math_operation
],
'implicit_gemm_mode'
:
ImplicitGemmModeTag
[
operation
.
implicit_gemm_mode
]
'implicit_gemm_mode'
:
ImplicitGemmModeTag
[
operation
.
implicit_gemm_mode
]
}
}
...
@@ -323,9 +325,9 @@ using Deconvolution =
...
@@ -323,9 +325,9 @@ using Deconvolution =
###################################################################################################
###################################################################################################
#
#
def
GenerateConv2d
(
conv_kind
,
tile_descriptions
,
src_layout
,
flt_layout
,
dst_layout
,
dst_type
,
min_cc
,
src_align
=
32
,
flt_align
=
32
,
dst_align
=
128
,
\
def
GenerateConv2d
(
conv_kind
,
tile_descriptions
,
src_layout
,
flt_layout
,
dst_layout
,
dst_type
,
min_cc
,
src_align
=
32
,
flt_align
=
32
,
dst_align
=
32
,
\
skip_unity_kernel
=
False
,
implicit_gemm_mode
=
ImplicitGemmMode
.
GemmNT
,
without_shared_load
=
False
,
required_cuda_ver_major
=
9
,
\
use_special_optimization
=
SpecialOptimizeDesc
.
NoneSpecialOpt
,
implicit_gemm_mode
=
ImplicitGemmMode
.
GemmNT
,
without_shared_load
=
False
,
\
required_cuda_ver_minor
=
2
):
required_cuda_ver_m
ajor
=
9
,
required_cuda_ver_m
inor
=
2
):
operations
=
[]
operations
=
[]
element_epilogue
=
DataType
.
f32
element_epilogue
=
DataType
.
f32
...
@@ -412,10 +414,10 @@ def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_lay
...
@@ -412,10 +414,10 @@ def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_lay
bias
=
TensorDescription
(
bias_type
,
dst_layout
,
max
(
1
,
int
(
32
/
DataTypeSize
[
bias_type
])))
bias
=
TensorDescription
(
bias_type
,
dst_layout
,
max
(
1
,
int
(
32
/
DataTypeSize
[
bias_type
])))
dst
=
TensorDescription
(
dst_type
,
dst_layout
,
int
(
dst_align
/
DataTypeSize
[
dst_type
]))
dst
=
TensorDescription
(
dst_type
,
dst_layout
,
int
(
dst_align
/
DataTypeSize
[
dst_type
]))
new_operation
=
Conv2dOperation
(
conv_kind
,
ConvType
.
Convolution
,
min_cc
,
tile
,
src
,
flt
,
bias
,
dst
,
element_epilogue
,
epilogue
,
swizzling_functor
,
True
,
implicit_gemm_mode
,
without_shared_load
,
required_cuda_ver_major
,
required_cuda_ver_minor
)
new_operation
=
Conv2dOperation
(
conv_kind
,
ConvType
.
Convolution
,
min_cc
,
tile
,
src
,
flt
,
bias
,
dst
,
element_epilogue
,
epilogue
,
swizzling_functor
,
SpecialOptimizeDesc
.
NoneSpecialOpt
,
implicit_gemm_mode
,
without_shared_load
,
required_cuda_ver_major
,
required_cuda_ver_minor
)
operations
.
append
(
new_operation
)
operations
.
append
(
new_operation
)
if
not
skip_unity_kernel
:
if
use_special_optimization
!=
SpecialOptimizeDesc
.
NoneSpecialOpt
:
new_operation
=
Conv2dOperation
(
conv_kind
,
ConvType
.
Convolution
,
min_cc
,
tile
,
src
,
flt
,
bias
,
dst
,
element_epilogue
,
epilogue
,
swizzling_functor
,
False
,
implicit_gemm_mode
,
without_shared_load
,
required_cuda_ver_major
,
required_cuda_ver_minor
)
new_operation
=
Conv2dOperation
(
conv_kind
,
ConvType
.
Convolution
,
min_cc
,
tile
,
src
,
flt
,
bias
,
dst
,
element_epilogue
,
epilogue
,
swizzling_functor
,
use_special_optimization
,
implicit_gemm_mode
,
without_shared_load
,
required_cuda_ver_major
,
required_cuda_ver_minor
)
operations
.
append
(
new_operation
)
operations
.
append
(
new_operation
)
return
operations
return
operations
...
...
dnn/scripts/cutlass_generator/generator.py
浏览文件 @
11f022ff
...
@@ -168,10 +168,10 @@ def GenerateConv2d_Simt(args):
...
@@ -168,10 +168,10 @@ def GenerateConv2d_Simt(args):
for
dst_type
,
dst_layout
in
zip
(
dst_types
,
dst_layouts
):
for
dst_type
,
dst_layout
in
zip
(
dst_types
,
dst_layouts
):
if
dst_type
==
DataType
.
s4
or
dst_type
==
DataType
.
u4
:
if
dst_type
==
DataType
.
s4
or
dst_type
==
DataType
.
u4
:
min_cc
=
75
min_cc
=
75
skip_unity_kernel
=
True
use_special_optimization
=
SpecialOptimizeDesc
.
NoneSpecialOpt
else
:
else
:
min_cc
=
61
min_cc
=
61
skip_unity_kernel
=
False
use_special_optimization
=
SpecialOptimizeDesc
.
ConvFilterUnity
tile_descriptions
=
[
tile_descriptions
=
[
TileDescription
([
128
,
128
,
32
],
2
,
[
2
,
4
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
128
,
128
,
32
],
2
,
[
2
,
4
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
128
,
64
,
32
],
2
,
[
2
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
128
,
64
,
32
],
2
,
[
2
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
...
@@ -182,10 +182,16 @@ def GenerateConv2d_Simt(args):
...
@@ -182,10 +182,16 @@ def GenerateConv2d_Simt(args):
TileDescription
([
64
,
32
,
32
],
2
,
[
1
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
64
,
32
,
32
],
2
,
[
1
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
16
,
128
,
16
],
1
,
[
1
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
16
,
128
,
16
],
1
,
[
1
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
16
,
64
,
8
],
2
,
[
1
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
16
,
64
,
8
],
2
,
[
1
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
]
]
operations
+=
GenerateConv2d
(
ConvKind
.
Fprop
,
tile_descriptions
,
layout
[
0
],
layout
[
1
],
for
tile
in
tile_descriptions
:
dst_layout
,
dst_type
,
min_cc
,
32
,
32
,
32
,
if
dst_layout
==
LayoutType
.
TensorNC32HW32
and
tile
.
threadblock_shape
[
0
]
>
32
:
skip_unity_kernel
)
continue
if
(
dst_layout
==
LayoutType
.
TensorNCHW
or
dst_layout
==
LayoutType
.
TensorNHWC
)
\
and
tile
.
threadblock_shape
[
0
]
>
16
:
continue
operations
+=
GenerateConv2d
(
ConvKind
.
Fprop
,
[
tile
],
layout
[
0
],
layout
[
1
],
dst_layout
,
dst_type
,
min_cc
,
32
,
32
,
32
,
use_special_optimization
)
return
operations
return
operations
...
@@ -214,6 +220,8 @@ def GenerateConv2d_TensorOp_8816(args):
...
@@ -214,6 +220,8 @@ def GenerateConv2d_TensorOp_8816(args):
DataType
.
s8
,
DataType
.
s8
,
]
]
use_special_optimization
=
SpecialOptimizeDesc
.
ConvFilterUnity
min_cc
=
75
min_cc
=
75
max_cc
=
1024
max_cc
=
1024
...
@@ -232,28 +240,69 @@ def GenerateConv2d_TensorOp_8816(args):
...
@@ -232,28 +240,69 @@ def GenerateConv2d_TensorOp_8816(args):
TileDescription
([
64
,
128
,
64
],
2
,
[
2
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
64
,
128
,
64
],
2
,
[
2
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
128
,
64
,
32
],
1
,
[
2
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
128
,
64
,
32
],
1
,
[
2
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
128
,
32
,
32
],
1
,
[
2
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
128
,
32
,
32
],
1
,
[
2
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
64
,
128
,
32
],
1
,
[
2
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
32
,
128
,
32
],
1
,
[
1
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
]
]
operations
+=
GenerateConv2d
(
ConvKind
.
Fprop
,
tile_descriptions
,
layout
[
0
],
layout
[
1
],
operations
+=
GenerateConv2d
(
ConvKind
.
Fprop
,
tile_descriptions
,
layout
[
0
],
layout
[
1
],
dst_layout
,
dst_type
,
min_cc
,
128
,
128
,
64
,
dst_layout
,
dst_type
,
min_cc
,
128
,
128
,
64
,
use_special_optimization
,
False
,
ImplicitGemmMode
.
GemmTN
,
True
,
cuda_major
,
cuda_minor
)
ImplicitGemmMode
.
GemmTN
,
True
,
cuda_major
,
cuda_minor
)
else
:
else
:
assert
dst_layout
==
LayoutType
.
TensorNC4HW4
assert
dst_layout
==
LayoutType
.
TensorNC4HW4
tile_descriptions
=
[
tile_descriptions
=
[
TileDescription
([
128
,
256
,
64
],
2
,
[
2
,
4
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
256
,
128
,
64
],
2
,
[
4
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
128
,
128
,
64
],
2
,
[
2
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
128
,
64
,
64
],
2
,
[
2
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
64
,
128
,
64
],
2
,
[
2
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
64
,
128
,
64
],
2
,
[
2
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
128
,
64
,
32
],
1
,
[
2
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
128
,
32
,
32
],
1
,
[
2
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
64
,
128
,
32
],
1
,
[
2
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
32
,
128
,
32
],
1
,
[
1
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
32
,
128
,
32
],
1
,
[
1
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
]
]
operations
+=
GenerateConv2d
(
ConvKind
.
Fprop
,
tile_descriptions
,
layout
[
0
],
layout
[
1
],
operations
+=
GenerateConv2d
(
ConvKind
.
Fprop
,
tile_descriptions
,
layout
[
0
],
layout
[
1
],
dst_layout
,
dst_type
,
min_cc
,
128
,
128
,
64
,
dst_layout
,
dst_type
,
min_cc
,
128
,
128
,
64
,
use_special_optimization
,
False
,
ImplicitGemmMode
.
GemmNT
,
False
,
cuda_major
,
cuda_minor
)
ImplicitGemmMode
.
GemmNT
,
False
,
cuda_major
,
cuda_minor
)
layouts_nhwc
=
[
(
LayoutType
.
TensorNHWC
,
LayoutType
.
TensorNC4HW4
,
32
),
(
LayoutType
.
TensorNHWC
,
LayoutType
.
TensorNC8HW8
,
64
),
(
LayoutType
.
TensorNHWC
,
LayoutType
.
TensorNC16HW16
,
128
),
]
dst_layouts_nhwc
=
[
LayoutType
.
TensorNHWC
,
]
for
math_inst
in
math_instructions
:
for
layout
in
layouts_nhwc
:
for
dst_layout
in
dst_layouts_nhwc
:
dst_type
=
math_inst
.
element_b
tile_descriptions
=
[
TileDescription
([
128
,
32
,
32
],
1
,
[
2
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
64
,
16
,
32
],
2
,
[
1
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
]
for
tile
in
tile_descriptions
:
dst_align
=
32
if
tile
.
threadblock_shape
[
1
]
==
16
else
64
operations
+=
GenerateConv2d
(
ConvKind
.
Fprop
,
[
tile
],
layout
[
0
],
layout
[
1
],
dst_layout
,
dst_type
,
min_cc
,
layout
[
2
],
layout
[
2
],
dst_align
,
use_special_optimization
,
ImplicitGemmMode
.
GemmTN
,
False
,
cuda_major
,
cuda_minor
)
if
tile
.
threadblock_shape
[
1
]
==
16
or
tile
.
threadblock_shape
[
1
]
==
32
:
operations
+=
GenerateConv2d
(
ConvKind
.
Fprop
,
[
tile
],
layout
[
0
],
layout
[
1
],
dst_layout
,
dst_type
,
min_cc
,
layout
[
2
],
layout
[
2
],
dst_align
,
use_special_optimization
,
ImplicitGemmMode
.
GemmTN
,
True
,
cuda_major
,
cuda_minor
)
out_dtypes
=
[
DataType
.
s4
,
DataType
.
u4
,
DataType
.
f32
]
#INT8x8x4 and INT8x8x32
for
math_inst
in
math_instructions
:
for
layout
in
layouts_nhwc
:
for
dst_layout
in
dst_layouts_nhwc
:
for
out_dtype
in
out_dtypes
:
tile_descriptions
=
[
TileDescription
([
128
,
32
,
32
],
1
,
[
2
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
64
,
16
,
32
],
2
,
[
1
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
]
for
tile
in
tile_descriptions
:
dst_align
=
4
*
DataTypeSize
[
out_dtype
]
if
tile
.
threadblock_shape
[
1
]
==
16
or
out_dtype
==
DataType
.
f32
\
else
8
*
DataTypeSize
[
out_dtype
]
operations
+=
GenerateConv2d
(
ConvKind
.
Fprop
,
[
tile
],
layout
[
0
],
layout
[
1
],
dst_layout
,
out_dtype
,
min_cc
,
layout
[
2
],
layout
[
2
],
dst_align
,
use_special_optimization
,
ImplicitGemmMode
.
GemmTN
,
False
,
cuda_major
,
cuda_minor
)
if
tile
.
threadblock_shape
[
1
]
==
16
or
(
tile
.
threadblock_shape
[
1
]
==
32
and
out_dtype
!=
DataType
.
f32
):
operations
+=
GenerateConv2d
(
ConvKind
.
Fprop
,
[
tile
],
layout
[
0
],
layout
[
1
],
dst_layout
,
out_dtype
,
min_cc
,
layout
[
2
],
layout
[
2
],
dst_align
,
use_special_optimization
,
ImplicitGemmMode
.
GemmTN
,
True
,
cuda_major
,
cuda_minor
)
return
operations
return
operations
...
@@ -281,6 +330,8 @@ def GenerateConv2d_TensorOp_8832(args):
...
@@ -281,6 +330,8 @@ def GenerateConv2d_TensorOp_8832(args):
LayoutType
.
TensorNC64HW64
,
LayoutType
.
TensorNC64HW64
,
]
]
use_special_optimization
=
SpecialOptimizeDesc
.
ConvFilterUnity
min_cc
=
75
min_cc
=
75
max_cc
=
1024
max_cc
=
1024
...
@@ -298,8 +349,8 @@ def GenerateConv2d_TensorOp_8832(args):
...
@@ -298,8 +349,8 @@ def GenerateConv2d_TensorOp_8832(args):
TileDescription
([
128
,
64
,
64
],
1
,
[
2
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
128
,
64
,
64
],
1
,
[
2
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
]
]
operations
+=
GenerateConv2d
(
ConvKind
.
Fprop
,
tile_descriptions
,
layout
[
0
],
layout
[
1
],
operations
+=
GenerateConv2d
(
ConvKind
.
Fprop
,
tile_descriptions
,
layout
[
0
],
layout
[
1
],
dst_layout
,
dst_type
,
min_cc
,
128
,
128
,
64
,
dst_layout
,
dst_type
,
min_cc
,
128
,
128
,
64
,
use_special_optimization
,
False
,
ImplicitGemmMode
.
GemmTN
,
True
,
cuda_major
,
cuda_minor
)
ImplicitGemmMode
.
GemmTN
,
True
,
cuda_major
,
cuda_minor
)
layouts_nhwc
=
[
layouts_nhwc
=
[
(
LayoutType
.
TensorNHWC
,
LayoutType
.
TensorNC8HW8
,
32
),
(
LayoutType
.
TensorNHWC
,
LayoutType
.
TensorNC8HW8
,
32
),
...
@@ -316,18 +367,39 @@ def GenerateConv2d_TensorOp_8832(args):
...
@@ -316,18 +367,39 @@ def GenerateConv2d_TensorOp_8832(args):
for
dst_layout
in
dst_layouts_nhwc
:
for
dst_layout
in
dst_layouts_nhwc
:
dst_type
=
math_inst
.
element_b
dst_type
=
math_inst
.
element_b
tile_descriptions
=
[
tile_descriptions
=
[
TileDescription
([
128
,
16
,
64
],
2
,
[
1
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
128
,
32
,
64
],
1
,
[
2
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
128
,
32
,
64
],
1
,
[
2
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
128
,
64
,
64
],
1
,
[
2
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
128
,
64
,
64
],
1
,
[
2
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
]
]
for
tile
in
tile_descriptions
:
for
tile
in
tile_descriptions
:
operations
+=
GenerateConv2d
(
ConvKind
.
Fprop
,
[
tile
],
layout
[
0
],
layout
[
1
],
dst_align
=
16
if
tile
.
threadblock_shape
[
1
]
==
16
else
32
dst_layout
,
dst_type
,
min_cc
,
layout
[
2
],
layout
[
2
],
32
,
operations
+=
GenerateConv2d
(
ConvKind
.
Fprop
,
[
tile
],
layout
[
0
],
layout
[
1
],
dst_layout
,
False
,
ImplicitGemmMode
.
GemmTN
,
False
,
cuda_major
,
cuda_minor
)
dst_type
,
min_cc
,
layout
[
2
],
layout
[
2
],
dst_align
,
use_special_optimization
,
ImplicitGemmMode
.
GemmTN
,
False
,
cuda_major
,
cuda_minor
)
if
tile
.
threadblock_shape
[
1
]
==
32
or
tile
.
threadblock_shape
[
1
]
==
64
:
if
tile
.
threadblock_shape
[
1
]
==
32
or
tile
.
threadblock_shape
[
1
]
==
64
:
dst_align
=
32
if
tile
.
threadblock_shape
[
1
]
==
32
else
64
dst_align
=
32
if
tile
.
threadblock_shape
[
1
]
==
32
else
64
operations
+=
GenerateConv2d
(
ConvKind
.
Fprop
,
[
tile
],
layout
[
0
],
layout
[
1
],
operations
+=
GenerateConv2d
(
ConvKind
.
Fprop
,
[
tile
],
layout
[
0
],
layout
[
1
],
dst_layout
,
dst_layout
,
dst_type
,
min_cc
,
layout
[
2
],
layout
[
2
],
dst_align
,
dst_type
,
min_cc
,
layout
[
2
],
layout
[
2
],
dst_align
,
use_special_optimization
,
False
,
ImplicitGemmMode
.
GemmTN
,
True
,
cuda_major
,
cuda_minor
)
ImplicitGemmMode
.
GemmTN
,
True
,
cuda_major
,
cuda_minor
)
# INT4x4x8
for
math_inst
in
math_instructions
:
for
layout
in
layouts_nhwc
:
for
dst_layout
in
dst_layouts_nhwc
:
tile_descriptions
=
[
TileDescription
([
128
,
16
,
64
],
2
,
[
1
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
128
,
32
,
64
],
1
,
[
2
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
128
,
64
,
64
],
1
,
[
2
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
]
for
tile
in
tile_descriptions
:
dst_align
=
32
if
tile
.
threadblock_shape
[
1
]
==
16
else
64
operations
+=
GenerateConv2d
(
ConvKind
.
Fprop
,
[
tile
],
layout
[
0
],
layout
[
1
],
dst_layout
,
DataType
.
s8
,
min_cc
,
layout
[
2
],
layout
[
2
],
dst_align
,
use_special_optimization
,
ImplicitGemmMode
.
GemmTN
,
False
,
cuda_major
,
cuda_minor
)
if
tile
.
threadblock_shape
[
1
]
==
32
or
tile
.
threadblock_shape
[
1
]
==
64
:
dst_align
=
64
if
tile
.
threadblock_shape
[
1
]
==
32
else
128
operations
+=
GenerateConv2d
(
ConvKind
.
Fprop
,
[
tile
],
layout
[
0
],
layout
[
1
],
dst_layout
,
DataType
.
s8
,
min_cc
,
layout
[
2
],
layout
[
2
],
dst_align
,
use_special_optimization
,
ImplicitGemmMode
.
GemmTN
,
True
,
cuda_major
,
cuda_minor
)
return
operations
return
operations
...
@@ -354,6 +426,8 @@ def GenerateDeconv_Simt(args):
...
@@ -354,6 +426,8 @@ def GenerateDeconv_Simt(args):
DataType
.
s8
,
DataType
.
s8
,
]
]
use_special_optimization
=
SpecialOptimizeDesc
.
DeconvDoubleUpsampling
min_cc
=
61
min_cc
=
61
max_cc
=
1024
max_cc
=
1024
...
@@ -361,7 +435,6 @@ def GenerateDeconv_Simt(args):
...
@@ -361,7 +435,6 @@ def GenerateDeconv_Simt(args):
for
layout
in
layouts
:
for
layout
in
layouts
:
for
dst_type
,
dst_layout
in
zip
(
dst_types
,
dst_layouts
):
for
dst_type
,
dst_layout
in
zip
(
dst_types
,
dst_layouts
):
tile_descriptions
=
[
tile_descriptions
=
[
TileDescription
([
64
,
128
,
32
],
2
,
[
1
,
4
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
32
,
128
,
32
],
2
,
[
1
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
32
,
128
,
32
],
2
,
[
1
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
16
,
128
,
16
],
2
,
[
1
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
16
,
128
,
16
],
2
,
[
1
,
2
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
16
,
128
,
16
],
1
,
[
1
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
TileDescription
([
16
,
128
,
16
],
1
,
[
1
,
1
,
1
],
math_inst
,
min_cc
,
max_cc
),
...
@@ -369,7 +442,7 @@ def GenerateDeconv_Simt(args):
...
@@ -369,7 +442,7 @@ def GenerateDeconv_Simt(args):
]
]
operations
+=
GenerateConv2d
(
ConvKind
.
Dgrad
,
tile_descriptions
,
layout
[
0
],
layout
[
1
],
operations
+=
GenerateConv2d
(
ConvKind
.
Dgrad
,
tile_descriptions
,
layout
[
0
],
layout
[
1
],
dst_layout
,
dst_type
,
min_cc
,
32
,
32
,
32
,
dst_layout
,
dst_type
,
min_cc
,
32
,
32
,
32
,
True
)
use_special_optimization
)
return
operations
return
operations
################################################################################
################################################################################
...
...
dnn/scripts/cutlass_generator/library.py
浏览文件 @
11f022ff
...
@@ -562,6 +562,24 @@ StrideSupportNames = {
...
@@ -562,6 +562,24 @@ StrideSupportNames = {
StrideSupport
.
Unity
:
'unity_stride'
,
StrideSupport
.
Unity
:
'unity_stride'
,
}
}
class
SpecialOptimizeDesc
(
enum
.
Enum
):
NoneSpecialOpt
=
enum_auto
()
ConvFilterUnity
=
enum_auto
()
DeconvDoubleUpsampling
=
enum_auto
()
SpecialOptimizeDescNames
=
{
SpecialOptimizeDesc
.
NoneSpecialOpt
:
'none'
,
SpecialOptimizeDesc
.
ConvFilterUnity
:
'conv_filter_unity'
,
SpecialOptimizeDesc
.
DeconvDoubleUpsampling
:
'deconv_double_upsampling'
,
}
SpecialOptimizeDescTag
=
{
SpecialOptimizeDesc
.
NoneSpecialOpt
:
'cutlass::conv::SpecialOptimizeDesc::NONE'
,
SpecialOptimizeDesc
.
ConvFilterUnity
:
'cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY'
,
SpecialOptimizeDesc
.
DeconvDoubleUpsampling
:
'cutlass::conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING'
,
}
class
ImplicitGemmMode
(
enum
.
Enum
):
class
ImplicitGemmMode
(
enum
.
Enum
):
GemmNT
=
enum_auto
()
GemmNT
=
enum_auto
()
GemmTN
=
enum_auto
()
GemmTN
=
enum_auto
()
...
...
dnn/scripts/cutlass_generator/list.bzl
浏览文件 @
11f022ff
此差异由.gitattributes 抑制。
dnn/src/common/convolution.cpp
浏览文件 @
11f022ff
...
@@ -553,7 +553,10 @@ void ConvolutionBase<Parameter>::check_or_deduce_dtype_fwd(DType src,
...
@@ -553,7 +553,10 @@ void ConvolutionBase<Parameter>::check_or_deduce_dtype_fwd(DType src,
dst
.
valid
()
&&
(
dst
.
enumv
()
==
src
.
enumv
()
||
dst
.
valid
()
&&
(
dst
.
enumv
()
==
src
.
enumv
()
||
((
dst
.
enumv
()
==
DTypeEnum
::
QuantizedS4
||
((
dst
.
enumv
()
==
DTypeEnum
::
QuantizedS4
||
dst
.
enumv
()
==
DTypeEnum
::
Quantized4Asymm
)
&&
dst
.
enumv
()
==
DTypeEnum
::
Quantized4Asymm
)
&&
src
.
enumv
()
==
DTypeEnum
::
QuantizedS8
));
src
.
enumv
()
==
DTypeEnum
::
QuantizedS8
)
||
((
src
.
enumv
()
==
DTypeEnum
::
QuantizedS4
||
src
.
enumv
()
==
DTypeEnum
::
Quantized4Asymm
)
&&
dst
.
enumv
()
==
DTypeEnum
::
QuantizedS8
));
if
(
cond_dst
)
{
if
(
cond_dst
)
{
supported_dst_dtype
.
push_back
(
dst
);
supported_dst_dtype
.
push_back
(
dst
);
}
}
...
...
dnn/src/cuda/conv_bias/algo.cpp
浏览文件 @
11f022ff
...
@@ -71,6 +71,9 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() {
...
@@ -71,6 +71,9 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() {
for
(
auto
&&
algo
:
int8_nchw32_imma
)
{
for
(
auto
&&
algo
:
int8_nchw32_imma
)
{
all_algos
.
push_back
(
&
algo
);
all_algos
.
push_back
(
&
algo
);
}
}
for
(
auto
&&
algo
:
int8_nhwc_imma
)
{
all_algos
.
push_back
(
&
algo
);
}
for
(
auto
&&
algo
:
int4_int4_nchw64_imma
)
{
for
(
auto
&&
algo
:
int4_int4_nchw64_imma
)
{
all_algos
.
push_back
(
&
algo
);
all_algos
.
push_back
(
&
algo
);
}
}
...
@@ -236,7 +239,21 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() {
...
@@ -236,7 +239,21 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() {
int8_nchw32_imma
.
emplace_back
(
int8_nchw32_imma
.
emplace_back
(
AlgoParam
{
32
,
128
,
32
,
32
,
64
,
32
,
8
,
8
,
16
,
1
});
AlgoParam
{
32
,
128
,
32
,
32
,
64
,
32
,
8
,
8
,
16
,
1
});
}
}
{
using
AlgoParam
=
AlgoInt8NHWCIMMAImplicitGemm
::
AlgoParam
;
int8_nhwc_imma
.
emplace_back
(
AlgoParam
{
64
,
16
,
32
,
64
,
16
,
32
,
8
,
8
,
16
,
2
,
16
});
int8_nhwc_imma
.
emplace_back
(
AlgoParam
{
64
,
16
,
32
,
64
,
16
,
32
,
8
,
8
,
16
,
2
,
8
});
int8_nhwc_imma
.
emplace_back
(
AlgoParam
{
64
,
16
,
32
,
64
,
16
,
32
,
8
,
8
,
16
,
2
,
4
});
int8_nhwc_imma
.
emplace_back
(
AlgoParam
{
128
,
32
,
32
,
64
,
32
,
32
,
8
,
8
,
16
,
1
,
16
});
int8_nhwc_imma
.
emplace_back
(
AlgoParam
{
128
,
32
,
32
,
64
,
32
,
32
,
8
,
8
,
16
,
1
,
8
});
int8_nhwc_imma
.
emplace_back
(
AlgoParam
{
128
,
32
,
32
,
64
,
32
,
32
,
8
,
8
,
16
,
1
,
4
});
}
{
{
using
AlgoParam
=
AlgoInt4Int4NCHW64IMMAImplicitGemm
::
AlgoParam
;
using
AlgoParam
=
AlgoInt4Int4NCHW64IMMAImplicitGemm
::
AlgoParam
;
int4_int4_nchw64_imma
.
emplace_back
(
int4_int4_nchw64_imma
.
emplace_back
(
...
@@ -261,6 +278,12 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() {
...
@@ -261,6 +278,12 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() {
}
}
{
{
using
AlgoParam
=
AlgoInt4Int4NHWCIMMAImplicitGemm
::
AlgoParam
;
using
AlgoParam
=
AlgoInt4Int4NHWCIMMAImplicitGemm
::
AlgoParam
;
int4_int4_nhwc_imma
.
emplace_back
(
AlgoParam
{
128
,
16
,
64
,
128
,
16
,
64
,
8
,
8
,
32
,
2
,
32
});
int4_int4_nhwc_imma
.
emplace_back
(
AlgoParam
{
128
,
16
,
64
,
128
,
16
,
64
,
8
,
8
,
32
,
2
,
16
});
int4_int4_nhwc_imma
.
emplace_back
(
AlgoParam
{
128
,
16
,
64
,
128
,
16
,
64
,
8
,
8
,
32
,
2
,
8
});
int4_int4_nhwc_imma
.
emplace_back
(
int4_int4_nhwc_imma
.
emplace_back
(
AlgoParam
{
128
,
32
,
64
,
64
,
32
,
64
,
8
,
8
,
32
,
1
,
32
});
AlgoParam
{
128
,
32
,
64
,
64
,
32
,
64
,
8
,
8
,
32
,
1
,
32
});
int4_int4_nhwc_imma
.
emplace_back
(
int4_int4_nhwc_imma
.
emplace_back
(
...
@@ -276,6 +299,12 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() {
...
@@ -276,6 +299,12 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() {
}
}
{
{
using
AlgoParam
=
AlgoUInt4Int4NHWCIMMAImplicitGemm
::
AlgoParam
;
using
AlgoParam
=
AlgoUInt4Int4NHWCIMMAImplicitGemm
::
AlgoParam
;
uint4_int4_nhwc_imma
.
emplace_back
(
AlgoParam
{
128
,
16
,
64
,
128
,
16
,
64
,
8
,
8
,
32
,
2
,
32
});
uint4_int4_nhwc_imma
.
emplace_back
(
AlgoParam
{
128
,
16
,
64
,
128
,
16
,
64
,
8
,
8
,
32
,
2
,
16
});
uint4_int4_nhwc_imma
.
emplace_back
(
AlgoParam
{
128
,
16
,
64
,
128
,
16
,
64
,
8
,
8
,
32
,
2
,
8
});
uint4_int4_nhwc_imma
.
emplace_back
(
uint4_int4_nhwc_imma
.
emplace_back
(
AlgoParam
{
128
,
32
,
64
,
64
,
32
,
64
,
8
,
8
,
32
,
1
,
32
});
AlgoParam
{
128
,
32
,
64
,
64
,
32
,
64
,
8
,
8
,
32
,
1
,
32
});
uint4_int4_nhwc_imma
.
emplace_back
(
uint4_int4_nhwc_imma
.
emplace_back
(
...
...
dnn/src/cuda/conv_bias/algo.h
浏览文件 @
11f022ff
...
@@ -72,6 +72,7 @@ public:
...
@@ -72,6 +72,7 @@ public:
CUDA_IMPLICIT_GEMM_REORDER_FILTER_CHWN4_IMMA_INT8
,
CUDA_IMPLICIT_GEMM_REORDER_FILTER_CHWN4_IMMA_INT8
,
CUDA_IMPLICIT_GEMM_UNROLL_WIDTH_CHWN4_IMMA_INT8
,
CUDA_IMPLICIT_GEMM_UNROLL_WIDTH_CHWN4_IMMA_INT8
,
CUDA_IMPLICIT_GEMM_IMMA_NCHW32_INT8
,
CUDA_IMPLICIT_GEMM_IMMA_NCHW32_INT8
,
CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT8
,
CUDA_IMPLICIT_GEMM_IMMA_NCHW64_INT4_INT4
,
CUDA_IMPLICIT_GEMM_IMMA_NCHW64_INT4_INT4
,
CUDA_IMPLICIT_GEMM_IMMA_NCHW64_UINT4_INT4
,
CUDA_IMPLICIT_GEMM_IMMA_NCHW64_UINT4_INT4
,
CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT4_INT4
,
CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT4_INT4
,
...
@@ -524,6 +525,7 @@ public:
...
@@ -524,6 +525,7 @@ public:
* +
* +
* +--- AlgoInt8NCHW4DotProdImplicitGemm
* +--- AlgoInt8NCHW4DotProdImplicitGemm
* +--- AlgoInt8NCHW32IMMAImplicitGemm
* +--- AlgoInt8NCHW32IMMAImplicitGemm
* +--- AlgoInt8NHWCIMMAImplicitGemm
* +
* +
* +--- AlgoInt4NCHW64IMMAImplicitGemmBase
* +--- AlgoInt4NCHW64IMMAImplicitGemmBase
* +----+--- AlgoInt4Int4NCHW64IMMAImplicitGemm
* +----+--- AlgoInt4Int4NCHW64IMMAImplicitGemm
...
@@ -582,7 +584,7 @@ public:
...
@@ -582,7 +584,7 @@ public:
// operation (cutlass kernel) from the global OperationTable
// operation (cutlass kernel) from the global OperationTable
const
cutlass
::
library
::
Operation
*
get_cutlass_conv_op
(
const
cutlass
::
library
::
Operation
*
get_cutlass_conv_op
(
const
SizeArgs
&
args
,
ConvOperator
conv_op
,
ConvType
conv_type
,
const
SizeArgs
&
args
,
ConvOperator
conv_op
,
ConvType
conv_type
,
bool
load_from_cons
t
,
bool
without_shared_load
)
const
;
bool
use_conv_filter_unity_op
t
,
bool
without_shared_load
)
const
;
// execute the cutlass kernel found by get_cutlass_conv_op. we give
// execute the cutlass kernel found by get_cutlass_conv_op. we give
// subclasses full freedom to decide where and how these arguments are
// subclasses full freedom to decide where and how these arguments are
...
@@ -829,6 +831,47 @@ private:
...
@@ -829,6 +831,47 @@ private:
std
::
string
m_name
;
std
::
string
m_name
;
};
};
class
ConvBiasForwardImpl
::
AlgoInt8NHWCIMMAImplicitGemm
final
:
public
AlgoCutlassConvolutionBase
{
public:
AlgoInt8NHWCIMMAImplicitGemm
(
AlgoParam
algo_param
)
:
AlgoCutlassConvolutionBase
(
algo_param
)
{
m_name
=
ConvBias
::
algo_name
<
ConvBias
::
DirectParam
>
(
ssprintf
(
"INT8_NHWC_IMMA_IMPLICIT_GEMM_%s"
,
to_string
(
m_algo_param
).
c_str
()),
ConvBias
::
DirectParam
{});
}
bool
is_available
(
const
SizeArgs
&
args
)
const
override
;
size_t
get_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
override
;
void
exec
(
const
ExecArgs
&
args
)
const
override
;
const
char
*
name
()
const
override
{
return
m_name
.
c_str
();
}
AlgoAttribute
attribute
()
const
override
{
return
AlgoAttribute
::
REPRODUCIBLE
;
}
static
std
::
string
to_string
(
AlgoParam
algo_param
);
size_t
get_preprocess_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
override
;
SmallVector
<
TensorLayout
>
deduce_preprocessed_filter_layout
(
const
SizeArgs
&
args
)
const
override
;
void
exec_preprocess
(
const
ExecArgs
&
args
)
const
override
;
MEGDNN_DECL_ALGO_TYPE
(
CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT8
)
std
::
string
param
()
const
override
{
std
::
string
ret
;
serialize_write_pod
(
m_algo_param
,
ret
);
return
ret
;
}
private:
std
::
tuple
<
float
,
float
,
float
,
float
,
float
>
get_constants
(
const
ExecArgs
&
args
)
const
;
void
reorder_filter
(
const
ExecArgs
&
args
,
int
interleaved
,
void
*
reordered_filter
)
const
;
std
::
string
m_name
;
};
class
ConvBiasForwardImpl
::
AlgoInt4NCHW64IMMAImplicitGemmBase
class
ConvBiasForwardImpl
::
AlgoInt4NCHW64IMMAImplicitGemmBase
:
public
AlgoCutlassConvolutionBase
{
:
public
AlgoCutlassConvolutionBase
{
public:
public:
...
@@ -1087,6 +1130,7 @@ public:
...
@@ -1087,6 +1130,7 @@ public:
#endif
#endif
#if CUDA_VERSION >= 10020
#if CUDA_VERSION >= 10020
std
::
vector
<
AlgoInt8NCHW32IMMAImplicitGemm
>
int8_nchw32_imma
;
std
::
vector
<
AlgoInt8NCHW32IMMAImplicitGemm
>
int8_nchw32_imma
;
std
::
vector
<
AlgoInt8NHWCIMMAImplicitGemm
>
int8_nhwc_imma
;
std
::
vector
<
AlgoInt4Int4NCHW64IMMAImplicitGemm
>
int4_int4_nchw64_imma
;
std
::
vector
<
AlgoInt4Int4NCHW64IMMAImplicitGemm
>
int4_int4_nchw64_imma
;
std
::
vector
<
AlgoUInt4Int4NCHW64IMMAImplicitGemm
>
uint4_int4_nchw64_imma
;
std
::
vector
<
AlgoUInt4Int4NCHW64IMMAImplicitGemm
>
uint4_int4_nchw64_imma
;
std
::
vector
<
AlgoInt4Int4NHWCIMMAImplicitGemm
>
int4_int4_nhwc_imma
;
std
::
vector
<
AlgoInt4Int4NHWCIMMAImplicitGemm
>
int4_int4_nhwc_imma
;
...
...
dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp
浏览文件 @
11f022ff
...
@@ -140,6 +140,11 @@ LayoutPack get_layout_pack(const param::ConvBias::Format format,
...
@@ -140,6 +140,11 @@ LayoutPack get_layout_pack(const param::ConvBias::Format format,
LayoutTypeID
::
kTensorNC64HW64
};
LayoutTypeID
::
kTensorNC64HW64
};
case
Format
::
NHWC
:
case
Format
::
NHWC
:
switch
(
access_type
)
{
switch
(
access_type
)
{
case
4
:
return
{
LayoutTypeID
::
kTensorNHWC
,
LayoutTypeID
::
kTensorNC4HW4
,
LayoutTypeID
::
kTensorNHWC
,
LayoutTypeID
::
kTensorNHWC
};
case
8
:
case
8
:
return
{
LayoutTypeID
::
kTensorNHWC
,
return
{
LayoutTypeID
::
kTensorNHWC
,
LayoutTypeID
::
kTensorNC8HW8
,
LayoutTypeID
::
kTensorNC8HW8
,
...
@@ -192,12 +197,18 @@ EpilogueType get_epilogue_type(const param::ConvBias::NonlineMode mode,
...
@@ -192,12 +197,18 @@ EpilogueType get_epilogue_type(const param::ConvBias::NonlineMode mode,
const
Operation
*
const
Operation
*
ConvBiasForwardImpl
::
AlgoCutlassConvolutionBase
::
get_cutlass_conv_op
(
ConvBiasForwardImpl
::
AlgoCutlassConvolutionBase
::
get_cutlass_conv_op
(
const
SizeArgs
&
args
,
ConvOperator
conv_op
,
ConvType
conv_type
,
const
SizeArgs
&
args
,
ConvOperator
conv_op
,
ConvType
conv_type
,
bool
load_from_const
,
bool
without_shared_load
)
const
{
bool
use_conv_filter_unity_opt
,
bool
without_shared_load
)
const
{
using
Format
=
param
::
ConvBias
::
Format
;
auto
&&
param
=
args
.
opr
->
param
();
auto
&&
param
=
args
.
opr
->
param
();
auto
layouts
=
get_layout_pack
(
param
.
format
,
m_algo_param
.
access_size
);
auto
layouts
=
get_layout_pack
(
param
.
format
,
m_algo_param
.
access_size
);
auto
epilogue_type
=
get_epilogue_type
(
param
.
nonlineMode
,
auto
epilogue_type
=
get_epilogue_type
(
param
.
format
!=
Format
::
NCHW4_NCHW
);
param
.
nonlineMode
,
args
.
dst_layout
->
dtype
.
enumv
()
!=
DTypeEnum
::
Float32
);
cutlass
::
conv
::
SpecialOptimizeDesc
special_optimization
=
(
use_conv_filter_unity_opt
)
?
cutlass
::
conv
::
SpecialOptimizeDesc
::
CONV_FILTER_UNITY
:
cutlass
::
conv
::
SpecialOptimizeDesc
::
NONE
;
ConvolutionKey
key
{
convert_conv_op
(
conv_op
),
ConvolutionKey
key
{
convert_conv_op
(
conv_op
),
convert_dtype
(
args
.
src_layout
->
dtype
.
enumv
()),
convert_dtype
(
args
.
src_layout
->
dtype
.
enumv
()),
layouts
.
src
,
layouts
.
src
,
...
@@ -219,7 +230,7 @@ ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op(
...
@@ -219,7 +230,7 @@ ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op(
m_algo_param
.
instruction_k
,
m_algo_param
.
instruction_k
,
epilogue_type
,
epilogue_type
,
m_algo_param
.
stage
,
m_algo_param
.
stage
,
load_from_const
,
special_optimization
,
without_shared_load
};
without_shared_load
};
return
Singleton
::
get
().
operation_table
.
find_op
(
key
);
return
Singleton
::
get
().
operation_table
.
find_op
(
key
);
...
...
dnn/src/cuda/conv_bias/cutlass_reorder_filter.cu
浏览文件 @
11f022ff
...
@@ -144,28 +144,48 @@ void megdnn::cuda::cutlass_wrapper::reorder_ncxhwx_imma_filter(
...
@@ -144,28 +144,48 @@ void megdnn::cuda::cutlass_wrapper::reorder_ncxhwx_imma_filter(
IC
,
FH
,
FW
,
trans_oc
);
IC
,
FH
,
FW
,
trans_oc
);
after_kernel_launch
();
after_kernel_launch
();
}
}
template
<
uint32_t
size_bits
>
template
<
uint32_t
size_bits
,
uint32_t
alignbits
>
void
megdnn
::
cuda
::
cutlass_wrapper
::
reorder_nhwc_imma_filter
(
void
megdnn
::
cuda
::
cutlass_wrapper
::
reorder_nhwc_imma_filter
(
int8_t
*
dst_filter
,
const
int8_t
*
src_filter
,
uint32_t
OC
,
uint32_t
IC
,
int8_t
*
dst_filter
,
const
int8_t
*
src_filter
,
uint32_t
OC
,
uint32_t
IC
,
uint32_t
FH
,
uint32_t
FW
,
bool
trans_oc
,
uint32_t
oc_interleaved
,
uint32_t
FH
,
uint32_t
FW
,
bool
trans_oc
,
uint32_t
alignbits
,
cudaStream_t
stream
)
{
uint32_t
interleaved
,
cudaStream_t
stream
)
{
static
constexpr
uint32_t
elements_per_access
=
alignbits
/
size_bits
;
const
uint32_t
elements_per_access
=
alignbits
/
size_bits
;
uint32_t
nr_threads
=
query_blocksize_for_kernel
(
reinterpret_cast
<
const
void
*>
(
void
(
*
kern
)(
int8_t
*
__restrict__
,
const
int8_t
*
__restrict__
,
uint32_t
,
reorder_nhwc_imma_filter_kernel
<
size_bits
,
alignbits
,
32
>
));
uint32_t
,
uint32_t
,
uint32_t
,
bool
);
kern
=
nullptr
;
auto
get_kern
=
[
&
kern
](
const
uint32_t
alignbits
,
const
uint32_t
interleaved
)
{
#define DISPATCH_KERNEL(alignbits_, interleaved_) \
if (alignbits == alignbits_ && interleaved == interleaved_) { \
kern = reorder_nhwc_imma_filter_kernel<size_bits, alignbits_, \
interleaved_>; \
return; \
}
DISPATCH_KERNEL
(
128
,
16
);
DISPATCH_KERNEL
(
64
,
16
);
DISPATCH_KERNEL
(
32
,
16
);
DISPATCH_KERNEL
(
128
,
32
);
DISPATCH_KERNEL
(
64
,
32
);
DISPATCH_KERNEL
(
32
,
32
);
DISPATCH_KERNEL
(
128
,
64
);
DISPATCH_KERNEL
(
64
,
64
);
DISPATCH_KERNEL
(
32
,
64
);
#undef DISPATCH_KERNEL
};
get_kern
(
alignbits
,
interleaved
);
uint32_t
nr_threads
=
query_blocksize_for_kernel
(
kern
);
uint32_t
vthreads
=
DIVUP
(
OC
*
IC
*
FH
*
FW
,
elements_per_access
);
uint32_t
vthreads
=
DIVUP
(
OC
*
IC
*
FH
*
FW
,
elements_per_access
);
nr_threads
=
std
::
min
(
nr_threads
,
vthreads
);
nr_threads
=
std
::
min
(
nr_threads
,
vthreads
);
uint32_t
nr_blocks
=
DIVUP
(
vthreads
,
nr_threads
);
uint32_t
nr_blocks
=
DIVUP
(
vthreads
,
nr_threads
);
if
(
oc_interleaved
==
32
)
{
reorder_nhwc_imma_filter_kernel
<
size_bits
,
alignbits
,
32
>
kern
<<<
nr_blocks
,
nr_threads
,
0
,
stream
>>>
(
dst_filter
,
src_filter
,
OC
,
IC
,
<<<
nr_blocks
,
nr_threads
,
0
,
stream
>>>
(
FH
,
FW
,
trans_oc
);
dst_filter
,
src_filter
,
OC
,
IC
,
FH
,
FW
,
trans_oc
);
}
else
{
reorder_nhwc_imma_filter_kernel
<
size_bits
,
alignbits
,
64
>
<<<
nr_blocks
,
nr_threads
,
0
,
stream
>>>
(
dst_filter
,
src_filter
,
OC
,
IC
,
FH
,
FW
,
trans_oc
);
}
after_kernel_launch
();
after_kernel_launch
();
}
}
...
@@ -180,15 +200,14 @@ INST(8, 32)
...
@@ -180,15 +200,14 @@ INST(8, 32)
INST
(
4
,
64
)
INST
(
4
,
64
)
#undef INST
#undef INST
#define INST(_size_bits, _alignbits) \
#define INST(_size_bits) \
template void megdnn::cuda::cutlass_wrapper::reorder_nhwc_imma_filter< \
template void \
_size_bits, _alignbits>( \
megdnn::cuda::cutlass_wrapper::reorder_nhwc_imma_filter<_size_bits>( \
int8_t * dst_filter, const int8_t* src_filter, uint32_t OC, \
int8_t * dst_filter, const int8_t* src_filter, uint32_t OC, \
uint32_t IC, uint32_t FH, uint32_t FW, bool trans_oc, \
uint32_t IC, uint32_t FH, uint32_t FW, bool trans_oc, \
uint32_t oc_interleaved, cudaStream_t stream);
uint32_t alignbits, uint32_t interleaved, cudaStream_t stream);
INST
(
4
,
32
)
INST
(
4
)
INST
(
4
,
64
)
INST
(
8
)
INST
(
4
,
128
)
#undef INST
#undef INST
// vim: syntax=cuda.doxygen
// vim: syntax=cuda.doxygen
dnn/src/cuda/conv_bias/cutlass_reorder_filter.cuh
浏览文件 @
11f022ff
...
@@ -23,11 +23,11 @@ void reorder_ncxhwx_imma_filter(int8_t* dst_filter, const int8_t* src_filter,
...
@@ -23,11 +23,11 @@ void reorder_ncxhwx_imma_filter(int8_t* dst_filter, const int8_t* src_filter,
uint32_t
FW
,
bool
trans_oc
,
uint32_t
FW
,
bool
trans_oc
,
cudaStream_t
stream
);
cudaStream_t
stream
);
template
<
uint32_t
size_bits
,
uint32_t
alignbits
>
template
<
uint32_t
size_bits
>
void
reorder_nhwc_imma_filter
(
int8_t
*
dst_filter
,
const
int8_t
*
src_filter
,
void
reorder_nhwc_imma_filter
(
int8_t
*
dst_filter
,
const
int8_t
*
src_filter
,
uint32_t
OC
,
uint32_t
IC
,
uint32_t
FH
,
uint32_t
OC
,
uint32_t
IC
,
uint32_t
FH
,
uint32_t
FW
,
bool
trans_oc
,
uint32_t
FW
,
bool
trans_oc
,
uint32_t
alignbits
,
uint32_t
oc_
interleaved
,
cudaStream_t
stream
);
uint32_t
interleaved
,
cudaStream_t
stream
);
}
// namespace cutlass_wrapper
}
// namespace cutlass_wrapper
}
// namespace cuda
}
// namespace cuda
}
// namespace megdnn
}
// namespace megdnn
dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp
浏览文件 @
11f022ff
...
@@ -68,13 +68,27 @@ ConvBiasForwardImpl::AlgoInt4Int4NHWCIMMAImplicitGemm::get_constants(
...
@@ -68,13 +68,27 @@ ConvBiasForwardImpl::AlgoInt4Int4NHWCIMMAImplicitGemm::get_constants(
args
.
filter_layout
->
dtype
.
param
<
dtype
::
QuantizedS4
>
().
scale
,
args
.
filter_layout
->
dtype
.
param
<
dtype
::
QuantizedS4
>
().
scale
,
bias_scale
=
bias_scale
=
args
.
bias_layout
->
dtype
.
param
<
dtype
::
QuantizedS32
>
().
scale
,
args
.
bias_layout
->
dtype
.
param
<
dtype
::
QuantizedS32
>
().
scale
,
dst_scale
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
QuantizedS4
>
().
scale
;
dst_scale
;
if
(
args
.
dst_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS4
)
{
dst_scale
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
QuantizedS4
>
().
scale
;
}
else
{
// DTypeEnum::QuantizedS8
megdnn_assert
(
args
.
dst_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS8
);
dst_scale
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
QuantizedS8
>
().
scale
;
}
float
alpha
=
src_scale
*
filter_scale
/
dst_scale
,
float
alpha
=
src_scale
*
filter_scale
/
dst_scale
,
beta
=
bias_scale
/
dst_scale
,
gamma
=
0.
f
,
delta
=
0.
f
,
theta
=
0.
f
;
beta
=
bias_scale
/
dst_scale
,
gamma
=
0.
f
,
delta
=
0.
f
,
theta
=
0.
f
;
if
(
args
.
z_layout
->
ndim
>
0
)
{
if
(
args
.
z_layout
->
ndim
>
0
)
{
float
z_scale
=
args
.
z_layout
->
dtype
.
param
<
dtype
::
QuantizedS4
>
().
scale
;
float
z_scale
;
if
(
args
.
z_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS4
)
{
z_scale
=
args
.
z_layout
->
dtype
.
param
<
dtype
::
QuantizedS4
>
().
scale
;
}
else
{
// DTypeEnum::QuantizedS8
megdnn_assert
(
args
.
z_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS8
);
z_scale
=
args
.
z_layout
->
dtype
.
param
<
dtype
::
QuantizedS8
>
().
scale
;
}
gamma
=
z_scale
/
dst_scale
;
gamma
=
z_scale
/
dst_scale
;
}
}
...
...
dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp
浏览文件 @
11f022ff
...
@@ -76,6 +76,14 @@ bool ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::is_available(
...
@@ -76,6 +76,14 @@ bool ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::is_available(
if
(
fh
*
fw
>
kMaxFilterPixels
)
if
(
fh
*
fw
>
kMaxFilterPixels
)
return
false
;
return
false
;
bool
use_conv_filter_unity_opt
=
(
fh
==
1
&&
fw
==
1
);
bool
without_shared_load
=
true
;
const
auto
*
op
=
get_cutlass_conv_op
(
args
,
ConvOperator
::
kFprop
,
ConvType
::
kConvolution
,
use_conv_filter_unity_opt
,
without_shared_load
);
if
(
op
==
nullptr
)
return
false
;
return
true
;
return
true
;
}
}
...
@@ -110,7 +118,7 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec(
...
@@ -110,7 +118,7 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec(
float
dst_scale
=
0.
f
;
float
dst_scale
=
0.
f
;
float
threshold
=
0.
f
;
float
threshold
=
0.
f
;
uint8_t
src_zero
=
0
;
uint8_t
src_zero
=
0
;
bool
load_from_const
=
!
(
fh
==
1
&&
fw
==
1
);
bool
use_conv_filter_unity_opt
=
(
fh
==
1
&&
fw
==
1
);
bool
without_shared_load
=
true
;
bool
without_shared_load
=
true
;
if
(
args
.
dst_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
Quantized4Asymm
)
{
if
(
args
.
dst_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
Quantized4Asymm
)
{
...
@@ -126,7 +134,7 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec(
...
@@ -126,7 +134,7 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec(
const
auto
*
op
=
get_cutlass_conv_op
(
args
,
ConvOperator
::
kFprop
,
const
auto
*
op
=
get_cutlass_conv_op
(
args
,
ConvOperator
::
kFprop
,
ConvType
::
kConvolution
,
ConvType
::
kConvolution
,
load_from_cons
t
,
without_shared_load
);
use_conv_filter_unity_op
t
,
without_shared_load
);
execute_cutlass_conv_op
(
op
,
args
.
src_tensor
->
raw_ptr
,
filter_ptr
,
bias_ptr
,
execute_cutlass_conv_op
(
op
,
args
.
src_tensor
->
raw_ptr
,
filter_ptr
,
bias_ptr
,
z_ptr
,
args
.
dst_tensor
->
raw_ptr
,
nullptr
,
n
,
hi
,
wi
,
z_ptr
,
args
.
dst_tensor
->
raw_ptr
,
nullptr
,
n
,
hi
,
wi
,
...
...
dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp
浏览文件 @
11f022ff
...
@@ -56,8 +56,11 @@ bool ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::is_available(
...
@@ -56,8 +56,11 @@ bool ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::is_available(
if
(
args
.
src_layout
->
dtype
.
enumv
()
!=
src_dtype
()
||
if
(
args
.
src_layout
->
dtype
.
enumv
()
!=
src_dtype
()
||
args
.
filter_layout
->
dtype
.
enumv
()
!=
DTypeEnum
::
QuantizedS4
||
args
.
filter_layout
->
dtype
.
enumv
()
!=
DTypeEnum
::
QuantizedS4
||
args
.
bias_layout
->
dtype
.
enumv
()
!=
DTypeEnum
::
QuantizedS32
||
args
.
bias_layout
->
dtype
.
enumv
()
!=
DTypeEnum
::
QuantizedS32
)
args
.
dst_layout
->
dtype
.
enumv
()
!=
src_dtype
())
return
false
;
if
(
!
(
args
.
dst_layout
->
dtype
.
enumv
()
==
src_dtype
()
||
args
.
dst_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS8
))
return
false
;
return
false
;
// uint4 do not support H_SWISH activition
// uint4 do not support H_SWISH activition
...
@@ -83,6 +86,16 @@ bool ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::is_available(
...
@@ -83,6 +86,16 @@ bool ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::is_available(
if
((
co
%
8
!=
0
)
||
(
ci
%
m_algo_param
.
access_size
!=
0
))
if
((
co
%
8
!=
0
)
||
(
ci
%
m_algo_param
.
access_size
!=
0
))
return
false
;
return
false
;
bool
use_conv_filter_unity_opt
=
(
fh
==
1
&&
fw
==
1
);
bool
without_shared_load
=
((
co
%
m_algo_param
.
threadblock_n
==
0
)
&&
(
m_algo_param
.
threadblock_n
==
32
||
m_algo_param
.
threadblock_n
==
64
));
const
auto
*
op
=
get_cutlass_conv_op
(
args
,
ConvOperator
::
kFprop
,
ConvType
::
kConvolution
,
use_conv_filter_unity_opt
,
without_shared_load
);
if
(
op
==
nullptr
)
return
false
;
return
true
;
return
true
;
}
}
...
@@ -117,26 +130,31 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::exec(
...
@@ -117,26 +130,31 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::exec(
float
dst_scale
=
0.
f
;
float
dst_scale
=
0.
f
;
float
threshold
=
0.
f
;
float
threshold
=
0.
f
;
uint8_t
src_zero
=
0
;
uint8_t
src_zero
=
0
;
bool
load_from_const
=
!
(
fh
==
1
&&
fw
==
1
);
bool
use_conv_filter_unity_opt
=
(
fh
==
1
&&
fw
==
1
);
bool
without_shared_load
=
((
co
%
m_algo_param
.
threadblock_n
==
0
)
&&
bool
without_shared_load
=
((
co
%
m_algo_param
.
threadblock_n
==
0
)
&&
(
m_algo_param
.
threadblock_n
==
32
||
(
m_algo_param
.
threadblock_n
==
32
||
m_algo_param
.
threadblock_n
==
64
));
m_algo_param
.
threadblock_n
==
64
));
if
(
args
.
src_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
Quantized4Asymm
)
{
src_zero
=
args
.
src_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
()
.
zero_point
;
}
if
(
args
.
dst_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
Quantized4Asymm
)
{
if
(
args
.
dst_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
Quantized4Asymm
)
{
dst_scale
=
dst_scale
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
().
scale
;
args
.
dst_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
().
scale
;
src_zero
=
args
.
src_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
()
}
else
if
(
args
.
dst_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS4
)
{
.
zero_point
;
}
else
{
// DTypeEnum::QuantizedS4
dst_scale
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
QuantizedS4
>
().
scale
;
dst_scale
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
QuantizedS4
>
().
scale
;
}
else
{
// DTypeEnum::QuantizedS8
dst_scale
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
QuantizedS8
>
().
scale
;
}
}
cudaStream_t
stream
=
cuda_stream
(
args
.
opr
->
handle
());
cudaStream_t
stream
=
cuda_stream
(
args
.
opr
->
handle
());
const
auto
*
op
=
get_cutlass_conv_op
(
args
,
ConvOperator
::
kFprop
,
const
auto
*
op
=
get_cutlass_conv_op
(
args
,
ConvOperator
::
kFprop
,
ConvType
::
kConvolution
,
ConvType
::
kConvolution
,
load_from_cons
t
,
without_shared_load
);
use_conv_filter_unity_op
t
,
without_shared_load
);
execute_cutlass_conv_op
(
op
,
args
.
src_tensor
->
raw_ptr
,
filter_ptr
,
bias_ptr
,
execute_cutlass_conv_op
(
op
,
args
.
src_tensor
->
raw_ptr
,
filter_ptr
,
bias_ptr
,
z_ptr
,
args
.
dst_tensor
->
raw_ptr
,
nullptr
,
n
,
hi
,
wi
,
z_ptr
,
args
.
dst_tensor
->
raw_ptr
,
nullptr
,
n
,
hi
,
wi
,
...
@@ -166,29 +184,18 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::reorder_filter(
...
@@ -166,29 +184,18 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::reorder_filter(
cudaStream_t
stream
=
cuda_stream
(
args
.
opr
->
handle
());
cudaStream_t
stream
=
cuda_stream
(
args
.
opr
->
handle
());
// reformat filter from nhwc to ncxhwx and reorder oc
// reformat filter from nhwc to ncxhwx and reorder oc
// use trans_oc threadblock_n must be 32 or 64
// use trans_oc threadblock_n must be 32 or 64
and src dtype == dest dtype
bool
trans_oc
=
((
co
%
m_algo_param
.
threadblock_n
==
0
)
&&
bool
trans_oc
=
((
co
%
m_algo_param
.
threadblock_n
==
0
)
&&
(
m_algo_param
.
threadblock_n
==
32
||
(
m_algo_param
.
threadblock_n
==
32
||
m_algo_param
.
threadblock_n
==
64
));
m_algo_param
.
threadblock_n
==
64
));
uint32_t
oc_iterleave
=
(
m_algo_param
.
threadblock_n
==
64
)
?
64
:
32
;
uint32_t
oc_iterleaved
=
(
m_algo_param
.
threadblock_n
==
64
)
?
64
:
32
;
if
(
iterleaved
==
8
)
{
uint32_t
alignbits
=
iterleaved
*
4
;
cutlass_wrapper
::
reorder_nhwc_imma_filter
<
4
,
32
>
(
reinterpret_cast
<
int8_t
*>
(
reordered_filter
),
cutlass_wrapper
::
reorder_nhwc_imma_filter
<
4
>
(
reinterpret_cast
<
int8_t
*>
(
args
.
filter_tensor
->
raw_ptr
),
co
,
ci
,
reinterpret_cast
<
int8_t
*>
(
reordered_filter
),
fh
,
fw
,
trans_oc
,
oc_iterleave
,
stream
);
reinterpret_cast
<
int8_t
*>
(
args
.
filter_tensor
->
raw_ptr
),
co
,
ci
,
fh
,
}
else
if
(
iterleaved
==
16
)
{
fw
,
trans_oc
,
alignbits
,
oc_iterleaved
,
stream
);
cutlass_wrapper
::
reorder_nhwc_imma_filter
<
4
,
64
>
(
reinterpret_cast
<
int8_t
*>
(
reordered_filter
),
reinterpret_cast
<
int8_t
*>
(
args
.
filter_tensor
->
raw_ptr
),
co
,
ci
,
fh
,
fw
,
trans_oc
,
oc_iterleave
,
stream
);
}
else
{
megdnn_assert
(
iterleaved
==
32
);
cutlass_wrapper
::
reorder_nhwc_imma_filter
<
4
,
128
>
(
reinterpret_cast
<
int8_t
*>
(
reordered_filter
),
reinterpret_cast
<
int8_t
*>
(
args
.
filter_tensor
->
raw_ptr
),
co
,
ci
,
fh
,
fw
,
trans_oc
,
oc_iterleave
,
stream
);
}
}
}
#endif
#endif
...
...
dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp
浏览文件 @
11f022ff
...
@@ -77,6 +77,14 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::is_available(
...
@@ -77,6 +77,14 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::is_available(
// FIXME: too large filter size is not supported now
// FIXME: too large filter size is not supported now
size_t
kMaxFilterPixels
=
848
/
(
2
*
m_algo_param
.
warp_k
/
32
)
-
2
;
size_t
kMaxFilterPixels
=
848
/
(
2
*
m_algo_param
.
warp_k
/
32
)
-
2
;
available
&=
fh
*
fw
<=
kMaxFilterPixels
;
available
&=
fh
*
fw
<=
kMaxFilterPixels
;
bool
use_conv_filter_unity_opt
=
(
fh
==
1
&&
fw
==
1
);
bool
without_shared_load
=
(
param
.
format
==
Format
::
NCHW32
);
const
auto
*
op
=
get_cutlass_conv_op
(
args
,
ConvOperator
::
kFprop
,
ConvType
::
kConvolution
,
use_conv_filter_unity_opt
,
without_shared_load
);
available
&=
(
op
!=
nullptr
);
return
available
;
return
available
;
}
}
...
@@ -155,12 +163,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec(
...
@@ -155,12 +163,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec(
gamma
=
z_scale
/
dst_scale
;
gamma
=
z_scale
/
dst_scale
;
}
}
float
delta
=
0.
f
,
theta
=
0.
f
,
threshold
=
0.
f
;
float
delta
=
0.
f
,
theta
=
0.
f
,
threshold
=
0.
f
;
bool
load_from_const
=
!
(
fh
==
1
&&
fw
==
1
);
bool
use_conv_filter_unity_opt
=
(
fh
==
1
&&
fw
==
1
);
bool
without_shared_load
=
(
param
.
format
==
Format
::
NCHW32
);
bool
without_shared_load
=
(
param
.
format
==
Format
::
NCHW32
);
const
auto
*
op
=
get_cutlass_conv_op
(
args
,
ConvOperator
::
kFprop
,
const
auto
*
op
=
get_cutlass_conv_op
(
args
,
ConvOperator
::
kFprop
,
ConvType
::
kConvolution
,
ConvType
::
kConvolution
,
load_from_cons
t
,
without_shared_load
);
use_conv_filter_unity_op
t
,
without_shared_load
);
execute_cutlass_conv_op
(
execute_cutlass_conv_op
(
op
,
args
.
src_tensor
->
raw_ptr
,
filter_ptr
,
args
.
bias_tensor
->
raw_ptr
,
op
,
args
.
src_tensor
->
raw_ptr
,
filter_ptr
,
args
.
bias_tensor
->
raw_ptr
,
...
...
dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp
浏览文件 @
11f022ff
...
@@ -98,7 +98,14 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available(
...
@@ -98,7 +98,14 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available(
// FIXME: too large filter size is not supported now
// FIXME: too large filter size is not supported now
size_t
kMaxFilterPixels
=
848
/
(
2
*
m_algo_param
.
warp_k
/
4
)
-
2
;
size_t
kMaxFilterPixels
=
848
/
(
2
*
m_algo_param
.
warp_k
/
4
)
-
2
;
available
&=
fh
*
fw
<=
kMaxFilterPixels
;
available
&=
fh
*
fw
<=
kMaxFilterPixels
;
;
bool
use_conv_filter_unity_opt
=
(
fh
==
1
&&
fw
==
1
);
bool
without_shared_load
=
false
;
const
auto
*
op
=
get_cutlass_conv_op
(
args
,
ConvOperator
::
kFprop
,
ConvType
::
kConvolution
,
use_conv_filter_unity_opt
,
without_shared_load
);
available
&=
(
op
!=
nullptr
);
return
available
;
return
available
;
}
}
...
@@ -213,12 +220,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
...
@@ -213,12 +220,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
}
}
}
}
float
threshold
=
0.
f
;
float
threshold
=
0.
f
;
bool
load_from_const
=
!
(
fh
==
1
&&
fw
==
1
);
bool
use_conv_filter_unity_opt
=
(
fh
==
1
&&
fw
==
1
);
bool
without_shared_load
=
false
;
bool
without_shared_load
=
false
;
const
auto
*
op
=
get_cutlass_conv_op
(
args
,
ConvOperator
::
kFprop
,
const
auto
*
op
=
get_cutlass_conv_op
(
ConvType
::
kConvolution
,
args
,
ConvOperator
::
kFprop
,
ConvType
::
kConvolution
,
load_from_cons
t
,
without_shared_load
);
use_conv_filter_unity_op
t
,
without_shared_load
);
execute_cutlass_conv_op
(
execute_cutlass_conv_op
(
op
,
args
.
src_tensor
->
raw_ptr
,
filter_ptr
,
args
.
bias_tensor
->
raw_ptr
,
op
,
args
.
src_tensor
->
raw_ptr
,
filter_ptr
,
args
.
bias_tensor
->
raw_ptr
,
...
...
dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp
0 → 100644
浏览文件 @
11f022ff
/**
* \file dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/common/conv_bias.h"
#include "src/cuda/conv_bias/algo.h"
#include "src/cuda/conv_bias/cutlass_reorder_filter.cuh"
#include "src/cuda/convolution_helper/parameter.cuh"
#include "src/cuda/utils.h"
using
namespace
megdnn
;
using
namespace
cuda
;
using
namespace
convolution
;
#if CUDA_VERSION >= 10020
bool
ConvBiasForwardImpl
::
AlgoInt8NHWCIMMAImplicitGemm
::
is_available
(
const
SizeArgs
&
args
)
const
{
if
(
args
.
bias_layout
->
ndim
<=
0
)
return
false
;
using
Param
=
param
::
ConvBias
;
using
Format
=
Param
::
Format
;
using
Sparse
=
Param
::
Sparse
;
using
Mode
=
Param
::
Mode
;
using
NonlineMode
=
megdnn
::
param
::
ConvBias
::
NonlineMode
;
auto
&&
param
=
args
.
opr
->
param
();
if
(
!
check_bias_share_in_channel
(
*
(
args
.
bias_layout
),
param
.
format
))
return
false
;
if
(
param
.
format
!=
Format
::
NHWC
||
param
.
sparse
!=
Sparse
::
DENSE
||
param
.
mode
!=
Mode
::
CROSS_CORRELATION
)
return
false
;
if
(
param
.
nonlineMode
!=
NonlineMode
::
IDENTITY
&&
param
.
nonlineMode
!=
NonlineMode
::
RELU
&&
param
.
nonlineMode
!=
NonlineMode
::
H_SWISH
)
return
false
;
if
(
args
.
src_layout
->
dtype
.
enumv
()
!=
DTypeEnum
::
QuantizedS8
||
args
.
filter_layout
->
dtype
.
enumv
()
!=
DTypeEnum
::
QuantizedS8
)
return
false
;
auto
dst_dtype
=
args
.
dst_layout
->
dtype
.
enumv
();
if
(
!
(
dst_dtype
==
DTypeEnum
::
QuantizedS8
||
dst_dtype
==
DTypeEnum
::
QuantizedS4
||
dst_dtype
==
DTypeEnum
::
Quantized4Asymm
||
dst_dtype
==
DTypeEnum
::
Float32
))
return
false
;
if
(
!
(
args
.
bias_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS32
||
(
args
.
bias_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
Float32
&&
dst_dtype
==
DTypeEnum
::
Float32
)))
return
false
;
if
(
!
is_compute_capability_required
(
7
,
5
))
return
false
;
size_t
co
=
args
.
filter_layout
->
operator
[](
0
),
ci
=
args
.
filter_layout
->
operator
[](
3
),
fh
=
args
.
filter_layout
->
operator
[](
1
),
fw
=
args
.
filter_layout
->
operator
[](
2
);
// param buffer size is 4K, use 3.4K to store precomputed offset
size_t
kMaxFilterPixels
=
848
/
(
m_algo_param
.
warp_k
/
m_algo_param
.
access_size
)
-
1
;
if
(
fh
*
fw
>
kMaxFilterPixels
)
return
false
;
// co should be aligned with 4, and ci should be aligned with
// algo_param.access_size
if
((
co
%
4
!=
0
)
||
(
ci
%
m_algo_param
.
access_size
!=
0
))
return
false
;
bool
use_conv_filter_unity_opt
=
(
fh
==
1
&&
fw
==
1
);
bool
without_shared_load
=
((
co
%
m_algo_param
.
threadblock_n
==
0
)
&&
(
m_algo_param
.
threadblock_n
==
16
||
(
m_algo_param
.
threadblock_n
==
32
&&
dst_dtype
!=
DTypeEnum
::
Float32
)));
const
auto
*
op
=
get_cutlass_conv_op
(
args
,
ConvOperator
::
kFprop
,
ConvType
::
kConvolution
,
use_conv_filter_unity_opt
,
without_shared_load
);
if
(
op
==
nullptr
)
return
false
;
return
true
;
}
size_t
ConvBiasForwardImpl
::
AlgoInt8NHWCIMMAImplicitGemm
::
get_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
{
if
(
args
.
preprocessed_filter
)
{
return
0
;
}
else
{
return
args
.
filter_layout
->
span
().
dist_byte
();
}
}
size_t
ConvBiasForwardImpl
::
AlgoInt8NHWCIMMAImplicitGemm
::
get_preprocess_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
{
return
0
;
}
SmallVector
<
TensorLayout
>
ConvBiasForwardImpl
::
AlgoInt8NHWCIMMAImplicitGemm
::
deduce_preprocessed_filter_layout
(
const
SizeArgs
&
args
)
const
{
return
{
args
.
filter_layout
->
collapse_contiguous
()};
}
void
ConvBiasForwardImpl
::
AlgoInt8NHWCIMMAImplicitGemm
::
exec_preprocess
(
const
ExecArgs
&
args
)
const
{
void
*
filter_ptr
=
args
.
preprocessed_filter
->
tensors
[
0
].
raw_ptr
;
reorder_filter
(
args
,
m_algo_param
.
access_size
,
filter_ptr
);
}
std
::
tuple
<
float
,
float
,
float
,
float
,
float
>
ConvBiasForwardImpl
::
AlgoInt8NHWCIMMAImplicitGemm
::
get_constants
(
const
ExecArgs
&
args
)
const
{
float
src_scale
=
args
.
src_layout
->
dtype
.
param
<
dtype
::
QuantizedS8
>
().
scale
,
filter_scale
=
args
.
filter_layout
->
dtype
.
param
<
dtype
::
QuantizedS8
>
().
scale
,
bias_scale
=
1.
f
,
dst_scale
;
if
(
args
.
bias_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS32
)
{
bias_scale
=
args
.
bias_layout
->
dtype
.
param
<
dtype
::
QuantizedS32
>
().
scale
;
}
uint8_t
dst_zero
=
0
;
if
(
args
.
dst_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS8
)
{
dst_scale
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
QuantizedS8
>
().
scale
;
}
else
if
(
args
.
dst_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS4
)
{
dst_scale
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
QuantizedS4
>
().
scale
;
}
else
if
(
args
.
dst_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
Quantized4Asymm
)
{
dst_scale
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
().
scale
;
dst_zero
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
()
.
zero_point
;
}
else
{
// DTypeEnum::Float32
megdnn_assert
(
args
.
dst_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
Float32
);
dst_scale
=
1.
f
;
}
float
alpha
=
src_scale
*
filter_scale
/
dst_scale
,
beta
=
bias_scale
/
dst_scale
,
gamma
=
0.
f
,
delta
=
0.
f
,
theta
=
dst_zero
;
if
(
args
.
z_layout
->
ndim
>
0
)
{
float
z_scale
;
if
(
args
.
z_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS8
)
{
z_scale
=
args
.
z_layout
->
dtype
.
param
<
dtype
::
QuantizedS8
>
().
scale
;
gamma
=
z_scale
/
dst_scale
;
}
else
if
(
args
.
z_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS4
)
{
z_scale
=
args
.
z_layout
->
dtype
.
param
<
dtype
::
QuantizedS4
>
().
scale
;
gamma
=
z_scale
/
dst_scale
;
}
else
if
(
args
.
z_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
Quantized4Asymm
)
{
z_scale
=
args
.
z_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
().
scale
;
uint8_t
z_zero
=
args
.
z_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
()
.
zero_point
;
gamma
=
z_scale
/
dst_scale
;
delta
=
-
z_zero
*
gamma
;
}
else
{
// DTypeEnum::Float32
megdnn_assert
(
args
.
z_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
Float32
);
gamma
=
1.
f
;
}
}
if
(
args
.
opr
->
param
().
nonlineMode
==
param
::
ConvBias
::
NonlineMode
::
IDENTITY
)
{
delta
+=
theta
;
theta
=
0.
f
;
}
return
{
alpha
,
beta
,
gamma
,
delta
,
theta
};
}
void
ConvBiasForwardImpl
::
AlgoInt8NHWCIMMAImplicitGemm
::
exec
(
const
ExecArgs
&
args
)
const
{
auto
&&
param
=
args
.
opr
->
param
();
auto
&&
fm
=
args
.
filter_meta
;
size_t
n
=
args
.
src_layout
->
operator
[](
0
),
ci
=
args
.
src_layout
->
operator
[](
3
),
hi
=
args
.
src_layout
->
operator
[](
1
),
wi
=
args
.
src_layout
->
operator
[](
2
);
size_t
co
=
args
.
dst_layout
->
operator
[](
3
),
ho
=
args
.
dst_layout
->
operator
[](
1
),
wo
=
args
.
dst_layout
->
operator
[](
2
);
UNPACK_CONV_PARAMETER
(
fm
,
param
);
MARK_USED_VAR
void
*
filter_ptr
=
nullptr
;
void
*
bias_ptr
=
nullptr
;
void
*
z_ptr
=
nullptr
;
if
(
args
.
preprocessed_filter
)
{
filter_ptr
=
args
.
preprocessed_filter
->
tensors
[
0
].
raw_ptr
;
}
else
{
filter_ptr
=
reinterpret_cast
<
void
*>
(
args
.
workspace
.
raw_ptr
);
reorder_filter
(
args
,
m_algo_param
.
access_size
,
filter_ptr
);
}
bias_ptr
=
args
.
bias_tensor
->
raw_ptr
;
if
(
args
.
z_layout
->
ndim
>
0
)
z_ptr
=
args
.
z_tensor
->
raw_ptr
;
// \note these constants of cutlass epilogue will be passed to method
// `execute_cutlass_conv_op` by pointer and interpreted as ElementCompute*,
// a different dtype here results in undefined epilogue behaviors
float
alpha
,
beta
,
gamma
,
delta
,
theta
;
std
::
tie
(
alpha
,
beta
,
gamma
,
delta
,
theta
)
=
get_constants
(
args
);
float
dst_scale
=
1.
f
;
float
threshold
=
0.
f
;
bool
use_conv_filter_unity_opt
=
(
fh
==
1
&&
fw
==
1
);
auto
dst_dtype
=
args
.
dst_layout
->
dtype
.
enumv
();
bool
without_shared_load
=
((
co
%
m_algo_param
.
threadblock_n
==
0
)
&&
(
m_algo_param
.
threadblock_n
==
16
||
(
m_algo_param
.
threadblock_n
==
32
&&
dst_dtype
!=
DTypeEnum
::
Float32
)));
if
(
dst_dtype
==
DTypeEnum
::
QuantizedS8
)
{
// DTypeEnum::QuantizedS8
dst_scale
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
QuantizedS8
>
().
scale
;
}
else
if
(
dst_dtype
==
DTypeEnum
::
QuantizedS4
)
{
dst_scale
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
QuantizedS4
>
().
scale
;
}
else
if
(
dst_dtype
==
DTypeEnum
::
Quantized4Asymm
)
{
dst_scale
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
().
scale
;
}
else
{
// DTypeEnum::Float32
dst_scale
=
1.
f
;
}
cudaStream_t
stream
=
cuda_stream
(
args
.
opr
->
handle
());
const
auto
*
op
=
get_cutlass_conv_op
(
args
,
ConvOperator
::
kFprop
,
ConvType
::
kConvolution
,
use_conv_filter_unity_opt
,
without_shared_load
);
execute_cutlass_conv_op
(
op
,
args
.
src_tensor
->
raw_ptr
,
filter_ptr
,
bias_ptr
,
z_ptr
,
args
.
dst_tensor
->
raw_ptr
,
nullptr
,
n
,
hi
,
wi
,
ci
,
co
,
fh
,
fw
,
ho
,
wo
,
ph
,
pw
,
sh
,
sw
,
dh
,
dw
,
&
alpha
,
&
beta
,
&
gamma
,
&
delta
,
&
theta
,
&
threshold
,
&
dst_scale
,
stream
);
after_kernel_launch
();
}
std
::
string
ConvBiasForwardImpl
::
AlgoInt8NHWCIMMAImplicitGemm
::
to_string
(
AlgoParam
algo_param
)
{
return
ssprintf
(
"%dX%dX%d_%dX%dX%d_%d_%d"
,
algo_param
.
threadblock_m
,
algo_param
.
threadblock_n
,
algo_param
.
threadblock_k
,
algo_param
.
warp_m
,
algo_param
.
warp_n
,
algo_param
.
warp_k
,
algo_param
.
stage
,
algo_param
.
access_size
);
}
void
ConvBiasForwardImpl
::
AlgoInt8NHWCIMMAImplicitGemm
::
reorder_filter
(
const
ExecArgs
&
args
,
const
int
iterleaved
,
void
*
reordered_filter
)
const
{
size_t
co
=
args
.
filter_layout
->
operator
[](
0
),
ci
=
args
.
filter_layout
->
operator
[](
3
),
fh
=
args
.
filter_layout
->
operator
[](
1
),
fw
=
args
.
filter_layout
->
operator
[](
2
);
cudaStream_t
stream
=
cuda_stream
(
args
.
opr
->
handle
());
// reformat filter from nhwc to ncxhwx and reorder oc
// use trans_oc threadblock_n must be 16 or 32 and src dtype == dest dtype
bool
trans_oc
=
((
co
%
m_algo_param
.
threadblock_n
==
0
)
&&
(
m_algo_param
.
threadblock_n
==
16
||
(
m_algo_param
.
threadblock_n
==
32
&&
args
.
dst_layout
->
dtype
.
enumv
()
!=
DTypeEnum
::
Float32
)));
uint32_t
oc_iterleaved
=
(
m_algo_param
.
threadblock_n
==
32
)
?
32
:
16
;
uint32_t
alignbits
=
iterleaved
*
8
;
cutlass_wrapper
::
reorder_nhwc_imma_filter
<
8
>
(
reinterpret_cast
<
int8_t
*>
(
reordered_filter
),
reinterpret_cast
<
int8_t
*>
(
args
.
filter_tensor
->
raw_ptr
),
co
,
ci
,
fh
,
fw
,
trans_oc
,
alignbits
,
oc_iterleaved
,
stream
);
}
#endif
// vim: syntax=cpp.doxygen
dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp
浏览文件 @
11f022ff
...
@@ -102,22 +102,41 @@ ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm::get_constants(
...
@@ -102,22 +102,41 @@ ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm::get_constants(
args
.
filter_layout
->
dtype
.
param
<
dtype
::
QuantizedS4
>
().
scale
,
args
.
filter_layout
->
dtype
.
param
<
dtype
::
QuantizedS4
>
().
scale
,
bias_scale
=
bias_scale
=
args
.
bias_layout
->
dtype
.
param
<
dtype
::
QuantizedS32
>
().
scale
,
args
.
bias_layout
->
dtype
.
param
<
dtype
::
QuantizedS32
>
().
scale
,
dst_scale
=
dst_scale
;
args
.
dst_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
().
scale
;
uint8_t
dst_zero
=
0
;
if
(
args
.
dst_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
Quantized4Asymm
)
{
dst_scale
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
().
scale
;
dst_zero
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
()
.
zero_point
;
}
else
{
// DTypeEnum::QuantizedS8
megdnn_assert
(
args
.
dst_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS8
);
dst_scale
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
QuantizedS8
>
().
scale
;
}
uint8_t
dst_zero
=
args
.
dst_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
().
zero_point
;
float
alpha
=
src_scale
*
filter_scale
/
dst_scale
,
float
alpha
=
src_scale
*
filter_scale
/
dst_scale
,
beta
=
bias_scale
/
dst_scale
,
gamma
=
0.
f
,
delta
=
0.
f
,
beta
=
bias_scale
/
dst_scale
,
gamma
=
0.
f
,
delta
=
0.
f
,
theta
=
dst_zero
;
theta
=
dst_zero
;
if
(
args
.
z_layout
->
ndim
>
0
)
{
if
(
args
.
z_layout
->
ndim
>
0
)
{
float
z_scale
=
float
z_scale
;
args
.
z_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
().
scale
;
if
(
args
.
z_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
Quantized4Asymm
)
{
gamma
=
z_scale
/
dst_scale
;
z_scale
=
uint8_t
z_zero
=
args
.
z_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
().
scale
;
args
.
z_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
().
zero_point
;
uint8_t
z_zero
=
delta
=
-
z_zero
*
gamma
;
args
.
z_layout
->
dtype
.
param
<
dtype
::
Quantized4Asymm
>
()
.
zero_point
;
gamma
=
z_scale
/
dst_scale
;
delta
=
-
z_zero
*
gamma
;
}
else
{
// DTypeEnum::QuantizedS8
megdnn_assert
(
args
.
z_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS8
);
z_scale
=
args
.
z_layout
->
dtype
.
param
<
dtype
::
QuantizedS8
>
().
scale
;
gamma
=
z_scale
/
dst_scale
;
}
}
}
// identity epilogue has no theta:
// identity epilogue has no theta:
...
...
dnn/src/cuda/conv_bias/opr_impl.h
浏览文件 @
11f022ff
...
@@ -65,6 +65,7 @@ public:
...
@@ -65,6 +65,7 @@ public:
class
AlgoInt8CHWN4IMMAImplicitGemmReorderFilter
;
class
AlgoInt8CHWN4IMMAImplicitGemmReorderFilter
;
class
AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth
;
class
AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth
;
class
AlgoInt8NCHW32IMMAImplicitGemm
;
class
AlgoInt8NCHW32IMMAImplicitGemm
;
class
AlgoInt8NHWCIMMAImplicitGemm
;
class
AlgoInt4NCHW64IMMAImplicitGemmBase
;
class
AlgoInt4NCHW64IMMAImplicitGemmBase
;
class
AlgoInt4Int4NCHW64IMMAImplicitGemm
;
class
AlgoInt4Int4NCHW64IMMAImplicitGemm
;
class
AlgoUInt4Int4NCHW64IMMAImplicitGemm
;
class
AlgoUInt4Int4NCHW64IMMAImplicitGemm
;
...
...
dnn/src/cuda/convolution/backward_data/algo.h
浏览文件 @
11f022ff
...
@@ -275,6 +275,7 @@ public:
...
@@ -275,6 +275,7 @@ public:
private:
private:
WorkspaceBundle
get_workspace_bundle
(
dt_byte
*
raw_ptr
,
WorkspaceBundle
get_workspace_bundle
(
dt_byte
*
raw_ptr
,
const
SizeArgs
&
args
)
const
;
const
SizeArgs
&
args
)
const
;
const
void
*
get_available_op
(
const
SizeArgs
&
args
)
const
;
AlgoParam
m_algo_param
;
AlgoParam
m_algo_param
;
std
::
string
m_name
;
std
::
string
m_name
;
};
};
...
@@ -295,6 +296,7 @@ public:
...
@@ -295,6 +296,7 @@ public:
private:
private:
WorkspaceBundle
get_workspace_bundle
(
dt_byte
*
raw_ptr
,
WorkspaceBundle
get_workspace_bundle
(
dt_byte
*
raw_ptr
,
const
SizeArgs
&
args
)
const
;
const
SizeArgs
&
args
)
const
;
const
void
*
get_available_op
(
const
SizeArgs
&
args
)
const
;
};
};
class
ConvolutionBackwardDataImpl
::
AlgoPack
:
NonCopyableObj
{
class
ConvolutionBackwardDataImpl
::
AlgoPack
:
NonCopyableObj
{
...
...
dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp
浏览文件 @
11f022ff
...
@@ -20,6 +20,43 @@
...
@@ -20,6 +20,43 @@
using
namespace
megdnn
;
using
namespace
megdnn
;
using
namespace
cuda
;
using
namespace
cuda
;
const
void
*
ConvolutionBackwardDataImpl
::
AlgoInt8NCHW4DotProdImplicitGemm
::
get_available_op
(
const
SizeArgs
&
args
)
const
{
using
namespace
cutlass
::
library
;
auto
&&
fm
=
args
.
filter_meta
;
size_t
sh
=
fm
.
stride
[
0
],
sw
=
fm
.
stride
[
1
];
cutlass
::
conv
::
SpecialOptimizeDesc
special_optimization
=
(
sh
==
2
&&
sw
==
2
)
?
cutlass
::
conv
::
SpecialOptimizeDesc
::
DECONV_DOUBLE_UPSAMPLING
:
cutlass
::
conv
::
SpecialOptimizeDesc
::
NONE
;
ConvolutionKey
key
{
cutlass
::
conv
::
Operator
::
kDgrad
,
NumericTypeID
::
kS8
,
LayoutTypeID
::
kTensorNC4HW4
,
NumericTypeID
::
kS8
,
LayoutTypeID
::
kTensorK4RSC4
,
NumericTypeID
::
kS8
,
LayoutTypeID
::
kTensorNC4HW4
,
NumericTypeID
::
kS32
,
LayoutTypeID
::
kTensorNC4HW4
,
cutlass
::
conv
::
ConvType
::
kConvolution
,
m_algo_param
.
threadblock_m
,
m_algo_param
.
threadblock_n
,
m_algo_param
.
threadblock_k
,
m_algo_param
.
warp_m
,
m_algo_param
.
warp_n
,
m_algo_param
.
warp_k
,
1
,
1
,
4
,
cutlass
::
epilogue
::
EpilogueType
::
kBiasAddLinearCombinationClamp
,
m_algo_param
.
stage
,
special_optimization
,
false
};
return
(
void
*
)
Singleton
::
get
().
operation_table
.
find_op
(
key
);
}
bool
ConvolutionBackwardDataImpl
::
AlgoInt8NCHW4DotProdImplicitGemm
::
bool
ConvolutionBackwardDataImpl
::
AlgoInt8NCHW4DotProdImplicitGemm
::
is_available
(
const
SizeArgs
&
args
)
const
{
is_available
(
const
SizeArgs
&
args
)
const
{
auto
&&
fm
=
args
.
filter_meta
;
auto
&&
fm
=
args
.
filter_meta
;
...
@@ -51,6 +88,7 @@ bool ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
...
@@ -51,6 +88,7 @@ bool ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
// FIXME: too large filter size is not supported now
// FIXME: too large filter size is not supported now
available
&=
fm
.
spatial
[
0
]
*
fm
.
spatial
[
1
]
<=
available
&=
fm
.
spatial
[
0
]
*
fm
.
spatial
[
1
]
<=
(
uint32_t
)(
848
/
(
2
*
m_algo_param
.
warp_k
/
4
)
-
2
);
(
uint32_t
)(
848
/
(
2
*
m_algo_param
.
warp_k
/
4
)
-
2
);
available
&=
(
get_available_op
(
args
)
!=
nullptr
);
// only support sm_61 or later, platform should have fast native int8
// only support sm_61 or later, platform should have fast native int8
// support
// support
available
&=
is_compute_capability_required
(
6
,
1
);
available
&=
is_compute_capability_required
(
6
,
1
);
...
@@ -105,40 +143,14 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
...
@@ -105,40 +143,14 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
args
.
grad_layout
->
dtype
.
param
<
dtype
::
QuantizedS8
>
().
scale
;
args
.
grad_layout
->
dtype
.
param
<
dtype
::
QuantizedS8
>
().
scale
;
// \note these constants of cutlass epilogue will be passed to struct
// \note these constants of cutlass epilogue will be passed to struct
// `ConvolutionArguments` by pointer and interpreted as ElementCompute*,
a
// `ConvolutionArguments` by pointer and interpreted as ElementCompute*,
// different dtype here results in undefined epilogue behaviors
//
a
different dtype here results in undefined epilogue behaviors
float
alpha
=
diff_scale
*
filter_scale
/
grad_scale
,
beta
=
0.
f
,
float
alpha
=
diff_scale
*
filter_scale
/
grad_scale
,
beta
=
0.
f
,
gamma
=
0.
f
,
delta
=
0.
f
;
gamma
=
0.
f
,
delta
=
0.
f
;
using
namespace
cutlass
::
library
;
using
namespace
cutlass
::
library
;
// only use 16x64x8_16x64x8_2stages impl
const
Operation
*
op
=
(
const
Operation
*
)
get_available_op
(
args
);
ConvolutionKey
key
{
cutlass
::
conv
::
Operator
::
kDgrad
,
NumericTypeID
::
kS8
,
LayoutTypeID
::
kTensorNC4HW4
,
NumericTypeID
::
kS8
,
LayoutTypeID
::
kTensorK4RSC4
,
NumericTypeID
::
kS8
,
LayoutTypeID
::
kTensorNC4HW4
,
NumericTypeID
::
kS32
,
LayoutTypeID
::
kTensorNC4HW4
,
cutlass
::
conv
::
ConvType
::
kConvolution
,
m_algo_param
.
threadblock_m
,
m_algo_param
.
threadblock_n
,
m_algo_param
.
threadblock_k
,
m_algo_param
.
warp_m
,
m_algo_param
.
warp_n
,
m_algo_param
.
warp_k
,
1
,
1
,
4
,
cutlass
::
epilogue
::
EpilogueType
::
kBiasAddLinearCombinationClamp
,
m_algo_param
.
stage
,
true
,
false
};
const
Operation
*
op
=
Singleton
::
get
().
operation_table
.
find_op
(
key
);
// gcc prints warnings when size_t values are implicitly narrowed to int
// gcc prints warnings when size_t values are implicitly narrowed to int
cutlass
::
conv
::
Conv2dProblemSize
problem_size
{
cutlass
::
conv
::
Conv2dProblemSize
problem_size
{
...
@@ -167,7 +179,6 @@ void ConvolutionBackwardDataImpl::AlgoPack::fill_int8_dp4a_algos() {
...
@@ -167,7 +179,6 @@ void ConvolutionBackwardDataImpl::AlgoPack::fill_int8_dp4a_algos() {
int8_nchw4_dotprod
.
emplace_back
(
AlgoParam
{
16
,
128
,
16
,
16
,
64
,
16
,
2
});
int8_nchw4_dotprod
.
emplace_back
(
AlgoParam
{
16
,
128
,
16
,
16
,
64
,
16
,
2
});
int8_nchw4_dotprod
.
emplace_back
(
AlgoParam
{
16
,
128
,
16
,
16
,
128
,
16
,
1
});
int8_nchw4_dotprod
.
emplace_back
(
AlgoParam
{
16
,
128
,
16
,
16
,
128
,
16
,
1
});
int8_nchw4_dotprod
.
emplace_back
(
AlgoParam
{
32
,
128
,
32
,
32
,
64
,
32
,
2
});
int8_nchw4_dotprod
.
emplace_back
(
AlgoParam
{
32
,
128
,
32
,
32
,
64
,
32
,
2
});
int8_nchw4_dotprod
.
emplace_back
(
AlgoParam
{
64
,
128
,
32
,
64
,
32
,
32
,
2
});
}
}
// vim: syntax=cpp.doxygen
// vim: syntax=cpp.doxygen
dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp
浏览文件 @
11f022ff
...
@@ -19,6 +19,44 @@
...
@@ -19,6 +19,44 @@
using
namespace
megdnn
;
using
namespace
megdnn
;
using
namespace
cuda
;
using
namespace
cuda
;
const
void
*
ConvolutionBackwardDataImpl
::
AlgoInt8NCHWDotProdImplicitGemm
::
get_available_op
(
const
SizeArgs
&
args
)
const
{
using
namespace
cutlass
::
library
;
auto
&&
fm
=
args
.
filter_meta
;
size_t
sh
=
fm
.
stride
[
0
],
sw
=
fm
.
stride
[
1
];
cutlass
::
conv
::
SpecialOptimizeDesc
special_optimization
=
(
sh
==
2
&&
sw
==
2
)
?
cutlass
::
conv
::
SpecialOptimizeDesc
::
DECONV_DOUBLE_UPSAMPLING
:
cutlass
::
conv
::
SpecialOptimizeDesc
::
NONE
;
// only use 16x64x8_16x64x8_2stages impl
ConvolutionKey
key
{
cutlass
::
conv
::
Operator
::
kDgrad
,
NumericTypeID
::
kS8
,
LayoutTypeID
::
kTensorNC4HW4
,
NumericTypeID
::
kS8
,
LayoutTypeID
::
kTensorK4RSC4
,
NumericTypeID
::
kS8
,
LayoutTypeID
::
kTensorNC4HW4
,
NumericTypeID
::
kS32
,
LayoutTypeID
::
kTensorNC4HW4
,
cutlass
::
conv
::
ConvType
::
kConvolution
,
16
,
64
,
8
,
16
,
64
,
8
,
1
,
1
,
4
,
cutlass
::
epilogue
::
EpilogueType
::
kBiasAddLinearCombinationClamp
,
2
,
special_optimization
,
false
};
return
(
void
*
)
Singleton
::
get
().
operation_table
.
find_op
(
key
);
}
bool
ConvolutionBackwardDataImpl
::
AlgoInt8NCHWDotProdImplicitGemm
::
is_available
(
bool
ConvolutionBackwardDataImpl
::
AlgoInt8NCHWDotProdImplicitGemm
::
is_available
(
const
SizeArgs
&
args
)
const
{
const
SizeArgs
&
args
)
const
{
auto
&&
fm
=
args
.
filter_meta
;
auto
&&
fm
=
args
.
filter_meta
;
...
@@ -52,6 +90,9 @@ bool ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::is_available(
...
@@ -52,6 +90,9 @@ bool ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::is_available(
available
&=
(
fm
.
dilation
[
0
]
==
1
&&
fm
.
dilation
[
1
]
==
1
);
available
&=
(
fm
.
dilation
[
0
]
==
1
&&
fm
.
dilation
[
1
]
==
1
);
// FIXME: too large filter size is not supported now
// FIXME: too large filter size is not supported now
available
&=
fm
.
spatial
[
0
]
*
fm
.
spatial
[
1
]
<=
(
848
/
(
2
*
8
/
4
)
-
2
);
available
&=
fm
.
spatial
[
0
]
*
fm
.
spatial
[
1
]
<=
(
848
/
(
2
*
8
/
4
)
-
2
);
available
&=
(
get_available_op
(
args
)
!=
nullptr
);
// only support sm_61 or later, platform should have fast native int8
// only support sm_61 or later, platform should have fast native int8
// support
// support
available
&=
is_compute_capability_required
(
6
,
1
);
available
&=
is_compute_capability_required
(
6
,
1
);
...
@@ -138,33 +179,7 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::exec(
...
@@ -138,33 +179,7 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::exec(
using
namespace
cutlass
::
library
;
using
namespace
cutlass
::
library
;
// only use 16x64x8_16x64x8_2stages impl
const
Operation
*
op
=
(
const
Operation
*
)
get_available_op
(
args
);
ConvolutionKey
key
{
cutlass
::
conv
::
Operator
::
kDgrad
,
NumericTypeID
::
kS8
,
LayoutTypeID
::
kTensorNC4HW4
,
NumericTypeID
::
kS8
,
LayoutTypeID
::
kTensorK4RSC4
,
NumericTypeID
::
kS8
,
LayoutTypeID
::
kTensorNC4HW4
,
NumericTypeID
::
kS32
,
LayoutTypeID
::
kTensorNC4HW4
,
cutlass
::
conv
::
ConvType
::
kConvolution
,
16
,
64
,
8
,
16
,
64
,
8
,
1
,
1
,
4
,
cutlass
::
epilogue
::
EpilogueType
::
kBiasAddLinearCombinationClamp
,
2
,
true
,
false
};
const
Operation
*
op
=
Singleton
::
get
().
operation_table
.
find_op
(
key
);
// gcc prints warnings when size_t values are implicitly narrowed to int
// gcc prints warnings when size_t values are implicitly narrowed to int
cutlass
::
conv
::
Conv2dProblemSize
problem_size
{
cutlass
::
conv
::
Conv2dProblemSize
problem_size
{
...
...
dnn/src/cuda/cutlass/convolution_operation.h
浏览文件 @
11f022ff
...
@@ -119,8 +119,8 @@ public:
...
@@ -119,8 +119,8 @@ public:
m_description
.
threadblock_swizzle
=
ThreadblockSwizzleMap
<
m_description
.
threadblock_swizzle
=
ThreadblockSwizzleMap
<
typename
Operator
::
ThreadblockSwizzle
>::
kId
;
typename
Operator
::
ThreadblockSwizzle
>::
kId
;
m_description
.
need_load_from_const_mem
=
m_description
.
special_optimization
=
Operator
::
k
NeedLoadFromConstMem
;
Operator
::
k
SpecialOpt
;
m_description
.
gemm_mode
=
Operator
::
kGemmMode
;
m_description
.
gemm_mode
=
Operator
::
kGemmMode
;
m_description
.
without_shared_load
=
Operator
::
kWithoutSharedLoad
;
m_description
.
without_shared_load
=
Operator
::
kWithoutSharedLoad
;
}
}
...
...
dnn/src/cuda/cutlass/library.h
浏览文件 @
11f022ff
...
@@ -487,7 +487,7 @@ struct ConvolutionDescription : public OperationDescription {
...
@@ -487,7 +487,7 @@ struct ConvolutionDescription : public OperationDescription {
ThreadblockSwizzleID
threadblock_swizzle
;
ThreadblockSwizzleID
threadblock_swizzle
;
bool
need_load_from_const_mem
;
conv
::
SpecialOptimizeDesc
special_optimization
;
conv
::
ImplicitGemmMode
gemm_mode
;
conv
::
ImplicitGemmMode
gemm_mode
;
bool
without_shared_load
;
bool
without_shared_load
;
};
};
...
...
dnn/src/cuda/cutlass/operation_table.cpp
浏览文件 @
11f022ff
...
@@ -124,7 +124,7 @@ ConvolutionKey get_convolution_key_from_desc(
...
@@ -124,7 +124,7 @@ ConvolutionKey get_convolution_key_from_desc(
key
.
epilogue_type
=
desc
.
epilogue_type
;
key
.
epilogue_type
=
desc
.
epilogue_type
;
key
.
stages
=
desc
.
tile_description
.
threadblock_stages
;
key
.
stages
=
desc
.
tile_description
.
threadblock_stages
;
key
.
need_load_from_const_mem
=
desc
.
need_load_from_const_mem
;
key
.
special_optimization
=
desc
.
special_optimization
;
key
.
without_shared_load
=
desc
.
without_shared_load
;
key
.
without_shared_load
=
desc
.
without_shared_load
;
return
key
;
return
key
;
...
@@ -156,23 +156,25 @@ void OperationTable::append(Manifest const& manifest) {
...
@@ -156,23 +156,25 @@ void OperationTable::append(Manifest const& manifest) {
/////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
Operation
const
*
OperationTable
::
find_op
(
GemmKey
const
&
key
)
const
{
Operation
const
*
OperationTable
::
find_op
(
GemmKey
const
&
key
)
const
{
megdnn_assert
(
gemm_operations
.
count
(
key
)
>
0
,
if
(
gemm_operations
.
count
(
key
))
{
"key not found in cutlass operation table"
);
auto
const
&
ops
=
gemm_operations
.
at
(
key
);
auto
const
&
ops
=
gemm_operations
.
at
(
key
);
megdnn_assert
(
ops
.
size
()
==
1
,
"exactly one kernel expected, got %zu"
,
megdnn_assert
(
ops
.
size
()
==
1
,
"exactly one kernel expected, got %zu"
,
ops
.
size
());
ops
.
size
());
return
ops
[
0
];
return
ops
[
0
];
}
return
nullptr
;
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
Operation
const
*
OperationTable
::
find_op
(
ConvolutionKey
const
&
key
)
const
{
Operation
const
*
OperationTable
::
find_op
(
ConvolutionKey
const
&
key
)
const
{
megdnn_assert
(
convolution_operations
.
count
(
key
)
>
0
,
if
(
convolution_operations
.
count
(
key
)
>
0
)
{
"key not found in cutlass operation table"
);
auto
const
&
ops
=
convolution_operations
.
at
(
key
);
auto
const
&
ops
=
convolution_operations
.
at
(
key
);
megdnn_assert
(
ops
.
size
()
==
1
,
"exactly one kernel expected, got %zu"
,
megdnn_assert
(
ops
.
size
()
==
1
,
"exactly one kernel expected, got %zu"
,
ops
.
size
());
ops
.
size
());
return
ops
[
0
];
return
ops
[
0
];
}
return
nullptr
;
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
...
...
dnn/src/cuda/cutlass/operation_table.h
浏览文件 @
11f022ff
...
@@ -211,7 +211,7 @@ struct ConvolutionKey {
...
@@ -211,7 +211,7 @@ struct ConvolutionKey {
epilogue
::
EpilogueType
epilogue_type
;
epilogue
::
EpilogueType
epilogue_type
;
int
stages
;
int
stages
;
bool
need_load_from_const_mem
;
conv
::
SpecialOptimizeDesc
special_optimization
;
bool
without_shared_load
;
bool
without_shared_load
;
inline
bool
operator
==
(
ConvolutionKey
const
&
rhs
)
const
{
inline
bool
operator
==
(
ConvolutionKey
const
&
rhs
)
const
{
...
@@ -234,7 +234,7 @@ struct ConvolutionKey {
...
@@ -234,7 +234,7 @@ struct ConvolutionKey {
(
instruction_shape_n
==
rhs
.
instruction_shape_n
)
&&
(
instruction_shape_n
==
rhs
.
instruction_shape_n
)
&&
(
instruction_shape_k
==
rhs
.
instruction_shape_k
)
&&
(
instruction_shape_k
==
rhs
.
instruction_shape_k
)
&&
(
epilogue_type
==
rhs
.
epilogue_type
)
&&
(
stages
==
rhs
.
stages
)
&&
(
epilogue_type
==
rhs
.
epilogue_type
)
&&
(
stages
==
rhs
.
stages
)
&&
(
need_load_from_const_mem
==
rhs
.
need_load_from_const_mem
)
&&
(
special_optimization
==
rhs
.
special_optimization
)
&&
(
without_shared_load
==
rhs
.
without_shared_load
);
(
without_shared_load
==
rhs
.
without_shared_load
);
}
}
...
@@ -270,8 +270,8 @@ struct ConvolutionKey {
...
@@ -270,8 +270,8 @@ struct ConvolutionKey {
"
\n
instruction_shape: "
+
instruction_shape_str
+
"
\n
instruction_shape: "
+
instruction_shape_str
+
"
\n
epilogue_type: "
+
to_string
(
epilogue_type
)
+
"
\n
epilogue_type: "
+
to_string
(
epilogue_type
)
+
"
\n
stages: "
+
std
::
to_string
(
stages
)
+
"
\n
stages: "
+
std
::
to_string
(
stages
)
+
"
\n
need_load_from_const_mem
: "
+
"
\n
special_optimization
: "
+
to_string
(
need_load_from_const_mem
)
+
to_string
(
special_optimization
)
+
"
\n
without_shared_load: "
+
to_string
(
without_shared_load
)
+
"
\n
without_shared_load: "
+
to_string
(
without_shared_load
)
+
"
\n
}"
;
"
\n
}"
;
}
}
...
@@ -308,8 +308,8 @@ struct ConvolutionKeyHasher {
...
@@ -308,8 +308,8 @@ struct ConvolutionKeyHasher {
sizeof
(
key
.
instruction_shape_k
))
sizeof
(
key
.
instruction_shape_k
))
.
update
(
&
key
.
epilogue_type
,
sizeof
(
key
.
epilogue_type
))
.
update
(
&
key
.
epilogue_type
,
sizeof
(
key
.
epilogue_type
))
.
update
(
&
key
.
stages
,
sizeof
(
key
.
stages
))
.
update
(
&
key
.
stages
,
sizeof
(
key
.
stages
))
.
update
(
&
key
.
need_load_from_const_mem
,
.
update
(
&
key
.
special_optimization
,
sizeof
(
key
.
need_load_from_const_mem
))
sizeof
(
key
.
special_optimization
))
.
update
(
&
key
.
without_shared_load
,
.
update
(
&
key
.
without_shared_load
,
sizeof
(
key
.
without_shared_load
))
sizeof
(
key
.
without_shared_load
))
.
digest
();
.
digest
();
...
...
dnn/src/cuda/cutlass/util.cu
浏览文件 @
11f022ff
...
@@ -1566,6 +1566,35 @@ char const* to_string(MathOperationID math_op, bool pretty) {
...
@@ -1566,6 +1566,35 @@ char const* to_string(MathOperationID math_op, bool pretty) {
///////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////
static
struct
{
char
const
*
text
;
char
const
*
pretty
;
conv
::
SpecialOptimizeDesc
enumerant
;
}
SpecialOptimizeDesc_enumerants
[]
=
{
{
"none_special_opt"
,
"NoneSpecialOpt"
,
conv
::
SpecialOptimizeDesc
::
NONE
},
{
"conv_filter_unity"
,
"ConvFilterUnity"
,
conv
::
SpecialOptimizeDesc
::
CONV_FILTER_UNITY
},
{
"deconv_double_upsampling"
,
"DeconvDoubleUpsampling"
,
conv
::
SpecialOptimizeDesc
::
DECONV_DOUBLE_UPSAMPLING
},
};
/// Converts an SpecialOptimizeDesc enumerant to a string
char
const
*
to_string
(
conv
::
SpecialOptimizeDesc
special_opt
,
bool
pretty
)
{
for
(
auto
const
&
possible
:
SpecialOptimizeDesc_enumerants
)
{
if
(
special_opt
==
possible
.
enumerant
)
{
if
(
pretty
)
{
return
possible
.
pretty
;
}
else
{
return
possible
.
text
;
}
}
}
return
pretty
?
"Invalid"
:
"invalid"
;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
static
struct
{
static
struct
{
char
const
*
text
;
char
const
*
text
;
char
const
*
pretty
;
char
const
*
pretty
;
...
...
dnn/src/cuda/cutlass/util.h
浏览文件 @
11f022ff
...
@@ -207,6 +207,10 @@ char const* to_string(bool val, bool pretty = false);
...
@@ -207,6 +207,10 @@ char const* to_string(bool val, bool pretty = false);
/// Converts a MathOperationID enumerant to a string
/// Converts a MathOperationID enumerant to a string
char
const
*
to_string
(
MathOperationID
math_op
,
bool
pretty
=
false
);
char
const
*
to_string
(
MathOperationID
math_op
,
bool
pretty
=
false
);
/// Converts a SpecialOptimizeDesc enumerant to a string
char
const
*
to_string
(
conv
::
SpecialOptimizeDesc
special_opt
,
bool
pretty
=
false
);
/// Converts an ImplicitGemmMode enumerant to a string
/// Converts an ImplicitGemmMode enumerant to a string
char
const
*
to_string
(
conv
::
ImplicitGemmMode
mode
,
bool
pretty
=
false
);
char
const
*
to_string
(
conv
::
ImplicitGemmMode
mode
,
bool
pretty
=
false
);
...
...
dnn/src/cuda/matrix_mul/algos.h
浏览文件 @
11f022ff
...
@@ -235,6 +235,7 @@ public:
...
@@ -235,6 +235,7 @@ public:
m_name
{
ssprintf
(
"CUTLASS_FLOAT32_SIMT_%s"
,
m_name
{
ssprintf
(
"CUTLASS_FLOAT32_SIMT_%s"
,
m_algo_param
.
to_string
().
c_str
())}
{}
m_algo_param
.
to_string
().
c_str
())}
{}
bool
is_available
(
const
SizeArgs
&
args
)
const
override
;
bool
is_available
(
const
SizeArgs
&
args
)
const
override
;
size_t
get_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
override
;
size_t
get_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
override
;
const
char
*
name
()
const
override
{
return
m_name
.
c_str
();
}
const
char
*
name
()
const
override
{
return
m_name
.
c_str
();
}
AlgoAttribute
attribute
()
const
override
{
AlgoAttribute
attribute
()
const
override
{
...
@@ -260,6 +261,7 @@ private:
...
@@ -260,6 +261,7 @@ private:
void
do_exec
(
const
ExecArgs
&
args
)
const
override
;
void
do_exec
(
const
ExecArgs
&
args
)
const
override
;
int
min_alignment_requirement
()
const
override
{
return
1
;
}
int
min_alignment_requirement
()
const
override
{
return
1
;
}
std
::
string
m_name
;
std
::
string
m_name
;
const
void
*
get_available_op
(
const
SizeArgs
&
args
)
const
;
};
};
class
MatrixMulForwardImpl
::
AlgoFloat32SIMTSplitK
final
class
MatrixMulForwardImpl
::
AlgoFloat32SIMTSplitK
final
...
@@ -270,6 +272,7 @@ public:
...
@@ -270,6 +272,7 @@ public:
m_name
{
ssprintf
(
"CUTLASS_FLOAT32_SIMT_SPLIT_K_%s"
,
m_name
{
ssprintf
(
"CUTLASS_FLOAT32_SIMT_SPLIT_K_%s"
,
m_algo_param
.
to_string
().
c_str
())}
{}
m_algo_param
.
to_string
().
c_str
())}
{}
bool
is_available
(
const
SizeArgs
&
args
)
const
override
;
bool
is_available
(
const
SizeArgs
&
args
)
const
override
;
size_t
get_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
override
;
size_t
get_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
override
;
const
char
*
name
()
const
override
{
return
m_name
.
c_str
();
}
const
char
*
name
()
const
override
{
return
m_name
.
c_str
();
}
AlgoAttribute
attribute
()
const
override
{
AlgoAttribute
attribute
()
const
override
{
...
@@ -297,6 +300,7 @@ private:
...
@@ -297,6 +300,7 @@ private:
void
do_exec
(
const
ExecArgs
&
args
)
const
override
;
void
do_exec
(
const
ExecArgs
&
args
)
const
override
;
int
min_alignment_requirement
()
const
override
{
return
1
;
}
int
min_alignment_requirement
()
const
override
{
return
1
;
}
std
::
string
m_name
;
std
::
string
m_name
;
const
void
*
get_available_op
(
const
SizeArgs
&
args
)
const
;
};
};
class
MatrixMulForwardImpl
::
AlgoFloat32SIMTGemvBatchedStrided
final
class
MatrixMulForwardImpl
::
AlgoFloat32SIMTGemvBatchedStrided
final
...
...
dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp
浏览文件 @
11f022ff
...
@@ -19,6 +19,39 @@
...
@@ -19,6 +19,39 @@
using
namespace
megdnn
;
using
namespace
megdnn
;
using
namespace
cuda
;
using
namespace
cuda
;
const
void
*
MatrixMulForwardImpl
::
AlgoFloat32SIMT
::
get_available_op
(
const
SizeArgs
&
args
)
const
{
using
namespace
cutlass
::
library
;
auto
&&
param
=
args
.
opr
->
param
();
auto
layoutA
=
param
.
transposeA
?
LayoutTypeID
::
kColumnMajor
:
LayoutTypeID
::
kRowMajor
;
auto
layoutB
=
param
.
transposeB
?
LayoutTypeID
::
kColumnMajor
:
LayoutTypeID
::
kRowMajor
;
int
alignment
=
min_alignment_requirement
();
GemmKey
key
{
NumericTypeID
::
kF32
,
layoutA
,
NumericTypeID
::
kF32
,
layoutB
,
NumericTypeID
::
kF32
,
LayoutTypeID
::
kRowMajor
,
NumericTypeID
::
kF32
,
m_algo_param
.
threadblock_m
,
m_algo_param
.
threadblock_n
,
m_algo_param
.
threadblock_k
,
m_algo_param
.
warp_m
,
m_algo_param
.
warp_n
,
m_algo_param
.
warp_k
,
1
,
1
,
1
,
2
,
alignment
,
alignment
,
SplitKMode
::
kNone
};
return
(
void
*
)
Singleton
::
get
().
operation_table
.
find_op
(
key
);
}
bool
MatrixMulForwardImpl
::
AlgoFloat32SIMT
::
is_available
(
bool
MatrixMulForwardImpl
::
AlgoFloat32SIMT
::
is_available
(
const
SizeArgs
&
args
)
const
{
const
SizeArgs
&
args
)
const
{
bool
available
=
bool
available
=
...
@@ -34,6 +67,8 @@ bool MatrixMulForwardImpl::AlgoFloat32SIMT::is_available(
...
@@ -34,6 +67,8 @@ bool MatrixMulForwardImpl::AlgoFloat32SIMT::is_available(
m_algo_param
.
threadblock_n
<=
m_algo_param
.
threadblock_n
<=
y_grid_limit
);
y_grid_limit
);
available
&=
(
get_available_op
(
args
)
!=
nullptr
);
return
available
;
return
available
;
}
}
...
@@ -61,34 +96,7 @@ void MatrixMulForwardImpl::AlgoFloat32SIMT::do_exec(
...
@@ -61,34 +96,7 @@ void MatrixMulForwardImpl::AlgoFloat32SIMT::do_exec(
using
namespace
cutlass
::
library
;
using
namespace
cutlass
::
library
;
auto
layoutA
=
param
.
transposeA
?
LayoutTypeID
::
kColumnMajor
const
Operation
*
op
=
(
const
Operation
*
)
get_available_op
(
args
);
:
LayoutTypeID
::
kRowMajor
;
auto
layoutB
=
param
.
transposeB
?
LayoutTypeID
::
kColumnMajor
:
LayoutTypeID
::
kRowMajor
;
int
alignment
=
min_alignment_requirement
();
GemmKey
key
{
NumericTypeID
::
kF32
,
layoutA
,
NumericTypeID
::
kF32
,
layoutB
,
NumericTypeID
::
kF32
,
LayoutTypeID
::
kRowMajor
,
NumericTypeID
::
kF32
,
m_algo_param
.
threadblock_m
,
m_algo_param
.
threadblock_n
,
m_algo_param
.
threadblock_k
,
m_algo_param
.
warp_m
,
m_algo_param
.
warp_n
,
m_algo_param
.
warp_k
,
1
,
1
,
1
,
2
,
alignment
,
alignment
,
SplitKMode
::
kNone
};
const
Operation
*
op
=
Singleton
::
get
().
operation_table
.
find_op
(
key
);
GemmArguments
gemm_args
{
problem_size
,
GemmArguments
gemm_args
{
problem_size
,
args
.
tensor_a
.
raw_ptr
,
args
.
tensor_a
.
raw_ptr
,
...
...
dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
浏览文件 @
11f022ff
...
@@ -19,6 +19,39 @@
...
@@ -19,6 +19,39 @@
using
namespace
megdnn
;
using
namespace
megdnn
;
using
namespace
cuda
;
using
namespace
cuda
;
const
void
*
MatrixMulForwardImpl
::
AlgoFloat32SIMTSplitK
::
get_available_op
(
const
SizeArgs
&
args
)
const
{
using
namespace
cutlass
::
library
;
auto
&&
param
=
args
.
opr
->
param
();
auto
layoutA
=
param
.
transposeA
?
LayoutTypeID
::
kColumnMajor
:
LayoutTypeID
::
kRowMajor
;
auto
layoutB
=
param
.
transposeB
?
LayoutTypeID
::
kColumnMajor
:
LayoutTypeID
::
kRowMajor
;
int
alignment
=
min_alignment_requirement
();
GemmKey
key
{
NumericTypeID
::
kF32
,
layoutA
,
NumericTypeID
::
kF32
,
layoutB
,
NumericTypeID
::
kF32
,
LayoutTypeID
::
kRowMajor
,
NumericTypeID
::
kF32
,
m_algo_param
.
threadblock_m
,
m_algo_param
.
threadblock_n
,
m_algo_param
.
threadblock_k
,
m_algo_param
.
warp_m
,
m_algo_param
.
warp_n
,
m_algo_param
.
warp_k
,
1
,
1
,
1
,
2
,
alignment
,
alignment
,
SplitKMode
::
kParallel
};
return
(
void
*
)
Singleton
::
get
().
operation_table
.
find_op
(
key
);
}
bool
MatrixMulForwardImpl
::
AlgoFloat32SIMTSplitK
::
is_available
(
bool
MatrixMulForwardImpl
::
AlgoFloat32SIMTSplitK
::
is_available
(
const
SizeArgs
&
args
)
const
{
const
SizeArgs
&
args
)
const
{
auto
&&
param
=
args
.
opr
->
param
();
auto
&&
param
=
args
.
opr
->
param
();
...
@@ -35,6 +68,8 @@ bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available(
...
@@ -35,6 +68,8 @@ bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available(
available
&=
((
m
+
m_algo_param
.
threadblock_m
-
1
)
/
available
&=
((
m
+
m_algo_param
.
threadblock_m
-
1
)
/
m_algo_param
.
threadblock_m
<=
m_algo_param
.
threadblock_m
<=
y_grid_limit
);
y_grid_limit
);
available
&=
(
get_available_op
(
args
)
!=
nullptr
);
return
available
;
return
available
;
}
}
...
@@ -66,35 +101,7 @@ void MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::do_exec(
...
@@ -66,35 +101,7 @@ void MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::do_exec(
float
alpha
=
1.
f
,
beta
=
0.
f
;
float
alpha
=
1.
f
,
beta
=
0.
f
;
using
namespace
cutlass
::
library
;
using
namespace
cutlass
::
library
;
const
Operation
*
op
=
(
const
Operation
*
)
get_available_op
(
args
);
auto
layoutA
=
param
.
transposeA
?
LayoutTypeID
::
kColumnMajor
:
LayoutTypeID
::
kRowMajor
;
auto
layoutB
=
param
.
transposeB
?
LayoutTypeID
::
kColumnMajor
:
LayoutTypeID
::
kRowMajor
;
int
alignment
=
min_alignment_requirement
();
GemmKey
key
{
NumericTypeID
::
kF32
,
layoutA
,
NumericTypeID
::
kF32
,
layoutB
,
NumericTypeID
::
kF32
,
LayoutTypeID
::
kRowMajor
,
NumericTypeID
::
kF32
,
m_algo_param
.
threadblock_m
,
m_algo_param
.
threadblock_n
,
m_algo_param
.
threadblock_k
,
m_algo_param
.
warp_m
,
m_algo_param
.
warp_n
,
m_algo_param
.
warp_k
,
1
,
1
,
1
,
2
,
alignment
,
alignment
,
SplitKMode
::
kParallel
};
Operation
const
*
op
=
Singleton
::
get
().
operation_table
.
find_op
(
key
);
GemmArguments
gemm_args
{
problem_size
,
GemmArguments
gemm_args
{
problem_size
,
args
.
tensor_a
.
raw_ptr
,
args
.
tensor_a
.
raw_ptr
,
...
...
dnn/test/cuda/conv_bias_int8.cpp
浏览文件 @
11f022ff
...
@@ -882,6 +882,125 @@ TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) {
...
@@ -882,6 +882,125 @@ TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) {
ConvBias
::
DirectParam
{});
ConvBias
::
DirectParam
{});
check
(
algo
);
check
(
algo
);
}
}
TEST_F
(
CUDA
,
CUTLASS_CONV_BIAS_INT8_NHWC
)
{
require_compute_capability
(
7
,
5
);
Checker
<
ConvBiasForward
>
checker
(
handle_cuda
());
auto
check
=
[
&
checker
](
const
std
::
string
&
algo
)
{
checker
.
set_before_exec_callback
(
conv_bias
::
ConvBiasAlgoChecker
<
ConvBiasForward
>
(
algo
.
c_str
()));
UniformIntRNG
rng
{
-
8
,
8
};
UniformIntRNG
bias_rng
{
-
50
,
50
};
checker
.
set_rng
(
0
,
&
rng
)
.
set_rng
(
1
,
&
rng
)
.
set_rng
(
2
,
&
bias_rng
)
.
set_rng
(
3
,
&
rng
)
.
set_dtype
(
0
,
dtype
::
QuantizedS8
{
1.2
f
})
.
set_dtype
(
1
,
dtype
::
QuantizedS8
{
1.3
f
})
.
set_dtype
(
2
,
dtype
::
QuantizedS32
{
1.2
f
*
1.3
f
})
.
set_dtype
(
3
,
dtype
::
QuantizedS8
{
19.990229
f
})
.
set_dtype
(
4
,
dtype
::
QuantizedS8
{
19.990228
f
})
.
set_epsilon
(
1e-3
);
param
::
ConvBias
param
;
param
.
pad_h
=
param
.
pad_w
=
1
;
param
.
stride_h
=
param
.
stride_w
=
1
;
param
.
format
=
param
::
ConvBias
::
Format
::
NHWC
;
checker
.
set_param
(
param
).
execs
(
{{
16
,
7
,
7
,
16
},
{
32
,
3
,
3
,
16
},
{
1
,
1
,
1
,
32
},
{},
{}});
param
.
pad_h
=
param
.
pad_w
=
0
;
param
.
nonlineMode
=
param
::
ConvBias
::
NonlineMode
::
RELU
;
checker
.
set_param
(
param
).
execs
(
{{
16
,
7
,
7
,
16
},
{
16
,
1
,
1
,
16
},
{
1
,
1
,
1
,
16
},
{},
{}});
};
std
::
string
algo
=
ConvBias
::
algo_name
<
ConvBias
::
DirectParam
>
(
"INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16"
,
ConvBias
::
DirectParam
{});
check
(
algo
);
algo
=
ConvBias
::
algo_name
<
ConvBias
::
DirectParam
>
(
"INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16"
,
ConvBias
::
DirectParam
{});
check
(
algo
);
}
TEST_F
(
CUDA
,
CUTLASS_CONV_BIAS_INT8_NHWC_UINT4_WEIGHT_PREPROCESS
)
{
require_compute_capability
(
7
,
5
);
Checker
<
ConvBiasForward
,
OprWeightPreprocessProxy
<
ConvBiasForward
>>
checker
(
handle_cuda
());
auto
check
=
[
&
checker
](
const
std
::
string
&
algo
)
{
checker
.
set_before_exec_callback
(
conv_bias
::
ConvBiasAlgoChecker
<
ConvBiasForward
>
(
algo
.
c_str
()));
UniformIntRNG
rng
{
-
8
,
8
};
UniformIntRNG
bias_rng
{
-
50
,
50
};
UniformIntRNG
rng_u4
{
0
,
15
};
checker
.
set_rng
(
0
,
&
rng
)
.
set_rng
(
1
,
&
rng
)
.
set_rng
(
2
,
&
bias_rng
)
.
set_rng
(
3
,
&
rng_u4
)
.
set_dtype
(
0
,
dtype
::
QuantizedS8
{
0.2
f
})
.
set_dtype
(
1
,
dtype
::
QuantizedS8
{
0.3
f
})
.
set_dtype
(
2
,
dtype
::
QuantizedS32
{
0.2
f
*
0.3
f
})
.
set_dtype
(
3
,
dtype
::
Quantized4Asymm
{
0.5
f
,
8
})
.
set_dtype
(
4
,
dtype
::
Quantized4Asymm
{
0.5
f
,
4
})
.
set_epsilon
(
1
+
1e-3
);
param
::
ConvBias
param
;
param
.
pad_h
=
param
.
pad_w
=
1
;
param
.
stride_h
=
param
.
stride_w
=
1
;
param
.
format
=
param
::
ConvBias
::
Format
::
NHWC
;
checker
.
set_param
(
param
).
execs
(
{{
16
,
7
,
7
,
16
},
{
32
,
3
,
3
,
16
},
{
1
,
1
,
1
,
32
},
{},
{}});
param
.
pad_h
=
param
.
pad_w
=
0
;
param
.
nonlineMode
=
param
::
ConvBias
::
NonlineMode
::
RELU
;
checker
.
set_param
(
param
).
execs
(
{{
16
,
7
,
7
,
16
},
{
16
,
1
,
1
,
16
},
{
1
,
1
,
1
,
16
},
{},
{}});
};
std
::
string
algo
=
ConvBias
::
algo_name
<
ConvBias
::
DirectParam
>
(
"INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16"
,
ConvBias
::
DirectParam
{});
check
(
algo
);
algo
=
ConvBias
::
algo_name
<
ConvBias
::
DirectParam
>
(
"INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16"
,
ConvBias
::
DirectParam
{});
check
(
algo
);
}
TEST_F
(
CUDA
,
CUTLASS_CONV_BIAS_INT8_NHWC_FLOAT
)
{
require_compute_capability
(
7
,
5
);
Checker
<
ConvBiasForward
>
checker
(
handle_cuda
());
auto
check
=
[
&
checker
](
const
std
::
string
&
algo
)
{
checker
.
set_before_exec_callback
(
conv_bias
::
ConvBiasAlgoChecker
<
ConvBiasForward
>
(
algo
.
c_str
()));
UniformIntRNG
rng
{
-
8
,
8
};
UniformFloatRNG
float_rng
{
-
50
,
50
};
checker
.
set_rng
(
0
,
&
rng
)
.
set_rng
(
1
,
&
rng
)
.
set_rng
(
2
,
&
float_rng
)
.
set_rng
(
3
,
&
float_rng
)
.
set_dtype
(
0
,
dtype
::
QuantizedS8
(
1.9980618
f
))
.
set_dtype
(
1
,
dtype
::
QuantizedS8
(
1.9980927
f
))
.
set_dtype
(
2
,
dtype
::
Float32
())
.
set_dtype
(
3
,
dtype
::
Float32
())
.
set_dtype
(
4
,
dtype
::
Float32
());
param
::
ConvBias
param
;
param
.
pad_h
=
param
.
pad_w
=
1
;
param
.
stride_h
=
param
.
stride_w
=
1
;
param
.
format
=
param
::
ConvBias
::
Format
::
NHWC
;
checker
.
set_param
(
param
).
execs
(
{{
16
,
7
,
7
,
16
},
{
32
,
3
,
3
,
16
},
{
1
,
1
,
1
,
32
},
{},
{}});
param
.
pad_h
=
param
.
pad_w
=
0
;
param
.
nonlineMode
=
param
::
ConvBias
::
NonlineMode
::
RELU
;
checker
.
set_param
(
param
).
execs
(
{{
16
,
7
,
7
,
16
},
{
16
,
1
,
1
,
16
},
{
1
,
1
,
1
,
16
},
{},
{}});
};
std
::
string
algo
=
ConvBias
::
algo_name
<
ConvBias
::
DirectParam
>
(
"INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16"
,
ConvBias
::
DirectParam
{});
check
(
algo
);
algo
=
ConvBias
::
algo_name
<
ConvBias
::
DirectParam
>
(
"INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16"
,
ConvBias
::
DirectParam
{});
check
(
algo
);
}
#endif
#endif
TEST_F
(
CUDA
,
CUTLASS_CONV_BIAS_INT8_NCHW4_NCHW
)
{
TEST_F
(
CUDA
,
CUTLASS_CONV_BIAS_INT8_NCHW4_NCHW
)
{
...
@@ -969,7 +1088,7 @@ TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_NCHW4) {
...
@@ -969,7 +1088,7 @@ TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_NCHW4) {
checker
.
set_before_exec_callback
(
conv_bias
::
ConvBiasAlgoChecker
<
checker
.
set_before_exec_callback
(
conv_bias
::
ConvBiasAlgoChecker
<
ConvBiasForward
>
(
ConvBiasForward
>
(
ConvBias
::
algo_name
<
ConvBias
::
DirectParam
>
(
ConvBias
::
algo_name
<
ConvBias
::
DirectParam
>
(
"INT8_NCHW32_IMMA_IMPLICIT_GEMM_
128X128X64_64X64X64_2
"
,
"INT8_NCHW32_IMMA_IMPLICIT_GEMM_
32X128X32_32X64X32_1
"
,
ConvBias
::
DirectParam
{})
ConvBias
::
DirectParam
{})
.
c_str
()));
.
c_str
()));
checker
.
set_dtype
(
0
,
dtype
::
QuantizedS8
(
1.9980618
f
))
checker
.
set_dtype
(
0
,
dtype
::
QuantizedS8
(
1.9980618
f
))
...
@@ -1109,6 +1228,16 @@ TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) {
...
@@ -1109,6 +1228,16 @@ TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) {
"DIRECT:INT8_NCHW32_IMMA_IMPLICIT_GEMM"
,
"DIRECT:INT8_NCHW32_IMMA_IMPLICIT_GEMM"
,
param
::
ConvBias
::
Format
::
NCHW32
);
param
::
ConvBias
::
Format
::
NCHW32
);
}
}
TEST_F
(
CUDA
,
BENCHMARK_CUTLASS_CONV_BIAS_INT8_NHWC
)
{
require_compute_capability
(
7
,
5
);
benchmark_target_algo_with_cudnn_tsc
(
handle_cuda
(),
get_det_first_bench_args
(
16
),
dtype
::
QuantizedS8
{
1.2
f
},
dtype
::
QuantizedS8
{
1.3
f
},
dtype
::
QuantizedS32
{
1.2
f
*
1.3
f
},
dtype
::
QuantizedS8
{
1.0
f
},
"DIRECT:INT8_NHWC_IMMA_IMPLICIT_GEMM"
,
param
::
ConvBias
::
Format
::
NHWC
);
}
#endif
#endif
TEST_F
(
CUDA
,
BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4
)
{
TEST_F
(
CUDA
,
BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4
)
{
...
...
dnn/test/cuda/conv_test_utils.cpp
浏览文件 @
11f022ff
...
@@ -102,9 +102,7 @@ std::vector<BenchArgs> get_det_first_bench_args(size_t batch) {
...
@@ -102,9 +102,7 @@ std::vector<BenchArgs> get_det_first_bench_args(size_t batch) {
args
.
emplace_back
(
BenchArgs
{
batch
,
16
,
384
,
640
,
16
,
3
,
1
});
args
.
emplace_back
(
BenchArgs
{
batch
,
16
,
384
,
640
,
16
,
3
,
1
});
args
.
emplace_back
(
BenchArgs
{
batch
,
16
,
384
,
640
,
32
,
3
,
2
});
args
.
emplace_back
(
BenchArgs
{
batch
,
16
,
384
,
640
,
32
,
3
,
2
});
args
.
emplace_back
(
BenchArgs
{
batch
,
32
,
184
,
320
,
32
,
3
,
1
});
args
.
emplace_back
(
BenchArgs
{
batch
,
32
,
184
,
320
,
32
,
3
,
1
});
args
.
emplace_back
(
BenchArgs
{
batch
,
32
,
384
,
640
,
64
,
3
,
2
});
args
.
emplace_back
(
BenchArgs
{
batch
,
32
,
184
,
320
,
32
,
1
,
1
});
args
.
emplace_back
(
BenchArgs
{
batch
,
32
,
184
,
320
,
32
,
1
,
1
});
args
.
emplace_back
(
BenchArgs
{
batch
,
32
,
384
,
640
,
64
,
1
,
2
});
return
args
;
return
args
;
}
}
...
@@ -333,6 +331,9 @@ void benchmark_target_algo_with_cudnn_tsc(
...
@@ -333,6 +331,9 @@ void benchmark_target_algo_with_cudnn_tsc(
.
reshape
({
shape
[
0
],
shape
[
1
]
/
4
,
4
,
shape
[
2
],
.
reshape
({
shape
[
0
],
shape
[
1
]
/
4
,
4
,
shape
[
2
],
shape
[
3
]})
shape
[
3
]})
.
dimshuffle
({
1
,
3
,
4
,
0
,
2
}));
.
dimshuffle
({
1
,
3
,
4
,
0
,
2
}));
}
else
if
(
format
==
Format
::
NHWC
)
{
ret
=
static_cast
<
TensorShape
>
(
TensorLayout
{
shape
,
dtype
}.
dimshuffle
({
0
,
2
,
3
,
1
}));
}
}
return
ret
;
return
ret
;
};
};
...
@@ -363,6 +364,9 @@ void benchmark_target_algo_with_cudnn_tsc(
...
@@ -363,6 +364,9 @@ void benchmark_target_algo_with_cudnn_tsc(
if
((
format
==
Format
::
CHWN4
||
format
==
Format
::
NCHW4
)
&&
if
((
format
==
Format
::
CHWN4
||
format
==
Format
::
NCHW4
)
&&
(
arg
.
ci
%
16
!=
0
))
(
arg
.
ci
%
16
!=
0
))
continue
;
continue
;
// skip testcase which cannot enable nhwc tensorcore
if
((
format
==
Format
::
NHWC
)
&&
(
arg
.
ci
%
4
!=
0
||
arg
.
co
%
4
!=
0
))
continue
;
Format
format_cudnn
=
arg
.
ci
%
32
==
0
&&
arg
.
co
%
32
==
0
Format
format_cudnn
=
arg
.
ci
%
32
==
0
&&
arg
.
co
%
32
==
0
?
Format
::
NCHW32
?
Format
::
NCHW32
:
Format
::
NCHW4
;
:
Format
::
NCHW4
;
...
...
dnn/test/cuda/convolution.cpp
浏览文件 @
11f022ff
...
@@ -327,7 +327,6 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A) {
...
@@ -327,7 +327,6 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A) {
all_params
.
emplace_back
(
AlgoParam
{
16
,
128
,
16
,
16
,
64
,
16
,
2
});
all_params
.
emplace_back
(
AlgoParam
{
16
,
128
,
16
,
16
,
64
,
16
,
2
});
all_params
.
emplace_back
(
AlgoParam
{
16
,
128
,
16
,
16
,
128
,
16
,
1
});
all_params
.
emplace_back
(
AlgoParam
{
16
,
128
,
16
,
16
,
128
,
16
,
1
});
all_params
.
emplace_back
(
AlgoParam
{
32
,
128
,
32
,
32
,
64
,
32
,
2
});
all_params
.
emplace_back
(
AlgoParam
{
32
,
128
,
32
,
32
,
64
,
32
,
2
});
all_params
.
emplace_back
(
AlgoParam
{
64
,
128
,
32
,
64
,
32
,
32
,
2
});
for
(
auto
algo_param
:
all_params
)
{
for
(
auto
algo_param
:
all_params
)
{
Checker
<
ConvolutionBackwardData
>
checker
(
handle_cuda
());
Checker
<
ConvolutionBackwardData
>
checker
(
handle_cuda
());
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录