提交 4eda3388 编写于 作者: M Megvii Engine Team 提交者: huangxinda

feat(dnn/cuda): generate cutlass kimpls using cmake and bazel

GitOrigin-RevId: da3bcfb85af3f148116b8b6a51f214c6cef8684e
上级 8d248a6a
# Mark generated files as binary, ignore them in git diff.
# dnn
dnn/scripts/cutlass_generator/list.bzl binary
dnn/src/cuda/conv_bias/int4/kimpl/* binary
dnn/src/cuda/conv_bias/int8/kimpl/* binary
dnn/src/cuda/conv_bias/int8_imma/kimpl/* binary
......
load("list.bzl", "cutlass_gen_list")
genrule(
name = "cutlass_kimpls",
outs = cutlass_gen_list,
cmd = """GEN=$(location //brain/megbrain/dnn/scripts/cutlass_generator:generator.py)
pwd > /tmp/a
echo $(@D) > /tmp/b
python3 $$GEN --operations gemm --type simt $(@D)
python3 $$GEN --operations gemv --type simt $(@D)
python3 $$GEN --operations deconv --type simt $(@D)
python3 $$GEN --operations conv2d --type simt $(@D)
python3 $$GEN --operations conv2d --type tensorop8816 $(@D)
python3 $$GEN --operations conv2d --type tensorop8832 $(@D)
""",
tools = ["//brain/megbrain/dnn/scripts/cutlass_generator:generator.py"],
visibility = ["//visibility:public"],
)
# Generate device kernel registration code for CUTLASS kernels
## Usage
```bash
python3 generator.py [--operations {gemm, gemv, conv2d, deconv}] [--type {simt, tensorop8816, tensorop8832}]
output
```
- operations: operation kind, including gemm|gemv|conv2d|deconv
- type: opcode class, simt|tensorop8816|tensorop8832
- output: the output directory for CUTLASS kernels
## Generate file list for bazel
We generate `list.bzl` because the `genrule` method of bazel requires that the output file list be specified in the analysis phase.
Please call `gen_list.py` when new operations are added.
```bash
python3 gen_list.py
```
此差异已折叠。
此差异已折叠。
from generator import (
GenerateGemmOperations,
GenerateGemvOperations,
GenerateConv2dOperations,
GenerateDeconvOperations,
)
class GenArg:
def __init__(self, gen_op, gen_type):
self.operations = gen_op
self.type = gen_type
def write_op_list(f, gen_op, gen_type):
if gen_op == "gemm":
operations = GenerateGemmOperations(GenArg(gen_op, gen_type))
elif gen_op == "gemv":
operations = GenerateGemvOperations(GenArg(gen_op, gen_type))
elif gen_op == "conv2d":
operations = GenerateConv2dOperations(GenArg(gen_op, gen_type))
elif gen_op == "deconv":
operations = GenerateDeconvOperations(GenArg(gen_op, gen_type))
for op in operations:
f.write(' "%s.cu",\n' % op.procedural_name())
if __name__ == "__main__":
with open("list.bzl", "w") as f:
f.write("# Generated by dnn/scripts/cutlass_generator/gen_list.py\n\n")
f.write("cutlass_gen_list = [\n")
write_op_list(f, "gemm", "simt")
write_op_list(f, "gemv", "simt")
write_op_list(f, "deconv", "simt")
write_op_list(f, "conv2d", "simt")
write_op_list(f, "conv2d", "tensorop8816")
write_op_list(f, "conv2d", "tensorop8832")
f.write("]")
此差异已折叠。
#
# \file lazy_file.py
#
# \brief LazyFile updates the target file only when the content is changed
# in order to avoid generating new cutlass kimpls each time cmake is called
#
import io
import os
class LazyFile:
def __init__(self, filename):
self.filename = filename
self.buffer = io.StringIO()
def write(self, data):
self.buffer.write(str(data))
def close(self):
if os.path.isfile(self.filename):
old_data = open(self.filename).read()
else:
old_data = ""
new_data = self.buffer.getvalue()
if old_data != new_data:
with open(self.filename, "w") as f:
f.write(new_data)
#
# \file generator.py
#
# \brief Generates the CUTLASS Library's instances
#
import re
###################################################################################################
import enum
# The following block implements enum.auto() for Python 3.5 variants that don't include it such
# as the default 3.5.2 on Ubuntu 16.04.
#
# https://codereview.stackexchange.com/questions/177309/reimplementing-pythons-enum-auto-for-compatibility
try:
from enum import auto as enum_auto
except ImportError:
__cutlass_library_auto_enum = 0
def enum_auto() -> int:
global __cutlass_library_auto_enum
i = __cutlass_library_auto_enum
__cutlass_library_auto_enum += 1
return i
###################################################################################################
#
class GeneratorTarget(enum.Enum):
Library = enum_auto()
#
GeneratorTargetNames = {
GeneratorTarget.Library: 'library'
}
#
###################################################################################################
#
class DataType(enum.Enum):
b1 = enum_auto()
u4 = enum_auto()
u8 = enum_auto()
u16 = enum_auto()
u32 = enum_auto()
u64 = enum_auto()
s4 = enum_auto()
s8 = enum_auto()
s16 = enum_auto()
s32 = enum_auto()
s64 = enum_auto()
f16 = enum_auto()
bf16 = enum_auto()
f32 = enum_auto()
tf32 = enum_auto()
f64 = enum_auto()
cf16 = enum_auto()
cbf16 = enum_auto()
cf32 = enum_auto()
ctf32 = enum_auto()
cf64 = enum_auto()
cs4 = enum_auto()
cs8 = enum_auto()
cs16 = enum_auto()
cs32 = enum_auto()
cs64 = enum_auto()
cu4 = enum_auto()
cu8 = enum_auto()
cu16 = enum_auto()
cu32 = enum_auto()
cu64 = enum_auto()
invalid = enum_auto()
#
ShortDataTypeNames = {
DataType.s32: 'i',
DataType.f16: 'h',
DataType.f32: 's',
DataType.f64: 'd',
DataType.cf32: 'c',
DataType.cf64: 'z',
}
#
DataTypeNames = {
DataType.b1: "b1",
DataType.u4: "u4",
DataType.u8: "u8",
DataType.u16: "u16",
DataType.u32: "u32",
DataType.u64: "u64",
DataType.s4: "s4",
DataType.s8: "s8",
DataType.s16: "s16",
DataType.s32: "s32",
DataType.s64: "s64",
DataType.f16: "f16",
DataType.bf16: "bf16",
DataType.f32: "f32",
DataType.tf32: "tf32",
DataType.f64: "f64",
DataType.cf16: "cf16",
DataType.cbf16: "cbf16",
DataType.cf32: "cf32",
DataType.ctf32: "ctf32",
DataType.cf64: "cf64",
DataType.cu4: "cu4",
DataType.cu8: "cu8",
DataType.cu16: "cu16",
DataType.cu32: "cu32",
DataType.cu64: "cu64",
DataType.cs4: "cs4",
DataType.cs8: "cs8",
DataType.cs16: "cs16",
DataType.cs32: "cs32",
DataType.cs64: "cs64",
}
DataTypeTag = {
DataType.b1: "cutlass::uint1b_t",
DataType.u4: "cutlass::uint4b_t",
DataType.u8: "uint8_t",
DataType.u16: "uint16_t",
DataType.u32: "uint32_t",
DataType.u64: "uint64_t",
DataType.s4: "cutlass::int4b_t",
DataType.s8: "int8_t",
DataType.s16: "int16_t",
DataType.s32: "int32_t",
DataType.s64: "int64_t",
DataType.f16: "cutlass::half_t",
DataType.bf16: "cutlass::bfloat16_t",
DataType.f32: "float",
DataType.tf32: "cutlass::tfloat32_t",
DataType.f64: "double",
DataType.cf16: "cutlass::complex<cutlass::half_t>",
DataType.cbf16: "cutlass::complex<cutlass::bfloat16_t>",
DataType.cf32: "cutlass::complex<float>",
DataType.ctf32: "cutlass::complex<cutlass::tfloat32_t>",
DataType.cf64: "cutlass::complex<double>",
DataType.cu4: "cutlass::complex<cutlass::uint4b_t>",
DataType.cu8: "cutlass::complex<cutlass::uint8_t>",
DataType.cu16: "cutlass::complex<cutlass::uint16_t>",
DataType.cu32: "cutlass::complex<cutlass::uint32_t>",
DataType.cu64: "cutlass::complex<cutlass::uint64_t>",
DataType.cs4: "cutlass::complex<cutlass::int4b_t>",
DataType.cs8: "cutlass::complex<cutlass::int8_t>",
DataType.cs16: "cutlass::complex<cutlass::int16_t>",
DataType.cs32: "cutlass::complex<cutlass::int32_t>",
DataType.cs64: "cutlass::complex<cutlass::int64_t>",
}
DataTypeSize = {
DataType.b1: 1,
DataType.u4: 4,
DataType.u8: 4,
DataType.u16: 16,
DataType.u32: 32,
DataType.u64: 64,
DataType.s4: 4,
DataType.s8: 8,
DataType.s16: 16,
DataType.s32: 32,
DataType.s64: 64,
DataType.f16: 16,
DataType.bf16: 16,
DataType.f32: 32,
DataType.tf32: 32,
DataType.f64: 64,
DataType.cf16: 32,
DataType.cbf16: 32,
DataType.cf32: 64,
DataType.ctf32: 32,
DataType.cf64: 128,
DataType.cu4: 8,
DataType.cu8: 16,
DataType.cu16: 32,
DataType.cu32: 64,
DataType.cu64: 128,
DataType.cs4: 8,
DataType.cs8: 16,
DataType.cs16: 32,
DataType.cs32: 64,
DataType.cs64: 128,
}
###################################################################################################
#
class ComplexTransform(enum.Enum):
none = enum_auto()
conj = enum_auto()
#
ComplexTransformTag = {
ComplexTransform.none: 'cutlass::ComplexTransform::kNone',
ComplexTransform.conj: 'cutlass::ComplexTransform::kConjugate',
}
#
RealComplexBijection = [
(DataType.f16, DataType.cf16),
(DataType.f32, DataType.cf32),
(DataType.f64, DataType.cf64),
]
#
def is_complex(data_type):
for r, c in RealComplexBijection:
if data_type == c:
return True
return False
#
def get_complex_from_real(real_type):
for r, c in RealComplexBijection:
if real_type == r:
return c
return DataType.invalid
#
def get_real_from_complex(complex_type):
for r, c in RealComplexBijection:
if complex_type == c:
return r
return DataType.invalid
#
class ComplexMultiplyOp(enum.Enum):
multiply_add = enum_auto()
gaussian = enum_auto()
###################################################################################################
#
class MathOperation(enum.Enum):
multiply_add = enum_auto()
multiply_add_saturate = enum_auto()
xor_popc = enum_auto()
multiply_add_fast_bf16 = enum_auto()
multiply_add_fast_f16 = enum_auto()
multiply_add_complex = enum_auto()
multiply_add_complex_gaussian = enum_auto()
#
MathOperationTag = {
MathOperation.multiply_add: 'cutlass::arch::OpMultiplyAdd',
MathOperation.multiply_add_saturate: 'cutlass::arch::OpMultiplyAddSaturate',
MathOperation.xor_popc: 'cutlass::arch::OpXorPopc',
MathOperation.multiply_add_fast_bf16: 'cutlass::arch::OpMultiplyAddFastBF16',
MathOperation.multiply_add_fast_f16: 'cutlass::arch::OpMultiplyAddFastF16',
MathOperation.multiply_add_complex: 'cutlass::arch::OpMultiplyAddComplex',
MathOperation.multiply_add_complex_gaussian: 'cutlass::arch::OpMultiplyAddGaussianComplex',
}
###################################################################################################
#
class LayoutType(enum.Enum):
ColumnMajor = enum_auto()
RowMajor = enum_auto()
ColumnMajorInterleaved2 = enum_auto()
RowMajorInterleaved2 = enum_auto()
ColumnMajorInterleaved32 = enum_auto()
RowMajorInterleaved32 = enum_auto()
ColumnMajorInterleaved64 = enum_auto()
RowMajorInterleaved64 = enum_auto()
TensorNHWC = enum_auto()
TensorNDHWC = enum_auto()
TensorNCHW = enum_auto()
TensorNGHWC = enum_auto()
TensorNC4HW4 = enum_auto()
TensorC4RSK4 = enum_auto()
TensorNC8HW8 = enum_auto()
TensorNC16HW16 = enum_auto()
TensorNC32HW32 = enum_auto()
TensorNC64HW64 = enum_auto()
TensorC32RSK32 = enum_auto()
TensorC64RSK64 = enum_auto()
TensorK4RSC4 = enum_auto()
#
LayoutTag = {
LayoutType.ColumnMajor: 'cutlass::layout::ColumnMajor',
LayoutType.RowMajor: 'cutlass::layout::RowMajor',
LayoutType.ColumnMajorInterleaved2: 'cutlass::layout::ColumnMajorInterleaved<2>',
LayoutType.RowMajorInterleaved2: 'cutlass::layout::RowMajorInterleaved<2>',
LayoutType.ColumnMajorInterleaved32: 'cutlass::layout::ColumnMajorInterleaved<32>',
LayoutType.RowMajorInterleaved32: 'cutlass::layout::RowMajorInterleaved<32>',
LayoutType.ColumnMajorInterleaved64: 'cutlass::layout::ColumnMajorInterleaved<64>',
LayoutType.RowMajorInterleaved64: 'cutlass::layout::RowMajorInterleaved<64>',
LayoutType.TensorNHWC: 'cutlass::layout::TensorNHWC',
LayoutType.TensorNDHWC: 'cutlass::layout::TensorNDHWC',
LayoutType.TensorNCHW: 'cutlass::layout::TensorNCHW',
LayoutType.TensorNGHWC: 'cutlass::layout::TensorNGHWC',
LayoutType.TensorNC4HW4: 'cutlass::layout::TensorNCxHWx<4>',
LayoutType.TensorC4RSK4: 'cutlass::layout::TensorCxRSKx<4>',
LayoutType.TensorNC8HW8: 'cutlass::layout::TensorNCxHWx<8>',
LayoutType.TensorNC16HW16: 'cutlass::layout::TensorNCxHWx<16>',
LayoutType.TensorNC32HW32: 'cutlass::layout::TensorNCxHWx<32>',
LayoutType.TensorC32RSK32: 'cutlass::layout::TensorCxRSKx<32>',
LayoutType.TensorNC64HW64: 'cutlass::layout::TensorNCxHWx<64>',
LayoutType.TensorC64RSK64: 'cutlass::layout::TensorCxRSKx<64>',
LayoutType.TensorK4RSC4: 'cutlass::layout::TensorKxRSCx<4>',
}
#
TransposedLayout = {
LayoutType.ColumnMajor: LayoutType.RowMajor,
LayoutType.RowMajor: LayoutType.ColumnMajor,
LayoutType.ColumnMajorInterleaved2: LayoutType.RowMajorInterleaved2,
LayoutType.RowMajorInterleaved2: LayoutType.ColumnMajorInterleaved2,
LayoutType.ColumnMajorInterleaved32: LayoutType.RowMajorInterleaved32,
LayoutType.RowMajorInterleaved32: LayoutType.ColumnMajorInterleaved32,
LayoutType.ColumnMajorInterleaved64: LayoutType.RowMajorInterleaved64,
LayoutType.RowMajorInterleaved64: LayoutType.ColumnMajorInterleaved64,
LayoutType.TensorNHWC: LayoutType.TensorNHWC
}
#
ShortLayoutTypeNames = {
LayoutType.ColumnMajor: 'n',
LayoutType.ColumnMajorInterleaved32: 'n2',
LayoutType.ColumnMajorInterleaved32: 'n32',
LayoutType.ColumnMajorInterleaved64: 'n64',
LayoutType.RowMajor: 't',
LayoutType.RowMajorInterleaved2: 't2',
LayoutType.RowMajorInterleaved32: 't32',
LayoutType.RowMajorInterleaved64: 't64',
LayoutType.TensorNHWC: 'nhwc',
LayoutType.TensorNDHWC: 'ndhwc',
LayoutType.TensorNCHW: 'nchw',
LayoutType.TensorNGHWC: 'nghwc',
LayoutType.TensorNC4HW4: 'nc4hw4',
LayoutType.TensorC4RSK4: 'c4rsk4',
LayoutType.TensorNC8HW8: 'nc8hw8',
LayoutType.TensorNC16HW16: 'nc16hw16',
LayoutType.TensorNC32HW32: 'nc32hw32',
LayoutType.TensorNC64HW64: 'nc64hw64',
LayoutType.TensorC32RSK32: 'c32rsk32',
LayoutType.TensorC64RSK64: 'c64rsk64',
LayoutType.TensorK4RSC4: 'k4rsc4',
}
#
ShortComplexLayoutNames = {
(LayoutType.ColumnMajor, ComplexTransform.none): 'n',
(LayoutType.ColumnMajor, ComplexTransform.conj): 'c',
(LayoutType.RowMajor, ComplexTransform.none): 't',
(LayoutType.RowMajor, ComplexTransform.conj): 'h'
}
###################################################################################################
#
class OpcodeClass(enum.Enum):
Simt = enum_auto()
TensorOp = enum_auto()
WmmaTensorOp = enum_auto()
OpcodeClassNames = {
OpcodeClass.Simt: 'simt',
OpcodeClass.TensorOp: 'tensorop',
OpcodeClass.WmmaTensorOp: 'wmma_tensorop',
}
OpcodeClassTag = {
OpcodeClass.Simt: 'cutlass::arch::OpClassSimt',
OpcodeClass.TensorOp: 'cutlass::arch::OpClassTensorOp',
OpcodeClass.WmmaTensorOp: 'cutlass::arch::OpClassWmmaTensorOp',
}
###################################################################################################
#
class OperationKind(enum.Enum):
Gemm = enum_auto()
Conv2d = enum_auto()
#
OperationKindNames = {
OperationKind.Gemm: 'gemm'
, OperationKind.Conv2d: 'conv2d'
}
#
class Target(enum.Enum):
library = enum_auto()
ArchitectureNames = {
50: 'maxwell',
60: 'pascal',
61: 'pascal',
70: 'volta',
75: 'turing',
80: 'ampere',
}
###################################################################################################
#
def SubstituteTemplate(template, values):
text = template
changed = True
while changed:
changed = False
for key, value in values.items():
regex = "\\$\\{%s\\}" % key
newtext = re.sub(regex, value, text)
if newtext != text:
changed = True
text = newtext
return text
###################################################################################################
#
class GemmKind(enum.Enum):
Gemm = enum_auto()
Sparse = enum_auto()
Universal = enum_auto()
PlanarComplex = enum_auto()
PlanarComplexArray = enum_auto()
SplitKParallel = enum_auto()
GemvBatchedStrided = enum_auto()
#
GemmKindNames = {
GemmKind.Gemm: "gemm",
GemmKind.Sparse: "spgemm",
GemmKind.Universal: "gemm",
GemmKind.PlanarComplex: "gemm_planar_complex",
GemmKind.PlanarComplexArray: "gemm_planar_complex_array",
GemmKind.SplitKParallel: "gemm_split_k_parallel",
GemmKind.GemvBatchedStrided: "gemv_batched_strided",
}
#
class EpilogueFunctor(enum.Enum):
LinearCombination = enum_auto()
LinearCombinationClamp = enum_auto()
BiasAddLinearCombination = enum_auto()
BiasAddLinearCombinationRelu = enum_auto()
BiasAddLinearCombinationHSwish = enum_auto()
BiasAddLinearCombinationClamp = enum_auto()
BiasAddLinearCombinationReluClamp = enum_auto()
BiasAddLinearCombinationHSwishClamp = enum_auto()
#
EpilogueFunctorTag = {
EpilogueFunctor.LinearCombination: 'cutlass::epilogue::thread::LinearCombination',
EpilogueFunctor.LinearCombinationClamp: 'cutlass::epilogue::thread::LinearCombinationClamp',
EpilogueFunctor.BiasAddLinearCombination: 'cutlass::epilogue::thread::BiasAddLinearCombination',
EpilogueFunctor.BiasAddLinearCombinationRelu: 'cutlass::epilogue::thread::BiasAddLinearCombinationRelu',
EpilogueFunctor.BiasAddLinearCombinationHSwish: 'cutlass::epilogue::thread::BiasAddLinearCombinationHSwish',
EpilogueFunctor.BiasAddLinearCombinationClamp: 'cutlass::epilogue::thread::BiasAddLinearCombinationClamp',
EpilogueFunctor.BiasAddLinearCombinationReluClamp: 'cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp',
EpilogueFunctor.BiasAddLinearCombinationHSwishClamp: 'cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp',
}
#
ShortEpilogueNames = {
EpilogueFunctor.BiasAddLinearCombinationHSwishClamp: 'hswish',
EpilogueFunctor.BiasAddLinearCombinationReluClamp: 'relu',
EpilogueFunctor.BiasAddLinearCombinationClamp: 'identity',
EpilogueFunctor.BiasAddLinearCombinationHSwish: 'hswish',
EpilogueFunctor.BiasAddLinearCombinationRelu: 'relu',
EpilogueFunctor.BiasAddLinearCombination: 'identity',
}
#
class SwizzlingFunctor(enum.Enum):
Identity1 = enum_auto()
Identity2 = enum_auto()
Identity4 = enum_auto()
Identity8 = enum_auto()
ConvFpropNCxHWx = enum_auto()
ConvFpropNHWC = enum_auto()
ConvDgradNCxHWx = enum_auto()
#
SwizzlingFunctorTag = {
SwizzlingFunctor.Identity1: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>',
SwizzlingFunctor.Identity2: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>',
SwizzlingFunctor.Identity4: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>',
SwizzlingFunctor.Identity8: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>',
SwizzlingFunctor.ConvFpropNCxHWx: 'cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle',
SwizzlingFunctor.ConvFpropNHWC: 'cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle',
SwizzlingFunctor.ConvDgradNCxHWx: 'cutlass::conv::threadblock::ConvolutionDgradNCxHWxThreadblockSwizzle',
}
###################################################################################################
class ConvType(enum.Enum):
Convolution = enum_auto()
BatchConvolution = enum_auto()
Local = enum_auto()
LocalShare = enum_auto()
ConvTypeTag = {
ConvType.Convolution: 'cutlass::conv::ConvType::kConvolution',
ConvType.BatchConvolution: 'cutlass::conv::ConvType::kBatchConvolution',
ConvType.Local: 'cutlass::conv::ConvType::kLocal',
ConvType.LocalShare : 'cutlass::conv::ConvType::kLocalShare',
}
#
class ConvKind(enum.Enum):
Fprop = enum_auto()
Dgrad = enum_auto()
Wgrad = enum_auto()
#
ConvKindTag = {
ConvKind.Fprop: 'cutlass::conv::Operator::kFprop',
ConvKind.Dgrad: 'cutlass::conv::Operator::kDgrad',
ConvKind.Wgrad: 'cutlass::conv::Operator::kWgrad'
}
ConvKindNames = {
ConvKind.Fprop: 'fprop',
ConvKind.Dgrad: 'dgrad',
ConvKind.Wgrad: 'wgrad',
}
#
class IteratorAlgorithm(enum.Enum):
Analytic = enum_auto()
Optimized = enum_auto()
#
IteratorAlgorithmTag = {
IteratorAlgorithm.Analytic: 'cutlass::conv::IteratorAlgorithm::kAnalytic',
IteratorAlgorithm.Optimized: 'cutlass::conv::IteratorAlgorithm::kOptimized',
}
IteratorAlgorithmNames = {
IteratorAlgorithm.Analytic: 'analytic',
IteratorAlgorithm.Optimized: 'optimized',
}
#
class StrideSupport(enum.Enum):
Strided = enum_auto()
Unity = enum_auto()
#
StrideSupportTag = {
StrideSupport.Strided: 'cutlass::conv::StrideSupport::kStrided',
StrideSupport.Unity: 'cutlass::conv::StrideSupport::kUnity',
}
StrideSupportNames = {
StrideSupport.Strided: '',
StrideSupport.Unity: 'unity_stride',
}
class ImplicitGemmMode(enum.Enum):
GemmNt = enum_auto()
GemmTn = enum_auto()
ImplicitGemmModeNames = {
ImplicitGemmMode.GemmNt: 'gemm_nt',
ImplicitGemmMode.GemmTn: 'gemm_tn',
}
ImplicitGemmModeTag = {
ImplicitGemmMode.GemmNt: 'cutlass::conv::ImplicitGemmMode::GEMM_NT',
ImplicitGemmMode.GemmTn: 'cutlass::conv::ImplicitGemmMode::GEMM_TN',
}
###################################################################################################
#
class MathInstruction:
def __init__(self, instruction_shape, element_a, element_b, element_accumulator, opcode_class, math_operation = MathOperation.multiply_add):
self.instruction_shape = instruction_shape
self.element_a = element_a
self.element_b = element_b
self.element_accumulator = element_accumulator
self.opcode_class = opcode_class
self.math_operation = math_operation
#
class TileDescription:
def __init__(self, threadblock_shape, stages, warp_count, math_instruction, min_compute, max_compute):
self.threadblock_shape = threadblock_shape
self.stages = stages
self.warp_count = warp_count
self.math_instruction = math_instruction
self.minimum_compute_capability = min_compute
self.maximum_compute_capability = max_compute
def procedural_name(self):
return "%dx%d_%dx%d" % (self.threadblock_shape[0], self.threadblock_shape[1], self.threadblock_shape[2], self.stages)
#
class TensorDescription:
def __init__(self, element, layout, alignment = 1, complex_transform = ComplexTransform.none):
self.element = element
self.layout = layout
self.alignment = alignment
self.complex_transform = complex_transform
###################################################################################################
此差异已折叠。
#
# \file generator.py
#
# \brief Generates the CUTLASS Library's instances
#
import enum
import os.path
import shutil
from library import *
from gemm_operation import *
from conv2d_operation import *
###################################################################################################
class EmitOperationKindLibrary:
def __init__(self, generated_path, kind, args):
self.generated_path = generated_path
self.kind = kind
self.args = args
self.emitters = {
OperationKind.Gemm: EmitGemmConfigurationLibrary
, OperationKind.Conv2d: EmitConv2dConfigurationLibrary
}
self.configurations = [];
self.header_template ="""
/*
Generated by manifest.py - Do not edit.
*/
#include "cutlass/cutlass.h"
#include "cutlass/library/library.h"
#include "cutlass/library/manifest.h"
namespace cutlass {
namespace library {
///////////////////////////////////////////////////////////////////////////////////////////////////
"""
self.entry_template = """
//
// Entry point to construct operations
//
void initialize_all_${operation_name}_operations(Manifest &manifest) {
"""
self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n"
self.configuration_template =" initialize_${configuration_name}(manifest);\n"
self.epilogue_template ="""
}
///////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace library
} // namespace cutlass
"""
#
def __enter__(self):
self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind])
os.mkdir(self.operation_path)
self.top_level_path = os.path.join(self.operation_path, "all_%s_operations.cu" % OperationKindNames[self.kind])
self.top_level_file = open(self.top_level_path, "w")
self.top_level_file.write(self.header_template)
self.source_files = [self.top_level_path,]
return self
#
def emit(self, configuration_name, operations):
with self.emitters[self.kind](self.operation_path, configuration_name) as configuration_emitter:
for operation in operations:
configuration_emitter.emit(operation)
self.source_files.append(configuration_emitter.configuration_path)
self.configurations.append(configuration_name)
self.top_level_file.write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} ))
#
def __exit__(self, exception_type, exception_value, traceback):
self.top_level_file.write(SubstituteTemplate(self.entry_template, {'operation_name': OperationKindNames[self.kind]}))
for configuration_name in self.configurations:
self.top_level_file.write(SubstituteTemplate(self.configuration_template, {'configuration_name': configuration_name}))
self.top_level_file.write(self.epilogue_template)
self.top_level_file.close()
###################################################################################################
###################################################################################################
class Options:
def __init__(self):
pass
###################################################################################################
#
class Manifest:
#
def __init__(self, args):
self.operations = {}
self.args = args
architectures = args.architectures.split(';') if len(args.architectures) else ['50',]
self.compute_capabilities = [int(x) for x in architectures]
self.selected_kernels = []
if args.operations == 'all':
self.operations_enabled = []
else:
operations_list = [
OperationKind.Gemm
, OperationKind.Conv2d
]
self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')]
if args.kernels == 'all':
self.kernel_names = []
else:
self.kernel_names = [x for x in args.kernels.split(',') if x != '']
self.ignore_kernel_names = [x for x in args.ignore_kernels.split(',') if x != '']
if args.kernel_filter_file is None:
self.kernel_filter_list = []
else:
self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file)
self.operation_count = 0
self.operations_by_name = {}
self.top_level_prologue = '''
#include "cutlass/library/library.h"
#include "cutlass/library/manifest.h"
namespace cutlass {
namespace library {
${prototypes}
void initialize_all(Manifest &manifest) {
'''
self.top_level_reserve = ' manifest.reserve(${operation_count});\n\n'
self.top_level_epilogue = '''
}
} // namespace library
} // namespace cutlass
'''
def get_kernel_filters (self, kernelListFile):
if os.path.isfile(kernelListFile):
with open(kernelListFile, 'r') as fileReader:
lines = [line.rstrip() for line in fileReader if not line.startswith("#")]
lines = [re.compile(line) for line in lines if line]
return lines
else:
return []
def filter_out_kernels(self, kernel_name, kernel_filter_list):
for kernel_filter_re in kernel_filter_list:
if kernel_filter_re.search(kernel_name) is not None:
return True
return False
#
def _filter_string_matches(self, filter_string, haystack):
''' Returns true if all substrings appear in the haystack in order'''
substrings = filter_string.split('*')
for sub in substrings:
idx = haystack.find(sub)
if idx < 0:
return False
haystack = haystack[idx + len(sub):]
return True
#
def filter(self, operation):
''' Filtering operations based on various criteria'''
# filter based on compute capability
enabled = False
for cc in self.compute_capabilities:
if cc >= operation.tile_description.minimum_compute_capability and \
cc <= operation.tile_description.maximum_compute_capability:
enabled = True
break
if not enabled:
return False
if len(self.operations_enabled) and not operation.operation_kind in self.operations_enabled:
return False
# eliminate duplicates
if operation.procedural_name() in self.operations_by_name.keys():
return False
# Filter based on list of valid substrings
if len(self.kernel_names):
name = operation.procedural_name()
enabled = False
# compare against the include list
for name_substr in self.kernel_names:
if self._filter_string_matches(name_substr, name):
enabled = True
break
# compare against the exclude list
for name_substr in self.ignore_kernel_names:
if self._filter_string_matches(name_substr, name):
enabled = False
break
if len(self.kernel_filter_list) > 0:
enabled = False
if self.filter_out_kernels(operation.procedural_name(), self.kernel_filter_list):
enabled = True
# todo: filter based on compute data type
return enabled
#
#
def append(self, operation):
'''
Inserts the operation.
operation_kind -> configuration_name -> []
'''
if self.filter(operation):
self.selected_kernels.append(operation.procedural_name())
self.operations_by_name[operation.procedural_name()] = operation
# add the configuration
configuration_name = operation.configuration_name()
if operation.operation_kind not in self.operations.keys():
self.operations[operation.operation_kind] = {}
if configuration_name not in self.operations[operation.operation_kind].keys():
self.operations[operation.operation_kind][configuration_name] = []
self.operations[operation.operation_kind][configuration_name].append(operation)
self.operation_count += 1
#
#
def emit(self, target = GeneratorTarget.Library):
operation_emitters = {
GeneratorTarget.Library: EmitOperationKindLibrary
}
generated_path = os.path.join(self.args.curr_build_dir, 'generated')
# create generated/
if os.path.exists(generated_path):
shutil.rmtree(generated_path)
os.mkdir(generated_path)
source_files = []
top_level_path = os.path.join(generated_path, 'initialize_all.cpp')
with open(top_level_path, 'w') as top_level_file:
if target == GeneratorTarget.Library:
source_files.append(top_level_path)
prototypes = []
for operation_kind, configurations in self.operations.items():
prototypes.append(SubstituteTemplate(
"void initialize_all_${operation_kind}_operations(Manifest &manifest);",
{'operation_kind': OperationKindNames[operation_kind]}))
top_level_file.write(SubstituteTemplate(self.top_level_prologue,
{'prototypes': "\n".join(prototypes)}))
top_level_file.write(SubstituteTemplate(
self.top_level_reserve, {'operation_count': str(self.operation_count)}))
# for each operation kind, emit initializer for all configurations
for operation_kind, configurations in self.operations.items():
with operation_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter:
for configuration_name, operations in configurations.items():
operation_kind_emitter.emit(configuration_name, operations)
source_files += operation_kind_emitter.source_files
top_level_file.write(SubstituteTemplate(
" initialize_all_${operation_kind}_operations(manifest);\n",
{'operation_kind': OperationKindNames[operation_kind]}))
top_level_file.write(self.top_level_epilogue)
# write the manifest.cmake file containing paths from all targets
manifest_path = os.path.join(generated_path, "manifest.cmake")
with open(manifest_path, "w") as manifest_file:
target_name = 'cutlass_library_objs'
target_text = SubstituteTemplate("""cutlass_target_sources(
${target_name}
BATCH_SOURCES ON
PRIVATE
""", { 'target_name': target_name})
manifest_file.write(target_text)
for source_file in source_files:
manifest_file.write(" %s\n" % str(source_file.replace('\\', '/')))
manifest_file.write(")")
#
###################################################################################################
......@@ -113,6 +113,31 @@ if(MGE_WITH_CUDA)
list(APPEND SOURCES ${SOURCES_})
file(GLOB_RECURSE CUSOURCES cuda/*.cu)
set(CUTLASS_GEN_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/../scripts/cutlass_generator/generator.py)
set(CUTLASS_GEN_DIR ${CMAKE_CURRENT_BINARY_DIR}/cuda/cutlass/generated)
function(gen_cutlass_kimpl op type)
set(CURRENT_CUTLASS_GEN_DIR ${CUTLASS_GEN_DIR}/${op}_${type})
file(MAKE_DIRECTORY ${CURRENT_CUTLASS_GEN_DIR})
execute_process(
COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${CUTLASS_GEN_SCRIPT} --operations ${op} --type ${type} ${CURRENT_CUTLASS_GEN_DIR}
RESULT_VARIABLE gen_cutlass_result
OUTPUT_FILE ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log
ERROR_FILE ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log
)
if (NOT gen_cutlass_result EQUAL 0)
message(FATAL_ERROR "Error generating library instances. See ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log")
endif()
endfunction()
gen_cutlass_kimpl(gemm simt)
gen_cutlass_kimpl(gemv simt)
gen_cutlass_kimpl(deconv simt)
gen_cutlass_kimpl(conv2d simt)
gen_cutlass_kimpl(conv2d tensorop8816)
gen_cutlass_kimpl(conv2d tensorop8832)
file(GLOB_RECURSE CUTLASS_SOURCES ${CUTLASS_GEN_DIR}/*.cu)
list(APPEND SOURCES ${CUTLASS_SOURCES})
list(APPEND SOURCES ${CUSOURCES})
endif()
......
#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif
#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif
#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif
#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif
#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif
#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif
#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif
#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif
#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif
#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif
#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif
#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif
#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif
#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<32>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
32,
32,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif
#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<8>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 64>,
cutlass::gemm::GemmShape<64, 32, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
8,
8,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif
#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<16>,
cutlass::int4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle,
2,
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::ImplicitGemmMode::GEMM_TN>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册