feat(dnn/cuda): add cudnn frontend api

GitOrigin-RevId: 9b18a57893fd7059a29edc11fc04c14d8176e547
......@@ -54,9 +54,12 @@ if(MGE_WITH_CUDA)
add_library(cutlass INTERFACE)
target_include_directories(
cutlass
INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/cutlass/include>)
add_library(cudnn-frontend INTERFACE)
target_include_directories(
cudnn-frontend
INTERFACE
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/cutlass/include>
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/cutlass/tools/util/include>)
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/cudnn-frontend/include>)
endif()
if(MGE_WITH_TEST)
......
......@@ -22,7 +22,16 @@ public:
bool operator==(const KeyStorage& k) const { return k1 == k.k1 && k2 == k.k2; }
};
struct Key {
struct Hash {
size_t operator()(const KeyStorage& k) const {
size_t h1 = k.k1;
size_t h2 = k.k2;
h1 ^= h2 + 0x9e3779b9 + (h1 << 6) + (h1 >> 2);
return h1;
}
};
class Key {
Handle* m_handle;
uint32_t m_opr_type;
const TensorLayout* m_inp_layouts_ptr;
......@@ -62,14 +71,6 @@ public:
MGE_WIN_DECLSPEC_FUC void clear();
private:
struct Hash {
size_t operator()(const KeyStorage& k) const {
size_t h1 = k.k1;
size_t h2 = k.k2;
h1 ^= h2 + 0x9e3779b9 + (h1 << 6) + (h1 >> 2);
return h1;
}
};
std::unordered_map<KeyStorage, Result, Hash> m_heuristic_cache;
#if __DEPLOY_ON_XP_SP2__
size_t m_mtx;
......
......@@ -222,6 +222,8 @@ target_link_libraries(megdnn PUBLIC opr_param_defs)
if(MGE_WITH_CUDA)
target_link_libraries(megdnn PRIVATE $<BUILD_INTERFACE:cutlass>)
target_include_directories(megdnn PRIVATE ${CUDNN_INCLUDE_DIR})
target_link_libraries(megdnn PRIVATE $<BUILD_INTERFACE:cudnn-frontend>)
endif()
if(MGE_WITH_ROCM)
......
......@@ -14,6 +14,12 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() {
non_cudnn_algos.push_back(&matmul8x8x32);
non_cudnn_algos.push_back(&batched_matmul);
non_cudnn_algos.push_back(&int1_simple);
#if CUDNN_VERSION > 8004
all_algos.push_back(&cudnn_conv_v8);
all_algos.push_back(&cudnn_conv_bias_activation_v8);
#endif
fill_cudnn_algos();
for (auto&& algo : cudnn_conv_bias_activations) {
all_algos.push_back(&algo);
......@@ -169,6 +175,30 @@ std::string ConvBiasForwardImpl::AlgoBase::SizeArgs::to_string() const {
nonlinear_mode_str.c_str());
}
param::Convolution ConvBiasForwardImpl::AlgoBase::get_param_convolution(
const SizeArgs& args) const {
param::Convolution::Mode mode;
param::Convolution::Sparse sparse = args.filter_meta.group > 1
? param::Convolution::Sparse::GROUP
: param::Convolution::Sparse::DENSE;
if (args.filter_meta.should_flip) {
mode = param::Convolution::Mode::CONVOLUTION;
} else {
mode = param::Convolution::Mode::CROSS_CORRELATION;
}
return param::Convolution{
mode,
args.filter_meta.padding[0],
args.filter_meta.padding[1],
args.filter_meta.stride[0],
args.filter_meta.stride[1],
args.filter_meta.dilation[1],
args.filter_meta.dilation[0],
sparse,
args.filter_meta.format,
args.opr->param().compute_mode};
}
void ConvBiasForwardImpl::AlgoPack::fill_cudnn_algos() {
for (auto&& algo : CudnnAlgoPack::conv_fwd_algos()) {
cudnn_conv_bias_activations.push_back(algo.first);
......
......@@ -76,6 +76,8 @@ public:
CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32,
CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16,
CUDA_SIMPLE_INT1,
CUDA_CUDNN_CONV_V8,
CUDA_CUDNN_CONVBIAS_V8,
};
using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
......@@ -157,12 +159,40 @@ public:
}
virtual bool is_cudnn() const { return false; }
param::Convolution get_param_convolution(const SizeArgs& args) const;
};
class ConvBiasForwardImpl::AlgoCUDNNConvBiasActivationBase : public AlgoBase {
public:
AlgoCUDNNConvBiasActivationBase() = default;
virtual ~AlgoCUDNNConvBiasActivationBase() = default;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override;
bool is_cudnn() const override { return true; }
size_t get_preprocess_workspace_in_bytes(const SizeArgs& args) const override;
SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const SizeArgs& args) const override;
void exec_preprocess(const ExecArgs& args) const override;
protected:
virtual size_t cudnn_get_workspace_in_bytes(const SizeArgs& args) const = 0;
virtual void cudnn_execute(
const ExecArgs& args, const Workspace& workspace, float alpha,
float beta) const = 0;
protected:
std::string m_name;
};
class ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation final : public AlgoBase {
class ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation final
: public AlgoCUDNNConvBiasActivationBase {
public:
AlgoCUDNNConvBiasActivation(cudnnConvolutionFwdAlgo_t cudnn_enum)
: m_cudnn_enum(cudnn_enum) {
: AlgoCUDNNConvBiasActivationBase(), m_cudnn_enum(cudnn_enum) {
megdnn_assert(
CudnnAlgoPack::conv_fwd_algos().find(cudnn_enum) !=
CudnnAlgoPack::conv_fwd_algos().end());
......@@ -171,9 +201,6 @@ public:
"CUDNN:ConvBiasActivation:" + m_attr.name, {});
}
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override;
param::Convolution get_param_convolution(const SizeArgs& args) const;
bool is_available(const SizeArgs&) const override;
const char* name() const override { return m_name.c_str(); }
......@@ -191,8 +218,6 @@ public:
cudnnConvolutionFwdAlgo_t cudnn_enum() { return m_cudnn_enum; }
bool is_cudnn() const override { return true; }
MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN_CONVBIAS)
std::string param() const override {
......@@ -202,11 +227,46 @@ public:
}
private:
std::string m_name;
size_t cudnn_get_workspace_in_bytes(const SizeArgs& args) const override;
void cudnn_execute(
const ExecArgs& args, const Workspace& workspace, float alpha,
float beta) const override;
private:
cudnnConvolutionFwdAlgo_t m_cudnn_enum;
CudnnAlgoPack::Attr m_attr;
};
#if CUDNN_VERSION > 8004
class ConvBiasForwardImpl::AlgoCUDNNConvBiasActivationV8 final
: public AlgoCUDNNConvBiasActivationBase {
public:
AlgoCUDNNConvBiasActivationV8() : AlgoCUDNNConvBiasActivationBase() {
m_name = ConvBiasForward::algo_name<DefaultParam>(
"CUDNN:ConvBiasActivationV8", {});
}
~AlgoCUDNNConvBiasActivationV8() = default;
bool is_available(const SizeArgs& args) const override;
AlgoAttribute attribute() const override {
return AlgoAttribute::REPRODUCIBLE | AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
}
const char* name() const override { return m_name.c_str(); }
MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN_CONVBIAS_V8)
std::string param() const override { return ""; }
private:
size_t cudnn_get_workspace_in_bytes(const SizeArgs& args) const override;
void cudnn_execute(
const ExecArgs& args, const Workspace& workspace, float alpha,
float beta) const override;
};
#endif
class ConvBiasForwardImpl::AlgoChanwise final : public AlgoBase {
public:
bool is_available(const SizeArgs& args) const override;
......@@ -284,9 +344,34 @@ private:
mutable std::string m_name;
};
class ConvBiasForwardImpl::AlgoCUDNNConv final : public AlgoBase {
class ConvBiasForwardImpl::AlgoCUDNNConvBase : public AlgoBase {
public:
AlgoCUDNNConvBase() = default;
virtual ~AlgoCUDNNConvBase() = default;
size_t get_workspace_in_bytes(const SizeArgs& args) const override {
return get_workspace_bundle(nullptr, args).total_size_in_bytes();
}
void exec(const ExecArgs& args) const override;
bool is_cudnn() const override { return true; }
protected:
virtual size_t cudnn_get_workspace_in_bytes(const SizeArgs& args) const = 0;
virtual void cudnn_execute(
const ExecArgs& args, const Workspace& workspace) const = 0;
private:
WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
protected:
std::string m_name;
};
class ConvBiasForwardImpl::AlgoCUDNNConv final : public AlgoCUDNNConvBase {
public:
AlgoCUDNNConv(cudnnConvolutionFwdAlgo_t cudnn_enum) : m_cudnn_enum(cudnn_enum) {
AlgoCUDNNConv(cudnnConvolutionFwdAlgo_t cudnn_enum)
: AlgoCUDNNConvBase(), m_cudnn_enum(cudnn_enum) {
megdnn_assert(
CudnnAlgoPack::conv_fwd_algos().find(cudnn_enum) !=
CudnnAlgoPack::conv_fwd_algos().end());
......@@ -296,8 +381,6 @@ public:
}
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override;
AlgoAttribute attribute() const override {
auto ret = static_cast<AlgoAttribute>(0);
......@@ -314,8 +397,6 @@ public:
cudnnConvolutionFwdAlgo_t cudnn_enum() const { return m_cudnn_enum; }
bool is_cudnn() const override { return true; }
MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN_CONV)
std::string param() const override {
......@@ -325,12 +406,38 @@ public:
}
private:
std::string m_name;
size_t cudnn_get_workspace_in_bytes(const SizeArgs& args) const override;
void cudnn_execute(const ExecArgs& args, const Workspace& workspace) const override;
private:
cudnnConvolutionFwdAlgo_t m_cudnn_enum;
CudnnAlgoPack::Attr m_attr;
};
WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
#if CUDNN_VERSION > 8004
class ConvBiasForwardImpl::AlgoCUDNNConvV8 final : public AlgoCUDNNConvBase {
public:
AlgoCUDNNConvV8() : AlgoCUDNNConvBase() {
m_name = ConvBiasForward::algo_name<DefaultParam>("CUDNN:ConvolutionV8", {});
}
bool is_available(const SizeArgs& args) const override;
AlgoAttribute attribute() const override {
return AlgoAttribute::REPRODUCIBLE | AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
}
const char* name() const override { return m_name.c_str(); }
MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN_CONV_V8)
std::string param() const override { return ""; }
private:
size_t cudnn_get_workspace_in_bytes(const SizeArgs& args) const override;
void cudnn_execute(const ExecArgs& args, const Workspace& workspace) const override;
};
#endif
//! compute small matmul in the kernel
class ConvBiasForwardImpl::AlgoInplaceMatmul final : public AlgoBase {
......@@ -1140,6 +1247,10 @@ public:
AlgoGroupConvGeneral group;
AlgoBFloat16 bfloat16;
AlgoSimpleInt1 int1_simple;
#if CUDNN_VERSION > 8004
AlgoCUDNNConvV8 cudnn_conv_v8;
AlgoCUDNNConvBiasActivationV8 cudnn_conv_bias_activation_v8;
#endif
AlgoBase* cudnn_conv_bias_act_from_enum(cudnnConvolutionFwdAlgo_t algo);
......
......@@ -56,99 +56,33 @@ bool ConvBiasForwardImpl::AlgoCUDNNConv::is_available(const SizeArgs& args) cons
return status == CUDNN_STATUS_SUCCESS;
}
WorkspaceBundle ConvBiasForwardImpl::AlgoCUDNNConv::get_workspace_bundle(
void* ptr, const SizeArgs& args) const {
auto dst_layout = *args.dst_layout;
SmallVector<size_t> sizes;
if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) {
dst_layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype, dst_layout.dtype);
sizes.push_back(dst_layout.span().dist_byte());
}
if (args.z_layout->ndim > 0 &&
args.z_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) {
auto z_layout = *args.z_layout;
z_layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype, z_layout.dtype);
sizes.push_back(z_layout.span().dist_byte());
}
SizeArgs conv_args = args;
conv_args.dst_layout = &dst_layout;
size_t ConvBiasForwardImpl::AlgoCUDNNConv::cudnn_get_workspace_in_bytes(
const SizeArgs& args) const {
CUDNNForwardDescs D;
conv_args.init_conv_desc(D);
args.init_conv_desc(D);
size_t conv_workspace_size;
auto status = cudnnGetConvolutionForwardWorkspaceSize(
conv_args.handle->cudnn_handle(), D.src_desc.desc, D.filter_desc.desc,
D.conv_desc.conv_desc, D.dst_desc.desc, m_cudnn_enum, &conv_workspace_size);
megdnn_assert(
status == CUDNN_STATUS_SUCCESS,
"conv fwd get workspace failed: %s; info: %s", cudnnGetErrorString(status),
args.to_string().c_str());
sizes.insert(sizes.begin(), conv_workspace_size);
return {ptr, std::move(sizes)};
}
size_t ConvBiasForwardImpl::AlgoCUDNNConv::get_workspace_in_bytes(
const SizeArgs& args) const {
return get_workspace_bundle(nullptr, args).total_size_in_bytes();
cudnn_check(cudnnGetConvolutionForwardWorkspaceSize(
args.handle->cudnn_handle(), D.src_desc.desc, D.filter_desc.desc,
D.conv_desc.conv_desc, D.dst_desc.desc, m_cudnn_enum,
&conv_workspace_size));
return conv_workspace_size;
}
void ConvBiasForwardImpl::AlgoCUDNNConv::exec(const ExecArgs& args) const {
auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args);
TensorND conv_dst_tensor = *args.dst_tensor;
if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) {
conv_dst_tensor = TensorND{bundle.get(1), args.dst_tensor->layout};
conv_dst_tensor.layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype,
conv_dst_tensor.layout.dtype);
}
ExecArgs conv_args = args;
conv_args.dst_tensor = &conv_dst_tensor;
conv_args.dst_layout = &conv_dst_tensor.layout;
{
CUDNNForwardDescs D;
conv_args.init_conv_desc(D);
auto conv_workspace = bundle.get_workspace(0);
float alpha = 1.0f, beta = 0.0f;
auto status = cudnnConvolutionForward(
conv_args.handle->cudnn_handle(), &alpha, D.src_desc.desc,
conv_args.src_tensor->raw_ptr(), D.filter_desc.desc,
conv_args.filter_tensor->raw_ptr(), D.conv_desc.conv_desc, m_cudnn_enum,
conv_workspace.raw_ptr, conv_workspace.size, &beta, D.dst_desc.desc,
conv_args.dst_tensor->raw_ptr());
megdnn_assert(
status == CUDNN_STATUS_SUCCESS, "conv fwd failed: %s; info: %s",
cudnnGetErrorString(status), conv_args.to_string().c_str());
}
if (args.z_layout->ndim > 0) {
auto z_tensor = *args.z_tensor;
if (args.z_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) {
z_tensor = TensorND{bundle.get(2), args.z_tensor->layout};
z_tensor.layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype,
z_tensor.layout.dtype);
auto typecvt = args.handle->create_operator<TypeCvt>();
typecvt->exec(*args.z_tensor, z_tensor);
}
auto add = args.handle->create_operator<ElemwiseForward>();
add->param().mode = Elemwise::Param::Mode::ADD;
add->exec({conv_dst_tensor, z_tensor}, conv_dst_tensor);
}
handle_bias_and_nonlinear(
args.handle, args.nonlinear_mode, &conv_dst_tensor, args.dst_tensor,
args.bias_tensor);
void ConvBiasForwardImpl::AlgoCUDNNConv::cudnn_execute(
const ExecArgs& args, const Workspace& workspace) const {
CUDNNForwardDescs D;
args.init_conv_desc(D);
float alpha = 1.0f, beta = 0.0f;
auto status = cudnnConvolutionForward(
args.handle->cudnn_handle(), &alpha, D.src_desc.desc,
args.src_tensor->raw_ptr(), D.filter_desc.desc,
args.filter_tensor->raw_ptr(), D.conv_desc.conv_desc, m_cudnn_enum,
workspace.raw_ptr, workspace.size, &beta, D.dst_desc.desc,
args.dst_tensor->raw_ptr());
megdnn_assert(
status == CUDNN_STATUS_SUCCESS, "conv fwd failed: %s; info: %s",
cudnnGetErrorString(status), args.to_string().c_str());
}
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/cuda/conv_bias/cudnn_conv_base.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "src/common/conv_bias.h"
#include "src/cuda/conv_bias/algo.h"
#include "src/cuda/utils.h"
using namespace megdnn;
using namespace cuda;
using namespace conv_bias;
WorkspaceBundle ConvBiasForwardImpl::AlgoCUDNNConvBase::get_workspace_bundle(
void* ptr, const SizeArgs& args) const {
auto dst_layout = *args.dst_layout;
SmallVector<size_t> sizes;
if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) {
dst_layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype, dst_layout.dtype);
sizes.push_back(dst_layout.span().dist_byte());
}
if (args.z_layout->ndim > 0 &&
args.z_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) {
auto z_layout = *args.z_layout;
z_layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype, z_layout.dtype);
sizes.push_back(z_layout.span().dist_byte());
}
SizeArgs conv_args = args;
conv_args.dst_layout = &dst_layout;
size_t conv_workspace_size = cudnn_get_workspace_in_bytes(conv_args);
sizes.insert(sizes.begin(), conv_workspace_size);
return {ptr, std::move(sizes)};
}
void ConvBiasForwardImpl::AlgoCUDNNConvBase::exec(const ExecArgs& args) const {
auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args);
TensorND conv_dst_tensor = *args.dst_tensor;
if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) {
conv_dst_tensor = TensorND{bundle.get(1), args.dst_tensor->layout};
conv_dst_tensor.layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype,
conv_dst_tensor.layout.dtype);
}
ExecArgs conv_args = args;
conv_args.dst_tensor = &conv_dst_tensor;
conv_args.dst_layout = &conv_dst_tensor.layout;
cudnn_execute(conv_args, bundle.get_workspace(0));
if (args.z_layout->ndim > 0) {
auto z_tensor = *args.z_tensor;
if (args.z_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) {
z_tensor = TensorND{bundle.get(2), args.z_tensor->layout};
z_tensor.layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype,
z_tensor.layout.dtype);
auto typecvt = args.handle->create_operator<TypeCvt>();
typecvt->exec(*args.z_tensor, z_tensor);
}
auto add = args.handle->create_operator<ElemwiseForward>();
add->param().mode = Elemwise::Param::Mode::ADD;
add->exec({conv_dst_tensor, z_tensor}, conv_dst_tensor);
}
handle_bias_and_nonlinear(
args.handle, args.nonlinear_mode, &conv_dst_tensor, args.dst_tensor,
args.bias_tensor);
}
// vim: syntax=cpp.doxygen
......@@ -124,10 +124,10 @@ bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::is_available(
// forbits sigmoid for quantized
if (args.src_layout->dtype.category() == DTypeCategory::QUANTIZED)
return false;
MEGDNN_FALLTHRU // XXX: why?
case param::ConvBias::NonlineMode::IDENTITY
: if (args.src_layout->dtype.category() ==
DTypeCategory::QUANTIZED) break;
MEGDNN_FALLTHRU; // XXX: why?
case param::ConvBias::NonlineMode::IDENTITY:
if (args.src_layout->dtype.category() == DTypeCategory::QUANTIZED)
break;
if (m_cudnn_enum != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) {
// cudnn require algo to
// CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
......@@ -149,7 +149,7 @@ bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::is_available(
return status == CUDNN_STATUS_SUCCESS;
}
size_t ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::get_workspace_in_bytes(
size_t ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::cudnn_get_workspace_in_bytes(
const SizeArgs& args) const {
CUDNNForwardDescs D;
......@@ -162,85 +162,18 @@ size_t ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::get_workspace_in_bytes(
status == CUDNN_STATUS_SUCCESS,
"conv fwd get workspace failed: %s; info: %s", cudnnGetErrorString(status),
args.to_string().c_str());
if (args.bias_layout && args.bias_layout->dtype != dtype::Float32() &&
args.src_layout->dtype.category() != DTypeCategory::FLOAT) {
// cudnn require bias to be float when executing CONFIG_INT
// convert bias to float if bias is not float at first
workspace_size += sizeof(float) * args.bias_layout->span().dist_elem();
}
return workspace_size;
}
void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::exec(
const ExecArgs& args) const {
void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::cudnn_execute(
const ExecArgs& args, const Workspace& workspace, float alpha,
float beta) const {
#if CUDNN_MAJOR < 7
megdnn_throw("ConvBias require cudnn 7.0 or higher");
#else
megdnn_assert(cudnnGetVersion() >= 7401);
CUDNNForwardDescs D;
args.init_conv_bias_desc(D);
float alpha = 1.0f, beta = 0.0f;
if (args.z_layout->ndim > 0)
beta = 1.0f;
auto get_scale = [](const DType& dtype) -> float {
megdnn_assert(dtype.category() == DTypeCategory::QUANTIZED);
switch (dtype.enumv()) {
#define cb(_dt) \
case DTypeTrait<_dt>::enumv: \
return dtype.param<_dt>().scale;
MEGDNN_FOREACH_QUANTIZED_DTYPE(cb)
#undef cb
default:
megdnn_assert_internal(0);
}
};
auto src_dtype = args.src_layout->dtype, filter_dtype = args.filter_layout->dtype,
dst_dtype = args.dst_layout->dtype;
megdnn_assert(
(src_dtype.category() == dst_dtype.category()) ||
(src_dtype.enumv() == DTypeEnum::QuantizedS8 &&
dst_dtype.enumv() == DTypeEnum::Float32));
megdnn_assert(src_dtype.category() == filter_dtype.category());
if (args.src_layout->dtype.category() == DTypeCategory::QUANTIZED) {
auto expected_bias_scale = get_scale(args.src_layout->dtype) *
get_scale(args.filter_layout->dtype);
alpha = expected_bias_scale;
if (args.dst_layout->dtype.category() == DTypeCategory::QUANTIZED)
alpha /= get_scale(args.dst_layout->dtype);
if (args.z_layout->ndim > 0 &&
args.z_layout->dtype.category() == DTypeCategory::QUANTIZED) {
beta = get_scale(args.z_layout->dtype) / get_scale(args.dst_layout->dtype);
}
if (args.bias_layout->dtype.category() == DTypeCategory::QUANTIZED) {
megdnn_assert(
fabs(expected_bias_scale - get_scale(args.bias_layout->dtype)) <
1e-4);
}
}
auto workspace_ptr = args.workspace.raw_ptr;
auto workspace_size = args.workspace.size;
auto bias_ptr = args.bias_tensor->raw_ptr();
if (args.bias_layout && args.bias_layout->dtype != dtype::Float32() &&
args.src_layout->dtype.category() != DTypeCategory::FLOAT) {
auto cvt = args.handle->create_operator<TypeCvt>();
auto float_bias_layout = *args.bias_layout;
auto converted_bias_layout = *args.bias_layout;
converted_bias_layout.dtype = dtype::QuantizedS32(alpha);
float_bias_layout.dtype = dtype::Float32();
auto bias_size_in_bytes = float_bias_layout.span().dist_byte();
megdnn_assert(args.workspace.size >= bias_size_in_bytes);
cvt->exec(
{args.bias_tensor->raw_ptr(), converted_bias_layout},
TensorND{workspace_ptr, float_bias_layout});
bias_ptr = workspace_ptr;
workspace_ptr += bias_size_in_bytes;
workspace_size -= bias_size_in_bytes;
}
cudnnStatus_t status;
if (args.z_layout->ndim == 0) {
......@@ -248,55 +181,23 @@ void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::exec(
args.handle->cudnn_handle(), &alpha, D.src_desc.desc,
args.src_tensor->raw_ptr(), D.filter_desc.desc,
args.filter_tensor->raw_ptr(), D.conv_desc.conv_desc, m_cudnn_enum,
workspace_ptr, workspace_size, &beta, D.dst_desc.desc,
args.dst_tensor->raw_ptr(), D.bias_desc.desc, bias_ptr,
D.conv_desc.act_desc, D.dst_desc.desc, args.dst_tensor->raw_ptr());
workspace.raw_ptr, workspace.size, &beta, D.dst_desc.desc,
args.dst_tensor->raw_ptr(), D.bias_desc.desc,
args.bias_tensor->raw_ptr(), D.conv_desc.act_desc, D.dst_desc.desc,
args.dst_tensor->raw_ptr());
} else {
status = cudnnConvolutionBiasActivationForward(
args.handle->cudnn_handle(), &alpha, D.src_desc.desc,
args.src_tensor->raw_ptr(), D.filter_desc.desc,
args.filter_tensor->raw_ptr(), D.conv_desc.conv_desc, m_cudnn_enum,
workspace_ptr, workspace_size, &beta, D.z_desc.desc,
args.z_tensor->raw_ptr(), D.bias_desc.desc, bias_ptr,
workspace.raw_ptr, workspace.size, &beta, D.z_desc.desc,
args.z_tensor->raw_ptr(), D.bias_desc.desc, args.bias_tensor->raw_ptr(),
D.conv_desc.act_desc, D.dst_desc.desc, args.dst_tensor->raw_ptr());
}
megdnn_assert(
status == CUDNN_STATUS_SUCCESS, "conv fwd failed: %s; info: %s, algo %s",
cudnnGetErrorString(status), args.to_string().c_str(), name());
// Noline
switch (args.nonlinear_mode) {
case param::ConvBias::NonlineMode::RELU:
break;
case param::ConvBias::NonlineMode::SIGMOID: {
megdnn_assert(
args.dst_layout->dtype.category() != DTypeCategory::QUANTIZED);
auto&& elem_opr = args.handle->create_operator<ElemwiseForward>();
elem_opr->param().mode = Elemwise::Param::Mode::SIGMOID;
elem_opr->exec({*(args.dst_tensor)}, *(args.dst_tensor));
break;
}
case param::ConvBias::NonlineMode::IDENTITY:
break;
case param::ConvBias::NonlineMode::H_SWISH: {
megdnn_assert(
args.dst_layout->dtype.category() == DTypeCategory::QUANTIZED ||
(args.dst_layout->dtype.category() == DTypeCategory::FLOAT &&
args.opr->param().format == param::ConvBias::Format::NCHW4_NCHW));
if (args.dst_layout->dtype.category() == DTypeCategory::QUANTIZED) {
auto&& elem_opr = args.handle->create_operator<ElemwiseMultiType>();
elem_opr->param().mode = ElemwiseMultiType::Param::Mode::QH_SWISH;
elem_opr->exec({*(args.dst_tensor)}, *(args.dst_tensor));
} else {
auto&& elem_opr = args.handle->create_operator<ElemwiseForward>();
elem_opr->param().mode = ElemwiseForward::Param::Mode::H_SWISH;
elem_opr->exec({*(args.dst_tensor)}, *(args.dst_tensor));
}
break;
}
default:
megdnn_throw("unsupported NonlineMode");
}
#endif
}
......
/**
* \file dnn/src/cuda/conv_bias/cudnn_conv_bias_activation_base.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "megdnn/oprs/general.h"
#include "./algo.h"
#include "src/common/conv_bias.h"
#include "src/cuda/conv_bias/helper.h"
#include "src/cuda/cudnn_wrapper.h"
#include "src/cuda/utils.h"
using namespace megdnn;
using namespace cuda;
using namespace conv_bias;
size_t ConvBiasForwardImpl::AlgoCUDNNConvBiasActivationBase::get_workspace_in_bytes(
const SizeArgs& args) const {
auto workspace_size = cudnn_get_workspace_in_bytes(args);
auto&& param = args.opr->param();
if (args.preprocessed_filter == nullptr) {
if (args.bias_layout && args.bias_layout->dtype != dtype::Float32() &&
args.src_layout->dtype.category() != DTypeCategory::FLOAT) {
// cudnn require bias to be float when executing CONFIG_INT
// convert bias to float if bias is not float at first
workspace_size += sizeof(float) * args.bias_layout->span().dist_elem();
}
if (param.format == param::ConvBias::Format::NCHW32) {
workspace_size += args.filter_layout->span().dist_byte() +
args.bias_layout->span().dist_byte();
}
}
return workspace_size;
}
void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivationBase::exec(
const ExecArgs& args) const {
float alpha, beta;
std::tie(alpha, beta) = cudnn_get_conv_bias_act_scale_param(
args.src_tensor->layout, args.dst_tensor->layout,
args.filter_tensor->layout, args.bias_tensor->layout,
args.z_tensor->layout);
auto workspace_ptr = args.workspace.raw_ptr;
auto workspace_size = args.workspace.size;
auto bias_ptr = args.bias_tensor->raw_ptr();
TensorND filter_tensor;
TensorND bias_tensor;
auto&& param = args.opr->param();
if (args.preprocessed_filter != nullptr) {
bias_tensor = TensorND{
args.bias_tensor->layout,
args.preprocessed_filter->tensors[0].raw_ptr()};
if (param.format == Param::Format::NCHW32) {
megdnn_assert(args.preprocessed_filter->tensors.size() == 2);
filter_tensor = TensorND{
args.filter_tensor->layout,
args.preprocessed_filter->tensors[1].raw_ptr()};
} else {
filter_tensor = *args.filter_tensor;
}
} else {
if (args.bias_layout && args.bias_layout->dtype != dtype::Float32() &&
args.src_layout->dtype.category() != DTypeCategory::FLOAT) {
auto cvt = args.handle->create_operator<TypeCvt>();
auto float_bias_layout = *args.bias_layout;
auto converted_bias_layout = *args.bias_layout;
converted_bias_layout.dtype = dtype::QuantizedS32(alpha);
float_bias_layout.dtype = dtype::Float32();
auto bias_size_in_bytes = float_bias_layout.span().dist_byte();
megdnn_assert(args.workspace.size >= bias_size_in_bytes);
cvt->exec(
{args.bias_tensor->raw_ptr(), converted_bias_layout},
TensorND{workspace_ptr, float_bias_layout});
bias_ptr = workspace_ptr;
workspace_ptr += bias_size_in_bytes;
workspace_size -= bias_size_in_bytes;
}
if (param.format == Param::Format::NCHW32) {
size_t reorder_workspace_size =
args.filter_tensor->layout.span().dist_byte() +
args.bias_tensor->layout.span().dist_byte();
auto reorder_filter_ptr = workspace_ptr;
auto reorder_bias_ptr =
workspace_ptr + args.filter_tensor->layout.span().dist_byte();
cudnn_reorder_filer_and_bias_nchw32(
cudnn_handle(args.opr->handle()), args.filter_tensor->raw_ptr(),
args.filter_meta, bias_ptr, reorder_filter_ptr, reorder_bias_ptr);
filter_tensor = TensorND(args.filter_tensor->layout, reorder_filter_ptr);
bias_ptr = reorder_bias_ptr;
workspace_ptr += reorder_workspace_size;
workspace_size -= reorder_workspace_size;
} else {
filter_tensor = *args.filter_tensor;
}
}
bias_tensor = TensorND{args.bias_tensor->layout, bias_ptr};
ExecArgs exec_args{
const_cast<ConvBiasForwardImpl*>(args.opr),
*args.src_tensor,
filter_tensor,
bias_tensor,
*args.z_tensor,
*args.dst_tensor,
args.workspace};
Workspace cudnn_workspace{workspace_ptr, workspace_size};
cudnn_execute(exec_args, cudnn_workspace, alpha, beta);
// Noline
switch (args.nonlinear_mode) {
case param::ConvBias::NonlineMode::RELU:
break;
case param::ConvBias::NonlineMode::SIGMOID: {
megdnn_assert(
args.dst_layout->dtype.category() != DTypeCategory::QUANTIZED);
auto&& elem_opr = args.handle->create_operator<ElemwiseForward>();
elem_opr->param().mode = Elemwise::Param::Mode::SIGMOID;
elem_opr->exec({*(args.dst_tensor)}, *(args.dst_tensor));
break;
}
case param::ConvBias::NonlineMode::IDENTITY:
break;
case param::ConvBias::NonlineMode::H_SWISH: {
megdnn_assert(
args.dst_layout->dtype.category() == DTypeCategory::QUANTIZED ||
(args.dst_layout->dtype.category() == DTypeCategory::FLOAT &&
args.opr->param().format == param::ConvBias::Format::NCHW4_NCHW));
if (args.dst_layout->dtype.category() == DTypeCategory::QUANTIZED) {
auto&& elem_opr = args.handle->create_operator<ElemwiseMultiType>();
elem_opr->param().mode = ElemwiseMultiType::Param::Mode::QH_SWISH;
elem_opr->exec({*(args.dst_tensor)}, *(args.dst_tensor));
} else {
auto&& elem_opr = args.handle->create_operator<ElemwiseForward>();
elem_opr->param().mode = ElemwiseForward::Param::Mode::H_SWISH;
elem_opr->exec({*(args.dst_tensor)}, *(args.dst_tensor));
}
break;
}
default:
megdnn_throw("unsupported NonlineMode");
}
}
size_t ConvBiasForwardImpl::AlgoCUDNNConvBiasActivationBase::
get_preprocess_workspace_in_bytes(const SizeArgs& args) const {
auto&& param = args.opr->param();
if (param.format == Param::Format::NCHW32) {
return args.bias_layout->span().dist_byte();
}
return 0_z;
}
SmallVector<TensorLayout> ConvBiasForwardImpl::AlgoCUDNNConvBiasActivationBase::
deduce_preprocessed_filter_layout(const SizeArgs& args) const {
auto&& param = args.opr->param();
if (param.format == Param::Format::NCHW32) {
return {args.bias_layout->collapse_contiguous(),
args.filter_layout->collapse_contiguous()};
} else {
return {args.bias_layout->collapse_contiguous()};
}
}
void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivationBase::exec_preprocess(
const ExecArgs& args) const {
float alpha, beta;
std::tie(alpha, beta) = cudnn_get_conv_bias_act_scale_param(
args.src_tensor->layout, args.dst_tensor->layout,
args.filter_tensor->layout, args.bias_tensor->layout,
args.z_tensor->layout);
MEGDNN_MARK_USED_VAR(beta);
auto workspace_ptr = args.workspace.raw_ptr;
auto workspace_size = args.workspace.size;
auto bias_ptr = workspace_size > 0 ? workspace_ptr
: args.preprocessed_filter->tensors[0].raw_ptr();
if (args.bias_layout && args.bias_layout->dtype != dtype::Float32() &&
args.src_layout->dtype.category() != DTypeCategory::FLOAT) {
auto cvt = args.handle->create_operator<TypeCvt>();
auto float_bias_layout = *args.bias_layout;
auto converted_bias_layout = *args.bias_layout;
converted_bias_layout.dtype = dtype::QuantizedS32(alpha);
float_bias_layout.dtype = dtype::Float32();
cvt->exec(
{args.bias_tensor->raw_ptr(), converted_bias_layout},
TensorND{bias_ptr, float_bias_layout});
}
if (args.opr->param().format == Param::Format::NCHW32) {
auto reorder_filter_ptr = args.preprocessed_filter->tensors[1].raw_ptr();
auto reorder_bias_ptr = args.preprocessed_filter->tensors[0].raw_ptr();
cudnn_reorder_filer_and_bias_nchw32(
cudnn_handle(args.opr->handle()), args.filter_tensor->raw_ptr(),
args.filter_meta, bias_ptr, reorder_filter_ptr, reorder_bias_ptr);
}
}
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/cuda/conv_bias/cudnn_conv_bias_activation_v8.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "megdnn/oprs/general.h"
#include "./algo.h"
#include "src/common/conv_bias.h"
#include "src/cuda/cudnn_wrapper_v8.h"
#include "src/cuda/utils.h"
#if CUDNN_VERSION >= 8004
using namespace megdnn;
using namespace cuda;
using namespace conv_bias;
namespace {
TensorLayout canonical_bias_layout(
const TensorLayout& bias_layout, const param::ConvBias::Format format) {
int64_t vector_count, vector_dimension;
std::tie(vector_count, vector_dimension) = get_vector_count_and_dimension(format);
size_t channel = bias_layout[vector_dimension] * vector_count;
if (bias_layout.dtype.category() != DTypeCategory::FLOAT) {
return TensorLayout{{1, channel, 1, 1}, dtype::Float32()};
}
return TensorLayout{{1, channel, 1, 1}, bias_layout.dtype};
}
} // namespace
bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivationV8::is_available(
const SizeArgs& args) const {
auto&& param = args.opr->param();
if (param.format == param::ConvBias::Format::NCHW4_NCHW32 ||
param.format == param::ConvBias::Format::NCHW32_NCHW4 ||
param.format == param::ConvBias::Format::NCHW4_NCHW ||
param.format == param::ConvBias::Format::NCHW8 ||
param.format == param::ConvBias::Format::NCHW64 ||
param.format == param::ConvBias::Format::CHWN4)
return false;
if (param.format != Param::Format::NCHW && param.format != Param::Format::NHWC) {
if (!args.src_layout->is_contiguous() || !args.dst_layout->is_contiguous()) {
return false;
}
}
if ((args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS4 ||
args.src_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) &&
args.filter_layout->dtype.enumv() == DTypeEnum::QuantizedS4)
return false;
if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4 ||
args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm)
return false;
if (args.src_layout->dtype == args.filter_layout->dtype &&
args.src_layout->dtype == dtype::BFloat16()) {
return false;
}
if (args.bias_layout->ndim == 0 ||
!check_bias_share_in_channel(*(args.bias_layout), param.format)) {
return false;
}
// FIXME: cudnn cannot handle the case when the initial value of dst tensor
// contains nan and beta is zero, because the result of 0.f * nan is still
// nan
if (args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS8 &&
args.dst_layout->dtype.enumv() == DTypeEnum::Float32 &&
param.format == param::ConvBias::Format::NCHW) {
return false;
}
if (param.format == param::ConvBias::Format::NCHW32) {
// sm version
auto&& device_prop = current_device_prop();
if (device_prop.major < 7 || (device_prop.major == 7 && device_prop.minor < 5))
return false;
}
switch (args.nonlinear_mode) {
case param::ConvBias::NonlineMode::RELU:
case param::ConvBias::NonlineMode::IDENTITY:
break;
case param::ConvBias::NonlineMode::SIGMOID:
// forbits sigmoid for quantized
if (args.src_layout->dtype.category() == DTypeCategory::QUANTIZED)
return false;
break;
case param::ConvBias::NonlineMode::H_SWISH:
if (args.src_layout->dtype.category() == DTypeCategory::QUANTIZED)
break;
return false;
default:
megdnn_throw("unsupported NonlineMode");
}
auto bias_layout =
canonical_bias_layout(*args.bias_layout, args.opr->param().format);
auto plan = get_heuristic_plan_from_opr(
static_cast<const ConvBiasForward*>(args.opr), *args.src_layout,
*args.dst_layout, *args.filter_layout, bias_layout, *args.z_layout,
args.filter_meta);
return plan != nullptr;
}
size_t ConvBiasForwardImpl::AlgoCUDNNConvBiasActivationV8::cudnn_get_workspace_in_bytes(
const SizeArgs& args) const {
auto bias_layout =
canonical_bias_layout(*args.bias_layout, args.opr->param().format);
auto plan = get_heuristic_plan_from_opr(
static_cast<const ConvBiasForward*>(args.opr), *args.src_layout,
*args.dst_layout, *args.filter_layout, bias_layout, *args.z_layout,
args.filter_meta);
megdnn_assert(
plan != nullptr, "algo(%s) cannot find execution from heuristics", name());
return plan->getWorkspaceSize();
}
void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivationV8::cudnn_execute(
const ExecArgs& args, const Workspace& workspace, float alpha,
float beta) const {
auto&& bias_layout =
canonical_bias_layout(args.bias_tensor->layout, args.opr->param().format);
auto plan = get_heuristic_plan_from_opr(
static_cast<const ConvBiasForward*>(args.opr), args.src_tensor->layout,
args.dst_tensor->layout, args.filter_tensor->layout, bias_layout,
args.z_tensor->layout, args.filter_meta);
megdnn_assert(
plan != nullptr, "algo(%s) cannot find execution from heuristics", name());
auto&& handle = cudnn_handle(args.handle);
TensorND bias_tensor{args.bias_tensor->raw_ptr(), bias_layout};
run_conv_bias_act_with_plan(
handle, *plan, *args.src_tensor, *args.dst_tensor, *args.filter_tensor,
bias_tensor, *args.z_tensor, workspace);
}
#endif
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/cuda/conv_bias/cudnn_conv_v8.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "src/common/conv_bias.h"
#include "src/cuda/conv_bias/algo.h"
#include "src/cuda/cudnn_wrapper_v8.h"
#include "src/cuda/utils.h"
#if CUDNN_VERSION >= 8004
using namespace megdnn;
using namespace cuda;
using namespace conv_bias;
bool ConvBiasForwardImpl::AlgoCUDNNConvV8::is_available(const SizeArgs& args) const {
if (args.filter_meta.format != Param::Format::NCHW &&
args.filter_meta.format != Param::Format::NHWC) {
if (!args.src_layout->is_contiguous() || !args.dst_layout->is_contiguous()) {
return false;
}
}
if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4 ||
args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
return false;
}
// FIXME: cudnn cannot handle the case when the initial value of dst tensor
// contains nan and beta is zero, because the result of 0.f * nan is still
// nan
if (args.src_layout->dtype.enumv() == DTypeEnum::QuantizedS8 &&
args.dst_layout->dtype.enumv() == DTypeEnum::Float32 &&
args.opr->param().format == param::ConvBias::Format::NCHW) {
return false;
}
auto dst_layout = *args.dst_layout;
if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) {
dst_layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(
args.src_layout->dtype, args.filter_layout->dtype, dst_layout.dtype);
}
SizeArgs conv_args = args;
conv_args.dst_layout = &dst_layout;
if (!is_cudnn_supported(conv_args))
return false;
auto conv_opr = args.handle->create_operator<ConvolutionForward>();
conv_opr->param() = get_param_convolution(args);
ConvolutionForward::CanonizedFilterMeta fm;
fm.copy_from(args.filter_meta);
auto plan = get_heuristic_plan_from_opr(
conv_opr.get(), *conv_args.src_layout, *conv_args.dst_layout,
*conv_args.filter_layout, {}, {}, fm);
return plan != nullptr;
}
size_t ConvBiasForwardImpl::AlgoCUDNNConvV8::cudnn_get_workspace_in_bytes(
const SizeArgs& args) const {
auto conv_opr = args.handle->create_operator<ConvolutionForward>();
conv_opr->param() = get_param_convolution(args);
ConvolutionForward::CanonizedFilterMeta fm;
fm.copy_from(args.filter_meta);
auto plan = get_heuristic_plan_from_opr(
conv_opr.get(), *args.src_layout, *args.dst_layout, *args.filter_layout, {},
{}, fm);
megdnn_assert(
plan != nullptr, "algo(%s) cannot find execution from heuristics", name());
return plan->getWorkspaceSize();
}
void ConvBiasForwardImpl::AlgoCUDNNConvV8::cudnn_execute(
const ExecArgs& args, const Workspace& workspace) const {
auto conv_opr = args.handle->create_operator<ConvolutionForward>();
conv_opr->param() = get_param_convolution(args);
ConvolutionForward::CanonizedFilterMeta fm;
fm.copy_from(args.filter_meta);
auto plan = get_heuristic_plan_from_opr(
conv_opr.get(), args.src_tensor->layout, args.dst_tensor->layout,
args.filter_tensor->layout, {}, {}, fm);
megdnn_assert(
plan != nullptr, "algo(%s) cannot find execution from heuristics", name());
auto&& handle = cudnn_handle(args.handle);
run_single_conv_with_plan(
handle, *plan, *args.src_tensor, *args.dst_tensor, *args.filter_tensor,
workspace);
}
#endif
// vim: syntax=cpp.doxygen
......@@ -197,8 +197,60 @@ void flip_filter(
ref_ptr.reset(workspace.raw_ptr);
}
} // namespace conv_bias
std::pair<float, float> cudnn_get_conv_bias_act_scale_param(
const TensorLayout& x, const TensorLayout& y, const TensorLayout& w,
const TensorLayout& b, const TensorLayout& z) {
float alpha = 1.f, beta = 0.f;
if (z.ndim > 0)
beta = 1.f;
auto get_scale = [](const DType& dtype) -> float {
megdnn_assert(dtype.category() == DTypeCategory::QUANTIZED);
switch (dtype.enumv()) {
#define cb(_dt) \
case DTypeTrait<_dt>::enumv: \
return dtype.param<_dt>().scale;
MEGDNN_FOREACH_QUANTIZED_DTYPE(cb)
#undef cb
default:
megdnn_assert_internal(0);
}
};
auto x_dtype = x.dtype, y_dtype = y.dtype, w_dtype = w.dtype;
megdnn_assert(
(x_dtype.category() == y_dtype.category()) ||
(x_dtype.enumv() == DTypeEnum::QuantizedS8 &&
y_dtype.enumv() == DTypeEnum::Float32));
megdnn_assert(x_dtype.category() == w_dtype.category());
if (x_dtype.category() == DTypeCategory::QUANTIZED) {
auto expected_bias_scale = get_scale(x_dtype) * get_scale(w_dtype);
alpha = expected_bias_scale;
if (y_dtype.category() == DTypeCategory::QUANTIZED)
alpha /= get_scale(y_dtype);
if (z.ndim > 0 && z.dtype.category() == DTypeCategory::QUANTIZED) {
beta = get_scale(z.dtype) / get_scale(y_dtype);
}
if (b.dtype.category() == DTypeCategory::QUANTIZED) {
megdnn_assert(fabs(expected_bias_scale - get_scale(b.dtype)) < 1e-4);
}
}
return {alpha, beta};
}
void cudnn_reorder_filer_and_bias_nchw32(
const cudnnHandle_t& handle, const void* filter_ptr,
const CanonizedFilterMeta& fm, const void* bias_ptr, void* reordered_filter_ptr,
void* reordered_bias_ptr) {
FilterDesc<param::ConvBias> filter_desc;
filter_desc.set(fm);
int reorder_bias = bias_ptr != nullptr;
cudnn_check(cudnnReorderFilterAndBias(
handle, filter_desc.desc, CUDNN_DEFAULT_REORDER, filter_ptr,
reordered_filter_ptr, reorder_bias, bias_ptr, reordered_bias_ptr));
}
} // namespace conv_bias
} // namespace cuda
} // namespace megdnn
......
......@@ -113,6 +113,15 @@ struct CUDNNForwardDescs {
}
};
std::pair<float, float> cudnn_get_conv_bias_act_scale_param(
const TensorLayout& x, const TensorLayout& y, const TensorLayout& w,
const TensorLayout& b, const TensorLayout& z);
void cudnn_reorder_filer_and_bias_nchw32(
const cudnnHandle_t& handle, const void* filter_ptr,
const CanonizedFilterMeta& fm, const void* bias_ptr, void* reordered_filter_ptr,
void* reordered_bias_ptr);
} // namespace conv_bias
} // namespace cuda
} // namespace megdnn
......
......@@ -47,6 +47,17 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic(
const AlgoAttribute& positive_attr, const AlgoAttribute& negative_attr) {
using namespace conv_bias;
AlgoBase::SizeArgs args{this, src, filter, bias, z, dst};
#if CUDNN_VERSION >= 8004
if (sm_algo_pack.cudnn_conv_v8.is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
return &sm_algo_pack.cudnn_conv_v8;
}
if (sm_algo_pack.cudnn_conv_bias_activation_v8.is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
return &sm_algo_pack.cudnn_conv_bias_activation_v8;
}
#endif
auto dst_layout = *args.dst_layout;
if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) {
dst_layout.dtype = DType();
......
#pragma once
#include "../elemwise/opr_impl.h"
#include "megdnn/oprs.h"
#include "src/cuda/cudnn_with_check.h"
namespace megdnn {
namespace cuda {
......@@ -65,6 +66,12 @@ public:
// The following algorithms are suitable for channel wise convolution
class AlgoFloat32NCHWFMAImplicitBatchedGemm;
class AlgoFloat16NCHWHMMAImplicitBatchedGemm;
class AlgoCUDNNConvBase;
class AlgoCUDNNConvBiasActivationBase;
#if CUDNN_VERSION > 8004
class AlgoCUDNNConvV8;
class AlgoCUDNNConvBiasActivationV8;
#endif
class AlgoPack;
......
此差异已折叠。
/**
* \file dnn/src/cuda/cudnn_wrapper_v8.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "megdnn/basic_types.h"
#include "megdnn/oprs/nn.h"
#include "src/common/utils.h"
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-variable"
#pragma GCC diagnostic ignored "-Wunused-function"
#pragma GCC diagnostic ignored "-Wreorder"
#include "cudnn_frontend.h"
#pragma GCC diagnostic pop
namespace megdnn {
namespace cuda {
static inline std::pair<int64_t, int64_t> get_vector_count_and_dimension(
const param::Convolution::Format format) {
using Format = param::Convolution::Format;
int64_t vector_count = 1;
int64_t vector_dimension = 1;
switch (format) {
case Format::NCHW:
break;
case Format::NHWC:
vector_dimension = 3;
break;
case Format::NCHW4:
vector_count = 4;
break;
case Format::NCHW32:
vector_count = 32;
break;
default:
megdnn_assert(
false, "unsupported format (got:%u) for cudnn",
static_cast<uint32_t>(format));
}
return {vector_count, vector_dimension};
}
template <typename Opr>
cudnn_frontend::ExecutionPlan* get_heuristic_plan_from_opr(
const Opr* opr, const TensorLayout& x, const TensorLayout& y,
const TensorLayout& w, const TensorLayout& b, const TensorLayout& z,
const typename Opr::CanonizedFilterMeta& fm);
void run_single_conv_with_plan(
const cudnnHandle_t& handle, const cudnn_frontend::ExecutionPlan& plan,
const TensorND& x, const TensorND& y, const TensorND& w,
const Workspace& workspace);
void run_conv_bias_act_with_plan(
const cudnnHandle_t& handle, const cudnn_frontend::ExecutionPlan& plan,
const TensorND& x, const TensorND& y, const TensorND& w, const TensorND& b,
const TensorND& z, const Workspace& workspace);
} // namespace cuda
} // namespace megdnn
// vim: syntax=cpp.doxygen
......@@ -58,6 +58,11 @@ HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle)
For example `export CUDA_CACHE_MAXSIZE=2147483647` and `export CUDA_CACHE_PATH=/data/.cuda_cache`)");
}
#endif
size_t free, tot;
cudaMemGetInfo(&free, &tot);
printf("before cudnn create, free: %.2f MB, tot: %.2f MB, allocated: %.2f MB\n",
free / 1024.0 / 1024.0, tot / 1024.0 / 1024.0,
(tot - free) / 1024.0 / 1024.0);
cudnn_check(cudnnCreate(&m_cudnn_handle));
cublas_check(cublasCreate(&m_cublas_handle));
#if CUDA_VERSION >= 10010
......@@ -69,6 +74,11 @@ HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle)
cudnn_check(cudnnSetStream(m_cudnn_handle, stream()));
cublas_check(cublasSetStream(m_cublas_handle, stream()));
#if CUDNN_VERSION >= 8004
// cudnn_check(cudnnOpsInferVersionCheck());
// cudnn_check(cudnnCnnInferVersionCheck());
#endif
// Note that all cublas scalars (alpha, beta) and scalar results such as dot
// output resides at device side.
cublas_check(cublasSetPointerMode(m_cublas_handle, CUBLAS_POINTER_MODE_DEVICE));
......@@ -82,6 +92,11 @@ HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle)
cudaMemcpyHostToDevice, stream()));
cuda_check(cudaStreamSynchronize(stream()));
cudaMemGetInfo(&free, &tot);
printf("after cudnn create, free: %.2f MB, tot: %.2f MB, allocated: %.2f MB\n",
free / 1024.0 / 1024.0, tot / 1024.0 / 1024.0,
(tot - free) / 1024.0 / 1024.0);
// check tk1
m_is_tegra_k1 = (strcmp(m_device_prop->name, "GK20A") == 0);
m_cusolver_handle = nullptr;
......
/**
* \file dnn/test/cuda/conv_bias.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "megdnn/dtype.h"
#include "test/cuda/fixture.h"
#include "megdnn/opr_param_defs.h"
#include "megdnn/oprs.h"
#include "src/cuda/handle.h"
#include "test/common/benchmarker.h"
#include "test/common/checker.h"
#include "test/common/conv_bias.h"
#include "test/common/rng.h"
#include "test/common/tensor.h"
#include "test/common/workspace_wrapper.h"
#include "test/cuda/utils.h"
using namespace megdnn;
using namespace test;
using namespace conv_bias;
#if CUDNN_VERSION >= 8004
TEST_F(CUDA, CONV_V8_FLOAT) {
Checker<ConvBiasForward> checker(handle_cuda());
checker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(ExecutionPolicyAlgoName{
ConvBiasForward::algo_name<ConvBiasForward::DefaultParam>(
"CUDNN:ConvolutionV8", {})
.c_str()}));
UniformFloatRNG rng(0.f, 1.f);
checker.set_rng(0, &rng)
.set_rng(1, &rng)
.set_rng(2, &rng)
.set_rng(3, &rng)
.set_dtype(0, dtype::Float32())
.set_dtype(1, dtype::Float32())
.set_dtype(2, dtype::Float32())
.set_dtype(3, dtype::Float32());
param::ConvBias param;
param.pad_h = param.pad_w = 1;
param.stride_h = param.stride_w = 1;
param.format = param::ConvBias::Format::NCHW;
param.nonlineMode = param::ConvBias::NonlineMode::RELU;
checker.set_param(param).execs(
{{1, 64, 7, 7}, {64, 64, 3, 3}, {1, 64, 1, 1}, {}, {}});
checker.set_param(param).execs(
{{1, 64, 7, 7}, {64, 64, 3, 3}, {1, 64, 1, 1}, {1, 64, 7, 7}, {}});
// group
param.sparse = param::ConvBias::Sparse::GROUP;
checker.set_param(param).execs(
{{1, 64, 7, 7}, {8, 8, 8, 3, 3}, {1, 64, 1, 1}, {}, {}});
checker.set_param(param).execs(
{{1, 64, 7, 7}, {8, 8, 8, 3, 3}, {1, 64, 1, 1}, {1, 64, 7, 7}, {}});
// NHWC
param.format = param::ConvBias::Format::NHWC;
checker.set_param(param).execs(
{{1, 7, 7, 64}, {8, 8, 3, 3, 8}, {1, 1, 1, 64}, {}, {}});
checker.set_param(param).execs(
{{1, 7, 7, 64}, {8, 8, 3, 3, 8}, {1, 1, 1, 64}, {1, 7, 7, 64}, {}});
}
TEST_F(CUDA, CONV_V8_HALF) {
Checker<ConvBiasForward> checker(handle_cuda());
checker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(ExecutionPolicyAlgoName{
ConvBiasForward::algo_name<ConvBiasForward::DefaultParam>(
"CUDNN:ConvolutionV8", {})
.c_str()}));
UniformFloatRNG rng(0.f, 1.f);
checker.set_rng(0, &rng)
.set_rng(1, &rng)
.set_rng(2, &rng)
.set_rng(3, &rng)
.set_dtype(0, dtype::Float16())
.set_dtype(1, dtype::Float16())
.set_dtype(2, dtype::Float16())
.set_dtype(3, dtype::Float16())
.set_dtype(4, dtype::Float16())
.set_epsilon(5e-2);
param::ConvBias param;
param.pad_h = param.pad_w = 1;
param.stride_h = param.stride_w = 1;
param.format = param::ConvBias::Format::NCHW;
param.nonlineMode = param::ConvBias::NonlineMode::RELU;
param.compute_mode = param::ConvBias::ComputeMode::FLOAT32;
checker.set_param(param).execs(
{{1, 64, 7, 7}, {64, 64, 3, 3}, {1, 64, 1, 1}, {}, {}});
checker.set_param(param).execs(
{{1, 64, 7, 7}, {64, 64, 3, 3}, {1, 64, 1, 1}, {1, 64, 7, 7}, {}});
// group
param.sparse = param::ConvBias::Sparse::GROUP;
checker.set_param(param).execs(
{{1, 64, 7, 7}, {8, 8, 8, 3, 3}, {1, 64, 1, 1}, {}, {}});
checker.set_param(param).execs(
{{1, 64, 7, 7}, {8, 8, 8, 3, 3}, {1, 64, 1, 1}, {1, 64, 7, 7}, {}});
// NHWC
param.format = param::ConvBias::Format::NHWC;
checker.set_param(param).execs(
{{1, 7, 7, 64}, {8, 8, 3, 3, 8}, {1, 1, 1, 64}, {}, {}});
checker.set_param(param).execs(
{{1, 7, 7, 64}, {8, 8, 3, 3, 8}, {1, 1, 1, 64}, {1, 7, 7, 64}, {}});
}
TEST_F(CUDA, CONV_BIAS_V8_FLOAT) {
Checker<ConvBiasForward> checker(handle_cuda());
checker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(ExecutionPolicyAlgoName{
ConvBiasForward::algo_name<ConvBiasForward::DefaultParam>(
"CUDNN:ConvBiasActivationV8", {})
.c_str()}));
UniformFloatRNG rng(0.f, 1.f);
UniformFloatRNG crng(0.f, 0.f);
checker.set_rng(0, &rng)
.set_rng(1, &rng)
.set_rng(2, &rng)
.set_rng(3, &rng)
.set_dtype(0, dtype::Float32())
.set_dtype(1, dtype::Float32())
.set_dtype(2, dtype::Float32())
.set_dtype(3, dtype::Float32());
param::ConvBias param;
param.pad_h = param.pad_w = 1;
param.stride_h = param.stride_w = 1;
param.format = param::ConvBias::Format::NCHW;
param.nonlineMode = param::ConvBias::NonlineMode::RELU;
checker.set_param(param).execs(
{{1, 64, 7, 7}, {64, 64, 3, 3}, {1, 64, 1, 1}, {}, {}});
checker.set_param(param).execs(
{{1, 64, 7, 7}, {64, 64, 3, 3}, {1, 64, 1, 1}, {1, 64, 7, 7}, {}});
// group
param.sparse = param::ConvBias::Sparse::GROUP;
checker.set_param(param).execs(
{{1, 64, 7, 7}, {8, 8, 8, 3, 3}, {1, 64, 1, 1}, {}, {}});
checker.set_param(param).execs(
{{1, 64, 7, 7}, {8, 8, 8, 3, 3}, {1, 64, 1, 1}, {1, 64, 7, 7}, {}});
// NHWC
param.format = param::ConvBias::Format::NHWC;
checker.set_param(param).execs(
{{1, 7, 7, 64}, {8, 8, 3, 3, 8}, {1, 1, 1, 64}, {}, {}});
checker.set_param(param).execs(
{{1, 7, 7, 64}, {8, 8, 3, 3, 8}, {1, 1, 1, 64}, {1, 7, 7, 64}, {}});
}
TEST_F(CUDA, CONV_BIAS_V8_HALF) {
Checker<ConvBiasForward> checker(handle_cuda());
checker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(ExecutionPolicyAlgoName{
ConvBiasForward::algo_name<ConvBiasForward::DefaultParam>(
"CUDNN:ConvBiasActivationV8", {})
.c_str()}));
UniformFloatRNG rng(0.f, 1.f);
checker.set_rng(0, &rng)
.set_rng(1, &rng)
.set_rng(2, &rng)
.set_rng(3, &rng)
.set_dtype(0, dtype::Float16())
.set_dtype(1, dtype::Float16())
.set_dtype(2, dtype::Float16())
.set_dtype(3, dtype::Float16())
.set_dtype(4, dtype::Float16())
.set_epsilon(5e-2);
param::ConvBias param;
param.pad_h = param.pad_w = 1;
param.stride_h = param.stride_w = 1;
param.format = param::ConvBias::Format::NCHW;
param.nonlineMode = param::ConvBias::NonlineMode::RELU;
param.compute_mode = param::ConvBias::ComputeMode::FLOAT32;
checker.set_param(param).execs(
{{1, 64, 7, 7}, {64, 64, 3, 3}, {1, 64, 1, 1}, {}, {}});
checker.set_param(param).execs(
{{1, 64, 7, 7}, {64, 64, 3, 3}, {1, 64, 1, 1}, {1, 64, 7, 7}, {}});
// group
param.sparse = param::ConvBias::Sparse::GROUP;
checker.set_param(param).execs(
{{1, 64, 7, 7}, {8, 8, 8, 3, 3}, {1, 64, 1, 1}, {}, {}});
checker.set_param(param).execs(
{{1, 64, 7, 7}, {8, 8, 8, 3, 3}, {1, 64, 1, 1}, {1, 64, 7, 7}, {}});
// NHWC
param.format = param::ConvBias::Format::NHWC;
checker.set_param(param).execs(
{{1, 7, 7, 64}, {8, 8, 3, 3, 8}, {1, 1, 1, 64}, {}, {}});
checker.set_param(param).execs(
{{1, 7, 7, 64}, {8, 8, 3, 3, 8}, {1, 1, 1, 64}, {1, 7, 7, 64}, {}});
}
TEST_F(CUDA, CONV_BIAS_V8_DP4A) {
Checker<ConvBiasForward> checker(handle_cuda());
checker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(ExecutionPolicyAlgoName{
ConvBiasForward::algo_name<ConvBiasForward::DefaultParam>(
"CUDNN:ConvBiasActivationV8", {})
.c_str()}));
UniformIntRNG rng{-3, 3};
UniformIntRNG bias_rng{-50, 50};
checker.set_rng(0, &rng)
.set_rng(1, &rng)
.set_rng(2, &bias_rng)
.set_rng(3, &rng)
.set_dtype(0, dtype::QuantizedS8{1.2f})
.set_dtype(1, dtype::QuantizedS8{1.3f})
.set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
.set_dtype(3, dtype::QuantizedS8{1.1f})
.set_dtype(4, dtype::QuantizedS8{1.0f})
.set_epsilon(1 + 1e-3);
param::ConvBias param;
param.pad_h = param.pad_w = 1;
param.stride_h = param.stride_w = 1;
param.format = param::ConvBias::Format::NCHW4;
param.nonlineMode = param::ConvBias::NonlineMode::RELU;
checker.set_param(param).execs(
{{1, 16, 7, 7, 4}, {64, 16, 3, 3, 4}, {1, 16, 1, 1, 4}, {}, {}});
checker.set_param(param).execs(
{{1, 16, 7, 7, 4},
{64, 16, 3, 3, 4},
{1, 16, 1, 1, 4},
{1, 16, 7, 7, 4},
{}});
param.nonlineMode = param::ConvBias::NonlineMode::IDENTITY;
checker.set_param(param).execs(
{{1, 16, 7, 7, 4}, {64, 16, 3, 3, 4}, {1, 16, 1, 1, 4}, {}, {}});
checker.set_param(param).execs(
{{1, 16, 7, 7, 4},
{64, 16, 3, 3, 4},
{1, 16, 1, 1, 4},
{1, 16, 7, 7, 4},
{}});
param.format = param::ConvBias::Format::NHWC;
checker.set_param(param).execs(
{{1, 7, 7, 64}, {64, 3, 3, 64}, {1, 1, 1, 64}, {}, {}});
checker.set_param(param).execs(
{{1, 7, 7, 64}, {64, 3, 3, 64}, {1, 1, 1, 64}, {1, 7, 7, 64}, {}});
param.sparse = param::ConvBias::Sparse::GROUP;
checker.set_param(param).execs(
{{1, 7, 7, 64}, {8, 8, 3, 3, 8}, {1, 1, 1, 64}, {}, {}});
checker.set_param(param).execs(
{{1, 7, 7, 64}, {8, 8, 3, 3, 8}, {1, 1, 1, 64}, {1, 7, 7, 64}, {}});
}
TEST_F(CUDA, CONV_BIAS_V8_IMMA) {
Checker<ConvBiasForward> checker(handle_cuda());
checker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(ExecutionPolicyAlgoName{
ConvBiasForward::algo_name<ConvBiasForward::DefaultParam>(
"CUDNN:ConvBiasActivationV8", {})
.c_str()}));
UniformIntRNG rng{-3, 3};
UniformIntRNG bias_rng{-50, 50};
checker.set_rng(0, &rng)
.set_rng(1, &rng)
.set_rng(2, &bias_rng)
.set_rng(3, &rng)
.set_dtype(0, dtype::QuantizedS8{1.2f})
.set_dtype(1, dtype::QuantizedS8{1.3f})
.set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
.set_dtype(3, dtype::QuantizedS8{1.1f})
.set_dtype(4, dtype::QuantizedS8{1.0f})
.set_epsilon(1 + 1e-3);
param::ConvBias param;
param.pad_h = param.pad_w = 1;
param.stride_h = param.stride_w = 1;
param.format = param::ConvBias::Format::NCHW32;
param.nonlineMode = param::ConvBias::NonlineMode::RELU;
checker.set_param(param).execs(
{{1, 2, 7, 7, 32}, {64, 2, 3, 3, 32}, {1, 2, 1, 1, 32}, {}, {}});
checker.set_param(param).execs(
{{1, 2, 7, 7, 32},
{64, 2, 3, 3, 32},
{1, 2, 1, 1, 32},
{1, 2, 7, 7, 32},
{}});
param.nonlineMode = NonlineMode::RELU;
param.stride_h = param.stride_w = 1;
param.pad_h = param.pad_w = 0;
checker.set_param(param).execs(
{{2, 8, 12, 12, 32}, {512, 8, 1, 1, 32}, {1, 16, 1, 1, 32}, {}, {}});
}
#endif
// vim: syntax=cpp.doxygen
......@@ -94,6 +94,7 @@ function git_submodule_update() {
git submodule sync
git submodule update -f --init midout
git submodule update -f --init flatbuffers
git submodule update -f --init cudnn-frontend
git submodule update -f --init Json
git submodule update -f --init gflags
git submodule update -f --init cpuinfo
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册
反馈
建议
客服 返回
顶部