提交 38067472 编写于 作者: M Megvii Engine Team 提交者: 王彪

fix(dnn/cuda): fix ci

GitOrigin-RevId: 8267e5f9ddd5c6813fcfccf8df197f3c8112fa98
上级 1da58ae1
...@@ -245,9 +245,8 @@ std::pair<int, int> get_tensor_alignment( ...@@ -245,9 +245,8 @@ std::pair<int, int> get_tensor_alignment(
int threads = warp_size * algo_param.threadblock_m * algo_param.threadblock_n * int threads = warp_size * algo_param.threadblock_m * algo_param.threadblock_n *
algo_param.threadblock_k / algo_param.threadblock_k /
(algo_param.warp_m * algo_param.warp_n * algo_param.warp_k); (algo_param.warp_m * algo_param.warp_n * algo_param.warp_k);
int threadblock_loads = filter.dtype.size( int threadblock_loads =
algo_param.threadblock_m * algo_param.threadblock_n * filter.dtype.size(algo_param.threadblock_m * algo_param.threadblock_k);
algo_param.threadblock_k);
int load_per_thread = threadblock_loads / threads; int load_per_thread = threadblock_loads / threads;
if (load_per_thread >= 16) if (load_per_thread >= 16)
alignment_filter = 16; alignment_filter = 16;
......
...@@ -30,6 +30,7 @@ bool ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available( ...@@ -30,6 +30,7 @@ bool ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available(
using Format = Param::Format; using Format = Param::Format;
using Sparse = Param::Sparse; using Sparse = Param::Sparse;
using Mode = Param::Mode; using Mode = Param::Mode;
using NonlineMode = Param::NonlineMode;
auto&& param = args.opr->param(); auto&& param = args.opr->param();
auto&& fm = args.filter_meta; auto&& fm = args.filter_meta;
RETURN_IF_FALSE( RETURN_IF_FALSE(
...@@ -37,6 +38,7 @@ bool ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available( ...@@ -37,6 +38,7 @@ bool ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available(
args.src_layout->dtype.enumv() == DTypeEnum::Float16 && args.src_layout->dtype.enumv() == DTypeEnum::Float16 &&
args.filter_layout->dtype.enumv() == DTypeEnum::Float16 && args.filter_layout->dtype.enumv() == DTypeEnum::Float16 &&
args.dst_layout->dtype.enumv() == DTypeEnum::Float16); args.dst_layout->dtype.enumv() == DTypeEnum::Float16);
RETURN_IF_FALSE(param.nonlineMode != NonlineMode::SIGMOID);
RETURN_IF_FALSE( RETURN_IF_FALSE(
args.bias_layout->ndim <= 0 || args.bias_layout->ndim <= 0 ||
(args.bias_layout->dtype.enumv() == DTypeEnum::Float16 && (args.bias_layout->dtype.enumv() == DTypeEnum::Float16 &&
......
...@@ -23,12 +23,14 @@ bool ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available( ...@@ -23,12 +23,14 @@ bool ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available(
#define RETURN_IF_FALSE(stmt_) \ #define RETURN_IF_FALSE(stmt_) \
if (!(stmt_)) \ if (!(stmt_)) \
return false; return false;
RETURN_IF_FALSE(is_compute_capability_required(6, 1));
RETURN_IF_FALSE( RETURN_IF_FALSE(
args.src_layout->is_contiguous() && args.dst_layout->is_contiguous()); args.src_layout->is_contiguous() && args.dst_layout->is_contiguous());
using Param = param::ConvBias; using Param = param::ConvBias;
using Format = Param::Format; using Format = Param::Format;
using Sparse = Param::Sparse; using Sparse = Param::Sparse;
using Mode = Param::Mode; using Mode = Param::Mode;
using NonlineMode = Param::NonlineMode;
auto&& param = args.opr->param(); auto&& param = args.opr->param();
auto&& fm = args.filter_meta; auto&& fm = args.filter_meta;
RETURN_IF_FALSE( RETURN_IF_FALSE(
...@@ -36,6 +38,7 @@ bool ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available( ...@@ -36,6 +38,7 @@ bool ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available(
args.src_layout->dtype.enumv() == DTypeEnum::Float32 && args.src_layout->dtype.enumv() == DTypeEnum::Float32 &&
args.filter_layout->dtype.enumv() == DTypeEnum::Float32 && args.filter_layout->dtype.enumv() == DTypeEnum::Float32 &&
args.dst_layout->dtype.enumv() == DTypeEnum::Float32); args.dst_layout->dtype.enumv() == DTypeEnum::Float32);
RETURN_IF_FALSE(param.nonlineMode != NonlineMode::SIGMOID);
RETURN_IF_FALSE( RETURN_IF_FALSE(
args.bias_layout->ndim <= 0 || args.bias_layout->ndim <= 0 ||
(args.bias_layout->dtype.enumv() == DTypeEnum::Float32 && (args.bias_layout->dtype.enumv() == DTypeEnum::Float32 &&
......
...@@ -63,6 +63,7 @@ bool ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_avai ...@@ -63,6 +63,7 @@ bool ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_avai
#define RETURN_IF_FALSE(stmt_) \ #define RETURN_IF_FALSE(stmt_) \
if (!(stmt_)) \ if (!(stmt_)) \
return false; return false;
RETURN_IF_FALSE(is_compute_capability_required(6, 1));
RETURN_IF_FALSE( RETURN_IF_FALSE(
args.diff_layout->is_contiguous() && args.grad_layout->is_contiguous()); args.diff_layout->is_contiguous() && args.grad_layout->is_contiguous());
using Param = param::Convolution; using Param = param::Convolution;
......
...@@ -29,6 +29,19 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: ...@@ -29,6 +29,19 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
(sh == 2 && sw == 2) (sh == 2 && sw == 2)
? cutlass::conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING ? cutlass::conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING
: cutlass::conv::SpecialOptimizeDesc::NONE; : cutlass::conv::SpecialOptimizeDesc::NONE;
int alignment_filter = 4;
constexpr int warp_size = 32;
int threads = warp_size * m_algo_param.threadblock_m * m_algo_param.threadblock_n *
m_algo_param.threadblock_k /
(m_algo_param.warp_m * m_algo_param.warp_n * m_algo_param.warp_k);
int threadblock_loads = args.filter_layout->dtype.size(
m_algo_param.threadblock_m * m_algo_param.threadblock_k);
int load_per_thread = threadblock_loads / threads;
if (load_per_thread >= 16)
alignment_filter = 16;
else if (load_per_thread >= 8)
alignment_filter = 8;
megdnn_assert(load_per_thread >= 4);
ConvolutionKey key{ ConvolutionKey key{
cutlass::conv::Operator::kDgrad, cutlass::conv::Operator::kDgrad,
NumericTypeID::kS8, NumericTypeID::kS8,
...@@ -54,7 +67,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: ...@@ -54,7 +67,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
m_algo_param.stage, m_algo_param.stage,
special_optimization, special_optimization,
4, 4,
4, alignment_filter,
false}; false};
return (void*)Singleton::get().operation_table.find_op(key); return (void*)Singleton::get().operation_table.find_op(key);
} }
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include "test/common/workspace_wrapper.h" #include "test/common/workspace_wrapper.h"
#include "test/cuda/benchmark.h" #include "test/cuda/benchmark.h"
#include "test/cuda/fixture.h" #include "test/cuda/fixture.h"
#include "test/cuda/utils.h"
#include <cuda_profiler_api.h> #include <cuda_profiler_api.h>
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
...@@ -510,6 +511,7 @@ void check_chanwise(DType io_type, DType comp_type, Handle* handle, const char* ...@@ -510,6 +511,7 @@ void check_chanwise(DType io_type, DType comp_type, Handle* handle, const char*
#define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_FMA_##tag) { \ TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_FMA_##tag) { \
require_compute_capability(6, 1); \
check_chanwise<ConvolutionForward>( \ check_chanwise<ConvolutionForward>( \
dtype::Float32(), dtype::Float32(), handle_cuda(), \ dtype::Float32(), dtype::Float32(), handle_cuda(), \
"FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
...@@ -522,6 +524,7 @@ MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb) ...@@ -522,6 +524,7 @@ MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb)
#define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_FMA_##tag) { \ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_FMA_##tag) { \
require_compute_capability(6, 1); \
check_chanwise<ConvolutionBackwardData>( \ check_chanwise<ConvolutionBackwardData>( \
dtype::Float32(), dtype::Float32(), handle_cuda(), \ dtype::Float32(), dtype::Float32(), handle_cuda(), \
"FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
...@@ -544,6 +547,7 @@ MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb) ...@@ -544,6 +547,7 @@ MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb)
// check both ioc16 and io16xc32 // check both ioc16 and io16xc32
#define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_HMMA_##tag) { \ TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_HMMA_##tag) { \
require_compute_capability(7, 0); \
check_chanwise<ConvolutionForward>( \ check_chanwise<ConvolutionForward>( \
dtype::Float16(), dtype::Float16(), handle_cuda(), \ dtype::Float16(), dtype::Float16(), handle_cuda(), \
"FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
...@@ -560,6 +564,7 @@ MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb) ...@@ -560,6 +564,7 @@ MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb)
#define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_HMMA_##tag) { \ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_HMMA_##tag) { \
require_compute_capability(7, 0); \
check_chanwise<ConvolutionBackwardData>( \ check_chanwise<ConvolutionBackwardData>( \
dtype::Float16(), dtype::Float16(), handle_cuda(), \ dtype::Float16(), dtype::Float16(), handle_cuda(), \
"FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
...@@ -1407,7 +1412,7 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_LARGE_KERNEL) { ...@@ -1407,7 +1412,7 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_LARGE_KERNEL) {
bencher.proxy()->target_execution_policy.algo.reset(); bencher.proxy()->target_execution_policy.algo.reset();
param.compute_mode = param::Convolution::ComputeMode::FLOAT32; param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
bencher.set_param(param); bencher.set_param(param);
auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS; auto time_in_ms_pseudo_fp16 = bencher.execs({filter, src, src}) / RUNS;
printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s " printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
"float16: %.2fms %.2fGB/s " "float16: %.2fms %.2fGB/s "
......
...@@ -1033,7 +1033,7 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_GROUP) { ...@@ -1033,7 +1033,7 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_GROUP) {
ConvBiasForward::algo_name<ConvBiasForward::DirectParam>( ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
"CUDA:GROUP_CONV", {}) "CUDA:GROUP_CONV", {})
.c_str(), .c_str(),
{{"CUDNN", {}}}})); {{"DEFAULT:CUDNN", {}}}}));
ConvBias::Param param; ConvBias::Param param;
param.sparse = ConvBias::Param::Sparse::GROUP; param.sparse = ConvBias::Param::Sparse::GROUP;
param.nonlineMode = mode; param.nonlineMode = mode;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册