diff --git a/dnn/src/cuda/conv_bias/algo.cpp b/dnn/src/cuda/conv_bias/algo.cpp index 57df49027dfaad34f62ff900f8684eee340a3800..e100417c75c14fe31a2d4a2c2037c8399c41330b 100644 --- a/dnn/src/cuda/conv_bias/algo.cpp +++ b/dnn/src/cuda/conv_bias/algo.cpp @@ -104,20 +104,19 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { ConvBiasForwardImpl::AlgoPack ConvBiasForwardImpl::sm_algo_pack; -ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs(ConvBiasForwardImpl* o, - const TensorLayout& src, - const TensorLayout& filter, - const TensorLayout& bias, - const TensorLayout& z, - const TensorLayout& dst) +ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs( + ConvBiasForwardImpl* o, const TensorLayout& src, + const TensorLayout& filter, const TensorLayout& bias, + const TensorLayout& z, const TensorLayout& dst, + const PreprocessedFilter* preprocessed_filter) : SizeArgs(o, src, filter, o->check_layout_fwd(src, filter, dst), bias, - z, dst) {} + z, dst, preprocessed_filter) {} ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs( ConvBiasForwardImpl* o, const TensorLayout& src, const TensorLayout& filter, const CanonizedFilterMeta& filter_meta, const TensorLayout& bias, const TensorLayout& z, - const TensorLayout& dst) + const TensorLayout& dst, const PreprocessedFilter* preprocessed_filter) : BiasForwardSizeArgs{concrete_handle(o->handle()), &src, &filter, @@ -126,14 +125,16 @@ ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs( filter_meta, &dst, o->param().nonlineMode}, - opr{o} {} + opr{o}, + preprocessed_filter{preprocessed_filter} {} ConvBiasForwardImpl::AlgoBase::ExecArgs::ExecArgs( ConvBiasForwardImpl* opr, _megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_in bias, _megdnn_tensor_in z, - _megdnn_tensor_out dst, _megdnn_workspace workspace) + _megdnn_tensor_out dst, _megdnn_workspace workspace, + const PreprocessedFilter* preprocessed_filter) : SizeArgs(opr, src.layout, filter.layout, bias.layout, z.layout, - dst.layout), + dst.layout, preprocessed_filter), src_tensor{&src}, filter_tensor{&filter}, bias_tensor{&bias}, diff --git a/dnn/src/cuda/conv_bias/algo.h b/dnn/src/cuda/conv_bias/algo.h index 32a214f17df593dd633f2c1c788905db44e03fa5..8325548f4f32d899e519640a3eeea5a613147cdd 100644 --- a/dnn/src/cuda/conv_bias/algo.h +++ b/dnn/src/cuda/conv_bias/algo.h @@ -41,16 +41,19 @@ public: AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; } struct SizeArgs : public conv_bias::BiasForwardSizeArgs { ConvBiasForwardImpl* opr; - + const PreprocessedFilter* preprocessed_filter; + std::string to_string() const; SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias, - const TensorLayout& z, const TensorLayout& dst); + const TensorLayout& z, const TensorLayout& dst, + const PreprocessedFilter* preprocessed_filter = nullptr); SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src, const TensorLayout& filter, const CanonizedFilterMeta& filter_meta, const TensorLayout& bias, const TensorLayout& z, - const TensorLayout& dst); + const TensorLayout& dst, + const PreprocessedFilter* preprocessed_filter = nullptr); void init_conv_bias_desc(conv_bias::CUDNNForwardDescs& desc) const { desc.set_conv_bias(*src_layout, filter_meta, *dst_layout, @@ -69,11 +72,21 @@ public: ExecArgs(ConvBiasForwardImpl* opr, _megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_in bias, _megdnn_tensor_in z, _megdnn_tensor_out dst, - _megdnn_workspace workspace); + _megdnn_workspace workspace, + const PreprocessedFilter* preprocessed_filter = nullptr); }; virtual bool is_available(const SizeArgs& args) const = 0; virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; virtual void exec(const ExecArgs& args) const = 0; + virtual size_t get_preprocess_workspace_in_bytes( + const SizeArgs& args) const { + return 0; + } + virtual SmallVector deduce_preprocessed_filter_layout( + const SizeArgs& args) const { + return {}; + } + virtual void exec_preprocess(const ExecArgs& args) const {} bool is_available_wk(const SizeArgs& args, size_t limit) { return is_available(args) && get_workspace_in_bytes(args) <= limit; diff --git a/dnn/src/cuda/conv_bias/opr_impl.cpp b/dnn/src/cuda/conv_bias/opr_impl.cpp index d487c5a30890fa61e3eae39bd65612c2cb63c162..5205e40ad5643bf1d39cd2dc9f2793fb06b214a6 100644 --- a/dnn/src/cuda/conv_bias/opr_impl.cpp +++ b/dnn/src/cuda/conv_bias/opr_impl.cpp @@ -29,7 +29,8 @@ void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_workspace workspace) { check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout, workspace.size, preprocessed_filter); - AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace); + AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace, + preprocessed_filter); auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout, z.layout, dst.layout); algo->check_workspace(args, workspace).exec(args); @@ -205,17 +206,50 @@ const char* ConvBiasForwardImpl::get_algorithm_set_name() const { return "CONV_BIAS_CUDA"; } -size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src, - const TensorLayout& filter, - const TensorLayout& bias, - const TensorLayout& z, - const TensorLayout& dst, - const PreprocessedFilter*) { - AlgoBase::SizeArgs args{this, src, filter, bias, z, dst}; +size_t ConvBiasForwardImpl::get_workspace_in_bytes( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& bias, const TensorLayout& z, + const TensorLayout& dst, + const PreprocessedFilter* preprocessed_filter) { + AlgoBase::SizeArgs args{ + this, src, filter, bias, z, dst, preprocessed_filter}; return get_algorithm(this, src, filter, bias, z, dst) ->get_workspace_in_bytes(args); }; +size_t ConvBiasForwardImpl::get_preprocess_workspace_in_bytes( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& bias, const TensorLayout& z, + const TensorLayout& dst) { + AlgoBase::SizeArgs args{this, src, filter, bias, z, dst}; + return get_algorithm(this, src, filter, bias, z, dst) + ->get_preprocess_workspace_in_bytes(args); +} + +SmallVector +ConvBiasForwardImpl::deduce_preprocessed_filter_layout( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& bias, const TensorLayout& z, + const TensorLayout& dst) { + AlgoBase::SizeArgs args{this, src, filter, bias, z, dst}; + return get_algorithm(this, src, filter, bias, z, dst) + ->deduce_preprocessed_filter_layout(args); +} + +void ConvBiasForwardImpl::exec_preprocess( + const TensorLayout& src_layout, _megdnn_tensor_in filter, + const TensorLayout& bias_layout, const TensorLayout& z_layout, + const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter, + _megdnn_workspace workspace) { + TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout}, + z{nullptr, z_layout}, bias{nullptr, bias_layout}; + AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace, + preprocessed_filter); + auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout, + z.layout, dst.layout); + return algo->exec_preprocess(args); +} + } // namespace cuda } // namespace megdnn diff --git a/dnn/src/cuda/conv_bias/opr_impl.h b/dnn/src/cuda/conv_bias/opr_impl.h index ca07b8f37452d5504b405d4a4703a47096d8625d..81f3f7710b15f1a25dcdaac7baaec2ee8a9ca5a8 100644 --- a/dnn/src/cuda/conv_bias/opr_impl.h +++ b/dnn/src/cuda/conv_bias/opr_impl.h @@ -44,21 +44,14 @@ public: const TensorLayout&, const TensorLayout&, const TensorLayout&, - const TensorLayout&) override { - return 0; - }; + const TensorLayout&) override; SmallVector deduce_preprocessed_filter_layout( const TensorLayout&, const TensorLayout&, const TensorLayout&, - const TensorLayout&, const TensorLayout&) override { - return {}; - } + const TensorLayout&, const TensorLayout&) override; void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, const TensorLayout&, const TensorLayout&, const TensorLayout&, PreprocessedFilter*, - _megdnn_workspace) override { - megdnn_throw("cuda conv_bias exec_preprocess has not implemeted yet"); - } - + _megdnn_workspace) override; const char* get_algorithm_set_name() const override; class AlgoBase; diff --git a/dnn/test/common/comparator.inl b/dnn/test/common/comparator.inl index e3caf319828ff8103c4cf1628ff5a881073d1b55..a83bff994bbbd3678a04e95d36a1c2c18a442d4b 100644 --- a/dnn/test/common/comparator.inl +++ b/dnn/test/common/comparator.inl @@ -32,6 +32,22 @@ class DefaultComparator { } }; +template <> +class DefaultComparator { +public: + bool is_same(dt_qint8 expected, dt_qint8 actual) const { + return expected.as_int8() == actual.as_int8(); + } +}; + +template <> +class DefaultComparator { +public: + bool is_same(dt_qint32 expected, dt_qint32 actual) const { + return expected.as_int32() == actual.as_int32(); + } +}; + } // namespace test } // namespace megdnn diff --git a/dnn/test/common/conv_bias.cpp b/dnn/test/common/conv_bias.cpp index 14a1b26c7d87fbef46d378ca22b3c930571bbee4..298134a559f68c7e8e150dc9e872cb00cd3df6f3 100644 --- a/dnn/test/common/conv_bias.cpp +++ b/dnn/test/common/conv_bias.cpp @@ -741,10 +741,12 @@ void check_conv_bias(DType src_dtype, DType filter_dtype, DType bias_dtype, std::unique_ptr rng; std::unique_ptr bias_rng; std::unique_ptr const_rng; + std::unique_ptr zero_rng; // TODO: check range of rng if (src_dtype.enumv() == DTypeEnum::QuantizedS8) { rng = std::make_unique(-3, 3); const_rng = std::make_unique(1, 1); + zero_rng = std::make_unique(0, 0); megdnn_assert(bias_dtype.enumv() == DTypeEnum::QuantizedS32); bias_rng = std::make_unique(-50, 50); checker.set_epsilon(1 + 1e-3) @@ -775,6 +777,12 @@ void check_conv_bias(DType src_dtype, DType filter_dtype, DType bias_dtype, fh = arg.filter[2]; fw = arg.filter[3]; z[1] = arg.filter[0] / 4; + } else if (format == Format::NCHW32) { + hi = arg.src[2]; + wi = arg.src[3]; + fh = arg.filter[2]; + fw = arg.filter[3]; + z[1] = arg.filter[0] / 32; } else { megdnn_assert(format == Format::CHWN4); hi = arg.src[1]; @@ -798,7 +806,7 @@ void check_conv_bias(DType src_dtype, DType filter_dtype, DType bias_dtype, megdnn_assert(rng != nullptr && bias_rng != nullptr); checker.set_rng(0, rng.get()) .set_rng(1, rng.get()) - .set_rng(2, rng.get()) + .set_rng(2, bias_rng.get()) .set_rng(3, rng.get()); if (args.empty()) { std::vector default_args; diff --git a/dnn/test/common/tensor.inl b/dnn/test/common/tensor.inl index 44021d2d468403efe04ee755f8a1de0a07825b30..a842f9f59dec0c0bfb49b9af0b1775fd2759b896 100644 --- a/dnn/test/common/tensor.inl +++ b/dnn/test/common/tensor.inl @@ -24,7 +24,8 @@ Tensor::Tensor(Handle *handle, TensorLayout layout): m_handle(handle), m_comparator(C()) { - layout.dtype = get_dtype_from_static_type(); + if (!layout.dtype.valid()) + layout.dtype = get_dtype_from_static_type(); m_tensornd.raw_ptr = megdnn_malloc(m_handle, layout.span().dist_byte()); m_tensornd.layout = layout; } @@ -67,10 +68,10 @@ void Tensor::check_with(const Tensor &rhs) const auto index = Index(m_tensornd.layout, linear_idx); auto offset = index.positive_offset(); ASSERT_TRUE(m_comparator.is_same(p0[offset], p1[offset])) - << "Index is " << index.to_string() - << "; layout is " << m_tensornd.layout.to_string() - << "; this->ptr()[offset] is " << this->ptr()[offset] - << "; rhs.ptr()[offset] is " << rhs.ptr()[offset]; + << "Index is " << index.to_string() << "; layout is " + << m_tensornd.layout.to_string() << "; this->ptr()[offset] is " + << this->ptr()[offset] << "; rhs.ptr()[offset] is " + << rhs.ptr()[offset]; } } diff --git a/dnn/test/cuda/conv_bias_int8.cpp b/dnn/test/cuda/conv_bias_int8.cpp index 79a9a1859eeef1a0c35e1ec6b515ea5d7bea8257..3173be121ff909e0032bc31b4fbe714fe5ca7f7f 100644 --- a/dnn/test/cuda/conv_bias_int8.cpp +++ b/dnn/test/cuda/conv_bias_int8.cpp @@ -18,6 +18,8 @@ #include "test/cuda/benchmark.h" #include "test/cuda/fixture.h" #include "test/cuda/utils.h" +#include "test/common/tensor.h" +#include "test/common/workspace_wrapper.h" #define V1(x) #x #define V(x) V1(x) @@ -34,7 +36,6 @@ struct BenchArgs { std::vector get_resnet50_bench_args(size_t batch = 64) { std::vector args; args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1}); - args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 1}); args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 2}); args.emplace_back(BenchArgs{batch, 4, 256, 256, 32, 7, 2}); @@ -44,7 +45,6 @@ std::vector get_resnet50_bench_args(size_t batch = 64) { args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 1}); args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 2}); args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 3, 2}); - args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1}); args.emplace_back(BenchArgs{batch, 256, 56, 56, 512, 1, 2}); args.emplace_back(BenchArgs{batch, 256, 56, 56, 128, 1, 2}); @@ -57,6 +57,7 @@ std::vector get_resnet50_bench_args(size_t batch = 64) { args.emplace_back(BenchArgs{batch, 1024, 14, 14, 256, 1, 1}); args.emplace_back(BenchArgs{batch, 256, 14, 14, 256, 3, 1}); args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 1}); + args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 2}); args.emplace_back(BenchArgs{batch, 1024, 14, 14, 2048, 1, 2}); args.emplace_back(BenchArgs{batch, 1024, 14, 14, 512, 1, 2}); @@ -331,6 +332,12 @@ void benchmark_target_algo_with_cudnn_tsc( if ((format == Format::CHWN4 || format == Format::NCHW4) && (arg.ci % 16 != 0)) continue; + Format format_cudnn = arg.ci % 32 == 0 && arg.co % 32 == 0 + ? Format::NCHW32 + : Format::NCHW4; + param.format = format_cudnn; + benchmarker_cudnn.set_param(param); + float time_in_ms = 0.f; if (algo) { time_in_ms = @@ -351,18 +358,14 @@ void benchmark_target_algo_with_cudnn_tsc( {}}) / RUNS; } - Format format_cudnn = arg.ci % 32 == 0 && arg.co % 32 == 0 - ? Format::NCHW32 - : Format::NCHW4; - param.format = format_cudnn; - benchmarker_cudnn.set_param(param); - auto time_in_ms_cudnn = + float time_in_ms_cudnn = benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn), get_tensor_shape(filter, format_cudnn), get_tensor_shape(bias, format_cudnn), {}, {}}) / RUNS; + float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f / (1e12); printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, " @@ -1075,8 +1078,8 @@ TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) { #if CUDA_VERSION >= 10020 -/// \note: we only check several cases and block sizes in megdnn_test, the full -/// testcases are written in cutlass repository +/// \note: we only check several cases and block sizes in megdnn_test, the +/// full testcases are written in cutlass repository TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) { require_compute_capability_eq(7, 5); Checker checker(handle_cuda());