diff --git a/dnn/src/cuda/conv_bias/algo.cpp b/dnn/src/cuda/conv_bias/algo.cpp index b3f6ae1847f972c3accb0d81436ac40ab5056551..390c2b718b5d77e02aa0dd3af36f75a626713799 100644 --- a/dnn/src/cuda/conv_bias/algo.cpp +++ b/dnn/src/cuda/conv_bias/algo.cpp @@ -63,12 +63,8 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { non_cudnn_algos.push_back(all_algos.rbegin()[1]); // group batched_matmul non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group 1x1 - algo_size = all_algos.size(); - for (size_t i = 0; i < algo_size; ++i) { - bfloat16_refhold.emplace_back(new AlgoBFloat16(all_algos[i])); - all_algos.push_back(bfloat16_refhold.back().get()); - bfloat16_algos.push_back(bfloat16_refhold.back().get()); - } + all_algos.push_back(&bfloat16); + bfloat16_algos.push_back(&bfloat16); size_t all_algo_size = all_algos.size(); #if CUDA_VERSION >= 10000 diff --git a/dnn/src/cuda/conv_bias/algo.h b/dnn/src/cuda/conv_bias/algo.h index 88cd1cac5230d3474ae09cf59d836b1fac971a67..0a49e957b4646cc93242ca1f37471d3c16885c2d 100644 --- a/dnn/src/cuda/conv_bias/algo.h +++ b/dnn/src/cuda/conv_bias/algo.h @@ -702,32 +702,20 @@ private: class ConvBiasForwardImpl::AlgoBFloat16 final : public AlgoBase { public: - AlgoBFloat16(AlgoBase* impl); - bool is_available(const SizeArgs& args) const override; size_t get_workspace_in_bytes(const SizeArgs& args) const override; void exec(const ExecArgs& args) const override; - const char* name() const override { return m_name.c_str(); } + std::vector get_subopr_list( + const TensorLayoutArray& layouts, + const OperatorBase* opr) const override; - bool is_reproducible() const override { return m_impl->is_reproducible(); } + const char* name() const override { return "CONVBIAS_BFLOAT16"; } + bool is_reproducible() const override { return true; } MEGDNN_DECL_ALGO_TYPE(CUDA_BFLOAT16) - - std::string param() const override { - std::string ret; - serialize_write_pod(m_impl, ret); - return ret; - } - private: - SizeArgs float_args(const SizeArgs& args, ConvBiasForwardImpl* opr, - TensorLayout& fsrc, TensorLayout& ffilter, - TensorLayout& fbias, TensorLayout& fz, - TensorLayout& fdst) const; WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; - AlgoBase* m_impl; - std::string m_name; }; @@ -766,7 +754,7 @@ public: std::vector int8_nchw32_imma; #endif std::vector> gconv_refhold; - std::vector> bfloat16_refhold; + AlgoBFloat16 bfloat16; std::unordered_map algo2gconv; AlgoBase* cudnn_conv_bias_act_from_enum(cudnnConvolutionFwdAlgo_t algo); diff --git a/dnn/src/cuda/conv_bias/bfloat16.cpp b/dnn/src/cuda/conv_bias/bfloat16.cpp index 7d4cf0390b78b3a7e41c245f50cb1e012526f215..10c45434dab2426e96b4f13e8347300e9ba0383b 100644 --- a/dnn/src/cuda/conv_bias/bfloat16.cpp +++ b/dnn/src/cuda/conv_bias/bfloat16.cpp @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "src/cuda/conv_bias/algo.h" @@ -18,58 +19,70 @@ using namespace megdnn; using namespace cuda; using namespace conv_bias; -ConvBiasForwardImpl::AlgoBFloat16::AlgoBFloat16( - ConvBiasForwardImpl::AlgoBase* algorithm) - : m_impl(algorithm) { - megdnn_assert_internal(algorithm); - m_name = ssprintf("BFLOAT16:%s", m_impl->name()); -} - -ConvBiasForwardImpl::AlgoBase::SizeArgs -ConvBiasForwardImpl::AlgoBFloat16::float_args( - const SizeArgs& args, ConvBiasForwardImpl* opr, TensorLayout& fsrc, - TensorLayout& ffilter, TensorLayout& fbias, TensorLayout& fz, - TensorLayout& fdst) const { - fsrc = *args.src_layout; - ffilter = *args.filter_layout; - fbias = *args.bias_layout; - fz = *args.z_layout; - fdst = *args.dst_layout; +namespace { +std::pair sub_opr_config( + const TensorLayoutArray& layouts, const ConvBiasForwardImpl* opr) { + megdnn_assert(layouts.size() >= 3); + std::pair ret; + ret.first = layouts; auto change_dtype = [](TensorLayout& layout) { if (layout.dtype == dtype::BFloat16()) { layout.dtype = dtype::Float32(); } }; - change_dtype(fsrc); - change_dtype(ffilter); - change_dtype(fbias); - change_dtype(fz); - change_dtype(fdst); - opr->param() = args.opr->param(); - opr->param().compute_mode = Param::ComputeMode::DEFAULT; - opr->execution_policy() = {m_impl->desc(), {}}; - return SizeArgs(opr, fsrc, ffilter, fbias, fz, fdst); + change_dtype(ret.first[0]); + change_dtype(ret.first[1]); + change_dtype(ret.first[2]); + change_dtype(ret.first[3]); + change_dtype(ret.first[4]); + + ret.second = opr->param(); + ret.second.compute_mode = ConvBiasForwardImpl::Param::ComputeMode::DEFAULT; + return ret; +} +} // namespace + +std::vector +ConvBiasForwardImpl::AlgoBFloat16::get_subopr_list( + const TensorLayoutArray& layouts, const OperatorBase* opr) const { + auto&& config = sub_opr_config( + layouts, static_cast(opr)); + + std::string param_str; + Algorithm::serialize_write_pod(config.second, param_str); + return {{Algorithm::OprType::CONVBIAS_FORWARD, param_str, config.first}}; } bool ConvBiasForwardImpl::AlgoBFloat16::is_available( const SizeArgs& args) const { - TensorLayout fsrc, ffilter, fbias, fz, fdst; auto convbias_opr = args.handle->create_operator(); - SizeArgs fargs = float_args( - args, static_cast(convbias_opr.get()), fsrc, - ffilter, fbias, fz, fdst); + auto&& config = sub_opr_config( + {*args.src_layout, *args.filter_layout, *args.bias_layout, + *args.z_layout, *args.dst_layout}, + args.opr); + convbias_opr->param() = config.second; + return args.src_layout->dtype == args.filter_layout->dtype && args.src_layout->dtype == dtype::BFloat16() && - m_impl->is_available(fargs); + get_algorithm(static_cast(convbias_opr.get()), + config.first[0], config.first[1], config.first[2], + config.first[3], config.first[4]); } WorkspaceBundle ConvBiasForwardImpl::AlgoBFloat16::get_workspace_bundle( void* ptr, const SizeArgs& args) const { - TensorLayout fsrc, ffilter, fbias, fz, fdst; auto convbias_opr = args.handle->create_operator(); - SizeArgs fargs = float_args( - args, static_cast(convbias_opr.get()), fsrc, - ffilter, fbias, fz, fdst); + if (args.opr->execution_policy().algo.valid()) { + megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); + convbias_opr->execution_policy() = + args.opr->execution_policy().sub_policy[0]; + } + auto&& config = sub_opr_config( + {*args.src_layout, *args.filter_layout, *args.bias_layout, + *args.z_layout, *args.dst_layout}, + args.opr); + convbias_opr->param() = config.second; + SmallVector sizes; auto get_workspace = [&sizes](const TensorLayout& src, const TensorLayout& dst) { @@ -77,12 +90,15 @@ WorkspaceBundle ConvBiasForwardImpl::AlgoBFloat16::get_workspace_bundle( sizes.push_back(dst.span().dist_byte()); } }; - get_workspace(*args.src_layout, fsrc); - get_workspace(*args.filter_layout, ffilter); - get_workspace(*args.bias_layout, fbias); - get_workspace(*args.z_layout, fz); - get_workspace(*args.dst_layout, fdst); - sizes.push_back(m_impl->get_workspace_in_bytes(fargs)); + get_workspace(*args.src_layout, config.first[0]); + get_workspace(*args.filter_layout, config.first[1]); + get_workspace(*args.bias_layout, config.first[2]); + get_workspace(*args.z_layout, config.first[3]); + get_workspace(*args.dst_layout, config.first[4]); + sizes.push_back(convbias_opr->get_workspace_in_bytes( + config.first[0], config.first[1], config.first[2], config.first[3], + config.first[4], nullptr)); + return {ptr, std::move(sizes)}; } @@ -110,7 +126,12 @@ void ConvBiasForwardImpl::AlgoBFloat16::exec(const ExecArgs& args) const { auto convbias_opr = args.handle->create_operator(); convbias_opr->param() = args.opr->param(); convbias_opr->param().compute_mode = Param::ComputeMode::DEFAULT; - convbias_opr->execution_policy() = {m_impl->desc(), {}}; + if (args.opr->execution_policy().algo.valid()) { + megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); + convbias_opr->execution_policy() = + args.opr->execution_policy().sub_policy[0]; + } + convbias_opr->exec(fsrc_tensor, ffilter_tensor, fbias_tensor, fz_tensor, fdst_tensor, nullptr, cvter.workspace()); } diff --git a/dnn/src/naive/conv_bias/opr_impl.cpp b/dnn/src/naive/conv_bias/opr_impl.cpp index 0eb52c4196c0ff4e08ea3028bec3f6dbe1e3b98e..6b925a4e76092329a56ef78847105b6f7a0be3e2 100644 --- a/dnn/src/naive/conv_bias/opr_impl.cpp +++ b/dnn/src/naive/conv_bias/opr_impl.cpp @@ -214,6 +214,9 @@ void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, DISPATCH_RAW(Float16, Float16, Float16, FLOAT32, (convolution::forward_bias)) + DISPATCH_RAW(BFloat16, BFloat16, BFloat16, FLOAT32, + (convolution::forward_bias)) #endif else { megdnn_throw(ssprintf( diff --git a/dnn/test/cuda/conv_bias.cpp b/dnn/test/cuda/conv_bias.cpp index 8f1e665b9b992e75f6ed511c100b1e496d93b050..dfcacd559977625e2aa65a24dc9c5bab0254f756 100644 --- a/dnn/test/cuda/conv_bias.cpp +++ b/dnn/test/cuda/conv_bias.cpp @@ -8,6 +8,7 @@ * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ +#include "megdnn/dtype.h" #include "test/cuda/fixture.h" #include "megdnn/opr_param_defs.h" @@ -108,6 +109,32 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_F32) { } } +TEST_F(CUDA, CONV_BIAS_FORWARD_BF16) { + using namespace conv_bias; + std::vector args = get_args(); + Checker checker(handle_cuda()); + + checker.set_before_exec_callback( + AlgoChecker(ExecutionPolicyAlgoName{ + "CONVBIAS_BFLOAT16", {{"MATMUL", {}}}})); + NormalRNG default_rng; + for (auto&& arg : args) { + arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; + checker.set_dtype(0, dtype::BFloat16()) + .set_dtype(1, dtype::BFloat16()) + .set_dtype(2, dtype::BFloat16()) + .set_dtype(3, dtype::BFloat16()) + .set_dtype(4, dtype::BFloat16()) + .set_rng(0, &default_rng) + .set_rng(1, &default_rng) + .set_rng(2, &default_rng) + .set_epsilon(2e-2) + .set_param(arg.param) + .execs({arg.src, arg.filter, arg.bias, {}, {}}); + } +} + + TEST_F(CUDA, CONV_BIAS_FORWARD_QS8) { require_compute_capability(6, 1); diff --git a/dnn/test/cuda/convolution.cpp b/dnn/test/cuda/convolution.cpp index 93f083c9bcdb8494625385a9bdd1393cacc1cc14..e2a82ba59ced7520311ef10b14337906546c8d95 100644 --- a/dnn/test/cuda/convolution.cpp +++ b/dnn/test/cuda/convolution.cpp @@ -80,7 +80,8 @@ TEST_F(CUDA, CONVOLUTION_FORWARD) Checker checker(handle_cuda()); NormalRNG default_rng; for (auto &&arg: args) { - float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]); + float scale = + 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]); UniformFloatRNG rng(scale, 2 * scale); checker. set_dtype(0, dtype::Float32()). @@ -115,7 +116,6 @@ TEST_F(CUDA, CONVOLUTION_FORWARD) .set_epsilon(1e-1) .set_param(arg.param) .execs({arg.src, arg.filter, {}}); - } }