提交 1cfdbc56 编写于 作者: M Megvii Engine Team 提交者: huangxinda

feat(dnn): add deterministic max pooling

GitOrigin-RevId: 9ab4c7a74839b661fe5ed052a20a66b6732c09d0
上级 20ab82d0
......@@ -48,6 +48,8 @@ INST_ARITY(megdnn::ConvBias, 4, 1);
INST_ARITY(megdnn::DeformableConvBackwardData, 5, 3);
INST_ARITY(megdnn::MatrixMul, 2, 1);
INST_ARITY(megdnn::BatchedMatrixMul, 2, 1);
INST_ARITY(megdnn::PoolingForward, 1, 1);
INST_ARITY(megdnn::PoolingBackward, 3, 1);
#undef INST_ARITY
......
......@@ -259,6 +259,8 @@ public:
DEFORMABLE_CONV_BACKWARD_FILTER,
CONVBIAS_FORWARD,
BATCH_CONV_FORWARD,
POOLING_FORWARD,
POOLING_BACKWARD,
};
struct SearchItem {
......@@ -334,6 +336,63 @@ private:
ExecutionPolicy m_execution_policy;
};
//! specialize for nargs == 2
template <class Opr>
class MultiAlgoOpr<Opr, 2> : public MultiAlgoOpr<Opr, -1> {
public:
using Algorithm = detail::Algorithm;
using AlgorithmInfo = detail::Algorithm::Info;
using AlgoAttribute = detail::Algorithm::Attribute;
//! get all possible algorithm decriptions for the specified layouts
std::vector<AlgorithmInfo> get_all_algorithms_info(const TensorLayout& p0,
const TensorLayout& p1) {
std::vector<AlgorithmInfo> ret;
for (auto&& algo : get_all_algorithms(p0, p1)) {
ret.emplace_back(algo->info());
}
return ret;
}
/**
* \brief Returns the best algorithm information which indicate the
* algorithm by heuristic.
*
* The selected algorithm should not use workspace more than
* \p workspace_limit_in_bytes.
*/
AlgorithmInfo get_algorithm_info_heuristic(
const TensorLayout& p0, const TensorLayout& p1,
size_t workspace_limit_in_bytes =
std::numeric_limits<size_t>::max(),
const AlgoAttribute& positive_attr = AlgoAttribute::DEFAULT,
const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) {
return get_algorithm_heuristic(p0, p1, workspace_limit_in_bytes,
positive_attr, negative_attr)
->info();
}
protected:
~MultiAlgoOpr() = default;
//! get all possible algorithms for the specified layouts
virtual std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& p0, const TensorLayout& p1) = 0;
/**
* \brief Returns the best algorithm by heuristic.
*
* The selected algorithm should not use workspace more than
* \p workspace_limit_in_bytes.
*/
virtual Algorithm* get_algorithm_heuristic(
const TensorLayout& p0, const TensorLayout& p1,
size_t workspace_limit_in_bytes =
std::numeric_limits<size_t>::max(),
const AlgoAttribute& positive_attr = AlgoAttribute::DEFAULT,
const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) = 0;
};
//! specialize for nargs == 3
template <class Opr>
class MultiAlgoOpr<Opr, 3> : public MultiAlgoOpr<Opr, -1> {
......
......@@ -713,7 +713,8 @@ protected:
void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst);
};
class PoolingForward : public PoolingBase {
class PoolingForward : public PoolingBase,
public detail::MultiAlgoOpr<PoolingForward, 2> {
DEF_OPR_IMPL(PoolingForward, PoolingBase, 1, 1);
public:
......@@ -734,7 +735,8 @@ protected:
using Pooling = PoolingForward;
class PoolingBackward : public PoolingBase {
class PoolingBackward : public PoolingBase,
public detail::MultiAlgoOpr<PoolingBackward, 4> {
DEF_OPR_IMPL(PoolingBackward, PoolingBase, 3, 1);
public:
......
......@@ -69,7 +69,7 @@ std::vector<typename Opr::Algorithm*> get_all_algorithms(
ret.push_back(i);
}
}
megdnn_assert(!ret.empty(), "no conv algorithm for %s",
megdnn_assert(!ret.empty(), "no algorithm for %s",
args.to_string().c_str());
return ret;
}
......
......@@ -294,32 +294,6 @@ void ConvDesc::set(DType data_type, const param::Convolution& param,
#endif
}
PoolingDesc::PoolingDesc() {
cudnn_check(cudnnCreatePoolingDescriptor(&desc));
}
PoolingDesc::~PoolingDesc() {
cudnn_check(cudnnDestroyPoolingDescriptor(desc));
}
void PoolingDesc::set(const param::Pooling& param) {
cudnnPoolingMode_t mode;
switch (param.mode) {
case param::Pooling::Mode::MAX:
mode = CUDNN_POOLING_MAX;
break;
case param::Pooling::Mode::AVERAGE:
mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
break;
case param::Pooling::Mode::AVERAGE_COUNT_EXCLUDE_PADDING:
mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
break;
}
cudnn_check(cudnnSetPooling2dDescriptor(
desc, mode, CUDNN_NOT_PROPAGATE_NAN, param.window_h, param.window_w,
param.pad_h, param.pad_w, param.stride_h, param.stride_w));
}
LRNDesc::LRNDesc() {
cudnn_check(cudnnCreateLRNDescriptor(&desc));
}
......
......@@ -54,14 +54,6 @@ class ConvDesc {
cudnnConvolutionDescriptor_t desc;
};
class PoolingDesc {
public:
PoolingDesc();
void set(const param::Pooling &param);
~PoolingDesc();
cudnnPoolingDescriptor_t desc;
};
class LRNDesc {
public:
LRNDesc();
......
/**
* \file dnn/src/cuda/pooling/algos.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "./algo.h"
#include "./pooling2d_qint.cuh"
#include "src/cuda/utils.h"
using namespace megdnn;
using namespace cuda;
namespace {
#define V1(v) #v
#define V(v) V1(v)
#define DEF_NAME(NAME) \
#NAME "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
} // namespace
PoolingForwardImpl::AlgoPack::AlgoPack() {
all_algos.push_back(&algo_chwn4);
all_algos.push_back(&algo_nchw4);
all_algos.push_back(&algo_nchw32);
all_algos.push_back(&algo_nhwc);
all_algos.push_back(&algo_nchw64);
all_algos.push_back(&algo_cudnn);
#if CUDNN_VERSION >= 6000
all_algos.push_back(&algo_cudnn_max_deterministic);
#endif
for (auto&& algo : all_algos) {
m_all_algos_map.emplace(algo->info().desc, algo);
}
}
PoolingForwardImpl::AlgoPack PoolingForwardImpl::sm_algo_pack;
MEGDNN_DEF_GET_ALGO_FROM_DESC(PoolingForwardImpl)
PoolingForwardImpl::AlgoBase::SizeArgs::SizeArgs(PoolingForwardImpl* o,
const TensorLayout& src,
const TensorLayout& dst)
: handle{concrete_handle(o->handle())},
opr{o},
layout_src{&src},
layout_dst{&dst} {}
PoolingForwardImpl::AlgoBase::ExecArgs::ExecArgs(PoolingForwardImpl* opr,
_megdnn_tensor_in src,
_megdnn_tensor_out dst,
_megdnn_workspace workspace)
: SizeArgs(opr, src.layout, dst.layout),
src_tensor{&src},
dst_tensor{&dst},
workspace{workspace} {}
std::string PoolingForwardImpl::AlgoBase::SizeArgs::to_string() const {
return ssprintf("src=%s, dst=%s", layout_src->to_string().c_str(),
layout_dst->to_string().c_str());
}
WorkspaceBundle PoolingForwardImpl::AlgoBase::get_workspace_bundle(
void* ptr, const SizeArgs& args) const {
SmallVector<size_t> sizes;
TensorLayout fsrc = *args.layout_src;
TensorLayout fdst = *args.layout_dst;
auto get_workspace = [&sizes](TensorLayout& layout) {
if (layout.dtype == dtype::BFloat16()) {
layout.dtype = dtype::Float32();
sizes.push_back(layout.span().dist_byte());
}
};
get_workspace(fsrc);
get_workspace(fdst);
return {ptr, std::move(sizes)};
}
size_t PoolingForwardImpl::AlgoBase::get_workspace_in_bytes(
const SizeArgs& args) const {
return get_workspace_bundle(nullptr, args).total_size_in_bytes();
}
bool PoolingForwardImpl::AlgoCUDNN::is_available(const SizeArgs& args) const {
using Format = param::Pooling::Format;
return (((args.opr->param().format == Format::NCHW ||
args.opr->param().format == Format::NHWC) &&
(args.layout_src->dtype.enumv() == DTypeEnum::Float16 ||
args.layout_src->dtype.enumv() == DTypeEnum::BFloat16 ||
args.layout_src->dtype.enumv() == DTypeEnum::Float32 ||
args.layout_src->dtype.enumv() == DTypeEnum::Int8 ||
args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS32 ||
args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS8 ||
args.layout_src->dtype.enumv() == DTypeEnum::Quantized8Asymm)) ||
((args.opr->param().format == Format::NCHW4 ||
args.opr->param().format == Format::NCHW32) &&
(args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS8 ||
args.layout_src->dtype.enumv() == DTypeEnum::Quantized8Asymm)));
}
void PoolingForwardImpl::AlgoCUDNN::init_mode(const ExecArgs& args,
cudnnPoolingMode_t& mode) const {
switch (args.opr->param().mode) {
case param::Pooling::Mode::MAX:
mode = CUDNN_POOLING_MAX;
break;
case param::Pooling::Mode::AVERAGE:
mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
break;
case param::Pooling::Mode::AVERAGE_COUNT_EXCLUDE_PADDING:
mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
break;
default:
megdnn_throw(ssprintf("Unspport pooling mode : {%d}",
static_cast<int>(args.opr->param().mode)));
}
}
void PoolingForwardImpl::AlgoCUDNN::exec(const ExecArgs& args) const {
TensorND src = *args.src_tensor;
TensorND dst = *args.dst_tensor;
auto wsb = get_workspace_bundle(args.workspace.raw_ptr, args);
auto ctypecvt = CompTypeCvter<dtype::BFloat16, dtype::Float32>(
concrete_handle(args.handle), &wsb);
if (args.layout_src->dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) {
ctypecvt.src_to_comp_type(*args.src_tensor, src)
.src_to_comp_type(*args.dst_tensor, dst);
}
{
dt_float32 alpha = 1.0f, beta = 0.0f;
TensorDesc src_desc, dst_desc;
src_desc.set(src.layout, args.opr->param().format);
dst_desc.set(dst.layout, args.opr->param().format);
cudnnPoolingMode_t mode;
init_mode(args, mode);
cudnnPoolingDescriptor_t cudnn_desc;
cudnn_check(cudnnCreatePoolingDescriptor(&cudnn_desc));
cudnn_check(cudnnSetPooling2dDescriptor(
cudnn_desc, mode, CUDNN_NOT_PROPAGATE_NAN,
args.opr->param().window_h, args.opr->param().window_w,
args.opr->param().pad_h, args.opr->param().pad_w,
args.opr->param().stride_h, args.opr->param().stride_w));
cudnn_check(cudnnPoolingForward(args.handle->cudnn_handle(), cudnn_desc,
&alpha, src_desc.desc, src.raw_ptr,
&beta, dst_desc.desc, dst.raw_ptr));
cudnn_check(cudnnDestroyPoolingDescriptor(cudnn_desc));
}
if (args.layout_src->dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) {
ctypecvt.comp_to_dst_type(dst, *args.dst_tensor);
}
}
#if CUDNN_VERSION >= 6000
bool PoolingForwardImpl::AlgoCUDNNMAXDETERMINISTIC::is_available(
const SizeArgs& args) const {
using Format = param::Pooling::Format;
return (args.opr->param().mode == param::Pooling::Mode::MAX &&
(((args.opr->param().format == Format::NCHW ||
args.opr->param().format == Format::NHWC) &&
(args.layout_src->dtype.enumv() == DTypeEnum::Float16 ||
args.layout_src->dtype.enumv() == DTypeEnum::BFloat16 ||
args.layout_src->dtype.enumv() == DTypeEnum::Float32 ||
args.layout_src->dtype.enumv() == DTypeEnum::Int8 ||
args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS32 ||
args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS8 ||
args.layout_src->dtype.enumv() == DTypeEnum::Quantized8Asymm)) ||
((args.opr->param().format == Format::NCHW4 ||
args.opr->param().format == Format::NCHW32) &&
(args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS8 ||
args.layout_src->dtype.enumv() == DTypeEnum::Quantized8Asymm))));
}
void PoolingForwardImpl::AlgoCUDNNMAXDETERMINISTIC::init_mode(
const ExecArgs& args, cudnnPoolingMode_t& mode) const {
switch (args.opr->param().mode) {
case param::Pooling::Mode::MAX:
mode = CUDNN_POOLING_MAX_DETERMINISTIC;
break;
default:
megdnn_throw(ssprintf("Unspport pooling mode : {%d}",
static_cast<int>(args.opr->param().mode)));
}
}
void PoolingForwardImpl::AlgoCUDNNMAXDETERMINISTIC::exec(
const ExecArgs& args) const {
TensorND src = *args.src_tensor;
TensorND dst = *args.dst_tensor;
auto wsb = get_workspace_bundle(args.workspace.raw_ptr, args);
auto ctypecvt = CompTypeCvter<dtype::BFloat16, dtype::Float32>(
concrete_handle(args.handle), &wsb);
if (args.layout_src->dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) {
ctypecvt.src_to_comp_type(*args.src_tensor, src)
.src_to_comp_type(*args.dst_tensor, dst);
}
{
dt_float32 alpha = 1.0f, beta = 0.0f;
TensorDesc src_desc, dst_desc;
src_desc.set(src.layout, args.opr->param().format);
dst_desc.set(dst.layout, args.opr->param().format);
cudnnPoolingMode_t mode;
init_mode(args, mode);
cudnnPoolingDescriptor_t cudnn_desc;
cudnn_check(cudnnCreatePoolingDescriptor(&cudnn_desc));
cudnn_check(cudnnSetPooling2dDescriptor(
cudnn_desc, mode, CUDNN_NOT_PROPAGATE_NAN,
args.opr->param().window_h, args.opr->param().window_w,
args.opr->param().pad_h, args.opr->param().pad_w,
args.opr->param().stride_h, args.opr->param().stride_w));
cudnn_check(cudnnPoolingForward(args.handle->cudnn_handle(), cudnn_desc,
&alpha, src_desc.desc, src.raw_ptr,
&beta, dst_desc.desc, dst.raw_ptr));
cudnn_check(cudnnDestroyPoolingDescriptor(cudnn_desc));
}
if (args.layout_src->dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) {
ctypecvt.comp_to_dst_type(dst, *args.dst_tensor);
}
}
#endif
bool PoolingForwardImpl::AlgoCHWN4::is_available(const SizeArgs& args) const {
using Format = param::Pooling::Format;
return (args.opr->param().format == Format::CHWN4 &&
(args.layout_src->dtype.enumv() == DTypeEnum::Quantized8Asymm ||
args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS8));
}
void PoolingForwardImpl::AlgoCHWN4::exec(const ExecArgs& args) const {
pooling2d::Param kern_param;
size_t c = (*args.layout_src)[0], hi = (*args.layout_src)[1],
wi = (*args.layout_src)[2], n = (*args.layout_src)[3],
ho = (*args.layout_dst)[1], wo = (*args.layout_dst)[2];
c = c * 4;
size_t ph = args.opr->param().pad_h, pw = args.opr->param().pad_w;
size_t window_h = args.opr->param().window_h,
window_w = args.opr->param().window_w;
size_t sh = args.opr->param().stride_h, sw = args.opr->param().stride_w;
kern_param.n = n, kern_param.c = c, kern_param.hi = hi, kern_param.wi = wi,
kern_param.ho = ho, kern_param.wo = wo, kern_param.ph = ph,
kern_param.pw = pw, kern_param.window_h = window_h,
kern_param.window_w = window_w, kern_param.sh = sh, kern_param.sw = sw;
auto&& stream = cuda_stream(args.handle);
pooling2d::do_pooling2d_int8_cdiv4hwn4(
args.src_tensor->compatible_ptr<int8_t>(),
args.dst_tensor->compatible_ptr<int8_t>(), kern_param, stream,
static_cast<uint32_t>(args.opr->param().mode));
}
bool PoolingForwardImpl::AlgoNCHW4::is_available(const SizeArgs& args) const {
using Format = param::Pooling::Format;
return args.opr->param().format == Format::NCHW4 &&
(args.layout_src->dtype.enumv() == DTypeEnum::Quantized8Asymm ||
args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS8);
}
void PoolingForwardImpl::AlgoNCHW4::exec(const ExecArgs& args) const {
pooling2d::Param kern_param;
size_t n = (*args.layout_src)[0], hi = (*args.layout_src)[2],
wi = (*args.layout_src)[3], c = (*args.layout_src)[1],
ho = (*args.layout_dst)[2], wo = (*args.layout_dst)[3];
c = c * 4;
size_t ph = args.opr->param().pad_h, pw = args.opr->param().pad_w;
size_t window_h = args.opr->param().window_h,
window_w = args.opr->param().window_w;
size_t sh = args.opr->param().stride_h, sw = args.opr->param().stride_w;
kern_param.n = n, kern_param.c = c, kern_param.hi = hi, kern_param.wi = wi,
kern_param.ho = ho, kern_param.wo = wo, kern_param.ph = ph,
kern_param.pw = pw, kern_param.window_h = window_h,
kern_param.window_w = window_w, kern_param.sh = sh, kern_param.sw = sw;
auto&& stream = cuda_stream(args.handle);
pooling2d::do_pooling2d_int8_ncdiv4hw4(
args.src_tensor->compatible_ptr<int8_t>(),
args.dst_tensor->compatible_ptr<int8_t>(), kern_param, stream,
static_cast<uint32_t>(args.opr->param().mode));
}
bool PoolingForwardImpl::AlgoNCHW32::is_available(const SizeArgs& args) const {
using Format = param::Pooling::Format;
return (args.opr->param().format == Format::NCHW32 &&
(args.layout_src->dtype.enumv() == DTypeEnum::Quantized8Asymm ||
args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS8));
}
void PoolingForwardImpl::AlgoNCHW32::exec(const ExecArgs& args) const {
pooling2d::Param kern_param;
size_t n = (*args.layout_src)[0], hi = (*args.layout_src)[2],
wi = (*args.layout_src)[3], c = (*args.layout_src)[1],
ho = (*args.layout_dst)[2], wo = (*args.layout_dst)[3];
c = c * 32;
size_t ph = args.opr->param().pad_h, pw = args.opr->param().pad_w;
size_t window_h = args.opr->param().window_h,
window_w = args.opr->param().window_w;
size_t sh = args.opr->param().stride_h, sw = args.opr->param().stride_w;
kern_param.n = n, kern_param.c = c, kern_param.hi = hi, kern_param.wi = wi,
kern_param.ho = ho, kern_param.wo = wo, kern_param.ph = ph,
kern_param.pw = pw, kern_param.window_h = window_h,
kern_param.window_w = window_w, kern_param.sh = sh, kern_param.sw = sw;
auto&& stream = cuda_stream(args.handle);
pooling2d::do_pooling2d_int8_ncdiv32hw32(
args.src_tensor->compatible_ptr<int8_t>(),
args.dst_tensor->compatible_ptr<int8_t>(), kern_param, stream,
static_cast<uint32_t>(args.opr->param().mode));
}
bool PoolingForwardImpl::AlgoNHWC::is_available(const SizeArgs& args) const {
using Format = param::Pooling::Format;
return (args.opr->param().format == Format::NHWC &&
(args.layout_src->dtype.enumv() == DTypeEnum::Quantized4Asymm ||
args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS4));
}
void PoolingForwardImpl::AlgoNHWC::exec(const ExecArgs& args) const {
TensorND src = *args.src_tensor;
TensorND dst = *args.dst_tensor;
{
megdnn_assert(src.layout.dtype.enumv() == dst.layout.dtype.enumv(),
"src and dst dtype must equal");
pooling2d::Param kern_param;
size_t n = src.layout[0], hi = src.layout[1], wi = src.layout[2],
c = src.layout[3], ho = dst.layout[1], wo = dst.layout[2];
size_t ph = args.opr->param().pad_h, pw = args.opr->param().pad_w;
size_t window_h = args.opr->param().window_h,
window_w = args.opr->param().window_w;
size_t sh = args.opr->param().stride_h, sw = args.opr->param().stride_w;
kern_param.n = n, kern_param.c = c, kern_param.hi = hi,
kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo,
kern_param.ph = ph, kern_param.pw = pw, kern_param.window_h = window_h,
kern_param.window_w = window_w, kern_param.sh = sh, kern_param.sw = sw;
bool uint_case = false;
int zero_point = 0;
if (src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
uint_case = true;
zero_point =
src.layout.dtype.param<dtype::Quantized4Asymm>().zero_point;
}
auto&& stream = cuda_stream(args.handle);
pooling2d::do_pooling2d_int4_nhwc(
(int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, kern_param, stream,
static_cast<uint32_t>(args.opr->param().mode), uint_case,
zero_point);
}
}
inline void PoolingForwardImpl::AlgoNCHW64::deduce_reformat_layout(
std::unique_ptr<RelayoutFormat>& relayout,
const TensorLayout& src_layout, TensorLayout& dst_layout,
RelayoutFormat::Param::Mode mode, const int oc = 0,
const int group = 1) const {
if (src_layout.ndim > 0) {
RelayoutFormat::Param trans_param;
trans_param.mode = mode;
trans_param.oc = oc;
trans_param.group = group;
relayout->param() = trans_param;
relayout->deduce_layout(src_layout, dst_layout);
} else {
dst_layout = src_layout;
}
}
void PoolingForwardImpl::AlgoNCHW64::get_inner_layout(
const TensorLayout& src, const TensorLayout& dst,
TensorLayout& inner_src, TensorLayout& inner_dst, Handle* handle,
PoolingForwardImpl::Param::Format format) const {
auto relayout_opr = handle->create_operator<RelayoutFormat>();
deduce_reformat_layout(relayout_opr, src, inner_src,
RelayoutFormat::Param::Mode::NCHW_NCHW64, 0, 1);
deduce_reformat_layout(relayout_opr, dst, inner_dst,
RelayoutFormat::Param::Mode::NCHW_NCHW64, 0, 1);
}
WorkspaceBundle PoolingForwardImpl::AlgoNCHW64::get_workspace_bundle(
void* ptr, const SizeArgs& args) const {
using Format = param::Pooling::Format;
SmallVector<size_t> sizes;
TensorLayout fsrc = *args.layout_src;
TensorLayout fdst = *args.layout_dst;
if (args.opr->param().format == Format::NCHW) {
get_inner_layout(*args.layout_src, *args.layout_dst, fsrc, fdst,
args.handle, args.opr->param().format);
sizes.push_back(fsrc.span().dist_byte());
sizes.push_back(fdst.span().dist_byte());
}
return {ptr, std::move(sizes)};
}
bool PoolingForwardImpl::AlgoNCHW64::is_available(const SizeArgs& args) const {
using Format = param::Pooling::Format;
return ((args.opr->param().format == Format::NCHW ||
args.opr->param().format == Format::NCHW64) &&
(args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS4 ||
args.layout_src->dtype.enumv() == DTypeEnum::Quantized4Asymm) &&
(args.layout_dst->dtype.enumv() == DTypeEnum::QuantizedS4 ||
args.layout_dst->dtype.enumv() == DTypeEnum::Quantized4Asymm));
}
void PoolingForwardImpl::AlgoNCHW64::exec(const ExecArgs& args) const {
using Format = param::Pooling::Format;
TensorND src = *args.src_tensor;
TensorND dst = *args.dst_tensor;
if (args.opr->param().format == Format::NCHW) {
auto wsb = get_workspace_bundle(args.workspace.raw_ptr, args);
auto handle_ptr = args.handle;
get_inner_layout(*args.layout_src, *args.layout_dst, src.layout,
dst.layout, handle_ptr, args.opr->param().format);
src.raw_ptr = wsb.get(0);
dst.raw_ptr = wsb.get(1);
auto relayout_opr = handle_ptr->create_operator<RelayoutFormat>();
RelayoutFormat::Param trans_param;
trans_param.mode = RelayoutFormat::Param::Mode::NCHW_NCHW64;
relayout_opr->param() = trans_param;
relayout_opr->exec(*args.src_tensor, src, {});
}
{
pooling2d::Param kern_param;
size_t n = src.layout[0], hi = src.layout[2], wi = src.layout[3],
c = src.layout[1], ho = dst.layout[2], wo = dst.layout[3];
c = c * 64;
size_t ph = args.opr->param().pad_h, pw = args.opr->param().pad_w;
size_t window_h = args.opr->param().window_h,
window_w = args.opr->param().window_w;
size_t sh = args.opr->param().stride_h, sw = args.opr->param().stride_w;
kern_param.n = n, kern_param.c = c, kern_param.hi = hi,
kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo,
kern_param.ph = ph, kern_param.pw = pw, kern_param.window_h = window_h,
kern_param.window_w = window_w, kern_param.sh = sh, kern_param.sw = sw;
bool uint_case = false;
int zero_point = 0;
if (src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
uint_case = true;
zero_point =
src.layout.dtype.param<dtype::Quantized4Asymm>().zero_point;
}
auto&& stream = cuda_stream(args.handle);
pooling2d::do_pooling2d_int4_ncdiv64hw64(
(int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, kern_param, stream,
static_cast<uint32_t>(args.opr->param().mode), uint_case,
zero_point);
}
if (args.layout_dst->ndim == 4) {
auto relayout_opr = args.handle->create_operator<RelayoutFormat>();
RelayoutFormat::Param trans_param;
trans_param.mode = RelayoutFormat::Param::Mode::NCHW64_NCHW;
relayout_opr->param() = trans_param;
relayout_opr->exec(dst, *args.dst_tensor, {});
}
}
PoolingBackwardImpl::AlgoPack::AlgoPack() {
algo_cudnn.push_back({DEF_NAME(cudnnUnreproducible), false});
algo_cudnn.push_back({DEF_NAME(cudnnReproducible), true});
for (auto&& i : algo_cudnn) {
all_algos.push_back(&i);
}
for (auto&& algo : all_algos) {
m_all_algos_map.emplace(algo->info().desc, algo);
}
}
PoolingBackwardImpl::AlgoPack PoolingBackwardImpl::sm_algo_pack;
MEGDNN_DEF_GET_ALGO_FROM_DESC(PoolingBackwardImpl)
PoolingBackwardImpl::AlgoBase::SizeArgs::SizeArgs(PoolingBackwardImpl* o,
const TensorLayout& src,
const TensorLayout& dst,
const TensorLayout& diff,
const TensorLayout& grad)
: handle{concrete_handle(o->handle())},
opr{o},
layout_src{&src},
layout_dst{&dst},
layout_diff{&diff},
layout_grad{&grad} {}
PoolingBackwardImpl::AlgoBase::ExecArgs::ExecArgs(PoolingBackwardImpl* opr,
_megdnn_tensor_in src,
_megdnn_tensor_in dst,
_megdnn_tensor_in diff,
_megdnn_tensor_out grad,
_megdnn_workspace workspace)
: SizeArgs(opr, src.layout, dst.layout, diff.layout, grad.layout),
src_tensor{&src},
dst_tensor{&dst},
diff_tensor{&diff},
grad_tensor{&grad},
workspace{workspace} {}
std::string PoolingBackwardImpl::AlgoBase::SizeArgs::to_string() const {
return ssprintf(
"src=%s, dst=%s, diff=%s, grad=%s", layout_src->to_string().c_str(),
layout_dst->to_string().c_str(), layout_diff->to_string().c_str(),
layout_grad->to_string().c_str());
}
bool PoolingBackwardImpl::AlgoCUDNN::is_available(const SizeArgs& args) const {
using Format = param::Pooling::Format;
#if CUDNN_VERSION < 6000
return ((args.opr->param().format == Format::NCHW ||
args.opr->param().format == Format::NHWC ||
args.opr->param().format == Format::NCHW4 ||
args.opr->param().format == Format::NCHW32) &&
(m_is_reproducible ^
(args.opr->param().mode == param::Pooling::Mode::MAX)));
#else
return ((args.opr->param().format == Format::NCHW ||
args.opr->param().format == Format::NHWC ||
args.opr->param().format == Format::NCHW4 ||
args.opr->param().format == Format::NCHW32) &&
(m_is_reproducible ||
args.opr->param().mode == param::Pooling::Mode::MAX));
#endif
}
WorkspaceBundle PoolingBackwardImpl::AlgoBase::get_workspace_bundle(
void* ptr, const SizeArgs& args) const {
SmallVector<size_t> sizes;
TensorLayout fsrc = *args.layout_src;
TensorLayout fdst = *args.layout_dst;
TensorLayout fdiff = *args.layout_diff;
TensorLayout fgrad = *args.layout_grad;
auto get_workspace = [&sizes](TensorLayout& layout) {
if (layout.dtype == dtype::BFloat16()) {
layout.dtype = dtype::Float32();
sizes.push_back(layout.span().dist_byte());
}
};
get_workspace(fsrc);
get_workspace(fdst);
get_workspace(fdiff);
get_workspace(fgrad);
return {ptr, std::move(sizes)};
}
size_t PoolingBackwardImpl::AlgoBase::get_workspace_in_bytes(
const SizeArgs& args) const {
return get_workspace_bundle(nullptr, args).total_size_in_bytes();
}
void PoolingBackwardImpl::AlgoCUDNN::init_mode(const ExecArgs& args,
cudnnPoolingMode_t& mode) const {
if (m_is_reproducible) {
switch (args.opr->param().mode) {
#if CUDNN_VERSION >= 6000
case param::Pooling::Mode::MAX:
mode = CUDNN_POOLING_MAX_DETERMINISTIC;
break;
#endif
case param::Pooling::Mode::AVERAGE:
mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
break;
case param::Pooling::Mode::AVERAGE_COUNT_EXCLUDE_PADDING:
mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
break;
default:
megdnn_throw(
ssprintf("Unspport pooling mode : {%d}",
static_cast<int>(args.opr->param().mode)));
}
} else if (args.opr->param().mode == param::Pooling::Mode::MAX) {
mode = CUDNN_POOLING_MAX;
} else {
megdnn_throw("init_mode failed\n");
}
}
void PoolingBackwardImpl::AlgoCUDNN::exec(const ExecArgs& args) const {
TensorND src = *args.src_tensor;
TensorND dst = *args.dst_tensor;
TensorND diff = *args.diff_tensor;
TensorND grad = *args.grad_tensor;
auto wsb = get_workspace_bundle(args.workspace.raw_ptr, args);
auto ctypecvt = CompTypeCvter<dtype::BFloat16, dtype::Float32>(
concrete_handle(args.handle), &wsb);
if (args.layout_src->dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) {
ctypecvt.src_to_comp_type(*args.src_tensor, src)
.src_to_comp_type(*args.dst_tensor, dst)
.src_to_comp_type(*args.diff_tensor, diff)
.src_to_comp_type(*args.grad_tensor, grad);
}
{
dt_float32 alpha = 1.0f, beta = 0.0f;
TensorDesc src_desc, dst_desc, diff_desc, grad_desc;
src_desc.set(src.layout, args.opr->param().format);
dst_desc.set(dst.layout, args.opr->param().format);
diff_desc.set(diff.layout, args.opr->param().format);
grad_desc.set(grad.layout, args.opr->param().format);
cudnnPoolingMode_t mode;
init_mode(args, mode);
cudnnPoolingDescriptor_t cudnn_desc;
cudnn_check(cudnnCreatePoolingDescriptor(&cudnn_desc));
cudnn_check(cudnnSetPooling2dDescriptor(
cudnn_desc, mode, CUDNN_NOT_PROPAGATE_NAN,
args.opr->param().window_h, args.opr->param().window_w,
args.opr->param().pad_h, args.opr->param().pad_w,
args.opr->param().stride_h, args.opr->param().stride_w));
cudnn_check(cudnnPoolingBackward(
args.handle->cudnn_handle(), cudnn_desc, &alpha, dst_desc.desc,
dst.raw_ptr, diff_desc.desc, diff.raw_ptr, src_desc.desc,
src.raw_ptr, &beta, grad_desc.desc, grad.raw_ptr));
cudnn_check(cudnnDestroyPoolingDescriptor(cudnn_desc));
}
if (args.layout_src->dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) {
ctypecvt.comp_to_dst_type(grad, *args.grad_tensor);
}
}
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/cuda/pooling/algo.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include <unordered_map>
#include "src/common/algo_base.h"
#include "src/common/metahelper.h"
#include "src/cuda/cudnn_wrapper.h"
#include "src/cuda/pooling/opr_impl.h"
namespace megdnn {
namespace cuda {
namespace {
#define V1(v) #v
#define V(v) V1(v)
#define DEF_NAME(NAME) \
#NAME "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
} // namespace
class PoolingForwardImpl::AlgoBase : public Algorithm {
public:
enum class AlgoType : uint32_t {
CUDA_CUDNN,
#if CUDNN_VERSION >= 6000
CUDA_CUDNN_MAXDETERMINISTIC,
#endif
CUDA_CHWN4,
CUDA_NCHW4,
CUDA_NCHW32,
CUDA_NHWC,
CUDA_NCHW64
};
using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; }
struct SizeArgs {
HandleImpl* handle;
PoolingForwardImpl* opr;
const TensorLayout *layout_src, *layout_dst;
std::string to_string() const;
SizeArgs(PoolingForwardImpl* opr, const TensorLayout& src,
const TensorLayout& dst);
};
struct ExecArgs : public SizeArgs {
const TensorND *src_tensor, *dst_tensor;
Workspace workspace;
ExecArgs(PoolingForwardImpl* opr, _megdnn_tensor_in src,
_megdnn_tensor_out dst, _megdnn_workspace workspace);
};
virtual bool is_available(const SizeArgs& args) const = 0;
size_t get_workspace_in_bytes(const SizeArgs& args) const;
virtual void exec(const ExecArgs& args) const = 0;
bool is_available_attribute(
const SizeArgs& args,
const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE,
const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) {
return contain_attribute_all(positive_attr) &&
!contain_attribute_any(negative_attr) && is_available(args);
}
protected:
~AlgoBase() = default;
virtual WorkspaceBundle get_workspace_bundle(void* ptr,
const SizeArgs& args) const;
};
class PoolingForwardImpl::AlgoCUDNN final : public AlgoBase {
std::string m_algo_name;
public:
AlgoCUDNN(std::string name) : m_algo_name(name) {}
bool is_available(const SizeArgs& args) const override;
void init_mode(const ExecArgs& args, cudnnPoolingMode_t& mode) const;
void exec(const ExecArgs& args) const override;
const char* name() const override { return m_algo_name.c_str(); }
AlgoAttribute attribute() const override {
return AlgoAttribute::REPRODUCIBLE;
}
MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN)
std::string param() const override { return m_algo_name; }
};
#if CUDNN_VERSION >= 6000
class PoolingForwardImpl::AlgoCUDNNMAXDETERMINISTIC final : public AlgoBase {
std::string m_algo_name;
public:
AlgoCUDNNMAXDETERMINISTIC(std::string name) : m_algo_name(name) {}
bool is_available(const SizeArgs& args) const override;
void init_mode(const ExecArgs& args, cudnnPoolingMode_t& mode) const;
void exec(const ExecArgs& args) const override;
const char* name() const override { return m_algo_name.c_str(); }
AlgoAttribute attribute() const override {
return AlgoAttribute::REPRODUCIBLE;
}
MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN_MAXDETERMINISTIC)
std::string param() const override { return m_algo_name; }
};
#endif
#define ALGO_LAYOUT_POOLING_IMPL(_layout) \
class PoolingForwardImpl::Algo##_layout final : public AlgoBase { \
std::string m_algo_name; \
\
public: \
Algo##_layout( \
std::string name = std::string("CUDA_").append(#_layout)) \
: m_algo_name(name) {} \
bool is_available(const SizeArgs& args) const override; \
void exec(const ExecArgs& args) const override; \
const char* name() const override { return m_algo_name.c_str(); } \
AlgoAttribute attribute() const override { \
return AlgoAttribute::REPRODUCIBLE; \
} \
MEGDNN_DECL_ALGO_TYPE(CUDA_##_layout)
ALGO_LAYOUT_POOLING_IMPL(CHWN4)};
ALGO_LAYOUT_POOLING_IMPL(NCHW4)};
ALGO_LAYOUT_POOLING_IMPL(NCHW32)};
ALGO_LAYOUT_POOLING_IMPL(NHWC)};
ALGO_LAYOUT_POOLING_IMPL(NCHW64) //{
protected:
WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args)
const override;
private:
inline void deduce_reformat_layout(
std::unique_ptr<RelayoutFormat> & relayout,
const TensorLayout& src_layout, TensorLayout& dst_layout,
RelayoutFormat::Param::Mode mode, const int oc, const int group)
const;
void get_inner_layout(const TensorLayout& src, const TensorLayout& dst,
TensorLayout& inner_src, TensorLayout& inner_dst,
Handle* handle,
PoolingForwardImpl::Param::Format format) const;
};
#undef ALGO_LAYOUT_POOLING_IMPL
class PoolingForwardImpl::AlgoPack : NonCopyableObj {
private:
AlgoBase::Mapper m_all_algos_map;
public:
AlgoPack();
AlgoCUDNN algo_cudnn{DEF_NAME(cudnnForward)};
#if CUDNN_VERSION >= 6000
AlgoCUDNNMAXDETERMINISTIC algo_cudnn_max_deterministic{
DEF_NAME(cudnnForwardMaxDeterministic)};
#endif
AlgoCHWN4 algo_chwn4;
AlgoNCHW4 algo_nchw4;
AlgoNCHW32 algo_nchw32;
AlgoNHWC algo_nhwc;
AlgoNCHW64 algo_nchw64;
std::vector<AlgoBase*> all_algos;
const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }
};
class PoolingBackwardImpl::AlgoBase : public Algorithm {
public:
enum class AlgoType : uint32_t { CUDA_CUDNN };
using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; }
struct SizeArgs {
HandleImpl* handle;
PoolingBackwardImpl* opr;
const TensorLayout *layout_src, *layout_dst, *layout_diff, *layout_grad;
std::string to_string() const;
SizeArgs(PoolingBackwardImpl* opr, const TensorLayout& src,
const TensorLayout& dst, const TensorLayout& diff,
const TensorLayout& grad);
};
struct ExecArgs : public SizeArgs {
const TensorND *src_tensor, *dst_tensor, *diff_tensor, *grad_tensor;
Workspace workspace;
ExecArgs(PoolingBackwardImpl* opr, _megdnn_tensor_in src,
_megdnn_tensor_in dst, _megdnn_tensor_in diff,
_megdnn_tensor_out grad, _megdnn_workspace workspace);
};
virtual bool is_available(const SizeArgs& args) const = 0;
size_t get_workspace_in_bytes(const SizeArgs& args) const;
virtual void exec(const ExecArgs& args) const = 0;
bool is_available_attribute(
const SizeArgs& args,
const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE,
const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) {
return contain_attribute_all(positive_attr) &&
!contain_attribute_any(negative_attr) && is_available(args);
}
protected:
~AlgoBase() = default;
virtual WorkspaceBundle get_workspace_bundle(void* ptr,
const SizeArgs& args) const;
};
class PoolingBackwardImpl::AlgoCUDNN final : public AlgoBase {
std::string m_algo_name;
bool m_is_reproducible;
public:
AlgoCUDNN(std::string name, bool is_reproducible)
: m_algo_name(name), m_is_reproducible(is_reproducible) {}
bool is_available(const SizeArgs& args) const override;
void init_mode(const ExecArgs& args, cudnnPoolingMode_t& mode) const;
void exec(const ExecArgs& args) const override;
const char* name() const override { return m_algo_name.c_str(); }
AlgoAttribute attribute() const override {
auto ret = AlgoAttribute::DEFAULT;
if (m_is_reproducible) {
ret |= AlgoAttribute::REPRODUCIBLE;
}
return ret;
}
MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN)
std::string param() const override { return m_algo_name; }
};
class PoolingBackwardImpl::AlgoPack : NonCopyableObj {
private:
AlgoBase::Mapper m_all_algos_map;
public:
AlgoPack();
std::vector<AlgoCUDNN> algo_cudnn;
std::vector<AlgoBase*> all_algos;
const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }
};
} // namespace cuda
} // namespace megdnn
// vim: syntax=cpp.doxygen
......@@ -6,275 +6,97 @@
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/cuda/pooling/opr_impl.h"
#include "src/cuda/relayout_format/opr_impl.h"
#include "./algo.h"
#include "./pooling2d_qint.cuh"
#include "src/common/algo_chooser.h"
#include "src/cuda/relayout_format/opr_impl.h"
#include "src/cuda/utils.h"
namespace megdnn {
namespace cuda {
namespace {
inline void deduce_reformat_layout(std::unique_ptr<RelayoutFormat>& relayout,
const TensorLayout& src_layout,
TensorLayout& dst_layout,
RelayoutFormat::Param::Mode mode,
const int oc = 0, const int group = 1) {
if (src_layout.ndim > 0) {
RelayoutFormat::Param trans_param;
trans_param.mode = mode;
trans_param.oc = oc;
trans_param.group = group;
relayout->param() = trans_param;
relayout->deduce_layout(src_layout, dst_layout);
} else {
dst_layout = src_layout;
}
size_t PoolingForwardImpl::get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& dst) {
AlgoBase::SizeArgs args(this, src, dst);
return get_algorithm(this, src, dst)->get_workspace_in_bytes(args);
}
void get_inner_layout(const TensorLayout& src, const TensorLayout& dst,
TensorLayout& inner_src, TensorLayout& inner_dst,
Handle* handle,
PoolingForwardImpl::Param::Format format) {
bool is_nchw = format == PoolingForwardImpl::Param::Format::NCHW;
if (is_nchw) {
auto relayout_opr = handle->create_operator<RelayoutFormat>();
deduce_reformat_layout(relayout_opr, src, inner_src,
RelayoutFormat::Param::Mode::NCHW_NCHW64, 0, 1);
deduce_reformat_layout(relayout_opr, dst, inner_dst,
RelayoutFormat::Param::Mode::NCHW_NCHW64, 0, 1);
} else {
megdnn_assert(0, "not support");
}
const char* PoolingForwardImpl::get_algorithm_set_name() const {
return "CUDA_POOLING_FORWARD";
}
} // namespace
void PoolingForwardImpl::setup_descs(const TensorLayout& src,
const TensorLayout& dst) {
src_desc.set(src, param().format);
dst_desc.set(dst, param().format);
pooling_desc.set(this->param());
std::vector<PoolingForwardImpl::Algorithm*>
PoolingForwardImpl::get_all_algorithms(const TensorLayout& src,
const TensorLayout& dst) {
return megdnn::get_all_algorithms<PoolingForwardImpl>({this, src, dst});
}
WorkspaceBundle PoolingForwardImpl::get_workspace_bundle(
void* ptr, const TensorLayout& src, const TensorLayout& dst) const {
SmallVector<size_t> sizes;
TensorLayout fsrc = src;
TensorLayout fdst = dst;
bool is_nchw = param().format == Param::Format::NCHW;
if ((src.dtype.enumv() == DTypeEnum::QuantizedS4 ||
src.dtype.enumv() == DTypeEnum::Quantized4Asymm) &&
(dst.dtype.enumv() == DTypeEnum::QuantizedS4 ||
dst.dtype.enumv() == DTypeEnum::Quantized4Asymm) &&
is_nchw) {
get_inner_layout(src, dst, fsrc, fdst, handle(), param().format);
sizes.push_back(fsrc.span().dist_byte());
sizes.push_back(fdst.span().dist_byte());
} else {
auto get_workspace = [&sizes](TensorLayout& layout) {
if (layout.dtype == dtype::BFloat16()) {
layout.dtype = dtype::Float32();
sizes.push_back(layout.span().dist_byte());
}
};
get_workspace(fsrc);
get_workspace(fdst);
PoolingForwardImpl::Algorithm* PoolingForwardImpl::get_algorithm_heuristic(
const TensorLayout& src, const TensorLayout& dst,
size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) {
MEGDNN_MARK_USED_VAR(workspace_limit_in_bytes);
AlgoBase::SizeArgs args(this, src, dst);
for (auto&& iter : sm_algo_pack.all_algos) {
if (iter->is_available_attribute(args, positive_attr, negative_attr)) {
return iter;
}
}
return {ptr, std::move(sizes)};
megdnn_throw(
ssprintf("require algorithm with attribute(%s) and without "
"attribute(%s), but can't get suitable algo.\n",
Algorithm::attribute_str(positive_attr).c_str(),
Algorithm::attribute_str(negative_attr).c_str()));
return nullptr;
}
void PoolingForwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_out sdst,
_megdnn_workspace sworkspace) {
check_exec(ssrc.layout, sdst.layout, sworkspace.size);
TensorND src = ssrc;
TensorND dst = sdst;
Param::Format inner_format = param().format;
auto wsb =
get_workspace_bundle(sworkspace.raw_ptr, ssrc.layout, sdst.layout);
auto ctypecvt = CompTypeCvter<dtype::BFloat16, dtype::Float32>(
concrete_handle(this->handle()), &wsb);
bool is_nchw = param().format == Param::Format::NCHW;
if (ssrc.layout.dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) {
ctypecvt.src_to_comp_type(ssrc, src).src_to_comp_type(sdst, dst);
} else if ((ssrc.layout.dtype.enumv() == DTypeEnum::QuantizedS4 ||
ssrc.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) &&
(sdst.layout.dtype.enumv() == DTypeEnum::QuantizedS4 ||
sdst.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) &&
is_nchw) {
auto handle_ptr = handle();
get_inner_layout(ssrc.layout, sdst.layout, src.layout, dst.layout,
handle_ptr, param().format);
src.raw_ptr = wsb.get(0);
dst.raw_ptr = wsb.get(1);
auto relayout_opr = handle_ptr->create_operator<RelayoutFormat>();
RelayoutFormat::Param trans_param;
trans_param.mode = RelayoutFormat::Param::Mode::NCHW_NCHW64;
relayout_opr->param() = trans_param;
relayout_opr->exec(ssrc, src, {});
inner_format = Param::Format::NCHW64;
}
{
using Format = param::Pooling::Format;
if (param().format == Format::CHWN4) {
pooling2d::Param kern_param;
size_t c = src.layout[0], hi = src.layout[1], wi = src.layout[2],
n = src.layout[3], ho = dst.layout[1], wo = dst.layout[2];
c = c * 4;
size_t ph = param().pad_h, pw = param().pad_w;
size_t window_h = param().window_h, window_w = param().window_w;
size_t sh = param().stride_h, sw = param().stride_w;
kern_param.n = n, kern_param.c = c, kern_param.hi = hi,
kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo,
kern_param.ph = ph, kern_param.pw = pw,
kern_param.window_h = window_h, kern_param.window_w = window_w,
kern_param.sh = sh, kern_param.sw = sw;
auto&& stream = cuda_stream(handle());
return pooling2d::do_pooling2d_int8_cdiv4hwn4(
src.compatible_ptr<int8_t>(), dst.compatible_ptr<int8_t>(),
kern_param, stream, static_cast<uint32_t>(param().mode));
} else if (param().format == Format::NCHW4) {
pooling2d::Param kern_param;
size_t n = src.layout[0], hi = src.layout[2], wi = src.layout[3],
c = src.layout[1], ho = dst.layout[2], wo = dst.layout[3];
c = c * 4;
size_t ph = param().pad_h, pw = param().pad_w;
size_t window_h = param().window_h, window_w = param().window_w;
size_t sh = param().stride_h, sw = param().stride_w;
kern_param.n = n, kern_param.c = c, kern_param.hi = hi,
kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo,
kern_param.ph = ph, kern_param.pw = pw,
kern_param.window_h = window_h, kern_param.window_w = window_w,
kern_param.sh = sh, kern_param.sw = sw;
auto&& stream = cuda_stream(handle());
return pooling2d::do_pooling2d_int8_ncdiv4hw4(
src.compatible_ptr<int8_t>(), dst.compatible_ptr<int8_t>(),
kern_param, stream, static_cast<uint32_t>(param().mode));
} else if (param().format == Format::NCHW32) {
pooling2d::Param kern_param;
size_t n = src.layout[0], hi = src.layout[2], wi = src.layout[3],
c = src.layout[1], ho = dst.layout[2], wo = dst.layout[3];
c = c * 32;
size_t ph = param().pad_h, pw = param().pad_w;
size_t window_h = param().window_h, window_w = param().window_w;
size_t sh = param().stride_h, sw = param().stride_w;
kern_param.n = n, kern_param.c = c, kern_param.hi = hi,
kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo,
kern_param.ph = ph, kern_param.pw = pw,
kern_param.window_h = window_h, kern_param.window_w = window_w,
kern_param.sh = sh, kern_param.sw = sw;
auto&& stream = cuda_stream(handle());
return pooling2d::do_pooling2d_int8_ncdiv32hw32(
src.compatible_ptr<int8_t>(), dst.compatible_ptr<int8_t>(),
kern_param, stream, static_cast<uint32_t>(param().mode));
} else if (param().format == Format::NCHW64 ||
inner_format == Format::NCHW64) {
pooling2d::Param kern_param;
size_t n = src.layout[0], hi = src.layout[2], wi = src.layout[3],
c = src.layout[1], ho = dst.layout[2], wo = dst.layout[3];
c = c * 64;
size_t ph = param().pad_h, pw = param().pad_w;
size_t window_h = param().window_h, window_w = param().window_w;
size_t sh = param().stride_h, sw = param().stride_w;
kern_param.n = n, kern_param.c = c, kern_param.hi = hi,
kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo,
kern_param.ph = ph, kern_param.pw = pw,
kern_param.window_h = window_h, kern_param.window_w = window_w,
kern_param.sh = sh, kern_param.sw = sw;
bool uint_case = false;
int zero_point = 0;
if (src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
uint_case = true;
zero_point = src.layout.dtype.param<dtype::Quantized4Asymm>()
.zero_point;
}
auto&& stream = cuda_stream(handle());
pooling2d::do_pooling2d_int4_ncdiv64hw64(
(int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, kern_param,
stream, static_cast<uint32_t>(param().mode), uint_case,
zero_point);
if (sdst.layout.ndim == 4) {
auto relayout_opr = handle()->create_operator<RelayoutFormat>();
RelayoutFormat::Param trans_param;
trans_param.mode = RelayoutFormat::Param::Mode::NCHW64_NCHW;
relayout_opr->param() = trans_param;
relayout_opr->exec(dst, sdst, {});
}
return;
} else if (param().format == Format::NHWC &&
(src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm ||
src.layout.dtype.enumv() == DTypeEnum::QuantizedS4)) {
megdnn_assert(src.layout.dtype.enumv() == dst.layout.dtype.enumv(),
"src and dst dtype must equal");
pooling2d::Param kern_param;
size_t n = src.layout[0], hi = src.layout[1], wi = src.layout[2],
c = src.layout[3], ho = dst.layout[1], wo = dst.layout[2];
size_t ph = param().pad_h, pw = param().pad_w;
size_t window_h = param().window_h, window_w = param().window_w;
size_t sh = param().stride_h, sw = param().stride_w;
kern_param.n = n, kern_param.c = c, kern_param.hi = hi,
kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo,
kern_param.ph = ph, kern_param.pw = pw,
kern_param.window_h = window_h, kern_param.window_w = window_w,
kern_param.sh = sh, kern_param.sw = sw;
bool uint_case = false;
int zero_point = 0;
if (src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
uint_case = true;
zero_point = src.layout.dtype.param<dtype::Quantized4Asymm>()
.zero_point;
}
auto&& stream = cuda_stream(handle());
pooling2d::do_pooling2d_int4_nhwc(
(int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, kern_param,
stream, static_cast<uint32_t>(param().mode), uint_case,
zero_point);
return;
}
auto handle = cudnn_handle(this->handle());
setup_descs(src.layout, dst.layout);
dt_float32 alpha = 1.0f, beta = 0.0f;
cudnn_check(cudnnPoolingForward(handle, pooling_desc.desc, &alpha,
src_desc.desc, src.raw_ptr, &beta,
dst_desc.desc, dst.raw_ptr));
AlgoBase::ExecArgs args(this, ssrc, sdst, sworkspace);
auto algo = get_algorithm(this, ssrc.layout, sdst.layout);
algo->exec(args);
}
if (ssrc.layout.dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) {
ctypecvt.comp_to_dst_type(dst, sdst);
}
}
void PoolingBackwardImpl::setup_descs(const TensorLayout& src,
const TensorLayout& dst,
const TensorLayout& diff,
const TensorLayout& grad) {
src_desc.set(src);
dst_desc.set(dst);
diff_desc.set(diff);
grad_desc.set(grad);
pooling_desc.set(this->param());
const char* PoolingBackwardImpl::get_algorithm_set_name() const {
return "CUDA_POOLING_BACKWARD";
}
WorkspaceBundle PoolingBackwardImpl::get_workspace_bundle(
void* ptr, const TensorLayout& src, const TensorLayout& dst,
const TensorLayout& diff, const TensorLayout& grad) const {
SmallVector<size_t> sizes;
TensorLayout fsrc = src;
TensorLayout fdst = dst;
TensorLayout fdiff = diff;
TensorLayout fgrad = grad;
auto get_workspace = [&sizes](TensorLayout& layout) {
if (layout.dtype == dtype::BFloat16()) {
layout.dtype = dtype::Float32();
sizes.push_back(layout.span().dist_byte());
std::vector<PoolingBackwardImpl::Algorithm*>
PoolingBackwardImpl::get_all_algorithms(const TensorLayout& src,
const TensorLayout& dst,
const TensorLayout& diff,
const TensorLayout& grad) {
return megdnn::get_all_algorithms<PoolingBackwardImpl>(
{this, src, dst, diff, grad});
}
PoolingBackwardImpl::Algorithm* PoolingBackwardImpl::get_algorithm_heuristic(
const TensorLayout& src, const TensorLayout& dst,
const TensorLayout& diff, const TensorLayout& grad,
size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) {
MEGDNN_MARK_USED_VAR(workspace_limit_in_bytes);
AlgoBase::SizeArgs args(this, src, dst, diff, grad);
for (auto iter : sm_algo_pack.all_algos) {
if (iter->is_available_attribute(args, positive_attr, negative_attr)) {
return iter;
}
};
get_workspace(fsrc);
get_workspace(fdst);
get_workspace(fdiff);
get_workspace(fgrad);
return {ptr, std::move(sizes)};
}
megdnn_throw(
ssprintf("require algorithm with attribute(%s) and without "
"attribute(%s), but can't get suitable algo.\n",
Algorithm::attribute_str(positive_attr).c_str(),
Algorithm::attribute_str(negative_attr).c_str()));
return nullptr;
}
void PoolingBackwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_in sdst,
......@@ -283,34 +105,23 @@ void PoolingBackwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_in sdst,
_megdnn_workspace sworkspace) {
check_exec(ssrc.layout, sdst.layout, sdiff.layout, sgrad.layout,
sworkspace.size);
auto handle = cudnn_handle(this->handle());
TensorND src = ssrc;
TensorND dst = sdst;
TensorND diff = sdiff;
TensorND grad = sgrad;
auto wsb = get_workspace_bundle(sworkspace.raw_ptr, ssrc.layout,
sdst.layout, sdiff.layout, sgrad.layout);
auto ctypecvt = CompTypeCvter<dtype::BFloat16, dtype::Float32>(
concrete_handle(this->handle()), &wsb);
if (ssrc.layout.dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) {
ctypecvt.src_to_comp_type(ssrc, src)
.src_to_comp_type(sdst, dst)
.src_to_comp_type(sdiff, diff)
.src_to_comp_type(sgrad, grad);
}
{
setup_descs(src.layout, dst.layout, diff.layout, grad.layout);
float alpha = 1.0f, beta = 0.0f;
cudnn_check(cudnnPoolingBackward(
handle, pooling_desc.desc, &alpha, dst_desc.desc, dst.raw_ptr,
diff_desc.desc, diff.raw_ptr, src_desc.desc, src.raw_ptr, &beta,
grad_desc.desc, grad.raw_ptr));
}
if (ssrc.layout.dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) {
ctypecvt.comp_to_dst_type(grad, sgrad);
AlgoBase::ExecArgs args(this, ssrc, sdst, sdiff, sgrad, sworkspace);
auto algo = get_algorithm(this, ssrc.layout, sdst.layout, sdiff.layout,
sgrad.layout);
algo->exec(args);
}
}
size_t PoolingBackwardImpl::get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& dst,
const TensorLayout& diff,
const TensorLayout& grad) {
AlgoBase::SizeArgs args(this, src, dst, diff, grad);
return get_algorithm(this, src, dst, diff, grad)
->get_workspace_in_bytes(args);
}
} // namespace cuda
} // namespace megdnn
......
......@@ -23,16 +23,45 @@ public:
void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
_megdnn_workspace workspace) override;
size_t get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& dst) override {
return get_workspace_bundle(nullptr, src, dst).total_size_in_bytes();
const TensorLayout& dst) override;
const char* get_algorithm_set_name() const override;
Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override;
AlgorithmInfo get_algorithm_info_heuristic(
const TensorLayout& src, const TensorLayout& dst,
size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) {
return get_algorithm_heuristic(src, dst, workspace_limit_in_bytes,
positive_attr, negative_attr)
->info();
}
class AlgoBase;
class AlgoCUDNN;
#if CUDNN_VERSION >= 6000
class AlgoCUDNNMAXDETERMINISTIC;
#endif
class AlgoCHWN4;
class AlgoNCHW4;
class AlgoNCHW32;
class AlgoNHWC;
class AlgoNCHW64;
class AlgoPack;
static const AlgoPack& algo_pack() { return sm_algo_pack; }
protected:
std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& src, const TensorLayout& dst) override;
Algorithm* get_algorithm_heuristic(
const TensorLayout& src, const TensorLayout& dst,
size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) override;
private:
TensorDesc src_desc, dst_desc;
PoolingDesc pooling_desc;
void setup_descs(const TensorLayout& src, const TensorLayout& dst);
WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout& src,
const TensorLayout& dst) const;
static AlgoPack sm_algo_pack;
};
class PoolingBackwardImpl final : public PoolingBackward {
......@@ -44,23 +73,43 @@ public:
size_t get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& dst,
const TensorLayout& diff,
const TensorLayout& grad) override {
return get_workspace_bundle(nullptr, src, dst, diff, grad)
.total_size_in_bytes();
const TensorLayout& grad) override;
const char* get_algorithm_set_name() const override;
Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override;
AlgorithmInfo get_algorithm_info_heuristic(
const TensorLayout& src, const TensorLayout& dst,
const TensorLayout& diff, const TensorLayout& grad,
size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) {
return get_algorithm_heuristic(src, dst, diff, grad,
workspace_limit_in_bytes, positive_attr,
negative_attr)
->info();
}
class AlgoBase;
class AlgoCUDNN;
class AlgoPack;
static const AlgoPack& algo_pack() { return sm_algo_pack; }
protected:
std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& src, const TensorLayout& dst,
const TensorLayout& diff, const TensorLayout& grad) override;
Algorithm* get_algorithm_heuristic(
const TensorLayout& src, const TensorLayout& dst,
const TensorLayout& diff, const TensorLayout& grad,
size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) override;
private:
TensorDesc src_desc, dst_desc, diff_desc, grad_desc;
PoolingDesc pooling_desc;
void setup_descs(const TensorLayout& src, const TensorLayout& dst,
const TensorLayout& diff, const TensorLayout& grad);
WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout& src,
const TensorLayout& dst,
const TensorLayout& diff,
const TensorLayout& grad) const;
static AlgoPack sm_algo_pack;
};
} // namespace cuda
} // namespace megdnn
} // namespace cuda
} // namespace megdnn
// vim: syntax=cpp.doxygen
......@@ -57,6 +57,24 @@ class DefaultBatchConvBiasForwardAlgorithm final
const char* name() const override { return "DEFAULT"; }
};
class DefaultPoolingForwardAlgorithm final
: public megdnn::PoolingForward::Algorithm {
AlgoAttribute attribute() const override {
return AlgoAttribute::REPRODUCIBLE | AlgoAttribute::NAIVE;
}
uint32_t type() const override { return 0; }
const char* name() const override { return "DEFAULT"; }
};
class DefaultPoolingBackwardAlgorithm final
: public megdnn::PoolingBackward::Algorithm {
AlgoAttribute attribute() const override {
return AlgoAttribute::REPRODUCIBLE | AlgoAttribute::NAIVE;
}
uint32_t type() const override { return 0; }
const char* name() const override { return "DEFAULT"; }
};
} // namespace naive
} // namespace megdnn
......
......@@ -110,6 +110,9 @@ DefaultLocalShareBackwardFilterAlgorithm
DefaultMatrixMulAlgorithm HandleImpl::m_default_matmul_fwd_algo;
DefaultBatchedMatrixMulAlgorithm HandleImpl::m_default_batched_matmul_fwd_algo;
DefaultPoolingForwardAlgorithm HandleImpl::m_default_pooling_fwd_algo;
DefaultPoolingBackwardAlgorithm HandleImpl::m_default_pooling_bwd_algo;
HandleImpl::HandleImpl(megcoreComputingHandle_t computing_handle,
HandleType type)
: HandleImplHelper(computing_handle, type),
......
......@@ -51,6 +51,9 @@ class HandleImpl : public HandleImplHelper {
static DefaultMatrixMulAlgorithm m_default_matmul_fwd_algo;
static DefaultBatchedMatrixMulAlgorithm m_default_batched_matmul_fwd_algo;
static DefaultPoolingForwardAlgorithm m_default_pooling_fwd_algo;
static DefaultPoolingBackwardAlgorithm m_default_pooling_bwd_algo;
//! move KernFunc to alloc_kern()->func, destruct func, and call dispatch
template <typename T>
void move_kern_func_to_new_kern_and_dispatch(T& func) {
......@@ -122,6 +125,14 @@ public:
return &m_default_batched_matmul_fwd_algo;
}
PoolingForward::Algorithm* default_pooling_fwd_algo() {
return &m_default_pooling_fwd_algo;
}
PoolingBackward::Algorithm* default_pooling_bwd_algo() {
return &m_default_pooling_bwd_algo;
}
Relayout* relayout_opr() override {
return get_helper_opr<Relayout, 2>(this);
}
......
......@@ -582,6 +582,52 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
megdnn_assert_internal(0);
}
PoolingForward::Algorithm* PoolingForwardImpl::get_algorithm_from_desc(
const AlgorithmDesc& desc) {
Algorithm* ret =
static_cast<HandleImpl*>(handle())->default_pooling_fwd_algo();
megdnn_assert(desc == ret->info().desc);
return ret;
}
std::vector<Algorithm*> PoolingForwardImpl::get_all_algorithms(
const TensorLayout&, const TensorLayout&) {
return {static_cast<HandleImpl*>(handle())->default_pooling_fwd_algo()};
}
Algorithm* PoolingForwardImpl::get_algorithm_heuristic(
const TensorLayout& /*src*/, const TensorLayout& /*dst*/,
size_t /*workspace_limit_in_bytes*/, const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) {
auto algo = static_cast<HandleImpl*>(handle())->default_pooling_fwd_algo();
algo->check_attribute(positive_attr, negative_attr);
return algo;
}
Algorithm* PoolingBackwardImpl::get_algorithm_from_desc(
const AlgorithmDesc& desc) {
Algorithm* ret =
static_cast<HandleImpl*>(handle())->default_pooling_bwd_algo();
megdnn_assert(desc == ret->info().desc);
return ret;
}
std::vector<Algorithm*> PoolingBackwardImpl::get_all_algorithms(
const TensorLayout& /*src*/, const TensorLayout& /*dst*/,
const TensorLayout& /*diff*/, const TensorLayout& /*grad*/) {
return {static_cast<HandleImpl*>(handle())->default_pooling_bwd_algo()};
}
Algorithm* PoolingBackwardImpl::get_algorithm_heuristic(
const TensorLayout& /*src*/, const TensorLayout& /*dst*/,
const TensorLayout& /*diff*/, const TensorLayout& /*grad*/,
size_t /*workspace_limit_in_bytes*/, const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) {
auto algo = static_cast<HandleImpl*>(handle())->default_pooling_bwd_algo();
algo->check_attribute(positive_attr, negative_attr);
return algo;
}
WorkspaceBundle PoolingBackwardImpl::get_workspace_bundle(
void* ptr, const TensorLayout& src, const TensorLayout& dst,
const TensorLayout& diff, const TensorLayout& grad) const {
......
......@@ -26,6 +26,21 @@ class PoolingForwardImpl: public PoolingForward {
private:
WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout&,
const TensorLayout&) const;
const char* get_algorithm_set_name() const override {
return "DEFALUT";
}
Algorithm* get_algorithm_from_desc(const AlgorithmDesc&) override;
std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& src, const TensorLayout& dst) override;
Algorithm* get_algorithm_heuristic(
const TensorLayout& src, const TensorLayout& dst,
size_t workspace_limit_in_bytes,
const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) override;
};
class PoolingBackwardImpl : public PoolingBackward {
......@@ -38,6 +53,20 @@ public:
const TensorLayout&,
const TensorLayout&) override;
const char* get_algorithm_set_name() const override { return "DEFALUT"; }
Algorithm* get_algorithm_from_desc(const AlgorithmDesc&) override;
std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& src, const TensorLayout& dst,
const TensorLayout& diff, const TensorLayout& grad) override;
Algorithm* get_algorithm_heuristic(
const TensorLayout& src, const TensorLayout& dst,
const TensorLayout& diff, const TensorLayout& grad,
size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) override;
private:
WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout&,
const TensorLayout&,
......
......@@ -46,6 +46,12 @@ struct AlgoProxy;
} \
}
#define LAYOUTS layouts[0], layouts[1]
#define TENSORS tensors[0], tensors[1]
DEF_ALGO_PROXY(2);
#undef LAYOUTS
#undef TENSORS
#define LAYOUTS layouts[0], layouts[1], layouts[2]
#define TENSORS tensors[0], tensors[1], tensors[2]
DEF_ALGO_PROXY(3);
......
......@@ -21,6 +21,13 @@
#include <cudnn.h>
#include "test/cuda/benchmark.h"
namespace {
#define V1(v) #v
#define V(v) V1(v)
#define DEF_NAME(NAME) \
#NAME "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
} // namespace
namespace megdnn {
namespace test {
......@@ -263,19 +270,24 @@ TEST_F(CUDA, POOLING_FORWARD_NCHW_Q4) {
checker.set_param(param).exec({{20, 24, 22, 33}, {}});
}
TEST_F(CUDA, POOLING_FORWARD_NCHW4) {
TEST_F(CUDA, POOLING_FORWARD_NCHW4_NCHW32) {
require_compute_capability(7, 5);
using Param = param::Pooling;
Checker<Pooling> checker(handle_cuda());
Param param;
checker.set_dtype(0, dtype::QuantizedS8(0.1f));
param.format = Param::Format::NCHW4;
checker.set_epsilon(1 + 1e-3);
checker.set_param(param).exec({{20, 3, 50, 50, 4}, {}});
param.mode = Param::Mode::AVERAGE;
checker.set_param(param).exec({{20, 3, 50, 50, 4}, {}});
param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
checker.set_param(param).exec({{20, 3, 50, 50, 4}, {}});
checker.set_before_exec_callback(
AlgoChecker<PoolingForward>(DEF_NAME(cudnnForward)));
for (auto format : {Param::Format::NCHW4, Param::Format::NCHW32}) {
param.format = format;
param.mode = Param::Mode::MAX;
checker.set_param(param).exec({{4, 3, 28, 28, 32}, {}});
param.mode = Param::Mode::AVERAGE;
checker.set_param(param).exec({{4, 3, 28, 28, 64}, {}});
param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
checker.set_param(param).exec({{4, 3, 28, 28, 32}, {}});
}
}
#if CUDNN_VERSION >= 7500
......@@ -288,6 +300,8 @@ TEST_F(CUDA, POOLING_FORWARD_NCHW32) {
auto i8_max = std::numeric_limits<int8_t>().max();
UniformIntRNG int_rng{i8_min, i8_max};
checker.set_dtype(0, dtype::QuantizedS8(0.1f));
checker.set_before_exec_callback(
AlgoChecker<PoolingForward>("CUDA_NCHW32"));
param.format = Param::Format::NCHW32;
checker.set_epsilon(1e-3).set_rng(0, &int_rng);
checker.set_param(param).exec({{64, 8, 28, 28, 32}, {}});
......@@ -394,6 +408,7 @@ TEST_F(CUDA, POOLING_FORWARD_INT8_NCHW4) {
UniformIntRNG int_rng{i8_min, i8_max};
checker.set_dtype(0, dtype::QuantizedS8(0.1f));
param.format = Param::Format::NCHW4;
checker.set_before_exec_callback(AlgoChecker<PoolingForward>("CUDA_NCHW4"));
for (auto mode : {Param::Mode::MAX, Param::Mode::AVERAGE,
Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING}) {
param.mode = mode;
......@@ -413,6 +428,8 @@ TEST_F(CUDA, POOLING_FORWARD_INT8_NCHW32) {
auto i8_max = std::numeric_limits<int8_t>().max();
UniformIntRNG int_rng{i8_min, i8_max};
checker.set_dtype(0, dtype::QuantizedS8(0.1f));
checker.set_before_exec_callback(
AlgoChecker<PoolingForward>("CUDA_NCHW32"));
param.format = Param::Format::NCHW32;
for (auto mode : {Param::Mode::MAX, Param::Mode::AVERAGE,
Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING}) {
......
......@@ -10,14 +10,52 @@
*/
#include "megbrain/opr/dnn/pooling.h"
#include "megbrain/graph/grad_impl.h"
#include "megbrain/opr/search_policy/algo_chooser.h"
#include "../internal/megdnn_opr_wrapper.inl"
#include "../search_policy/workspace_need_limit_getter.inl"
using namespace mgb;
using namespace opr;
MGB_DYN_TYPE_OBJ_FINAL_IMPL(PoolingForward);
MEGDNN_OPR_INIT1(PoolingForward, "pooling")
PoolingForward::PoolingForward(VarNode* i0, const Param& param,
const ExecutionPolicy& policy,
const OperatorNodeConfig& config)
: Super(OperatorNodeBaseCtorParam{
i0->owner_graph(), config, "pooling", {i0}}) {
init_megdnn_opr(*this, param);
add_input({i0});
m_policy = policy;
intl::MegDNNOprInitPostCtor<PoolingForward>::apply(*this);
}
SymbolVar PoolingForward::make(SymbolVar i0, const Param& param,
const OperatorNodeConfig& config,
const ExecutionPolicy& policy) {
intl::MegDNNOprInitInputsModifier<PoolingForward>::apply(param, {&i0});
return i0.insert_single_output_opr<PoolingForward>(i0.node(), param, policy,
config);
}
void PoolingForward::init_output_static_infer_desc() {
Super::set_nr_managed_outputs(this->output().size() - 1);
Super::Super::init_output_static_infer_desc();
init_output_static_infer_desc_workspace(
intl::AutoAddWorkspaceNeedLimitGetter<megdnn::PoolingForward>::val);
}
size_t PoolingForward::get_workspace_size_bytes(
const TensorShapeArray& input_shapes,
const TensorShapeArray& output_shapes) const {
return AlgoChooser<megdnn::PoolingForward>::setup_algo(
{TensorLayout{input_shapes[0], input(0)->dtype(),
input(0)->format()},
{output_shapes[0], output(0)->dtype(), output(0)->format()}},
megdnn_opr(), this, false);
}
#if MGB_ENABLE_GRAD
MGB_IMPL_OPR_GRAD(PoolingForward) {
......@@ -29,7 +67,41 @@ MGB_IMPL_OPR_GRAD(PoolingForward) {
#endif
MGB_DYN_TYPE_OBJ_FINAL_IMPL(PoolingBackward);
MEGDNN_OPR_INIT3(PoolingBackward, "pooling_bwd", 0, true);
PoolingBackward::PoolingBackward(VarNode* i0, VarNode* i1, VarNode* i2,
const Param& param,
const ExecutionPolicy& policy,
const OperatorNodeConfig& config)
: Super(
OperatorNodeBaseCtorParam{
i0->owner_graph(), config, "pooling_bwd", {i0}},
0, true) {
init_megdnn_opr(*this, param);
add_input({i0, i1, i2});
intl::MegDNNOprInitPostCtor<PoolingBackward>::apply(*this);
}
SymbolVar PoolingBackward::make(SymbolVar i0, SymbolVar i1, SymbolVar i2,
const Param& param,
const OperatorNodeConfig& config,
const ExecutionPolicy& policy) {
intl::MegDNNOprInitInputsModifier<PoolingBackward>::apply(param,
{&i0, &i1, &i2});
return i0.insert_single_output_opr<PoolingBackward>(
i0.node(), i1.node(), i2.node(), param, policy, config);
}
size_t PoolingBackward::get_workspace_size_bytes(
const TensorShapeArray& input_shapes,
const TensorShapeArray& output_shapes) const {
return AlgoChooser<megdnn::PoolingBackward>::setup_algo(
{TensorLayout{input_shapes[0], input(0)->dtype(),
input(0)->format()},
{input_shapes[1], input(1)->dtype(), input(1)->format()},
{input_shapes[2], input(2)->dtype(), input(2)->format()},
{output_shapes[0], output(0)->dtype(), output(0)->format()}},
megdnn_opr(), this, false);
}
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
......@@ -175,6 +175,8 @@ cb(DEFORMABLE_CONV_BACKWARD_DATA, DeformableConvBackwardData);
cb(DEFORMABLE_CONV_BACKWARD_FILTER, DeformableConvBackwardFilter);
cb(BATCH_CONV_FORWARD, BatchConvBiasForward);
cb(CONVBIAS_FORWARD, ConvBiasForward);
cb(POOLING_FORWARD, PoolingForward);
cb(POOLING_BACKWARD, PoolingBackward);
#undef cb
......@@ -195,7 +197,9 @@ cb(CONVBIAS_FORWARD, ConvBiasForward);
cb(DEFORMABLE_CONV_BACKWARD_DATA, stmt) \
cb(DEFORMABLE_CONV_BACKWARD_FILTER, stmt) \
cb(BATCH_CONV_FORWARD, stmt) \
cb(CONVBIAS_FORWARD, stmt)
cb(CONVBIAS_FORWARD, stmt) \
cb(POOLING_FORWARD, stmt) \
cb(POOLING_BACKWARD, stmt)
// clang-format on
#define _OPR_TYPE_CASE(_opr_type, _stmt) \
......@@ -521,11 +525,14 @@ AlgoChooser<Opr>::AlgoChooserHelper::AlgoChooserHelper(
mgb_assert(m_fastrun_layouts.size() == layouts.size());
static_assert(std::tuple_size<FixedTensorLayouts>::value == 3 ||
std::tuple_size<FixedTensorLayouts>::value == 5 ||
std::tuple_size<FixedTensorLayouts>::value == 8,
"Convolution AlgoChooser assumes arity = 3 , 5 or 8 (for "
"deformable conv)");
static_assert(
std::tuple_size<FixedTensorLayouts>::value == 2 ||
std::tuple_size<FixedTensorLayouts>::value == 3 ||
std::tuple_size<FixedTensorLayouts>::value == 4 ||
std::tuple_size<FixedTensorLayouts>::value == 5 ||
std::tuple_size<FixedTensorLayouts>::value == 8,
"Pooling assumes arity = 2 or 4,Convolution AlgoChooser assumes "
"arity = 3 , 5 or 8 (for deformable conv)");
}
template <typename Opr>
......
......@@ -284,7 +284,7 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
mdn_workspace),
std::forward_as_tuple(layouts[0],
inp_val[1].as_megdnn()),
array_skip<2>(layouts));
array_skip<arity_in>(layouts));
});
}
});
......
......@@ -12,34 +12,50 @@
#pragma once
#include "megbrain/opr/internal/megdnn_opr_wrapper.h"
#include "megbrain/opr/search_policy/algo_chooser_helper.h"
#include "megdnn/oprs.h"
namespace mgb {
namespace opr {
MGB_DEFINE_OPR_CLASS(PoolingForward,
intl::MegDNNOprWrapperFwd<megdnn::PoolingForward>) // {
public:
PoolingForward(VarNode *src, const Param &param,
const OperatorNodeConfig &config);
static SymbolVar make(SymbolVar src, const Param &param,
const OperatorNodeConfig &config = {});
intl::MegDNNOprWrapperFwd<megdnn::PoolingForward>,
public mixin::AlgoChooserHelper) //{
public:
PoolingForward(VarNode * src, const Param& param,
const ExecutionPolicy& policy,
const OperatorNodeConfig& config);
static SymbolVar make(SymbolVar src, const Param& param,
const OperatorNodeConfig& config = {},
const ExecutionPolicy& policy = {});
void init_output_static_infer_desc() override;
size_t get_workspace_size_bytes(const TensorShapeArray& input_shapes,
const TensorShapeArray& output_shapes)
const override;
};
using Pooling = PoolingForward;
MGB_DEFINE_OPR_CLASS(PoolingBackward,
intl::MegDNNOprWrapperBwd<megdnn::PoolingBackward>) // {
public:
PoolingBackward(VarNode *src, VarNode *dst, VarNode *diff,
const Param &param, const OperatorNodeConfig &config);
static SymbolVar make(SymbolVar src, SymbolVar dst, SymbolVar diff,
const Param &param,
const OperatorNodeConfig &config = {});
intl::MegDNNOprWrapperBwd<megdnn::PoolingBackward>,
public mixin::AlgoChooserHelper) //{
public:
PoolingBackward(VarNode * src, VarNode * dst, VarNode * diff,
const Param& param, const ExecutionPolicy& policy,
const OperatorNodeConfig& config);
static SymbolVar make(SymbolVar src, SymbolVar dst, SymbolVar diff,
const Param& param,
const OperatorNodeConfig& config = {},
const ExecutionPolicy& policy = {});
size_t get_workspace_size_bytes(const TensorShapeArray& input_shapes,
const TensorShapeArray& output_shapes)
const override final;
};
} // namespace opr
} // namespace mgb
} // namespace opr
} // namespace mgb
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
......@@ -18,6 +18,7 @@
#include "megbrain/opr/search_policy/algo_chooser_helper.h"
#include "megbrain/opr/search_policy/profiler.h"
#include "megbrain/opr/dnn/convolution.h"
#include "megbrain/opr/dnn/pooling.h"
#include "megbrain/opr/blas.h"
#include "megdnn/oprs/base.h"
......
......@@ -40,7 +40,9 @@ namespace opr {
cb(DeformableConvBackwardData) \
cb(BatchConvBiasForward) \
cb(MatrixMul) \
cb(BatchedMatrixMul)
cb(BatchedMatrixMul) \
cb(PoolingForward) \
cb(PoolingBackward)
// clang-format on
template <typename Opr>
......
......@@ -11,6 +11,9 @@
#include "./legacy_checker.h"
#include "megbrain/opr/dnn/pooling.h"
#include "megbrain/utils/persistent_cache.h"
#include "megbrain/opr/basic_arith.h"
#include "megbrain/opr/basic_arith_wrapper.h"
using namespace std;
using namespace mgb;
......@@ -104,6 +107,58 @@ TEST(TestOprDNN, PoolingBackward)
}
}
TEST(TestOprDNN, PoolingExePolicy) {
using Param = opr::Pooling::Param;
Param param;
using Policy = opr::Pooling::ExecutionPolicy;
using S = Policy::Strategy;
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
auto orig_impl = PersistentCache::set_impl(
std::make_shared<InMemoryPersistentCache>());
HostTensorND host_y, host_y_copy;
S strategy = S::HEURISTIC | S::REPRODUCIBLE;
auto graph = ComputingGraph::make();
HostTensorGenerator<> gen;
TensorShape shape = {1, 20, 24, 24};
auto input = opr::Host2DeviceCopy::make(*graph, gen(shape, cn));
param.mode = Param::Mode::MAX;
param.window_h = param.window_w = 2;
param.stride_h = param.stride_w = 2;
param.pad_h = param.pad_w = 0;
param.format = Param::Format::NCHW;
Policy policy;
policy.strategy = strategy;
auto pooling = opr::PoolingForward::make(input, param, {}, policy);
auto loss0 = opr::reduce_sum_sqr(pooling, pooling.make_scalar(1));
auto grad = cg::grad(loss0, input, true, false);
opr::PoolingBackward* found = nullptr;
auto cb = [&found](cg::OperatorNodeBase* opr) {
if (opr->same_type<opr::PoolingBackward>()) {
found = &opr->cast_final_safe<opr::PoolingBackward>();
}
};
cg::DepOprIter{cb}.add(grad.node()->owner_opr());
found->set_execution_policy(strategy);
auto func = graph->compile({make_callback_copy(grad, host_y)});
func->execute().wait();
mgb_assert(found->megdnn_opr()->execution_policy().algo.name.find(
"cudnnReproducible") != std::string::npos);
}
} // anonymous namespace
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册