提交 1cfdbc56 编写于 作者: M Megvii Engine Team 提交者: huangxinda

feat(dnn): add deterministic max pooling

GitOrigin-RevId: 9ab4c7a74839b661fe5ed052a20a66b6732c09d0
上级 20ab82d0
...@@ -48,6 +48,8 @@ INST_ARITY(megdnn::ConvBias, 4, 1); ...@@ -48,6 +48,8 @@ INST_ARITY(megdnn::ConvBias, 4, 1);
INST_ARITY(megdnn::DeformableConvBackwardData, 5, 3); INST_ARITY(megdnn::DeformableConvBackwardData, 5, 3);
INST_ARITY(megdnn::MatrixMul, 2, 1); INST_ARITY(megdnn::MatrixMul, 2, 1);
INST_ARITY(megdnn::BatchedMatrixMul, 2, 1); INST_ARITY(megdnn::BatchedMatrixMul, 2, 1);
INST_ARITY(megdnn::PoolingForward, 1, 1);
INST_ARITY(megdnn::PoolingBackward, 3, 1);
#undef INST_ARITY #undef INST_ARITY
......
...@@ -259,6 +259,8 @@ public: ...@@ -259,6 +259,8 @@ public:
DEFORMABLE_CONV_BACKWARD_FILTER, DEFORMABLE_CONV_BACKWARD_FILTER,
CONVBIAS_FORWARD, CONVBIAS_FORWARD,
BATCH_CONV_FORWARD, BATCH_CONV_FORWARD,
POOLING_FORWARD,
POOLING_BACKWARD,
}; };
struct SearchItem { struct SearchItem {
...@@ -334,6 +336,63 @@ private: ...@@ -334,6 +336,63 @@ private:
ExecutionPolicy m_execution_policy; ExecutionPolicy m_execution_policy;
}; };
//! specialize for nargs == 2
template <class Opr>
class MultiAlgoOpr<Opr, 2> : public MultiAlgoOpr<Opr, -1> {
public:
using Algorithm = detail::Algorithm;
using AlgorithmInfo = detail::Algorithm::Info;
using AlgoAttribute = detail::Algorithm::Attribute;
//! get all possible algorithm decriptions for the specified layouts
std::vector<AlgorithmInfo> get_all_algorithms_info(const TensorLayout& p0,
const TensorLayout& p1) {
std::vector<AlgorithmInfo> ret;
for (auto&& algo : get_all_algorithms(p0, p1)) {
ret.emplace_back(algo->info());
}
return ret;
}
/**
* \brief Returns the best algorithm information which indicate the
* algorithm by heuristic.
*
* The selected algorithm should not use workspace more than
* \p workspace_limit_in_bytes.
*/
AlgorithmInfo get_algorithm_info_heuristic(
const TensorLayout& p0, const TensorLayout& p1,
size_t workspace_limit_in_bytes =
std::numeric_limits<size_t>::max(),
const AlgoAttribute& positive_attr = AlgoAttribute::DEFAULT,
const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) {
return get_algorithm_heuristic(p0, p1, workspace_limit_in_bytes,
positive_attr, negative_attr)
->info();
}
protected:
~MultiAlgoOpr() = default;
//! get all possible algorithms for the specified layouts
virtual std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& p0, const TensorLayout& p1) = 0;
/**
* \brief Returns the best algorithm by heuristic.
*
* The selected algorithm should not use workspace more than
* \p workspace_limit_in_bytes.
*/
virtual Algorithm* get_algorithm_heuristic(
const TensorLayout& p0, const TensorLayout& p1,
size_t workspace_limit_in_bytes =
std::numeric_limits<size_t>::max(),
const AlgoAttribute& positive_attr = AlgoAttribute::DEFAULT,
const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) = 0;
};
//! specialize for nargs == 3 //! specialize for nargs == 3
template <class Opr> template <class Opr>
class MultiAlgoOpr<Opr, 3> : public MultiAlgoOpr<Opr, -1> { class MultiAlgoOpr<Opr, 3> : public MultiAlgoOpr<Opr, -1> {
......
...@@ -713,7 +713,8 @@ protected: ...@@ -713,7 +713,8 @@ protected:
void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst); void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst);
}; };
class PoolingForward : public PoolingBase { class PoolingForward : public PoolingBase,
public detail::MultiAlgoOpr<PoolingForward, 2> {
DEF_OPR_IMPL(PoolingForward, PoolingBase, 1, 1); DEF_OPR_IMPL(PoolingForward, PoolingBase, 1, 1);
public: public:
...@@ -734,7 +735,8 @@ protected: ...@@ -734,7 +735,8 @@ protected:
using Pooling = PoolingForward; using Pooling = PoolingForward;
class PoolingBackward : public PoolingBase { class PoolingBackward : public PoolingBase,
public detail::MultiAlgoOpr<PoolingBackward, 4> {
DEF_OPR_IMPL(PoolingBackward, PoolingBase, 3, 1); DEF_OPR_IMPL(PoolingBackward, PoolingBase, 3, 1);
public: public:
......
...@@ -69,7 +69,7 @@ std::vector<typename Opr::Algorithm*> get_all_algorithms( ...@@ -69,7 +69,7 @@ std::vector<typename Opr::Algorithm*> get_all_algorithms(
ret.push_back(i); ret.push_back(i);
} }
} }
megdnn_assert(!ret.empty(), "no conv algorithm for %s", megdnn_assert(!ret.empty(), "no algorithm for %s",
args.to_string().c_str()); args.to_string().c_str());
return ret; return ret;
} }
......
...@@ -294,32 +294,6 @@ void ConvDesc::set(DType data_type, const param::Convolution& param, ...@@ -294,32 +294,6 @@ void ConvDesc::set(DType data_type, const param::Convolution& param,
#endif #endif
} }
PoolingDesc::PoolingDesc() {
cudnn_check(cudnnCreatePoolingDescriptor(&desc));
}
PoolingDesc::~PoolingDesc() {
cudnn_check(cudnnDestroyPoolingDescriptor(desc));
}
void PoolingDesc::set(const param::Pooling& param) {
cudnnPoolingMode_t mode;
switch (param.mode) {
case param::Pooling::Mode::MAX:
mode = CUDNN_POOLING_MAX;
break;
case param::Pooling::Mode::AVERAGE:
mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
break;
case param::Pooling::Mode::AVERAGE_COUNT_EXCLUDE_PADDING:
mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
break;
}
cudnn_check(cudnnSetPooling2dDescriptor(
desc, mode, CUDNN_NOT_PROPAGATE_NAN, param.window_h, param.window_w,
param.pad_h, param.pad_w, param.stride_h, param.stride_w));
}
LRNDesc::LRNDesc() { LRNDesc::LRNDesc() {
cudnn_check(cudnnCreateLRNDescriptor(&desc)); cudnn_check(cudnnCreateLRNDescriptor(&desc));
} }
......
...@@ -54,14 +54,6 @@ class ConvDesc { ...@@ -54,14 +54,6 @@ class ConvDesc {
cudnnConvolutionDescriptor_t desc; cudnnConvolutionDescriptor_t desc;
}; };
class PoolingDesc {
public:
PoolingDesc();
void set(const param::Pooling &param);
~PoolingDesc();
cudnnPoolingDescriptor_t desc;
};
class LRNDesc { class LRNDesc {
public: public:
LRNDesc(); LRNDesc();
......
此差异已折叠。
/**
* \file dnn/src/cuda/pooling/algo.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include <unordered_map>
#include "src/common/algo_base.h"
#include "src/common/metahelper.h"
#include "src/cuda/cudnn_wrapper.h"
#include "src/cuda/pooling/opr_impl.h"
namespace megdnn {
namespace cuda {
namespace {
#define V1(v) #v
#define V(v) V1(v)
#define DEF_NAME(NAME) \
#NAME "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
} // namespace
class PoolingForwardImpl::AlgoBase : public Algorithm {
public:
enum class AlgoType : uint32_t {
CUDA_CUDNN,
#if CUDNN_VERSION >= 6000
CUDA_CUDNN_MAXDETERMINISTIC,
#endif
CUDA_CHWN4,
CUDA_NCHW4,
CUDA_NCHW32,
CUDA_NHWC,
CUDA_NCHW64
};
using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; }
struct SizeArgs {
HandleImpl* handle;
PoolingForwardImpl* opr;
const TensorLayout *layout_src, *layout_dst;
std::string to_string() const;
SizeArgs(PoolingForwardImpl* opr, const TensorLayout& src,
const TensorLayout& dst);
};
struct ExecArgs : public SizeArgs {
const TensorND *src_tensor, *dst_tensor;
Workspace workspace;
ExecArgs(PoolingForwardImpl* opr, _megdnn_tensor_in src,
_megdnn_tensor_out dst, _megdnn_workspace workspace);
};
virtual bool is_available(const SizeArgs& args) const = 0;
size_t get_workspace_in_bytes(const SizeArgs& args) const;
virtual void exec(const ExecArgs& args) const = 0;
bool is_available_attribute(
const SizeArgs& args,
const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE,
const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) {
return contain_attribute_all(positive_attr) &&
!contain_attribute_any(negative_attr) && is_available(args);
}
protected:
~AlgoBase() = default;
virtual WorkspaceBundle get_workspace_bundle(void* ptr,
const SizeArgs& args) const;
};
class PoolingForwardImpl::AlgoCUDNN final : public AlgoBase {
std::string m_algo_name;
public:
AlgoCUDNN(std::string name) : m_algo_name(name) {}
bool is_available(const SizeArgs& args) const override;
void init_mode(const ExecArgs& args, cudnnPoolingMode_t& mode) const;
void exec(const ExecArgs& args) const override;
const char* name() const override { return m_algo_name.c_str(); }
AlgoAttribute attribute() const override {
return AlgoAttribute::REPRODUCIBLE;
}
MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN)
std::string param() const override { return m_algo_name; }
};
#if CUDNN_VERSION >= 6000
class PoolingForwardImpl::AlgoCUDNNMAXDETERMINISTIC final : public AlgoBase {
std::string m_algo_name;
public:
AlgoCUDNNMAXDETERMINISTIC(std::string name) : m_algo_name(name) {}
bool is_available(const SizeArgs& args) const override;
void init_mode(const ExecArgs& args, cudnnPoolingMode_t& mode) const;
void exec(const ExecArgs& args) const override;
const char* name() const override { return m_algo_name.c_str(); }
AlgoAttribute attribute() const override {
return AlgoAttribute::REPRODUCIBLE;
}
MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN_MAXDETERMINISTIC)
std::string param() const override { return m_algo_name; }
};
#endif
#define ALGO_LAYOUT_POOLING_IMPL(_layout) \
class PoolingForwardImpl::Algo##_layout final : public AlgoBase { \
std::string m_algo_name; \
\
public: \
Algo##_layout( \
std::string name = std::string("CUDA_").append(#_layout)) \
: m_algo_name(name) {} \
bool is_available(const SizeArgs& args) const override; \
void exec(const ExecArgs& args) const override; \
const char* name() const override { return m_algo_name.c_str(); } \
AlgoAttribute attribute() const override { \
return AlgoAttribute::REPRODUCIBLE; \
} \
MEGDNN_DECL_ALGO_TYPE(CUDA_##_layout)
ALGO_LAYOUT_POOLING_IMPL(CHWN4)};
ALGO_LAYOUT_POOLING_IMPL(NCHW4)};
ALGO_LAYOUT_POOLING_IMPL(NCHW32)};
ALGO_LAYOUT_POOLING_IMPL(NHWC)};
ALGO_LAYOUT_POOLING_IMPL(NCHW64) //{
protected:
WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args)
const override;
private:
inline void deduce_reformat_layout(
std::unique_ptr<RelayoutFormat> & relayout,
const TensorLayout& src_layout, TensorLayout& dst_layout,
RelayoutFormat::Param::Mode mode, const int oc, const int group)
const;
void get_inner_layout(const TensorLayout& src, const TensorLayout& dst,
TensorLayout& inner_src, TensorLayout& inner_dst,
Handle* handle,
PoolingForwardImpl::Param::Format format) const;
};
#undef ALGO_LAYOUT_POOLING_IMPL
class PoolingForwardImpl::AlgoPack : NonCopyableObj {
private:
AlgoBase::Mapper m_all_algos_map;
public:
AlgoPack();
AlgoCUDNN algo_cudnn{DEF_NAME(cudnnForward)};
#if CUDNN_VERSION >= 6000
AlgoCUDNNMAXDETERMINISTIC algo_cudnn_max_deterministic{
DEF_NAME(cudnnForwardMaxDeterministic)};
#endif
AlgoCHWN4 algo_chwn4;
AlgoNCHW4 algo_nchw4;
AlgoNCHW32 algo_nchw32;
AlgoNHWC algo_nhwc;
AlgoNCHW64 algo_nchw64;
std::vector<AlgoBase*> all_algos;
const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }
};
class PoolingBackwardImpl::AlgoBase : public Algorithm {
public:
enum class AlgoType : uint32_t { CUDA_CUDNN };
using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; }
struct SizeArgs {
HandleImpl* handle;
PoolingBackwardImpl* opr;
const TensorLayout *layout_src, *layout_dst, *layout_diff, *layout_grad;
std::string to_string() const;
SizeArgs(PoolingBackwardImpl* opr, const TensorLayout& src,
const TensorLayout& dst, const TensorLayout& diff,
const TensorLayout& grad);
};
struct ExecArgs : public SizeArgs {
const TensorND *src_tensor, *dst_tensor, *diff_tensor, *grad_tensor;
Workspace workspace;
ExecArgs(PoolingBackwardImpl* opr, _megdnn_tensor_in src,
_megdnn_tensor_in dst, _megdnn_tensor_in diff,
_megdnn_tensor_out grad, _megdnn_workspace workspace);
};
virtual bool is_available(const SizeArgs& args) const = 0;
size_t get_workspace_in_bytes(const SizeArgs& args) const;
virtual void exec(const ExecArgs& args) const = 0;
bool is_available_attribute(
const SizeArgs& args,
const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE,
const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) {
return contain_attribute_all(positive_attr) &&
!contain_attribute_any(negative_attr) && is_available(args);
}
protected:
~AlgoBase() = default;
virtual WorkspaceBundle get_workspace_bundle(void* ptr,
const SizeArgs& args) const;
};
class PoolingBackwardImpl::AlgoCUDNN final : public AlgoBase {
std::string m_algo_name;
bool m_is_reproducible;
public:
AlgoCUDNN(std::string name, bool is_reproducible)
: m_algo_name(name), m_is_reproducible(is_reproducible) {}
bool is_available(const SizeArgs& args) const override;
void init_mode(const ExecArgs& args, cudnnPoolingMode_t& mode) const;
void exec(const ExecArgs& args) const override;
const char* name() const override { return m_algo_name.c_str(); }
AlgoAttribute attribute() const override {
auto ret = AlgoAttribute::DEFAULT;
if (m_is_reproducible) {
ret |= AlgoAttribute::REPRODUCIBLE;
}
return ret;
}
MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN)
std::string param() const override { return m_algo_name; }
};
class PoolingBackwardImpl::AlgoPack : NonCopyableObj {
private:
AlgoBase::Mapper m_all_algos_map;
public:
AlgoPack();
std::vector<AlgoCUDNN> algo_cudnn;
std::vector<AlgoBase*> all_algos;
const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }
};
} // namespace cuda
} // namespace megdnn
// vim: syntax=cpp.doxygen
...@@ -6,275 +6,97 @@ ...@@ -6,275 +6,97 @@
* *
* Unless required by applicable law or agreed to in writing, * Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an * software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/ */
#include "src/cuda/pooling/opr_impl.h" #include "src/cuda/pooling/opr_impl.h"
#include "src/cuda/relayout_format/opr_impl.h" #include "./algo.h"
#include "./pooling2d_qint.cuh" #include "./pooling2d_qint.cuh"
#include "src/common/algo_chooser.h"
#include "src/cuda/relayout_format/opr_impl.h"
#include "src/cuda/utils.h" #include "src/cuda/utils.h"
namespace megdnn { namespace megdnn {
namespace cuda { namespace cuda {
namespace { size_t PoolingForwardImpl::get_workspace_in_bytes(const TensorLayout& src,
inline void deduce_reformat_layout(std::unique_ptr<RelayoutFormat>& relayout, const TensorLayout& dst) {
const TensorLayout& src_layout, AlgoBase::SizeArgs args(this, src, dst);
TensorLayout& dst_layout, return get_algorithm(this, src, dst)->get_workspace_in_bytes(args);
RelayoutFormat::Param::Mode mode,
const int oc = 0, const int group = 1) {
if (src_layout.ndim > 0) {
RelayoutFormat::Param trans_param;
trans_param.mode = mode;
trans_param.oc = oc;
trans_param.group = group;
relayout->param() = trans_param;
relayout->deduce_layout(src_layout, dst_layout);
} else {
dst_layout = src_layout;
}
} }
void get_inner_layout(const TensorLayout& src, const TensorLayout& dst, const char* PoolingForwardImpl::get_algorithm_set_name() const {
TensorLayout& inner_src, TensorLayout& inner_dst, return "CUDA_POOLING_FORWARD";
Handle* handle,
PoolingForwardImpl::Param::Format format) {
bool is_nchw = format == PoolingForwardImpl::Param::Format::NCHW;
if (is_nchw) {
auto relayout_opr = handle->create_operator<RelayoutFormat>();
deduce_reformat_layout(relayout_opr, src, inner_src,
RelayoutFormat::Param::Mode::NCHW_NCHW64, 0, 1);
deduce_reformat_layout(relayout_opr, dst, inner_dst,
RelayoutFormat::Param::Mode::NCHW_NCHW64, 0, 1);
} else {
megdnn_assert(0, "not support");
}
} }
} // namespace std::vector<PoolingForwardImpl::Algorithm*>
void PoolingForwardImpl::setup_descs(const TensorLayout& src, PoolingForwardImpl::get_all_algorithms(const TensorLayout& src,
const TensorLayout& dst) { const TensorLayout& dst) {
src_desc.set(src, param().format); return megdnn::get_all_algorithms<PoolingForwardImpl>({this, src, dst});
dst_desc.set(dst, param().format);
pooling_desc.set(this->param());
} }
WorkspaceBundle PoolingForwardImpl::get_workspace_bundle( PoolingForwardImpl::Algorithm* PoolingForwardImpl::get_algorithm_heuristic(
void* ptr, const TensorLayout& src, const TensorLayout& dst) const { const TensorLayout& src, const TensorLayout& dst,
SmallVector<size_t> sizes; size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
TensorLayout fsrc = src; const AlgoAttribute& negative_attr) {
TensorLayout fdst = dst; MEGDNN_MARK_USED_VAR(workspace_limit_in_bytes);
bool is_nchw = param().format == Param::Format::NCHW;
if ((src.dtype.enumv() == DTypeEnum::QuantizedS4 || AlgoBase::SizeArgs args(this, src, dst);
src.dtype.enumv() == DTypeEnum::Quantized4Asymm) && for (auto&& iter : sm_algo_pack.all_algos) {
(dst.dtype.enumv() == DTypeEnum::QuantizedS4 || if (iter->is_available_attribute(args, positive_attr, negative_attr)) {
dst.dtype.enumv() == DTypeEnum::Quantized4Asymm) && return iter;
is_nchw) { }
get_inner_layout(src, dst, fsrc, fdst, handle(), param().format);
sizes.push_back(fsrc.span().dist_byte());
sizes.push_back(fdst.span().dist_byte());
} else {
auto get_workspace = [&sizes](TensorLayout& layout) {
if (layout.dtype == dtype::BFloat16()) {
layout.dtype = dtype::Float32();
sizes.push_back(layout.span().dist_byte());
}
};
get_workspace(fsrc);
get_workspace(fdst);
} }
return {ptr, std::move(sizes)}; megdnn_throw(
ssprintf("require algorithm with attribute(%s) and without "
"attribute(%s), but can't get suitable algo.\n",
Algorithm::attribute_str(positive_attr).c_str(),
Algorithm::attribute_str(negative_attr).c_str()));
return nullptr;
} }
void PoolingForwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_out sdst, void PoolingForwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_out sdst,
_megdnn_workspace sworkspace) { _megdnn_workspace sworkspace) {
check_exec(ssrc.layout, sdst.layout, sworkspace.size); check_exec(ssrc.layout, sdst.layout, sworkspace.size);
TensorND src = ssrc;
TensorND dst = sdst;
Param::Format inner_format = param().format;
auto wsb =
get_workspace_bundle(sworkspace.raw_ptr, ssrc.layout, sdst.layout);
auto ctypecvt = CompTypeCvter<dtype::BFloat16, dtype::Float32>(
concrete_handle(this->handle()), &wsb);
bool is_nchw = param().format == Param::Format::NCHW;
if (ssrc.layout.dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) {
ctypecvt.src_to_comp_type(ssrc, src).src_to_comp_type(sdst, dst);
} else if ((ssrc.layout.dtype.enumv() == DTypeEnum::QuantizedS4 ||
ssrc.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) &&
(sdst.layout.dtype.enumv() == DTypeEnum::QuantizedS4 ||
sdst.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) &&
is_nchw) {
auto handle_ptr = handle();
get_inner_layout(ssrc.layout, sdst.layout, src.layout, dst.layout,
handle_ptr, param().format);
src.raw_ptr = wsb.get(0);
dst.raw_ptr = wsb.get(1);
auto relayout_opr = handle_ptr->create_operator<RelayoutFormat>();
RelayoutFormat::Param trans_param;
trans_param.mode = RelayoutFormat::Param::Mode::NCHW_NCHW64;
relayout_opr->param() = trans_param;
relayout_opr->exec(ssrc, src, {});
inner_format = Param::Format::NCHW64;
}
{ {
using Format = param::Pooling::Format; AlgoBase::ExecArgs args(this, ssrc, sdst, sworkspace);
if (param().format == Format::CHWN4) { auto algo = get_algorithm(this, ssrc.layout, sdst.layout);
pooling2d::Param kern_param; algo->exec(args);
size_t c = src.layout[0], hi = src.layout[1], wi = src.layout[2],
n = src.layout[3], ho = dst.layout[1], wo = dst.layout[2];
c = c * 4;
size_t ph = param().pad_h, pw = param().pad_w;
size_t window_h = param().window_h, window_w = param().window_w;
size_t sh = param().stride_h, sw = param().stride_w;
kern_param.n = n, kern_param.c = c, kern_param.hi = hi,
kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo,
kern_param.ph = ph, kern_param.pw = pw,
kern_param.window_h = window_h, kern_param.window_w = window_w,
kern_param.sh = sh, kern_param.sw = sw;
auto&& stream = cuda_stream(handle());
return pooling2d::do_pooling2d_int8_cdiv4hwn4(
src.compatible_ptr<int8_t>(), dst.compatible_ptr<int8_t>(),
kern_param, stream, static_cast<uint32_t>(param().mode));
} else if (param().format == Format::NCHW4) {
pooling2d::Param kern_param;
size_t n = src.layout[0], hi = src.layout[2], wi = src.layout[3],
c = src.layout[1], ho = dst.layout[2], wo = dst.layout[3];
c = c * 4;
size_t ph = param().pad_h, pw = param().pad_w;
size_t window_h = param().window_h, window_w = param().window_w;
size_t sh = param().stride_h, sw = param().stride_w;
kern_param.n = n, kern_param.c = c, kern_param.hi = hi,
kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo,
kern_param.ph = ph, kern_param.pw = pw,
kern_param.window_h = window_h, kern_param.window_w = window_w,
kern_param.sh = sh, kern_param.sw = sw;
auto&& stream = cuda_stream(handle());
return pooling2d::do_pooling2d_int8_ncdiv4hw4(
src.compatible_ptr<int8_t>(), dst.compatible_ptr<int8_t>(),
kern_param, stream, static_cast<uint32_t>(param().mode));
} else if (param().format == Format::NCHW32) {
pooling2d::Param kern_param;
size_t n = src.layout[0], hi = src.layout[2], wi = src.layout[3],
c = src.layout[1], ho = dst.layout[2], wo = dst.layout[3];
c = c * 32;
size_t ph = param().pad_h, pw = param().pad_w;
size_t window_h = param().window_h, window_w = param().window_w;
size_t sh = param().stride_h, sw = param().stride_w;
kern_param.n = n, kern_param.c = c, kern_param.hi = hi,
kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo,
kern_param.ph = ph, kern_param.pw = pw,
kern_param.window_h = window_h, kern_param.window_w = window_w,
kern_param.sh = sh, kern_param.sw = sw;
auto&& stream = cuda_stream(handle());
return pooling2d::do_pooling2d_int8_ncdiv32hw32(
src.compatible_ptr<int8_t>(), dst.compatible_ptr<int8_t>(),
kern_param, stream, static_cast<uint32_t>(param().mode));
} else if (param().format == Format::NCHW64 ||
inner_format == Format::NCHW64) {
pooling2d::Param kern_param;
size_t n = src.layout[0], hi = src.layout[2], wi = src.layout[3],
c = src.layout[1], ho = dst.layout[2], wo = dst.layout[3];
c = c * 64;
size_t ph = param().pad_h, pw = param().pad_w;
size_t window_h = param().window_h, window_w = param().window_w;
size_t sh = param().stride_h, sw = param().stride_w;
kern_param.n = n, kern_param.c = c, kern_param.hi = hi,
kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo,
kern_param.ph = ph, kern_param.pw = pw,
kern_param.window_h = window_h, kern_param.window_w = window_w,
kern_param.sh = sh, kern_param.sw = sw;
bool uint_case = false;
int zero_point = 0;
if (src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
uint_case = true;
zero_point = src.layout.dtype.param<dtype::Quantized4Asymm>()
.zero_point;
}
auto&& stream = cuda_stream(handle());
pooling2d::do_pooling2d_int4_ncdiv64hw64(
(int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, kern_param,
stream, static_cast<uint32_t>(param().mode), uint_case,
zero_point);
if (sdst.layout.ndim == 4) {
auto relayout_opr = handle()->create_operator<RelayoutFormat>();
RelayoutFormat::Param trans_param;
trans_param.mode = RelayoutFormat::Param::Mode::NCHW64_NCHW;
relayout_opr->param() = trans_param;
relayout_opr->exec(dst, sdst, {});
}
return;
} else if (param().format == Format::NHWC &&
(src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm ||
src.layout.dtype.enumv() == DTypeEnum::QuantizedS4)) {
megdnn_assert(src.layout.dtype.enumv() == dst.layout.dtype.enumv(),
"src and dst dtype must equal");
pooling2d::Param kern_param;
size_t n = src.layout[0], hi = src.layout[1], wi = src.layout[2],
c = src.layout[3], ho = dst.layout[1], wo = dst.layout[2];
size_t ph = param().pad_h, pw = param().pad_w;
size_t window_h = param().window_h, window_w = param().window_w;
size_t sh = param().stride_h, sw = param().stride_w;
kern_param.n = n, kern_param.c = c, kern_param.hi = hi,
kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo,
kern_param.ph = ph, kern_param.pw = pw,
kern_param.window_h = window_h, kern_param.window_w = window_w,
kern_param.sh = sh, kern_param.sw = sw;
bool uint_case = false;
int zero_point = 0;
if (src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
uint_case = true;
zero_point = src.layout.dtype.param<dtype::Quantized4Asymm>()
.zero_point;
}
auto&& stream = cuda_stream(handle());
pooling2d::do_pooling2d_int4_nhwc(
(int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, kern_param,
stream, static_cast<uint32_t>(param().mode), uint_case,
zero_point);
return;
}
auto handle = cudnn_handle(this->handle());
setup_descs(src.layout, dst.layout);
dt_float32 alpha = 1.0f, beta = 0.0f;
cudnn_check(cudnnPoolingForward(handle, pooling_desc.desc, &alpha,
src_desc.desc, src.raw_ptr, &beta,
dst_desc.desc, dst.raw_ptr));
} }
if (ssrc.layout.dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) {
ctypecvt.comp_to_dst_type(dst, sdst);
}
} }
void PoolingBackwardImpl::setup_descs(const TensorLayout& src, const char* PoolingBackwardImpl::get_algorithm_set_name() const {
const TensorLayout& dst, return "CUDA_POOLING_BACKWARD";
const TensorLayout& diff,
const TensorLayout& grad) {
src_desc.set(src);
dst_desc.set(dst);
diff_desc.set(diff);
grad_desc.set(grad);
pooling_desc.set(this->param());
} }
WorkspaceBundle PoolingBackwardImpl::get_workspace_bundle( std::vector<PoolingBackwardImpl::Algorithm*>
void* ptr, const TensorLayout& src, const TensorLayout& dst, PoolingBackwardImpl::get_all_algorithms(const TensorLayout& src,
const TensorLayout& diff, const TensorLayout& grad) const { const TensorLayout& dst,
SmallVector<size_t> sizes; const TensorLayout& diff,
TensorLayout fsrc = src; const TensorLayout& grad) {
TensorLayout fdst = dst; return megdnn::get_all_algorithms<PoolingBackwardImpl>(
TensorLayout fdiff = diff; {this, src, dst, diff, grad});
TensorLayout fgrad = grad; }
auto get_workspace = [&sizes](TensorLayout& layout) {
if (layout.dtype == dtype::BFloat16()) { PoolingBackwardImpl::Algorithm* PoolingBackwardImpl::get_algorithm_heuristic(
layout.dtype = dtype::Float32(); const TensorLayout& src, const TensorLayout& dst,
sizes.push_back(layout.span().dist_byte()); const TensorLayout& diff, const TensorLayout& grad,
size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) {
MEGDNN_MARK_USED_VAR(workspace_limit_in_bytes);
AlgoBase::SizeArgs args(this, src, dst, diff, grad);
for (auto iter : sm_algo_pack.all_algos) {
if (iter->is_available_attribute(args, positive_attr, negative_attr)) {
return iter;
} }
}; }
get_workspace(fsrc); megdnn_throw(
get_workspace(fdst); ssprintf("require algorithm with attribute(%s) and without "
get_workspace(fdiff); "attribute(%s), but can't get suitable algo.\n",
get_workspace(fgrad); Algorithm::attribute_str(positive_attr).c_str(),
return {ptr, std::move(sizes)}; Algorithm::attribute_str(negative_attr).c_str()));
return nullptr;
} }
void PoolingBackwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_in sdst, void PoolingBackwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_in sdst,
...@@ -283,34 +105,23 @@ void PoolingBackwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_in sdst, ...@@ -283,34 +105,23 @@ void PoolingBackwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_in sdst,
_megdnn_workspace sworkspace) { _megdnn_workspace sworkspace) {
check_exec(ssrc.layout, sdst.layout, sdiff.layout, sgrad.layout, check_exec(ssrc.layout, sdst.layout, sdiff.layout, sgrad.layout,
sworkspace.size); sworkspace.size);
auto handle = cudnn_handle(this->handle());
TensorND src = ssrc;
TensorND dst = sdst;
TensorND diff = sdiff;
TensorND grad = sgrad;
auto wsb = get_workspace_bundle(sworkspace.raw_ptr, ssrc.layout,
sdst.layout, sdiff.layout, sgrad.layout);
auto ctypecvt = CompTypeCvter<dtype::BFloat16, dtype::Float32>(
concrete_handle(this->handle()), &wsb);
if (ssrc.layout.dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) {
ctypecvt.src_to_comp_type(ssrc, src)
.src_to_comp_type(sdst, dst)
.src_to_comp_type(sdiff, diff)
.src_to_comp_type(sgrad, grad);
}
{ {
setup_descs(src.layout, dst.layout, diff.layout, grad.layout); AlgoBase::ExecArgs args(this, ssrc, sdst, sdiff, sgrad, sworkspace);
float alpha = 1.0f, beta = 0.0f; auto algo = get_algorithm(this, ssrc.layout, sdst.layout, sdiff.layout,
cudnn_check(cudnnPoolingBackward( sgrad.layout);
handle, pooling_desc.desc, &alpha, dst_desc.desc, dst.raw_ptr, algo->exec(args);
diff_desc.desc, diff.raw_ptr, src_desc.desc, src.raw_ptr, &beta,
grad_desc.desc, grad.raw_ptr));
}
if (ssrc.layout.dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) {
ctypecvt.comp_to_dst_type(grad, sgrad);
} }
} }
size_t PoolingBackwardImpl::get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& dst,
const TensorLayout& diff,
const TensorLayout& grad) {
AlgoBase::SizeArgs args(this, src, dst, diff, grad);
return get_algorithm(this, src, dst, diff, grad)
->get_workspace_in_bytes(args);
}
} // namespace cuda } // namespace cuda
} // namespace megdnn } // namespace megdnn
......
...@@ -23,16 +23,45 @@ public: ...@@ -23,16 +23,45 @@ public:
void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
_megdnn_workspace workspace) override; _megdnn_workspace workspace) override;
size_t get_workspace_in_bytes(const TensorLayout& src, size_t get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& dst) override { const TensorLayout& dst) override;
return get_workspace_bundle(nullptr, src, dst).total_size_in_bytes();
const char* get_algorithm_set_name() const override;
Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override;
AlgorithmInfo get_algorithm_info_heuristic(
const TensorLayout& src, const TensorLayout& dst,
size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) {
return get_algorithm_heuristic(src, dst, workspace_limit_in_bytes,
positive_attr, negative_attr)
->info();
} }
class AlgoBase;
class AlgoCUDNN;
#if CUDNN_VERSION >= 6000
class AlgoCUDNNMAXDETERMINISTIC;
#endif
class AlgoCHWN4;
class AlgoNCHW4;
class AlgoNCHW32;
class AlgoNHWC;
class AlgoNCHW64;
class AlgoPack;
static const AlgoPack& algo_pack() { return sm_algo_pack; }
protected:
std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& src, const TensorLayout& dst) override;
Algorithm* get_algorithm_heuristic(
const TensorLayout& src, const TensorLayout& dst,
size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) override;
private: private:
TensorDesc src_desc, dst_desc; static AlgoPack sm_algo_pack;
PoolingDesc pooling_desc;
void setup_descs(const TensorLayout& src, const TensorLayout& dst);
WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout& src,
const TensorLayout& dst) const;
}; };
class PoolingBackwardImpl final : public PoolingBackward { class PoolingBackwardImpl final : public PoolingBackward {
...@@ -44,23 +73,43 @@ public: ...@@ -44,23 +73,43 @@ public:
size_t get_workspace_in_bytes(const TensorLayout& src, size_t get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& dst, const TensorLayout& dst,
const TensorLayout& diff, const TensorLayout& diff,
const TensorLayout& grad) override { const TensorLayout& grad) override;
return get_workspace_bundle(nullptr, src, dst, diff, grad)
.total_size_in_bytes(); const char* get_algorithm_set_name() const override;
Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override;
AlgorithmInfo get_algorithm_info_heuristic(
const TensorLayout& src, const TensorLayout& dst,
const TensorLayout& diff, const TensorLayout& grad,
size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) {
return get_algorithm_heuristic(src, dst, diff, grad,
workspace_limit_in_bytes, positive_attr,
negative_attr)
->info();
} }
class AlgoBase;
class AlgoCUDNN;
class AlgoPack;
static const AlgoPack& algo_pack() { return sm_algo_pack; }
protected:
std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& src, const TensorLayout& dst,
const TensorLayout& diff, const TensorLayout& grad) override;
Algorithm* get_algorithm_heuristic(
const TensorLayout& src, const TensorLayout& dst,
const TensorLayout& diff, const TensorLayout& grad,
size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) override;
private: private:
TensorDesc src_desc, dst_desc, diff_desc, grad_desc; static AlgoPack sm_algo_pack;
PoolingDesc pooling_desc;
void setup_descs(const TensorLayout& src, const TensorLayout& dst,
const TensorLayout& diff, const TensorLayout& grad);
WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout& src,
const TensorLayout& dst,
const TensorLayout& diff,
const TensorLayout& grad) const;
}; };
} // namespace cuda } // namespace cuda
} // namespace megdnn } // namespace megdnn
// vim: syntax=cpp.doxygen // vim: syntax=cpp.doxygen
...@@ -57,6 +57,24 @@ class DefaultBatchConvBiasForwardAlgorithm final ...@@ -57,6 +57,24 @@ class DefaultBatchConvBiasForwardAlgorithm final
const char* name() const override { return "DEFAULT"; } const char* name() const override { return "DEFAULT"; }
}; };
class DefaultPoolingForwardAlgorithm final
: public megdnn::PoolingForward::Algorithm {
AlgoAttribute attribute() const override {
return AlgoAttribute::REPRODUCIBLE | AlgoAttribute::NAIVE;
}
uint32_t type() const override { return 0; }
const char* name() const override { return "DEFAULT"; }
};
class DefaultPoolingBackwardAlgorithm final
: public megdnn::PoolingBackward::Algorithm {
AlgoAttribute attribute() const override {
return AlgoAttribute::REPRODUCIBLE | AlgoAttribute::NAIVE;
}
uint32_t type() const override { return 0; }
const char* name() const override { return "DEFAULT"; }
};
} // namespace naive } // namespace naive
} // namespace megdnn } // namespace megdnn
......
...@@ -110,6 +110,9 @@ DefaultLocalShareBackwardFilterAlgorithm ...@@ -110,6 +110,9 @@ DefaultLocalShareBackwardFilterAlgorithm
DefaultMatrixMulAlgorithm HandleImpl::m_default_matmul_fwd_algo; DefaultMatrixMulAlgorithm HandleImpl::m_default_matmul_fwd_algo;
DefaultBatchedMatrixMulAlgorithm HandleImpl::m_default_batched_matmul_fwd_algo; DefaultBatchedMatrixMulAlgorithm HandleImpl::m_default_batched_matmul_fwd_algo;
DefaultPoolingForwardAlgorithm HandleImpl::m_default_pooling_fwd_algo;
DefaultPoolingBackwardAlgorithm HandleImpl::m_default_pooling_bwd_algo;
HandleImpl::HandleImpl(megcoreComputingHandle_t computing_handle, HandleImpl::HandleImpl(megcoreComputingHandle_t computing_handle,
HandleType type) HandleType type)
: HandleImplHelper(computing_handle, type), : HandleImplHelper(computing_handle, type),
......
...@@ -51,6 +51,9 @@ class HandleImpl : public HandleImplHelper { ...@@ -51,6 +51,9 @@ class HandleImpl : public HandleImplHelper {
static DefaultMatrixMulAlgorithm m_default_matmul_fwd_algo; static DefaultMatrixMulAlgorithm m_default_matmul_fwd_algo;
static DefaultBatchedMatrixMulAlgorithm m_default_batched_matmul_fwd_algo; static DefaultBatchedMatrixMulAlgorithm m_default_batched_matmul_fwd_algo;
static DefaultPoolingForwardAlgorithm m_default_pooling_fwd_algo;
static DefaultPoolingBackwardAlgorithm m_default_pooling_bwd_algo;
//! move KernFunc to alloc_kern()->func, destruct func, and call dispatch //! move KernFunc to alloc_kern()->func, destruct func, and call dispatch
template <typename T> template <typename T>
void move_kern_func_to_new_kern_and_dispatch(T& func) { void move_kern_func_to_new_kern_and_dispatch(T& func) {
...@@ -122,6 +125,14 @@ public: ...@@ -122,6 +125,14 @@ public:
return &m_default_batched_matmul_fwd_algo; return &m_default_batched_matmul_fwd_algo;
} }
PoolingForward::Algorithm* default_pooling_fwd_algo() {
return &m_default_pooling_fwd_algo;
}
PoolingBackward::Algorithm* default_pooling_bwd_algo() {
return &m_default_pooling_bwd_algo;
}
Relayout* relayout_opr() override { Relayout* relayout_opr() override {
return get_helper_opr<Relayout, 2>(this); return get_helper_opr<Relayout, 2>(this);
} }
......
...@@ -582,6 +582,52 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, ...@@ -582,6 +582,52 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
megdnn_assert_internal(0); megdnn_assert_internal(0);
} }
PoolingForward::Algorithm* PoolingForwardImpl::get_algorithm_from_desc(
const AlgorithmDesc& desc) {
Algorithm* ret =
static_cast<HandleImpl*>(handle())->default_pooling_fwd_algo();
megdnn_assert(desc == ret->info().desc);
return ret;
}
std::vector<Algorithm*> PoolingForwardImpl::get_all_algorithms(
const TensorLayout&, const TensorLayout&) {
return {static_cast<HandleImpl*>(handle())->default_pooling_fwd_algo()};
}
Algorithm* PoolingForwardImpl::get_algorithm_heuristic(
const TensorLayout& /*src*/, const TensorLayout& /*dst*/,
size_t /*workspace_limit_in_bytes*/, const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) {
auto algo = static_cast<HandleImpl*>(handle())->default_pooling_fwd_algo();
algo->check_attribute(positive_attr, negative_attr);
return algo;
}
Algorithm* PoolingBackwardImpl::get_algorithm_from_desc(
const AlgorithmDesc& desc) {
Algorithm* ret =
static_cast<HandleImpl*>(handle())->default_pooling_bwd_algo();
megdnn_assert(desc == ret->info().desc);
return ret;
}
std::vector<Algorithm*> PoolingBackwardImpl::get_all_algorithms(
const TensorLayout& /*src*/, const TensorLayout& /*dst*/,
const TensorLayout& /*diff*/, const TensorLayout& /*grad*/) {
return {static_cast<HandleImpl*>(handle())->default_pooling_bwd_algo()};
}
Algorithm* PoolingBackwardImpl::get_algorithm_heuristic(
const TensorLayout& /*src*/, const TensorLayout& /*dst*/,
const TensorLayout& /*diff*/, const TensorLayout& /*grad*/,
size_t /*workspace_limit_in_bytes*/, const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) {
auto algo = static_cast<HandleImpl*>(handle())->default_pooling_bwd_algo();
algo->check_attribute(positive_attr, negative_attr);
return algo;
}
WorkspaceBundle PoolingBackwardImpl::get_workspace_bundle( WorkspaceBundle PoolingBackwardImpl::get_workspace_bundle(
void* ptr, const TensorLayout& src, const TensorLayout& dst, void* ptr, const TensorLayout& src, const TensorLayout& dst,
const TensorLayout& diff, const TensorLayout& grad) const { const TensorLayout& diff, const TensorLayout& grad) const {
......
...@@ -26,6 +26,21 @@ class PoolingForwardImpl: public PoolingForward { ...@@ -26,6 +26,21 @@ class PoolingForwardImpl: public PoolingForward {
private: private:
WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout&, WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout&,
const TensorLayout&) const; const TensorLayout&) const;
const char* get_algorithm_set_name() const override {
return "DEFALUT";
}
Algorithm* get_algorithm_from_desc(const AlgorithmDesc&) override;
std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& src, const TensorLayout& dst) override;
Algorithm* get_algorithm_heuristic(
const TensorLayout& src, const TensorLayout& dst,
size_t workspace_limit_in_bytes,
const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) override;
}; };
class PoolingBackwardImpl : public PoolingBackward { class PoolingBackwardImpl : public PoolingBackward {
...@@ -38,6 +53,20 @@ public: ...@@ -38,6 +53,20 @@ public:
const TensorLayout&, const TensorLayout&,
const TensorLayout&) override; const TensorLayout&) override;
const char* get_algorithm_set_name() const override { return "DEFALUT"; }
Algorithm* get_algorithm_from_desc(const AlgorithmDesc&) override;
std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& src, const TensorLayout& dst,
const TensorLayout& diff, const TensorLayout& grad) override;
Algorithm* get_algorithm_heuristic(
const TensorLayout& src, const TensorLayout& dst,
const TensorLayout& diff, const TensorLayout& grad,
size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) override;
private: private:
WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout&, WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout&,
const TensorLayout&, const TensorLayout&,
......
...@@ -46,6 +46,12 @@ struct AlgoProxy; ...@@ -46,6 +46,12 @@ struct AlgoProxy;
} \ } \
} }
#define LAYOUTS layouts[0], layouts[1]
#define TENSORS tensors[0], tensors[1]
DEF_ALGO_PROXY(2);
#undef LAYOUTS
#undef TENSORS
#define LAYOUTS layouts[0], layouts[1], layouts[2] #define LAYOUTS layouts[0], layouts[1], layouts[2]
#define TENSORS tensors[0], tensors[1], tensors[2] #define TENSORS tensors[0], tensors[1], tensors[2]
DEF_ALGO_PROXY(3); DEF_ALGO_PROXY(3);
......
...@@ -21,6 +21,13 @@ ...@@ -21,6 +21,13 @@
#include <cudnn.h> #include <cudnn.h>
#include "test/cuda/benchmark.h" #include "test/cuda/benchmark.h"
namespace {
#define V1(v) #v
#define V(v) V1(v)
#define DEF_NAME(NAME) \
#NAME "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
} // namespace
namespace megdnn { namespace megdnn {
namespace test { namespace test {
...@@ -263,19 +270,24 @@ TEST_F(CUDA, POOLING_FORWARD_NCHW_Q4) { ...@@ -263,19 +270,24 @@ TEST_F(CUDA, POOLING_FORWARD_NCHW_Q4) {
checker.set_param(param).exec({{20, 24, 22, 33}, {}}); checker.set_param(param).exec({{20, 24, 22, 33}, {}});
} }
TEST_F(CUDA, POOLING_FORWARD_NCHW4) { TEST_F(CUDA, POOLING_FORWARD_NCHW4_NCHW32) {
require_compute_capability(7, 5); require_compute_capability(7, 5);
using Param = param::Pooling; using Param = param::Pooling;
Checker<Pooling> checker(handle_cuda()); Checker<Pooling> checker(handle_cuda());
Param param; Param param;
checker.set_dtype(0, dtype::QuantizedS8(0.1f)); checker.set_dtype(0, dtype::QuantizedS8(0.1f));
param.format = Param::Format::NCHW4;
checker.set_epsilon(1 + 1e-3); checker.set_epsilon(1 + 1e-3);
checker.set_param(param).exec({{20, 3, 50, 50, 4}, {}}); checker.set_before_exec_callback(
param.mode = Param::Mode::AVERAGE; AlgoChecker<PoolingForward>(DEF_NAME(cudnnForward)));
checker.set_param(param).exec({{20, 3, 50, 50, 4}, {}}); for (auto format : {Param::Format::NCHW4, Param::Format::NCHW32}) {
param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; param.format = format;
checker.set_param(param).exec({{20, 3, 50, 50, 4}, {}}); param.mode = Param::Mode::MAX;
checker.set_param(param).exec({{4, 3, 28, 28, 32}, {}});
param.mode = Param::Mode::AVERAGE;
checker.set_param(param).exec({{4, 3, 28, 28, 64}, {}});
param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
checker.set_param(param).exec({{4, 3, 28, 28, 32}, {}});
}
} }
#if CUDNN_VERSION >= 7500 #if CUDNN_VERSION >= 7500
...@@ -288,6 +300,8 @@ TEST_F(CUDA, POOLING_FORWARD_NCHW32) { ...@@ -288,6 +300,8 @@ TEST_F(CUDA, POOLING_FORWARD_NCHW32) {
auto i8_max = std::numeric_limits<int8_t>().max(); auto i8_max = std::numeric_limits<int8_t>().max();
UniformIntRNG int_rng{i8_min, i8_max}; UniformIntRNG int_rng{i8_min, i8_max};
checker.set_dtype(0, dtype::QuantizedS8(0.1f)); checker.set_dtype(0, dtype::QuantizedS8(0.1f));
checker.set_before_exec_callback(
AlgoChecker<PoolingForward>("CUDA_NCHW32"));
param.format = Param::Format::NCHW32; param.format = Param::Format::NCHW32;
checker.set_epsilon(1e-3).set_rng(0, &int_rng); checker.set_epsilon(1e-3).set_rng(0, &int_rng);
checker.set_param(param).exec({{64, 8, 28, 28, 32}, {}}); checker.set_param(param).exec({{64, 8, 28, 28, 32}, {}});
...@@ -394,6 +408,7 @@ TEST_F(CUDA, POOLING_FORWARD_INT8_NCHW4) { ...@@ -394,6 +408,7 @@ TEST_F(CUDA, POOLING_FORWARD_INT8_NCHW4) {
UniformIntRNG int_rng{i8_min, i8_max}; UniformIntRNG int_rng{i8_min, i8_max};
checker.set_dtype(0, dtype::QuantizedS8(0.1f)); checker.set_dtype(0, dtype::QuantizedS8(0.1f));
param.format = Param::Format::NCHW4; param.format = Param::Format::NCHW4;
checker.set_before_exec_callback(AlgoChecker<PoolingForward>("CUDA_NCHW4"));
for (auto mode : {Param::Mode::MAX, Param::Mode::AVERAGE, for (auto mode : {Param::Mode::MAX, Param::Mode::AVERAGE,
Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING}) { Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING}) {
param.mode = mode; param.mode = mode;
...@@ -413,6 +428,8 @@ TEST_F(CUDA, POOLING_FORWARD_INT8_NCHW32) { ...@@ -413,6 +428,8 @@ TEST_F(CUDA, POOLING_FORWARD_INT8_NCHW32) {
auto i8_max = std::numeric_limits<int8_t>().max(); auto i8_max = std::numeric_limits<int8_t>().max();
UniformIntRNG int_rng{i8_min, i8_max}; UniformIntRNG int_rng{i8_min, i8_max};
checker.set_dtype(0, dtype::QuantizedS8(0.1f)); checker.set_dtype(0, dtype::QuantizedS8(0.1f));
checker.set_before_exec_callback(
AlgoChecker<PoolingForward>("CUDA_NCHW32"));
param.format = Param::Format::NCHW32; param.format = Param::Format::NCHW32;
for (auto mode : {Param::Mode::MAX, Param::Mode::AVERAGE, for (auto mode : {Param::Mode::MAX, Param::Mode::AVERAGE,
Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING}) { Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING}) {
......
...@@ -10,14 +10,52 @@ ...@@ -10,14 +10,52 @@
*/ */
#include "megbrain/opr/dnn/pooling.h" #include "megbrain/opr/dnn/pooling.h"
#include "megbrain/graph/grad_impl.h" #include "megbrain/graph/grad_impl.h"
#include "megbrain/opr/search_policy/algo_chooser.h"
#include "../internal/megdnn_opr_wrapper.inl" #include "../internal/megdnn_opr_wrapper.inl"
#include "../search_policy/workspace_need_limit_getter.inl"
using namespace mgb; using namespace mgb;
using namespace opr; using namespace opr;
MGB_DYN_TYPE_OBJ_FINAL_IMPL(PoolingForward); MGB_DYN_TYPE_OBJ_FINAL_IMPL(PoolingForward);
MEGDNN_OPR_INIT1(PoolingForward, "pooling")
PoolingForward::PoolingForward(VarNode* i0, const Param& param,
const ExecutionPolicy& policy,
const OperatorNodeConfig& config)
: Super(OperatorNodeBaseCtorParam{
i0->owner_graph(), config, "pooling", {i0}}) {
init_megdnn_opr(*this, param);
add_input({i0});
m_policy = policy;
intl::MegDNNOprInitPostCtor<PoolingForward>::apply(*this);
}
SymbolVar PoolingForward::make(SymbolVar i0, const Param& param,
const OperatorNodeConfig& config,
const ExecutionPolicy& policy) {
intl::MegDNNOprInitInputsModifier<PoolingForward>::apply(param, {&i0});
return i0.insert_single_output_opr<PoolingForward>(i0.node(), param, policy,
config);
}
void PoolingForward::init_output_static_infer_desc() {
Super::set_nr_managed_outputs(this->output().size() - 1);
Super::Super::init_output_static_infer_desc();
init_output_static_infer_desc_workspace(
intl::AutoAddWorkspaceNeedLimitGetter<megdnn::PoolingForward>::val);
}
size_t PoolingForward::get_workspace_size_bytes(
const TensorShapeArray& input_shapes,
const TensorShapeArray& output_shapes) const {
return AlgoChooser<megdnn::PoolingForward>::setup_algo(
{TensorLayout{input_shapes[0], input(0)->dtype(),
input(0)->format()},
{output_shapes[0], output(0)->dtype(), output(0)->format()}},
megdnn_opr(), this, false);
}
#if MGB_ENABLE_GRAD #if MGB_ENABLE_GRAD
MGB_IMPL_OPR_GRAD(PoolingForward) { MGB_IMPL_OPR_GRAD(PoolingForward) {
...@@ -29,7 +67,41 @@ MGB_IMPL_OPR_GRAD(PoolingForward) { ...@@ -29,7 +67,41 @@ MGB_IMPL_OPR_GRAD(PoolingForward) {
#endif #endif
MGB_DYN_TYPE_OBJ_FINAL_IMPL(PoolingBackward); MGB_DYN_TYPE_OBJ_FINAL_IMPL(PoolingBackward);
MEGDNN_OPR_INIT3(PoolingBackward, "pooling_bwd", 0, true);
PoolingBackward::PoolingBackward(VarNode* i0, VarNode* i1, VarNode* i2,
const Param& param,
const ExecutionPolicy& policy,
const OperatorNodeConfig& config)
: Super(
OperatorNodeBaseCtorParam{
i0->owner_graph(), config, "pooling_bwd", {i0}},
0, true) {
init_megdnn_opr(*this, param);
add_input({i0, i1, i2});
intl::MegDNNOprInitPostCtor<PoolingBackward>::apply(*this);
}
SymbolVar PoolingBackward::make(SymbolVar i0, SymbolVar i1, SymbolVar i2,
const Param& param,
const OperatorNodeConfig& config,
const ExecutionPolicy& policy) {
intl::MegDNNOprInitInputsModifier<PoolingBackward>::apply(param,
{&i0, &i1, &i2});
return i0.insert_single_output_opr<PoolingBackward>(
i0.node(), i1.node(), i2.node(), param, policy, config);
}
size_t PoolingBackward::get_workspace_size_bytes(
const TensorShapeArray& input_shapes,
const TensorShapeArray& output_shapes) const {
return AlgoChooser<megdnn::PoolingBackward>::setup_algo(
{TensorLayout{input_shapes[0], input(0)->dtype(),
input(0)->format()},
{input_shapes[1], input(1)->dtype(), input(1)->format()},
{input_shapes[2], input(2)->dtype(), input(2)->format()},
{output_shapes[0], output(0)->dtype(), output(0)->format()}},
megdnn_opr(), this, false);
}
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
...@@ -175,6 +175,8 @@ cb(DEFORMABLE_CONV_BACKWARD_DATA, DeformableConvBackwardData); ...@@ -175,6 +175,8 @@ cb(DEFORMABLE_CONV_BACKWARD_DATA, DeformableConvBackwardData);
cb(DEFORMABLE_CONV_BACKWARD_FILTER, DeformableConvBackwardFilter); cb(DEFORMABLE_CONV_BACKWARD_FILTER, DeformableConvBackwardFilter);
cb(BATCH_CONV_FORWARD, BatchConvBiasForward); cb(BATCH_CONV_FORWARD, BatchConvBiasForward);
cb(CONVBIAS_FORWARD, ConvBiasForward); cb(CONVBIAS_FORWARD, ConvBiasForward);
cb(POOLING_FORWARD, PoolingForward);
cb(POOLING_BACKWARD, PoolingBackward);
#undef cb #undef cb
...@@ -195,7 +197,9 @@ cb(CONVBIAS_FORWARD, ConvBiasForward); ...@@ -195,7 +197,9 @@ cb(CONVBIAS_FORWARD, ConvBiasForward);
cb(DEFORMABLE_CONV_BACKWARD_DATA, stmt) \ cb(DEFORMABLE_CONV_BACKWARD_DATA, stmt) \
cb(DEFORMABLE_CONV_BACKWARD_FILTER, stmt) \ cb(DEFORMABLE_CONV_BACKWARD_FILTER, stmt) \
cb(BATCH_CONV_FORWARD, stmt) \ cb(BATCH_CONV_FORWARD, stmt) \
cb(CONVBIAS_FORWARD, stmt) cb(CONVBIAS_FORWARD, stmt) \
cb(POOLING_FORWARD, stmt) \
cb(POOLING_BACKWARD, stmt)
// clang-format on // clang-format on
#define _OPR_TYPE_CASE(_opr_type, _stmt) \ #define _OPR_TYPE_CASE(_opr_type, _stmt) \
...@@ -521,11 +525,14 @@ AlgoChooser<Opr>::AlgoChooserHelper::AlgoChooserHelper( ...@@ -521,11 +525,14 @@ AlgoChooser<Opr>::AlgoChooserHelper::AlgoChooserHelper(
mgb_assert(m_fastrun_layouts.size() == layouts.size()); mgb_assert(m_fastrun_layouts.size() == layouts.size());
static_assert(std::tuple_size<FixedTensorLayouts>::value == 3 || static_assert(
std::tuple_size<FixedTensorLayouts>::value == 5 || std::tuple_size<FixedTensorLayouts>::value == 2 ||
std::tuple_size<FixedTensorLayouts>::value == 8, std::tuple_size<FixedTensorLayouts>::value == 3 ||
"Convolution AlgoChooser assumes arity = 3 , 5 or 8 (for " std::tuple_size<FixedTensorLayouts>::value == 4 ||
"deformable conv)"); std::tuple_size<FixedTensorLayouts>::value == 5 ||
std::tuple_size<FixedTensorLayouts>::value == 8,
"Pooling assumes arity = 2 or 4,Convolution AlgoChooser assumes "
"arity = 3 , 5 or 8 (for deformable conv)");
} }
template <typename Opr> template <typename Opr>
......
...@@ -284,7 +284,7 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( ...@@ -284,7 +284,7 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
mdn_workspace), mdn_workspace),
std::forward_as_tuple(layouts[0], std::forward_as_tuple(layouts[0],
inp_val[1].as_megdnn()), inp_val[1].as_megdnn()),
array_skip<2>(layouts)); array_skip<arity_in>(layouts));
}); });
} }
}); });
......
...@@ -12,34 +12,50 @@ ...@@ -12,34 +12,50 @@
#pragma once #pragma once
#include "megbrain/opr/internal/megdnn_opr_wrapper.h" #include "megbrain/opr/internal/megdnn_opr_wrapper.h"
#include "megbrain/opr/search_policy/algo_chooser_helper.h"
#include "megdnn/oprs.h" #include "megdnn/oprs.h"
namespace mgb { namespace mgb {
namespace opr { namespace opr {
MGB_DEFINE_OPR_CLASS(PoolingForward, MGB_DEFINE_OPR_CLASS(PoolingForward,
intl::MegDNNOprWrapperFwd<megdnn::PoolingForward>) // { intl::MegDNNOprWrapperFwd<megdnn::PoolingForward>,
public mixin::AlgoChooserHelper) //{
public: public:
PoolingForward(VarNode *src, const Param &param, PoolingForward(VarNode * src, const Param& param,
const OperatorNodeConfig &config); const ExecutionPolicy& policy,
static SymbolVar make(SymbolVar src, const Param &param, const OperatorNodeConfig& config);
const OperatorNodeConfig &config = {}); static SymbolVar make(SymbolVar src, const Param& param,
const OperatorNodeConfig& config = {},
const ExecutionPolicy& policy = {});
void init_output_static_infer_desc() override;
size_t get_workspace_size_bytes(const TensorShapeArray& input_shapes,
const TensorShapeArray& output_shapes)
const override;
}; };
using Pooling = PoolingForward; using Pooling = PoolingForward;
MGB_DEFINE_OPR_CLASS(PoolingBackward, MGB_DEFINE_OPR_CLASS(PoolingBackward,
intl::MegDNNOprWrapperBwd<megdnn::PoolingBackward>) // { intl::MegDNNOprWrapperBwd<megdnn::PoolingBackward>,
public mixin::AlgoChooserHelper) //{
public: public:
PoolingBackward(VarNode *src, VarNode *dst, VarNode *diff, PoolingBackward(VarNode * src, VarNode * dst, VarNode * diff,
const Param &param, const OperatorNodeConfig &config); const Param& param, const ExecutionPolicy& policy,
static SymbolVar make(SymbolVar src, SymbolVar dst, SymbolVar diff, const OperatorNodeConfig& config);
const Param &param,
const OperatorNodeConfig &config = {}); static SymbolVar make(SymbolVar src, SymbolVar dst, SymbolVar diff,
const Param& param,
const OperatorNodeConfig& config = {},
const ExecutionPolicy& policy = {});
size_t get_workspace_size_bytes(const TensorShapeArray& input_shapes,
const TensorShapeArray& output_shapes)
const override final;
}; };
} // namespace opr } // namespace opr
} // namespace mgb } // namespace mgb
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "megbrain/opr/search_policy/algo_chooser_helper.h" #include "megbrain/opr/search_policy/algo_chooser_helper.h"
#include "megbrain/opr/search_policy/profiler.h" #include "megbrain/opr/search_policy/profiler.h"
#include "megbrain/opr/dnn/convolution.h" #include "megbrain/opr/dnn/convolution.h"
#include "megbrain/opr/dnn/pooling.h"
#include "megbrain/opr/blas.h" #include "megbrain/opr/blas.h"
#include "megdnn/oprs/base.h" #include "megdnn/oprs/base.h"
......
...@@ -40,7 +40,9 @@ namespace opr { ...@@ -40,7 +40,9 @@ namespace opr {
cb(DeformableConvBackwardData) \ cb(DeformableConvBackwardData) \
cb(BatchConvBiasForward) \ cb(BatchConvBiasForward) \
cb(MatrixMul) \ cb(MatrixMul) \
cb(BatchedMatrixMul) cb(BatchedMatrixMul) \
cb(PoolingForward) \
cb(PoolingBackward)
// clang-format on // clang-format on
template <typename Opr> template <typename Opr>
......
...@@ -11,6 +11,9 @@ ...@@ -11,6 +11,9 @@
#include "./legacy_checker.h" #include "./legacy_checker.h"
#include "megbrain/opr/dnn/pooling.h" #include "megbrain/opr/dnn/pooling.h"
#include "megbrain/utils/persistent_cache.h"
#include "megbrain/opr/basic_arith.h"
#include "megbrain/opr/basic_arith_wrapper.h"
using namespace std; using namespace std;
using namespace mgb; using namespace mgb;
...@@ -104,6 +107,58 @@ TEST(TestOprDNN, PoolingBackward) ...@@ -104,6 +107,58 @@ TEST(TestOprDNN, PoolingBackward)
} }
} }
TEST(TestOprDNN, PoolingExePolicy) {
using Param = opr::Pooling::Param;
Param param;
using Policy = opr::Pooling::ExecutionPolicy;
using S = Policy::Strategy;
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
auto orig_impl = PersistentCache::set_impl(
std::make_shared<InMemoryPersistentCache>());
HostTensorND host_y, host_y_copy;
S strategy = S::HEURISTIC | S::REPRODUCIBLE;
auto graph = ComputingGraph::make();
HostTensorGenerator<> gen;
TensorShape shape = {1, 20, 24, 24};
auto input = opr::Host2DeviceCopy::make(*graph, gen(shape, cn));
param.mode = Param::Mode::MAX;
param.window_h = param.window_w = 2;
param.stride_h = param.stride_w = 2;
param.pad_h = param.pad_w = 0;
param.format = Param::Format::NCHW;
Policy policy;
policy.strategy = strategy;
auto pooling = opr::PoolingForward::make(input, param, {}, policy);
auto loss0 = opr::reduce_sum_sqr(pooling, pooling.make_scalar(1));
auto grad = cg::grad(loss0, input, true, false);
opr::PoolingBackward* found = nullptr;
auto cb = [&found](cg::OperatorNodeBase* opr) {
if (opr->same_type<opr::PoolingBackward>()) {
found = &opr->cast_final_safe<opr::PoolingBackward>();
}
};
cg::DepOprIter{cb}.add(grad.node()->owner_opr());
found->set_execution_policy(strategy);
auto func = graph->compile({make_callback_copy(grad, host_y)});
func->execute().wait();
mgb_assert(found->megdnn_opr()->execution_policy().algo.name.find(
"cudnnReproducible") != std::string::npos);
}
} // anonymous namespace } // anonymous namespace
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册