提交 b8b000db 编写于 作者: M Megvii Engine Team

feat(dnn/fallback): fix fallback interface of weight preprocess

GitOrigin-RevId: ca860f487e2c1d1264ab2b3ce0a5515c383b8dcb
上级 cf3a55ce
......@@ -234,10 +234,10 @@ public:
const TensorLayout& dst) = 0;
protected:
CanonizedFilterMeta check_exec(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst,
size_t workspace_in_bytes);
CanonizedFilterMeta check_exec(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst, size_t workspace_in_bytes,
const PreprocessedFilter* preprocessed_filter);
};
using Convolution = ConvolutionForward;
......@@ -408,12 +408,11 @@ public:
static WinogradParam parse_winograd_name(const std::string& algo_name);
protected:
CanonizedFilterMeta check_exec(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& bias,
const TensorLayout& z,
const TensorLayout& dst,
size_t workspace_in_bytes);
CanonizedFilterMeta check_exec(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& z,
const TensorLayout& dst, size_t workspace_in_bytes,
const PreprocessedFilter* preprocessed_filter);
};
using ConvBias = ConvBiasForward;
......
......@@ -32,7 +32,8 @@ void ConvBiasForward::deduce_layout(const TensorLayout& src,
ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& z,
const TensorLayout& dst, size_t workspace_in_bytes) {
const TensorLayout& dst, size_t workspace_in_bytes,
const PreprocessedFilter* preprocessed_filter) {
if ((param().format == param::ConvBias::Format::NCHW_WINOGRAD ||
param().format == param::ConvBias::Format::NCHW88_WINOGRAD ||
param().format == param::ConvBias::Format::NCHW44_WINOGRAD) &&
......@@ -82,9 +83,11 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec(
auto ret = check_layout_fwd(src, filter, dst);
megdnn_assert_contiguous(bias);
auto required_workspace_in_bytes =
get_workspace_in_bytes(src, filter, bias, z, dst, nullptr);
megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes);
auto required_workspace_in_bytes = get_workspace_in_bytes(
src, filter, bias, z, dst, preprocessed_filter);
megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes,
"worksapce have size of %zu, but need %zu",
workspace_in_bytes, required_workspace_in_bytes);
if (bias.ndim != 0) {
//! bias.layout == dst.layout failed, no assert information
auto check_eq = [](const TensorLayout& bias, const TensorLayout& dst) {
......
......@@ -1028,10 +1028,11 @@ void ConvolutionForward::deduce_layout(const TensorLayout& src,
ConvolutionForward::CanonizedFilterMeta ConvolutionForward::check_exec(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst, size_t workspace_in_bytes) {
const TensorLayout& dst, size_t workspace_in_bytes,
const PreprocessedFilter* preprocessed_filter) {
auto ret = check_layout_fwd(src, filter, dst);
auto required_workspace_in_bytes =
get_workspace_in_bytes(src, filter, dst, nullptr);
get_workspace_in_bytes(src, filter, dst, preprocessed_filter);
megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes);
return ret;
}
......
......@@ -25,10 +25,10 @@ namespace cuda {
void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_in bias, _megdnn_tensor_in z,
_megdnn_tensor_out dst,
const PreprocessedFilter*,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) {
check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout,
workspace.size);
workspace.size, preprocessed_filter);
AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace);
auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout,
z.layout, dst.layout);
......
......@@ -52,13 +52,10 @@ public:
const TensorLayout&, const TensorLayout&) override {
return {};
}
void exec_preprocess(const TensorLayout& ,
_megdnn_tensor_in ,
const TensorLayout& ,
const TensorLayout& ,
const TensorLayout& ,
PreprocessedFilter* ,
_megdnn_workspace ) override {
void exec_preprocess(const TensorLayout&, _megdnn_tensor_in,
const TensorLayout&, const TensorLayout&,
const TensorLayout&, PreprocessedFilter*,
_megdnn_workspace) override {
megdnn_throw("cuda conv_bias exec_preprocess has not implemeted yet");
}
......
......@@ -119,17 +119,22 @@ SmallVector<ConvBiasImpl::AlgoBase*> ConvBiasImpl::algo_pack() {
bool ConvBiasImpl::is_naive_algo(ConvBiasImpl::Algorithm* algo) {
return algo == nullptr || strcmp(algo->name(), "DEFAULT") == 0;
}
#define NCB_ALGO_FUNC(name, algo, param) \
static_cast<AlgoBase*>(algo)->name(this, param)
void ConvBiasImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_in bias, _megdnn_tensor_in z,
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) {
check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout,
workspace.size);
auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace);
workspace.size, preprocessed_filter);
auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace,
preprocessed_filter);
ConvBiasImpl::Algorithm* algo = get_algorithm(fparam, workspace.size);
if (!is_naive_algo(algo) &&
ncb_algo_get_workspace(algo, fparam) <= workspace.size) {
NCB_ALGO_FUNC(get_workspace, algo, fparam) <= workspace.size) {
exec_with_ncb_kern(fparam, algo);
} else {
naive::ConvBiasForwardImpl::exec(src, filter, bias, z, dst,
......@@ -137,18 +142,71 @@ void ConvBiasImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
}
}
void ConvBiasImpl::exec_preprocess(const TensorLayout& src_layout,
_megdnn_tensor_in filter,
const TensorLayout& bias_layout,
const TensorLayout& z_layout,
const TensorLayout& dst_layout,
PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) {
//! exec_preprocess currently only support preprocess weights before exec,
//! src/dst/bias/z will be ignored, just set to nullptr
TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout},
bias{nullptr, bias_layout};
auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace,
preprocessed_filter);
ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size);
if (!is_naive_algo(algo) && NCB_ALGO_FUNC(get_preprocess_workspace, algo,
fparam) <= workspace.size) {
exec_preprocess_with_ncb_kern(fparam, algo);
} else {
naive::ConvBiasForwardImpl::exec_preprocess(
src_layout, filter, bias_layout, z_layout, dst_layout,
preprocessed_filter, workspace);
}
}
size_t ConvBiasImpl::get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& z,
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter) {
auto fparam = make_ncb_kern_size_param(src, filter, bias, dst);
auto fparam = make_ncb_kern_size_param(src, filter, bias, dst,
preprocessed_filter);
ConvBiasImpl::Algorithm* algo = get_algorithm(fparam);
if (is_naive_algo(algo)) {
return naive::ConvBiasForwardImpl::get_workspace_in_bytes(
src, filter, bias, z, dst, preprocessed_filter);
} else {
return ncb_algo_get_workspace(algo, fparam);
return NCB_ALGO_FUNC(get_workspace, algo, fparam);
}
}
size_t ConvBiasImpl::get_preprocess_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& z,
const TensorLayout& dst) {
auto fparam = make_ncb_kern_size_param(src, filter, bias, dst, nullptr);
Algorithm* algo = get_algorithm(fparam);
if (is_naive_algo(algo)) {
return naive::ConvBiasForwardImpl::get_preprocess_workspace_in_bytes(
src, filter, bias, z, dst);
} else {
return NCB_ALGO_FUNC(get_preprocess_workspace, algo, fparam);
}
}
SmallVector<TensorLayout> ConvBiasImpl::deduce_preprocessed_filter_layout(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& z,
const TensorLayout& dst) {
auto fparam = make_ncb_kern_size_param(src, filter, bias, dst, nullptr);
Algorithm* algo = get_algorithm(fparam);
if (is_naive_algo(algo)) {
return naive::ConvBiasForwardImpl::deduce_preprocessed_filter_layout(
src, filter, bias, z, dst);
} else {
return NCB_ALGO_FUNC(deduce_preprocessed_filter_layout, algo, fparam);
}
}
......@@ -156,7 +214,7 @@ std::vector<ConvBiasImpl::Algorithm*> ConvBiasImpl::get_all_algorithms(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& z,
const TensorLayout& dst) {
auto fparam = make_ncb_kern_size_param(src, filter, bias, dst);
auto fparam = make_ncb_kern_size_param(src, filter, bias, dst, nullptr);
auto ret = get_all_algorithms_with_ncb(fparam);
if (ret.empty()) {
return naive::ConvBiasForwardImpl::get_all_algorithms(src, filter, bias,
......@@ -170,7 +228,7 @@ ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm_heuristic(
const TensorLayout& bias, const TensorLayout& z,
const TensorLayout& dst, size_t workspace_limit_in_bytes,
bool reproducible) {
auto fparam = make_ncb_kern_size_param(src, filter, bias, dst);
auto fparam = make_ncb_kern_size_param(src, filter, bias, dst, nullptr);
auto result = get_algorithm_heuristic_with_ncb(
fparam, workspace_limit_in_bytes, reproducible);
if (result == nullptr) {
......@@ -181,9 +239,25 @@ ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm_heuristic(
return result;
}
ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm_heuristic_with_ncb(
const NCBKernSizeParam& param, size_t workspace_limit_in_bytes,
bool reproducible) {
for (auto i : get_all_algorithms_with_ncb(param)) {
size_t need_workspace = NCB_ALGO_FUNC(get_workspace, i, param);
if (static_cast<AlgoBase*>(i)->usable_reproducible(
this, param, AlgoSelectionStrategy::HEURISTIC,
reproducible) &&
need_workspace <= workspace_limit_in_bytes) {
return i;
}
}
return nullptr;
}
ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& dst) {
const TensorLayout& bias, const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter) {
auto safe_u32 = [](size_t v) -> uint32_t {
megdnn_assert(v <= std::numeric_limits<uint32_t>::max(),
"value too large: %zu", v);
......@@ -258,7 +332,9 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param(
{src.stride[0], src.stride[1], src.stride[2], src.stride[3]},
{dst.stride[0], dst.stride[1], dst.stride[2], dst.stride[3]},
param().compute_mode,
nr_threads},
nr_threads,
reinterpret_cast<const ConvolutionForward::PreprocessedFilter*>(
preprocessed_filter)},
param().output_block_size,
format,
bias.dtype,
......@@ -269,10 +345,12 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param(
ConvBiasImpl::NCBKernParam ConvBiasImpl::make_ncb_kern_param(
_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_in bias,
_megdnn_tensor_out dst, _megdnn_workspace workspace) {
_megdnn_tensor_out dst, _megdnn_workspace workspace,
const PreprocessedFilter* preprocessed_filter) {
NCBKernParam ret;
static_cast<NCBKernSizeParam&>(ret) = make_ncb_kern_size_param(
src.layout, filter.layout, bias.layout, dst.layout);
static_cast<NCBKernSizeParam&>(ret) =
make_ncb_kern_size_param(src.layout, filter.layout, bias.layout,
dst.layout, preprocessed_filter);
ret.src_ptr = src.raw_ptr;
ret.filter_ptr = filter.raw_ptr;
ret.bias_ptr = bias.raw_ptr;
......@@ -284,7 +362,7 @@ ConvBiasImpl::NCBKernParam ConvBiasImpl::make_ncb_kern_param(
void ConvBiasImpl::exec_with_ncb_kern(const NCBKernParam& param,
ConvBiasImpl::Algorithm* algo) {
auto ncb_kerns = ncb_algo_dispatch_kerns(algo, param);
auto ncb_kerns = NCB_ALGO_FUNC(dispatch_kerns, algo, param);
for (auto&& kernel : ncb_kerns) {
auto run = [kernel, param](size_t index, size_t thread_id) {
CpuNDRange ndrange_id(kernel.global_size, index);
......@@ -295,21 +373,17 @@ void ConvBiasImpl::exec_with_ncb_kern(const NCBKernParam& param,
}
}
ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm_heuristic_with_ncb(
const NCBKernSizeParam& param, size_t workspace_limit_in_bytes,
bool reproducible) {
return ncb_algo_get_algorithm_heuristic(param, workspace_limit_in_bytes,
reproducible);
}
size_t ConvBiasImpl::ncb_algo_get_workspace(Algorithm* algo,
const NCBKernSizeParam& param) {
return static_cast<AlgoBase*>(algo)->get_workspace(this, param);
}
SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::ncb_algo_dispatch_kerns(
Algorithm* algo, const NCBKernSizeParam& param) {
return static_cast<AlgoBase*>(algo)->dispatch_kerns(this, param);
void ConvBiasImpl::exec_preprocess_with_ncb_kern(
const NCBKernParam& param, ConvBiasImpl::Algorithm* algo) {
auto ncb_kerns = NCB_ALGO_FUNC(dispatch_preprocess_kerns, algo, param);
for (auto&& kernel : ncb_kerns) {
auto run = [kernel, param](size_t index, size_t thread_id) {
CpuNDRange ndrange_id(kernel.global_size, index);
kernel.kern(param, {thread_id, ndrange_id});
};
static_cast<naive::HandleImpl*>(handle())->dispatch_kern(
run, kernel.global_size.total_size());
}
}
std::vector<ConvBiasImpl::Algorithm*> ConvBiasImpl::get_all_algorithms_with_ncb(
......@@ -332,20 +406,6 @@ std::vector<ConvBiasImpl::Algorithm*> ConvBiasImpl::get_all_algorithms_with_ncb(
return algos;
}
ConvBiasImpl::Algorithm* ConvBiasImpl::ncb_algo_get_algorithm_heuristic(
const NCBKernSizeParam& param, size_t workspace_limit_in_bytes,
bool reproducible) {
for (auto i : get_all_algorithms_with_ncb(param)) {
if (static_cast<AlgoBase*>(i)->usable_reproducible(
this, param, AlgoSelectionStrategy::HEURISTIC,
reproducible) &&
ncb_algo_get_workspace(i, param) <= workspace_limit_in_bytes) {
return i;
}
}
return nullptr;
}
ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm(
const NCBKernSizeParam& param, size_t workspace_size) {
if (auto set = execution_policy().algorithm) {
......
......@@ -51,6 +51,25 @@ public:
_megdnn_tensor_out dst, const PreprocessedFilter*,
_megdnn_workspace workspace) override;
void exec_preprocess(const TensorLayout& src_layout,
_megdnn_tensor_in filter,
const TensorLayout& bias_layout,
const TensorLayout& z_layout,
const TensorLayout& dst_layout,
PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) override;
SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& z,
const TensorLayout& dst) override;
size_t get_preprocess_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& bias,
const TensorLayout& z,
const TensorLayout& dst) override;
//! implemented by get_workspace_with_ncb()
size_t get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& filter,
......@@ -198,6 +217,23 @@ public:
virtual SmallVector<NCBKern> dispatch_kerns(
ConvBiasImpl* opr, const NCBKernSizeParam& param) const = 0;
virtual SmallVector<NCBKern> dispatch_preprocess_kerns(
ConvBiasImpl*, const NCBKernSizeParam&) const {
return {};
};
//! get the layouts of weight_prerocess dst
virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
ConvBiasImpl*, const NCBKernSizeParam&) const {
return {};
};
//! get the workspace when weight_prerocess
virtual size_t get_preprocess_workspace(ConvBiasImpl*,
const NCBKernSizeParam&) const {
return 0_z;
};
//! Temporarily used to identify whether the matmul algorithm is
//! is_preferred.
virtual bool is_preferred(ConvBiasImpl*,
......@@ -219,40 +255,19 @@ public:
virtual SmallVector<AlgoBase*> algo_pack();
protected:
//! default impl calls ncb_algo_dispatch_kern()
virtual void exec_with_ncb_kern(const NCBKernParam& param,
ConvBiasImpl::Algorithm* algo);
//! default impl calls ncb_algo_get_all_algorithms()
virtual void exec_preprocess_with_ncb_kern(const NCBKernParam& param,
Algorithm* algo);
virtual std::vector<Algorithm*> get_all_algorithms_with_ncb(
const NCBKernSizeParam& param);
//! default impl calls ncb_algo_get_algorithm_heuristic()
virtual Algorithm* get_algorithm_heuristic_with_ncb(
const NCBKernSizeParam& param, size_t workspace_limit_in_bytes,
bool reproducible = false);
/**
* \brief get kernel pointer for non-contiguous batch kernel or
* simply conv bias kernel.
*
* whether the kernel processing batch 1-group is decided by the
* algo.
*/
virtual SmallVector<NCBKern> ncb_algo_dispatch_kerns(
Algorithm* algo, const NCBKernSizeParam& param);
virtual size_t ncb_algo_get_workspace(Algorithm* algo,
const NCBKernSizeParam& param);
/*!
* the default impl iterates over all ncb_algo_get_all_algorithms()
* and return the first one whose workspace does not exceed the limit.
*/
virtual Algorithm* ncb_algo_get_algorithm_heuristic(
const NCBKernSizeParam& param, size_t workspace_limit_in_bytes,
bool reproducible = false);
const char* get_algorithm_set_name() const override;
private:
......@@ -276,16 +291,16 @@ private:
const NCBKernSizeParam& param,
size_t workspace_size = std::numeric_limits<size_t>::max());
NCBKernSizeParam make_ncb_kern_size_param(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& bias,
const TensorLayout& dst);
NCBKernParam make_ncb_kern_param(_megdnn_tensor_in src,
_megdnn_tensor_in filter,
_megdnn_tensor_in bias,
_megdnn_tensor_out dst,
_megdnn_workspace workspace);
NCBKernSizeParam make_ncb_kern_size_param(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter);
NCBKernParam make_ncb_kern_param(
_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_in bias, _megdnn_tensor_out dst,
_megdnn_workspace workspace,
const PreprocessedFilter* preprocessed_filter);
};
} // namespace fallback
......
......@@ -376,7 +376,67 @@ size_t ConvolutionImpl::AlgoDefault::get_workspace(
return get_bundle(param).total_size_in_bytes();
}
//! Return the implment kernel
size_t ConvolutionImpl::AlgoDefault::get_preprocess_workspace(
ConvolutionImpl*, const NCBKernSizeParam& param) const {
::ConvBiasImpl::NCBKernSizeParam conv_bias_param =
init_convbias_opr_and_param(m_conv_bias_opr, param);
m_conv_bias_opr->execution_policy() = {m_algorithm};
return m_algorithm->get_preprocess_workspace(m_conv_bias_opr,
conv_bias_param);
}
SmallVector<TensorLayout>
ConvolutionImpl::AlgoDefault::deduce_preprocessed_filter_layout(
ConvolutionImpl*, const NCBKernSizeParam& param) const {
::ConvBiasImpl::NCBKernSizeParam conv_bias_param =
init_convbias_opr_and_param(m_conv_bias_opr, param);
m_conv_bias_opr->execution_policy() = {m_algorithm};
return m_algorithm->deduce_preprocessed_filter_layout(m_conv_bias_opr,
conv_bias_param);
}
//! Return the implement preprocess kernel
SmallVector<ConvolutionImpl::NCBKern>
ConvolutionImpl::AlgoDefault::get_preprocess_kimpl(
::ConvBiasImpl* conv_bias_opr, ConvBiasImpl::AlgoBase* algo,
const NCBKernSizeParam& param) {
MIDOUT_BEGIN(megdnn_fallback_conv, midout_iv("get_preprocess_kimpl"_hash)) {
// construct the conv_bias kern param
::ConvBiasImpl::NCBKernParam conv_bias_param;
::ConvBiasImpl::NCBKernSizeParam conv_bias_size_param =
init_convbias_opr_and_param(conv_bias_opr, param);
static_cast<::ConvBiasImpl::NCBKernSizeParam&>(conv_bias_param) =
conv_bias_size_param;
auto conv_bias_preprocess_kerns =
algo->dispatch_preprocess_kerns(conv_bias_opr, conv_bias_param);
SmallVector<ConvolutionImpl::NCBKern> convolution_preprocess_kerns;
//! Set the conv_bias param using convolution param
auto set_copy_param_filter_workspace_ptr =
[](const NCBKernParam& conv_param,
::ConvBiasImpl::NCBKernParam& copied_param) {
copied_param.filter_ptr = conv_param.filter_ptr;
copied_param.workspace_ptr = conv_param.workspace_ptr;
copied_param.workspace_size = conv_param.workspace_size;
};
for (size_t i = 0; i < conv_bias_preprocess_kerns.size(); i++) {
auto kernel = conv_bias_preprocess_kerns[i];
//! If the kerenl batch parallel
auto run = [=](const NCBKernParam& p,
const NCBKernIndex& ncb_index) {
auto copy_param = conv_bias_param;
set_copy_param_filter_workspace_ptr(p, copy_param);
kernel.kern(copy_param,
{ncb_index.thread_id, ncb_index.ndrange_id});
};
convolution_preprocess_kerns.push_back({run, kernel.global_size});
}
return convolution_preprocess_kerns;
}
MIDOUT_END();
}
//! Return the implement kernel
SmallVector<ConvolutionImpl::NCBKern> ConvolutionImpl::AlgoDefault::get_kimpl(
::ConvBiasImpl* conv_bias_opr, ConvBiasImpl::AlgoBase* algo,
const NCBKernSizeParam& param) {
......@@ -392,7 +452,7 @@ SmallVector<ConvolutionImpl::NCBKern> ConvolutionImpl::AlgoDefault::get_kimpl(
SmallVector<ConvolutionImpl::NCBKern> convolution_kerns;
//! Set the conv_bias param using convolution param
auto set_copy_param_run_time_address =
auto set_copy_param_compute_address =
[](const NCBKernParam& conv_param,
::ConvBiasImpl::NCBKernParam& copied_param) {
copied_param.src_ptr = conv_param.src_ptr;
......@@ -407,7 +467,7 @@ SmallVector<ConvolutionImpl::NCBKern> ConvolutionImpl::AlgoDefault::get_kimpl(
auto run = [=](const NCBKernParam& p,
const NCBKernIndex& ncb_index) {
auto copy_param = conv_bias_param;
set_copy_param_run_time_address(p, copy_param);
set_copy_param_compute_address(p, copy_param);
kernel.kern(copy_param,
{ncb_index.thread_id, ncb_index.ndrange_id});
};
......
......@@ -110,6 +110,9 @@ class ConvolutionImpl::AlgoDefault final : public AlgoBase {
static SmallVector<NCBKern> get_kimpl(ConvBiasImpl* conv_bias_opr,
ConvBiasImpl::AlgoBase* algo,
const NCBKernSizeParam& param);
static SmallVector<NCBKern> get_preprocess_kimpl(
ConvBiasImpl* conv_bias_opr, ConvBiasImpl::AlgoBase* algo,
const NCBKernSizeParam& param);
public:
AlgoDefault(fallback::ConvBiasImpl* conv_bias_opr, ConvBiasImpl::AlgoBase*);
......@@ -121,6 +124,17 @@ public:
size_t get_workspace(ConvolutionImpl* opr,
const NCBKernSizeParam& param) const override;
size_t get_preprocess_workspace(ConvolutionImpl*,
const NCBKernSizeParam&) const override;
SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
ConvolutionImpl*, const NCBKernSizeParam&) const override;
SmallVector<NCBKern> dispatch_preprocess_kern(
ConvolutionImpl*, const NCBKernSizeParam& param) const override {
return get_preprocess_kimpl(m_conv_bias_opr, m_algorithm, param);
}
SmallVector<NCBKern> dispatch_kern(
ConvolutionImpl* /*opr*/,
const NCBKernSizeParam& param) const override {
......
......@@ -80,14 +80,19 @@ SmallVector<ConvolutionImpl::AlgoBase*> ConvolutionImpl::algo_pack() {
bool ConvolutionImpl::is_naive_algo(ConvolutionImpl::Algorithm* algo) {
return algo == nullptr || strcmp(algo->name(), "DEFAULT") == 0;
}
#define NCB_ALGO_FUNC(name, algo, param) \
static_cast<AlgoBase*>(algo)->name(this, fparam)
void ConvolutionImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) {
auto fparam = make_ncb_kern_param(src, filter, dst, workspace);
auto fparam = make_ncb_kern_param(src, filter, dst, preprocessed_filter,
workspace);
ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size);
if (!is_naive_algo(algo) &&
ncb_algo_get_workspace(algo, fparam) <= workspace.size) {
NCB_ALGO_FUNC(get_workspace, algo, fparam) <= workspace.size) {
exec_with_ncb_kern(fparam, algo);
} else {
naive::ConvolutionForwardImpl::exec(src, filter, dst,
......@@ -95,24 +100,73 @@ void ConvolutionImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
}
}
void ConvolutionImpl::exec_preprocess(const TensorLayout& src_layout,
_megdnn_tensor_in filter,
const TensorLayout& dst_layout,
PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) {
//! exec_preprocess currently only support preprocess weights before exec,
//! src/dst will be ignored, just set to nullptr
TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout};
auto fparam = make_ncb_kern_param(src, filter, dst, preprocessed_filter,
workspace);
ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size);
if (!is_naive_algo(algo) && NCB_ALGO_FUNC(get_preprocess_workspace, algo,
fparam) <= workspace.size) {
exec_preprocess_with_ncb_kern(fparam, algo);
} else {
naive::ConvolutionForwardImpl::exec_preprocess(
src_layout, filter, dst_layout, preprocessed_filter, workspace);
}
}
size_t ConvolutionImpl::get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter) {
auto fparam = make_ncb_kern_size_param(src, filter, dst);
auto fparam =
make_ncb_kern_size_param(src, filter, dst, preprocessed_filter);
Algorithm* algo = get_algorithm(fparam);
if (is_naive_algo(algo)) {
return naive::ConvolutionForwardImpl::get_workspace_in_bytes(
src, filter, dst, preprocessed_filter);
} else {
return ncb_algo_get_workspace(algo, fparam);
return static_cast<AlgoBase*>(algo)->get_workspace(this, fparam);
}
}
size_t ConvolutionImpl::get_preprocess_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst) {
auto fparam = make_ncb_kern_size_param(src, filter, dst, nullptr);
Algorithm* algo = get_algorithm(fparam);
if (is_naive_algo(algo)) {
return naive::ConvolutionForwardImpl::get_preprocess_workspace_in_bytes(
src, filter, dst);
} else {
return static_cast<AlgoBase*>(algo)->get_preprocess_workspace(this,
fparam);
}
}
SmallVector<TensorLayout> ConvolutionImpl::deduce_preprocessed_filter_layout(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst){
auto fparam = make_ncb_kern_size_param(src, filter, dst, nullptr);
Algorithm* algo = get_algorithm(fparam);
if (is_naive_algo(algo)) {
return naive::ConvolutionForwardImpl::deduce_preprocessed_filter_layout(
src, filter, dst);
} else {
return static_cast<AlgoBase*>(algo)->deduce_preprocessed_filter_layout(
this, fparam);
}
}
std::vector<ConvolutionImpl::Algorithm*> ConvolutionImpl::get_all_algorithms(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst) {
auto fparam = make_ncb_kern_size_param(src, filter, dst);
auto fparam = make_ncb_kern_size_param(src, filter, dst, nullptr);
auto ret = get_all_algorithms_with_ncb(fparam);
if (ret.empty()) {
return naive::ConvolutionForwardImpl::get_all_algorithms(src, filter,
......@@ -125,7 +179,7 @@ ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm_heuristic(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst, size_t workspace_limit_in_bytes,
bool reproducible) {
auto fparam = make_ncb_kern_size_param(src, filter, dst);
auto fparam = make_ncb_kern_size_param(src, filter, dst, nullptr);
auto result = get_algorithm_heuristic_with_ncb(
fparam, workspace_limit_in_bytes, reproducible);
if (result == nullptr) {
......@@ -137,7 +191,8 @@ ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm_heuristic(
ConvolutionImpl::NCBKernSizeParam ConvolutionImpl::make_ncb_kern_size_param(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst) {
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter) {
auto safe_u32 = [](size_t v) -> uint32_t {
megdnn_assert(v <= std::numeric_limits<uint32_t>::max(),
"value too large: %zu", v);
......@@ -175,15 +230,17 @@ ConvolutionImpl::NCBKernSizeParam ConvolutionImpl::make_ncb_kern_size_param(
{src.stride[0], src.stride[1], src.stride[2], src.stride[3]},
{dst.stride[0], dst.stride[1], dst.stride[2], dst.stride[3]},
param().compute_mode,
nr_threads};
nr_threads,
preprocessed_filter};
}
ConvolutionImpl::NCBKernParam ConvolutionImpl::make_ncb_kern_param(
_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) {
NCBKernParam ret;
static_cast<NCBKernSizeParam&>(ret) =
make_ncb_kern_size_param(src.layout, filter.layout, dst.layout);
static_cast<NCBKernSizeParam&>(ret) = make_ncb_kern_size_param(
src.layout, filter.layout, dst.layout, preprocessed_filter);
ret.src_ptr = src.raw_ptr;
ret.filter_ptr = filter.raw_ptr;
ret.dst_ptr = dst.raw_ptr;
......@@ -192,9 +249,30 @@ ConvolutionImpl::NCBKernParam ConvolutionImpl::make_ncb_kern_param(
return ret;
}
void ConvolutionImpl::exec_preprocess_with_ncb_kern(const NCBKernParam& param,
Algorithm* algo) {
auto kerns =
static_cast<AlgoBase*>(algo)->dispatch_preprocess_kern(this, param);
auto fallback_handle = handle();
for (auto kernel : kerns) {
megdnn_assert(
param.filter_meta.format == Param::Format::NCHW ||
param.filter_meta.format == Param::Format::NHWC ||
param.filter_meta.format == Param::Format::NCHW88 ||
param.filter_meta.format == Param::Format::NCHW44,
"invalid conv format");
auto run = [param, kernel](size_t index, size_t thread_id) {
CpuNDRange ndrange_id(kernel.global_size, index);
kernel.kern(param, {thread_id, ndrange_id});
};
static_cast<naive::HandleImpl*>(fallback_handle)
->dispatch_kern(run, kernel.global_size.total_size());
}
}
void ConvolutionImpl::exec_with_ncb_kern(const NCBKernParam& param,
Algorithm* algo) {
auto kerns = ncb_algo_dispatch_kern(algo, param);
auto kerns = static_cast<AlgoBase*>(algo)->dispatch_kern(this, param);
auto fallback_handle = handle();
for (auto kernel : kerns) {
megdnn_assert(param.filter_meta.format == Param::Format::NCHW ||
......@@ -215,10 +293,13 @@ ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm_heuristic_with_ncb(
const NCBKernSizeParam& param, size_t workspace_limit_in_bytes,
bool reproducible) {
for (auto i : get_all_algorithms_with_ncb(param)) {
if (static_cast<AlgoBase*>(i)->usable_reproducible(
this, param, AlgoSelectionStrategy::HEURISTIC,
reproducible) &&
ncb_algo_get_workspace(i, param) <= workspace_limit_in_bytes) {
size_t need_workspace =
static_cast<AlgoBase*>(i)->get_workspace(this, param);
bool usable_reproducible =
static_cast<AlgoBase*>(i)->usable_reproducible(
this, param, AlgoSelectionStrategy::HEURISTIC,
reproducible);
if (usable_reproducible && need_workspace <= workspace_limit_in_bytes) {
return i;
}
}
......
......@@ -39,12 +39,26 @@ public:
_megdnn_tensor_out dst, const PreprocessedFilter*,
_megdnn_workspace workspace) override;
void exec_preprocess(const TensorLayout& src_layout,
_megdnn_tensor_in filter,
const TensorLayout& dst_layout,
PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) override;
//! implemented by get_workspace_with_ncb()
size_t get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst,
const PreprocessedFilter*) override;
SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst) override;
size_t get_preprocess_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst) override;
//! implemented by get_all_algorithms_with_ncb()
std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& src, const TensorLayout& filter,
......@@ -70,6 +84,8 @@ public:
ptrdiff_t inp_s[4], out_s[4];
Param::ComputeMode compute_mode;
size_t nr_threads;
//! weight_preprocess info
const PreprocessedFilter* preprocessed_filter;
};
//! memory param for kernels with non-contiguous batch
......@@ -169,6 +185,23 @@ public:
virtual SmallVector<NCBKern> dispatch_kern(
ConvolutionImpl* opr, const NCBKernSizeParam& param) const = 0;
virtual SmallVector<NCBKern> dispatch_preprocess_kern(
ConvolutionImpl*, const NCBKernSizeParam&) const {
return {};
};
//! get the layouts of weight_prerocess dst
virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
ConvolutionImpl*, const NCBKernSizeParam&) const {
return {};
};
//! get the workspace when weight_prerocess
virtual size_t get_preprocess_workspace(ConvolutionImpl*,
const NCBKernSizeParam&) const {
return 0_z;
};
//! Temporarily used to identify whether the matmul algorithm is
//! is_preferred.
virtual bool is_preferred(ConvolutionImpl*,
......@@ -192,6 +225,9 @@ public:
protected:
virtual void exec_with_ncb_kern(const NCBKernParam& param, Algorithm* algo);
virtual void exec_preprocess_with_ncb_kern(const NCBKernParam& param,
Algorithm* algo);
virtual std::vector<Algorithm*> get_all_algorithms_with_ncb(
const NCBKernSizeParam& param);
......@@ -199,21 +235,6 @@ protected:
const NCBKernSizeParam& param, size_t workspace_limit_in_bytes,
bool reproducible = false);
//! get kernel pointer
virtual SmallVector<NCBKern> ncb_algo_dispatch_kern(
Algorithm* algo, const NCBKernSizeParam& param) {
return static_cast<AlgoBase*>(algo)->dispatch_kern(this, param);
}
//! get algo workspace
virtual size_t ncb_algo_get_workspace(Algorithm* algo,
const NCBKernSizeParam& param) {
return static_cast<AlgoBase*>(algo)->get_workspace(this, param);
}
/*!
* the default impl iterates over all ncb_1g_get_all_algorithms()
* and return the first one whose workspace does not exceed the limit.
*/
const char* get_algorithm_set_name() const override;
class AlgoFallback;
......@@ -231,14 +252,16 @@ private:
const NCBKernSizeParam& param,
size_t workspace_size = std::numeric_limits<size_t>::max());
NCBKernSizeParam make_ncb_kern_size_param(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst);
NCBKernParam make_ncb_kern_param(_megdnn_tensor_in src,
_megdnn_tensor_in filter,
_megdnn_tensor_out dst,
_megdnn_workspace workspace);
NCBKernSizeParam make_ncb_kern_size_param(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter);
NCBKernParam make_ncb_kern_param(
_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace);
};
class ConvolutionBackwardDataImpl : public naive::ConvolutionBackwardDataImpl {
......
......@@ -80,14 +80,15 @@ size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src,
void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_in bias, _megdnn_tensor_in z,
_megdnn_tensor_out dst,
const PreprocessedFilter*,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) {
MIDOUT_BEGIN(megdnn_naive_conv_bias_fwd) {
dt_byte *workspace_ptr = workspace.raw_ptr;
// ============================w * f + b================================
auto filter_meta = check_exec(src.layout, filter.layout, bias.layout,
z.layout, dst.layout, workspace.size);
auto filter_meta =
check_exec(src.layout, filter.layout, bias.layout, z.layout,
dst.layout, workspace.size, preprocessed_filter);
auto sfb = dst;
if (bias.layout.dtype.enumv() != dst.layout.dtype.enumv()) {
// intermediate result
......
......@@ -61,9 +61,7 @@ public:
void exec_preprocess(const TensorLayout&, _megdnn_tensor_in,
const TensorLayout&, const TensorLayout&,
const TensorLayout&, PreprocessedFilter*,
_megdnn_workspace) override{
megdnn_throw("conv_bias exec_preprocess is not impl yet");
}
_megdnn_workspace) override {}
const char* get_algorithm_set_name() const override;
};
......
......@@ -28,11 +28,11 @@ using namespace naive;
void ConvolutionForwardImpl::exec(_megdnn_tensor_in src,
_megdnn_tensor_in filter,
_megdnn_tensor_out dst,
const PreprocessedFilter*,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) {
MIDOUT_BEGIN(megdnn_naive_conv_fwd) {
auto filter_meta = check_exec(src.layout, filter.layout, dst.layout,
workspace.size);
workspace.size, preprocessed_filter);
using ComputeMode = Param::ComputeMode;
#define DISPATCH_CMODE(in_dt, out_dt, in_ct, out_ct, comp_ct, cmode) \
do { \
......
......@@ -44,9 +44,7 @@ class ConvolutionForwardImpl: public ConvolutionForward {
void exec_preprocess(const TensorLayout&, _megdnn_tensor_in,
const TensorLayout&, PreprocessedFilter*,
_megdnn_workspace) override {
megdnn_throw("convolution exec_preprocess in not impl yet");
}
_megdnn_workspace) override {}
SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const TensorLayout& , const TensorLayout& ,
......
......@@ -18,6 +18,9 @@
#include "test/common/workspace_wrapper.h"
#include <algorithm>
#include <memory>
namespace megdnn {
namespace test {
......@@ -32,6 +35,9 @@ struct OprProxyDefaultImpl
template <typename Opr>
struct OprProxy : public OprProxyDefaultImpl<Opr> {};
template <typename Opr>
struct OprWeightPreprocessProxy : public OprProxyDefaultImpl<Opr> {};
template <typename Opr>
struct OprProxyVectorToSingle {};
......@@ -139,6 +145,28 @@ struct OprProxyProfilingBase
typename Opr::Algorithm* target_algo = nullptr;
OprProxyProfilingBase(bool profile = false) { m_profiling = profile; }
//! used for alloc tensor for weight preprocess
static std::shared_ptr<TensorNDArray> alloc_tensors(
Handle* handle, const TensorLayoutArray& layouts) {
auto deleter = [handle](TensorNDArray* ptr) {
for (auto&& i : *ptr) {
auto pdata = static_cast<dt_byte*>(i.raw_ptr) +
i.layout.span().low_byte;
megdnn_free(handle, pdata);
}
delete ptr;
};
std::shared_ptr<TensorNDArray> ret{new TensorNDArray, deleter};
for (size_t i = 0; i < layouts.size(); ++i) {
auto span = layouts[i].span();
ret->emplace_back(static_cast<dt_byte*>(
megdnn_malloc(handle, span.dist_byte())) -
span.low_byte,
layouts[i]);
}
return ret;
}
};
template <class Opr>
......@@ -207,7 +235,6 @@ DEF_PROF3(LocalShareBackwardData);
DEF_PROF3(LocalShareBackwardFilter);
#undef DEF_PROF3
//! TODO: it should adapt weight preprocess later
template <>
struct OprProxy<ConvolutionForward>
: public OprProxyProfilingTernary<ConvolutionForward> {
......@@ -263,6 +290,100 @@ struct OprProxy<ConvolutionForward>
}
};
template <>
struct OprWeightPreprocessProxy<ConvolutionForward>
: public OprProxyProfilingTernary<ConvolutionForward> {
using OprProxyProfilingTernary<ConvolutionForward>::OprProxyProfilingTernary;
void exec(ConvolutionForward* opr, const TensorNDArray& tensors) {
megdnn_assert(tensors.size() == 3);
if (!Base::W.valid()) {
Base::W = WorkspaceWrapper(opr->handle(), 0);
}
if (Base::m_profiling && !Base::target_algo) {
size_t min_time = std::numeric_limits<size_t>::max();
for (auto algo :
opr->get_all_algorithms(tensors[0].layout, tensors[1].layout,
tensors[2].layout)) {
opr->execution_policy().algorithm = algo;
auto preprocess_tensors = weight_prerocess(opr, tensors, algo);
megcoreSynchronize(opr->handle()->megcore_computing_handle());
ConvolutionForward::PreprocessedFilter preprocessed_filter{
algo, *preprocess_tensors};
auto workspace_size = opr->get_workspace_in_bytes(
tensors[0].layout, tensors[1].layout, tensors[2].layout,
&preprocessed_filter);
Base::W.update(workspace_size);
for (size_t times = 0; times < Base::warmup_times; ++times)
opr->exec(tensors[0], tensors[1], tensors[2],
&preprocessed_filter, Base::W.workspace());
megcoreSynchronize(opr->handle()->megcore_computing_handle());
Timer timer;
timer.start();
for (size_t times = 0; times < Base::exec_times; ++times) {
opr->exec(tensors[0], tensors[1], tensors[2],
&preprocessed_filter, Base::W.workspace());
}
megcoreSynchronize(opr->handle()->megcore_computing_handle());
timer.stop();
printf("%.3fms %s\n", timer.get_time_in_us() / 1e3,
algo->name());
if (min_time > timer.get_time_in_us()) {
min_time = timer.get_time_in_us();
Base::target_algo = algo;
}
}
opr->execution_policy().algorithm = Base::target_algo;
auto preprocess_tensors =
weight_prerocess(opr, tensors, Base::target_algo);
megcoreSynchronize(opr->handle()->megcore_computing_handle());
ConvolutionForward::PreprocessedFilter preprocessed_filter{
Base::target_algo, *preprocess_tensors};
auto workspace_size = opr->get_workspace_in_bytes(
tensors[0].layout, tensors[1].layout, tensors[2].layout,
&preprocessed_filter);
Base::W.update(workspace_size);
}
auto preprocess_tensors =
weight_prerocess(opr, tensors, Base::target_algo);
megcoreSynchronize(opr->handle()->megcore_computing_handle());
ConvolutionForward::PreprocessedFilter preprocessed_filter{
Base::target_algo, *preprocess_tensors};
if (!Base::target_algo) {
auto workspace_size = opr->get_workspace_in_bytes(
tensors[0].layout, tensors[1].layout, tensors[2].layout,
&preprocessed_filter);
Base::W.update(workspace_size);
}
opr->exec(tensors[0], tensors[1], tensors[2], &preprocessed_filter,
Base::W.workspace());
}
//! handle weight preprocess
std::shared_ptr<TensorNDArray> weight_prerocess(
ConvolutionForward* opr, const TensorNDArray& tensors,
ConvolutionForward::Algorithm* algo) {
auto weight_perprocess_layouts = opr->deduce_preprocessed_filter_layout(
tensors[0].layout, tensors[1].layout, tensors[2].layout);
auto preprocessed_filter_tensors_ptr =
alloc_tensors(opr->handle(), weight_perprocess_layouts);
ConvolutionForward::PreprocessedFilter preprocessed_filter{
algo, *preprocessed_filter_tensors_ptr};
size_t preprocess_workspace_size =
opr->get_preprocess_workspace_in_bytes(tensors[0].layout,
tensors[1].layout,
tensors[2].layout);
WorkspaceWrapper preprocess_workspace(opr->handle(),
preprocess_workspace_size);
opr->exec_preprocess(tensors[0].layout, tensors[1], tensors[2].layout,
&preprocessed_filter,
preprocess_workspace.workspace());
return preprocessed_filter_tensors_ptr;
}
};
template <class Opr>
struct OprProxyProfiling5 : public OprProxyProfilingBase<Opr, 5> {
......@@ -329,11 +450,9 @@ struct OprProxyProfiling5 : public OprProxyProfilingBase<Opr, 5> {
DEF_PROF5(DeformableConvForward);
DEF_PROF5(DeformableConvBackwardFilter);
//DEF_PROF5(ConvBiasForward);
DEF_PROF5(BatchConvBiasForward);
#undef DEF_PROF5
//! TODO: it should adapt weight preprocess later
template <>
struct OprProxy<ConvBiasForward> : public OprProxyProfiling5<ConvBiasForward> {
using OprProxyProfiling5<ConvBiasForward>::OprProxyProfiling5;
......@@ -390,6 +509,106 @@ struct OprProxy<ConvBiasForward> : public OprProxyProfiling5<ConvBiasForward> {
}
};
template <>
struct OprWeightPreprocessProxy<ConvBiasForward>
: public OprProxyProfiling5<ConvBiasForward> {
using OprProxyProfiling5<ConvBiasForward>::OprProxyProfiling5;
void exec(ConvBiasForward* opr, const TensorNDArray& tensors) {
megdnn_assert(tensors.size() == 5);
if (!Base::W.valid()) {
Base::W = WorkspaceWrapper(opr->handle(), 0);
}
if (Base::m_profiling && !Base::target_algo) {
size_t min_time = std::numeric_limits<size_t>::max();
for (auto algo :
opr->get_all_algorithms(tensors[0].layout, tensors[1].layout,
tensors[2].layout, tensors[3].layout,
tensors[4].layout)) {
opr->execution_policy().algorithm = algo;
auto preprocess_tensors = weight_prerocess(opr, tensors, algo);
megcoreSynchronize(opr->handle()->megcore_computing_handle());
ConvBiasForward::PreprocessedFilter preprocessed_filter{
algo, *preprocess_tensors};
auto workspace_size = opr->get_workspace_in_bytes(
tensors[0].layout, tensors[1].layout, tensors[2].layout,
tensors[3].layout, tensors[4].layout,
&preprocessed_filter);
Base::W.update(workspace_size);
for (size_t times = 0; times < Base::warmup_times; ++times)
opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
tensors[4], &preprocessed_filter,
Base::W.workspace());
megcoreSynchronize(opr->handle()->megcore_computing_handle());
Timer timer;
timer.start();
for (size_t times = 0; times < Base::exec_times; ++times) {
opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
tensors[4], &preprocessed_filter,
Base::W.workspace());
}
megcoreSynchronize(opr->handle()->megcore_computing_handle());
timer.stop();
printf("%.3fms %s\n", timer.get_time_in_us() / 1e3,
algo->name());
if (min_time > timer.get_time_in_us()) {
min_time = timer.get_time_in_us();
Base::target_algo = algo;
}
}
opr->execution_policy().algorithm = Base::target_algo;
auto preprocess_tensors =
weight_prerocess(opr, tensors, Base::target_algo);
megcoreSynchronize(opr->handle()->megcore_computing_handle());
ConvBiasForward::PreprocessedFilter preprocessed_filter{
Base::target_algo, *preprocess_tensors};
auto workspace_size = opr->get_workspace_in_bytes(
tensors[0].layout, tensors[1].layout, tensors[2].layout,
tensors[3].layout, tensors[4].layout, &preprocessed_filter);
Base::W.update(workspace_size);
}
auto preprocess_tensors =
weight_prerocess(opr, tensors, Base::target_algo);
megcoreSynchronize(opr->handle()->megcore_computing_handle());
ConvBiasForward::PreprocessedFilter preprocessed_filter{
Base::target_algo, *preprocess_tensors};
if (!Base::target_algo) {
auto workspace_size = opr->get_workspace_in_bytes(
tensors[0].layout, tensors[1].layout, tensors[2].layout,
tensors[3].layout, tensors[4].layout, &preprocessed_filter);
Base::W.update(workspace_size);
}
opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], tensors[4],
&preprocessed_filter, Base::W.workspace());
}
//! handle weight preprocess
std::shared_ptr<TensorNDArray> weight_prerocess(
ConvBiasForward* opr, const TensorNDArray& tensors,
ConvBiasForward::Algorithm* algo) {
auto weight_perprocess_layouts = opr->deduce_preprocessed_filter_layout(
tensors[0].layout, tensors[1].layout, tensors[2].layout,
tensors[3].layout, tensors[4].layout);
auto preprocessed_filter_tensors_ptr =
alloc_tensors(opr->handle(), weight_perprocess_layouts);
ConvBiasForward::PreprocessedFilter preprocessed_filter{
algo, *preprocessed_filter_tensors_ptr};
size_t preprocess_workspace_size =
opr->get_preprocess_workspace_in_bytes(
tensors[0].layout, tensors[1].layout, tensors[2].layout,
tensors[3].layout, tensors[4].layout);
WorkspaceWrapper preprocess_workspace(opr->handle(),
preprocess_workspace_size);
opr->exec_preprocess(tensors[0].layout, tensors[1], tensors[2].layout,
tensors[3].layout, tensors[4].layout,
&preprocessed_filter,
preprocess_workspace.workspace());
return preprocessed_filter_tensors_ptr;
}
};
template <class Opr>
struct OprProxyProfiling8 : public OprProxyProfilingBase<Opr, 8> {
using Base = OprProxyProfilingBase<Opr, 8>;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册