提交 0b320568 编写于 作者: M Megvii Engine Team

feat(dnn/naive): support weight preprocess interface in dnn

GitOrigin-RevId: 84791aacf9ed53214fccaf074e67fd1460383380
上级 b2f0ceb2
......@@ -210,21 +210,25 @@ public:
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) = 0;
virtual void exec_preprocess(const TensorLayout& src_layout,
_megdnn_tensor_in filter,
const TensorLayout& dst_layout,
PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) = 0;
void deduce_dtype(DType src, DType filter, DType& dst);
void deduce_layout(const TensorLayout& src, const TensorLayout& filter,
TensorLayout& dst);
virtual size_t get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst,
PreprocessedFilter* preprocessed_filter) = 0;
const PreprocessedFilter* preprocessed_filter) = 0;
virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst) = 0;
virtual size_t get_preprocess_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst) = 0;
......@@ -337,7 +341,7 @@ public:
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& z,
const TensorLayout& dst,
PreprocessedFilter* preprocessed_filter) = 0;
const PreprocessedFilter* preprocessed_filter) = 0;
virtual size_t get_preprocess_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& z,
......
......@@ -76,7 +76,7 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec(
auto ret = check_layout_fwd(src, filter, dst);
megdnn_assert_contiguous(bias);
auto required_workspace_in_bytes =
get_workspace_in_bytes(src, filter, bias, z, dst);
get_workspace_in_bytes(src, filter, bias, z, dst, nullptr);
megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes);
if (bias.ndim != 0) {
//! bias.layout == dst.layout failed, no assert information
......
......@@ -981,7 +981,8 @@ ConvolutionForward::CanonizedFilterMeta ConvolutionForward::check_exec(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst, size_t workspace_in_bytes) {
auto ret = check_layout_fwd(src, filter, dst);
auto required_workspace_in_bytes = get_workspace_in_bytes(src, filter, dst);
auto required_workspace_in_bytes =
get_workspace_in_bytes(src, filter, dst, nullptr);
megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes);
return ret;
}
......
......@@ -112,7 +112,7 @@ void ConvBiasForwardImpl::AlgoBFloat16::exec(const ExecArgs& args) const {
convbias_opr->param().compute_mode = Param::ComputeMode::DEFAULT;
convbias_opr->execution_policy() = {m_impl};
convbias_opr->exec(fsrc_tensor, ffilter_tensor, fbias_tensor, fz_tensor,
fdst_tensor, cvter.workspace());
fdst_tensor, nullptr, cvter.workspace());
}
{ cvter.comp_to_dst_type(fdst_tensor, *args.dst_tensor); }
}
......
......@@ -25,6 +25,7 @@ namespace cuda {
void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_in bias, _megdnn_tensor_in z,
_megdnn_tensor_out dst,
const PreprocessedFilter*,
_megdnn_workspace workspace) {
check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout,
workspace.size);
......@@ -208,7 +209,8 @@ size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& bias,
const TensorLayout& z,
const TensorLayout& dst) {
const TensorLayout& dst,
const PreprocessedFilter*) {
AlgoBase::SizeArgs args{this, src, filter, bias, z, dst};
return get_algorithm(this, src, filter, bias, z, dst)
->get_workspace_in_bytes(args);
......
......@@ -20,7 +20,9 @@ public:
using ConvBiasForward::ConvBiasForward;
void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_in bias, _megdnn_tensor_in z,
_megdnn_tensor_out dst, _megdnn_workspace workspace) override;
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) override;
std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& z,
......@@ -34,7 +36,30 @@ public:
bool reproducible) override;
size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&,
const TensorLayout&, const TensorLayout&,
const TensorLayout&) override;
const TensorLayout&,
const PreprocessedFilter*) override;
size_t get_preprocess_workspace_in_bytes(const TensorLayout&,
const TensorLayout&,
const TensorLayout&,
const TensorLayout&,
const TensorLayout&) override {
return 0;
};
SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const TensorLayout&, const TensorLayout&, const TensorLayout&,
const TensorLayout&, const TensorLayout&) override {
return {};
}
void exec_preprocess(const TensorLayout& ,
_megdnn_tensor_in ,
const TensorLayout& ,
const TensorLayout& ,
const TensorLayout& ,
PreprocessedFilter* ,
_megdnn_workspace ) override {
megdnn_throw("cuda conv_bias exec_preprocess has not implemeted yet");
}
const char* get_algorithm_set_name() const override;
......
......@@ -73,22 +73,32 @@ ConvolutionForwardImpl::get_all_algorithms(const TensorLayout& src,
size_t ConvolutionForwardImpl::get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst) {
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter) {
auto extra_data = conv_bias_extra_data(dst);
return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get())
->get_workspace_in_bytes(src, filter, extra_data.bias_layout,
extra_data.z_layout, dst);
->get_workspace_in_bytes(
src, filter, extra_data.bias_layout, extra_data.z_layout,
dst,
reinterpret_cast<const ConvolutionBase<
param::ConvBias>::PreprocessedFilter*>(
preprocessed_filter));
}
void ConvolutionForwardImpl::exec(_megdnn_tensor_in src,
_megdnn_tensor_in filter,
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) {
auto extra_data = conv_bias_extra_data(dst.layout);
TensorND bias(nullptr, extra_data.bias_layout);
TensorND z(nullptr, extra_data.z_layout);
return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get())
->exec(src, filter, bias, z, dst, workspace);
->exec(src, filter, bias, z, dst,
reinterpret_cast<const ConvolutionBase<
param::ConvBias>::PreprocessedFilter*>(
preprocessed_filter),
workspace);
}
const char* ConvolutionForwardImpl::get_algorithm_set_name() const {
......
......@@ -11,6 +11,7 @@
#pragma once
#include "megdnn/oprs/nn.h"
#include "src/common/utils.h"
namespace megdnn {
namespace cuda {
......@@ -18,10 +19,11 @@ namespace cuda {
class ConvolutionForwardImpl: public ConvolutionForward {
public:
using ConvolutionForward::ConvolutionForward;
void exec(_megdnn_tensor_in src,
_megdnn_tensor_in filter,
void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) override;
std::vector<Algorithm *> get_all_algorithms(const TensorLayout &src,
const TensorLayout &filter,
const TensorLayout &dst) override;
......@@ -30,11 +32,28 @@ class ConvolutionForwardImpl: public ConvolutionForward {
const TensorLayout& dst,
size_t workspace_limit_in_bytes,
bool reproducible) override;
size_t get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst) override;
size_t get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter) override;
const char* get_algorithm_set_name() const override;
SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const TensorLayout&, const TensorLayout&,
const TensorLayout&) override {
return {};
}
size_t get_preprocess_workspace_in_bytes(
const TensorLayout& , const TensorLayout& ,
const TensorLayout& ) override{
return 0;
}
void exec_preprocess(const TensorLayout&, _megdnn_tensor_in,
const TensorLayout&, PreprocessedFilter*,
_megdnn_workspace) override {
megdnn_throw("cuda exec_preprocess has not implemeted yet");
}
protected:
struct ConvBiasExtraData{
std::unique_ptr<ConvBiasForward> convbias_opr;
......
......@@ -27,7 +27,7 @@ void MaskConvForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_workspace workspace) {
megdnn_assert(dst.layout.dtype.enumv() == DTypeTrait<dtype::Float32>::enumv,
"Mask conv only support Float32 dtype.");
m_conv_opr->exec(src, filter, dst, workspace);
m_conv_opr->exec(src, filter, dst, nullptr, workspace);
auto stream = cuda_stream(handle());
#define cb(DType) \
if (mask.layout.dtype == DType()) { \
......
......@@ -30,7 +30,7 @@ public:
const TensorLayout& dst) override {
MEGDNN_MARK_USED_VAR(mask);
m_conv_opr->param() = param();
return m_conv_opr->get_workspace_in_bytes(src, filter, dst);
return m_conv_opr->get_workspace_in_bytes(src, filter, dst, nullptr);
}
private:
......
......@@ -95,7 +95,9 @@ bool ConvBiasImpl::is_naive_algo(ConvBiasImpl::Algorithm* algo) {
}
void ConvBiasImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_in bias, _megdnn_tensor_in z,
_megdnn_tensor_out dst, _megdnn_workspace workspace) {
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) {
check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout,
workspace.size);
auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace);
......@@ -104,20 +106,21 @@ void ConvBiasImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
ncb_algo_get_workspace(algo, fparam) <= workspace.size) {
exec_with_ncb_kern(fparam, algo);
} else {
naive::ConvBiasForwardImpl::exec(src, filter, bias, z, dst, workspace);
naive::ConvBiasForwardImpl::exec(src, filter, bias, z, dst,
preprocessed_filter, workspace);
}
}
size_t ConvBiasImpl::get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& bias,
const TensorLayout& z,
const TensorLayout& dst) {
size_t ConvBiasImpl::get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& z,
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter) {
auto fparam = make_ncb_kern_size_param(src, filter, bias, dst);
ConvBiasImpl::Algorithm* algo = get_algorithm(fparam);
if (is_naive_algo(algo)) {
return naive::ConvBiasForwardImpl::get_workspace_in_bytes(src, filter,
bias, z, dst);
return naive::ConvBiasForwardImpl::get_workspace_in_bytes(
src, filter, bias, z, dst, preprocessed_filter);
} else {
return ncb_algo_get_workspace(algo, fparam);
}
......
......@@ -41,14 +41,16 @@ public:
//! implemented by exec_with_ncb_kern()
void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_in bias, _megdnn_tensor_in z,
_megdnn_tensor_out dst, _megdnn_workspace workspace) override;
_megdnn_tensor_out dst, const PreprocessedFilter*,
_megdnn_workspace workspace) override;
//! implemented by get_workspace_with_ncb()
size_t get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& bias,
const TensorLayout& z,
const TensorLayout& dst) override;
const TensorLayout& dst,
const PreprocessedFilter*) override;
//! implemented by get_all_algorithms_with_ncb()
std::vector<Algorithm*> get_all_algorithms(
......
......@@ -82,6 +82,7 @@ bool ConvolutionImpl::is_naive_algo(ConvolutionImpl::Algorithm* algo) {
}
void ConvolutionImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) {
auto fparam = make_ncb_kern_param(src, filter, dst, workspace);
ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size);
......@@ -89,18 +90,20 @@ void ConvolutionImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
ncb_algo_get_workspace(algo, fparam) <= workspace.size) {
exec_with_ncb_kern(fparam, algo);
} else {
naive::ConvolutionForwardImpl::exec(src, filter, dst, workspace);
naive::ConvolutionForwardImpl::exec(src, filter, dst,
preprocessed_filter, workspace);
}
}
size_t ConvolutionImpl::get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst) {
size_t ConvolutionImpl::get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter) {
auto fparam = make_ncb_kern_size_param(src, filter, dst);
Algorithm* algo = get_algorithm(fparam);
if (is_naive_algo(algo)) {
return naive::ConvolutionForwardImpl::get_workspace_in_bytes(
src, filter, dst);
src, filter, dst, preprocessed_filter);
} else {
return ncb_algo_get_workspace(algo, fparam);
}
......
......@@ -36,12 +36,14 @@ public:
//! implemented by exec_with_ncb_kern()
void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_out dst, _megdnn_workspace workspace) override;
_megdnn_tensor_out dst, const PreprocessedFilter*,
_megdnn_workspace workspace) override;
//! implemented by get_workspace_with_ncb()
size_t get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst) override;
const TensorLayout& dst,
const PreprocessedFilter*) override;
//! implemented by get_all_algorithms_with_ncb()
std::vector<Algorithm*> get_all_algorithms(
......
......@@ -54,7 +54,8 @@ size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& flt,
const TensorLayout& bias,
const TensorLayout& z,
const TensorLayout& dst) {
const TensorLayout& dst,
const PreprocessedFilter*) {
size_t float_workspace_size = 0;
if (z.ndim > 0 && z.dtype.category() != DTypeCategory::FLOAT) {
......@@ -79,6 +80,7 @@ size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src,
void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_in bias, _megdnn_tensor_in z,
_megdnn_tensor_out dst,
const PreprocessedFilter*,
_megdnn_workspace workspace) {
MIDOUT_BEGIN(megdnn_naive_conv_bias_fwd) {
dt_byte *workspace_ptr = workspace.raw_ptr;
......
......@@ -22,7 +22,9 @@ public:
using ConvBiasForward::ConvBiasForward;
void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_in bias, _megdnn_tensor_in z,
_megdnn_tensor_out dst, _megdnn_workspace workspace) override;
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) override;
std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& src, const TensorLayout& filter,
......@@ -37,11 +39,32 @@ public:
size_t workspace_limit_in_bytes,
bool reproducible) override;
size_t get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& bias,
const TensorLayout& z,
const TensorLayout& dst) override;
size_t get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& bias, const TensorLayout& z,
const TensorLayout& dst,
const PreprocessedFilter* preprocessed_filter) override;
size_t get_preprocess_workspace_in_bytes(const TensorLayout&,
const TensorLayout&,
const TensorLayout&,
const TensorLayout&,
const TensorLayout&) override {
return 0;
}
SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const TensorLayout&, const TensorLayout&, const TensorLayout&,
const TensorLayout&, const TensorLayout&) override {
return {};
}
void exec_preprocess(const TensorLayout&, _megdnn_tensor_in,
const TensorLayout&, const TensorLayout&,
const TensorLayout&, PreprocessedFilter*,
_megdnn_workspace) override{
megdnn_throw("conv_bias exec_preprocess is not impl yet");
}
const char* get_algorithm_set_name() const override;
};
......
......@@ -28,12 +28,11 @@ using namespace naive;
void ConvolutionForwardImpl::exec(_megdnn_tensor_in src,
_megdnn_tensor_in filter,
_megdnn_tensor_out dst,
_megdnn_workspace workspace)
{
const PreprocessedFilter*,
_megdnn_workspace workspace) {
MIDOUT_BEGIN(megdnn_naive_conv_fwd) {
auto filter_meta = check_exec(
src.layout, filter.layout, dst.layout, workspace.size);
auto filter_meta = check_exec(src.layout, filter.layout, dst.layout,
workspace.size);
using ComputeMode = Param::ComputeMode;
#define DISPATCH_CMODE(in_dt, out_dt, in_ct, out_ct, comp_ct, cmode) \
do { \
......@@ -57,19 +56,23 @@ void ConvolutionForwardImpl::exec(_megdnn_tensor_in src,
DISPATCH(Int8, Int16, dt_int8, dt_int16, dt_int16);
DISPATCH(Int8, Int32, dt_int8, dt_int32, dt_int32);
DISPATCH(QuantizedS8, QuantizedS32, dt_int8, dt_int32, dt_int32);
MEGDNN_INC_FLOAT16(DISPATCH_CMODE(Float16, Float16, dt_float16, dt_float16,
dt_float32, ComputeMode::FLOAT32));
MEGDNN_INC_FLOAT16(DISPATCH_CMODE(Float16, Float16, dt_float16,
dt_float16, dt_float32,
ComputeMode::FLOAT32));
MEGDNN_INC_FLOAT16(DISPATCH_CMODE(BFloat16, BFloat16, dt_bfloat16,
dt_bfloat16, dt_float32,
ComputeMode::FLOAT32));
DISPATCH(Quantized8Asymm, QuantizedS32, dt_quint8, dt_qint32, dt_qint32);
DISPATCH(Quantized8Asymm, QuantizedS32, dt_quint8, dt_qint32,
dt_qint32);
DISPATCH(QuantizedS8, QuantizedS8, dt_int8, dt_int8, dt_int32);
#undef DISPATCH
megdnn_throw(ssprintf("unsupported Conv(%s, %s) -> %s with cmode = %d",
src.layout.dtype.name(), filter.layout.dtype.name(),
src.layout.dtype.name(),
filter.layout.dtype.name(),
dst.layout.dtype.name(),
static_cast<int>(param().compute_mode)));
} MIDOUT_END();
}
MIDOUT_END();
}
size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes(const TensorLayout& filter,
......
......@@ -10,6 +10,7 @@
*/
#pragma once
#include "megdnn/oprs.h"
#include "src/common/utils.h"
namespace megdnn {
namespace naive {
......@@ -17,9 +18,9 @@ namespace naive {
class ConvolutionForwardImpl: public ConvolutionForward {
public:
using ConvolutionForward::ConvolutionForward;
void exec(_megdnn_tensor_in src,
_megdnn_tensor_in filter,
void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) override;
std::vector<Algorithm *> get_all_algorithms(const TensorLayout &src,
const TensorLayout &filter,
......@@ -30,10 +31,29 @@ class ConvolutionForwardImpl: public ConvolutionForward {
size_t workspace_limit_in_bytes,
bool reproducible) override;
size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&,
const TensorLayout&,
const PreprocessedFilter*) override {
return 0;
}
size_t get_preprocess_workspace_in_bytes(const TensorLayout&,
const TensorLayout&,
const TensorLayout&) override {
return 0;
}
void exec_preprocess(const TensorLayout&, _megdnn_tensor_in,
const TensorLayout&, PreprocessedFilter*,
_megdnn_workspace) override {
megdnn_throw("convolution exec_preprocess in not impl yet");
}
SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const TensorLayout& , const TensorLayout& ,
const TensorLayout& )override{
return {};
}
const char* get_algorithm_set_name() const override;
};
......
......@@ -97,7 +97,7 @@ void ConvPoolingForwardImpl::exec(const _megdnn_in TensorND src,
TensorND conv_dst((float*)(workspace.raw_ptr), conv_dst_layout);
//convFwd->check_layout(src.layout, filter.layout, workspace.layout, empty_wsp.layout);
check_layout(src.layout, filter.layout, bias.layout, dst.layout, workspace.size);
convFwd->exec(src, filter, conv_dst, empty_wsp);
convFwd->exec(src, filter, conv_dst, nullptr, empty_wsp);
// calculate bias
int conv_dst_batch = conv_dst.layout.shape[0];
......
......@@ -80,7 +80,7 @@ void MaskConvForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_workspace workspace) {
MEGDNN_MARK_USED_VAR(mask);
m_conv_opr->param() = this->param();
m_conv_opr->exec(src, filter, dst, workspace);
m_conv_opr->exec(src, filter, dst, nullptr, workspace);
#define cb(DType) \
if (mask.layout.dtype == DType()) { \
using ctype = typename DTypeTrait<DType>::ctype; \
......@@ -99,7 +99,7 @@ size_t MaskConvForwardImpl::get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& dst) {
MEGDNN_MARK_USED_VAR(mask);
m_conv_opr->param() = this->param();
return m_conv_opr->get_workspace_in_bytes(src, filter, dst);
return m_conv_opr->get_workspace_in_bytes(src, filter, dst, nullptr);
}
void MaskPropagateImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
......
......@@ -103,7 +103,7 @@ void SeparableConvForwardImpl::exec(_megdnn_tensor_in src,
ConvolutionForwardImpl* convOptr = new ConvolutionForwardImpl(this->handle());
Workspace empty_wsp;
convOptr->exec(src, filter2d, dst, empty_wsp);
convOptr->exec(src, filter2d, dst, nullptr, empty_wsp);
delete(convOptr);
free(filter2d_buf);
......
......@@ -664,7 +664,7 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD) {
conv_bias_opr->get_workspace_in_bytes(
tensors[0].layout, filter_transform_layout,
tensors[2].layout, tensors[3].layout,
tensors[4].layout);
tensors[4].layout, nullptr);
WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(),
conv_bias_workspace_in_bytes,
......@@ -676,7 +676,8 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD) {
winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor,
wb.get_workspace(2));
conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2],
tensors[3], tensors[4], wb.get_workspace(1));
tensors[3], tensors[4], nullptr,
wb.get_workspace(1));
free(wb.ptr());
};
......
......@@ -1008,7 +1008,7 @@ void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m,
conv_bias_opr->param().output_block_size = m;
size_t conv_bias_workspace_in_bytes = conv_bias_opr->get_workspace_in_bytes(
tensors[0].layout, filter_transform_layout, tensors[2].layout,
tensors[3].layout, tensors[4].layout);
tensors[3].layout, tensors[4].layout, nullptr);
WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(),
conv_bias_workspace_in_bytes,
......@@ -1020,7 +1020,7 @@ void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m,
winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor,
wb.get_workspace(2));
conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2],
tensors[3], tensors[4], wb.get_workspace(1));
tensors[3], tensors[4], nullptr, wb.get_workspace(1));
free(wb.ptr());
};
......
......@@ -200,15 +200,70 @@ struct OprProxyProfilingTernary : public OprProxyProfilingBase<Opr, 3> {
using OprProxyProfilingTernary<c>::OprProxyProfilingTernary; \
}
DEF_PROF3(ConvolutionForward);
DEF_PROF3(ConvolutionBackwardData);
DEF_PROF3(ConvolutionBackwardFilter);
DEF_PROF3(LocalShareForward);
DEF_PROF3(LocalShareBackwardData);
DEF_PROF3(LocalShareBackwardFilter);
#undef DEF_PROF3
//! TODO: it should adapt weight preprocess later
template <>
struct OprProxy<ConvolutionForward>
: public OprProxyProfilingTernary<ConvolutionForward> {
using OprProxyProfilingTernary<ConvolutionForward>::OprProxyProfilingTernary;
void exec(ConvolutionForward* opr, const TensorNDArray& tensors) {
megdnn_assert(tensors.size() == 3);
if (!Base::W.valid()) {
Base::W = WorkspaceWrapper(opr->handle(), 0);
}
if (Base::m_profiling && !Base::target_algo) {
size_t min_time = std::numeric_limits<size_t>::max();
for (auto algo :
opr->get_all_algorithms(tensors[0].layout, tensors[1].layout,
tensors[2].layout)) {
opr->execution_policy().algorithm = algo;
auto workspace_size = opr->get_workspace_in_bytes(
tensors[0].layout, tensors[1].layout, tensors[2].layout,
nullptr);
Base::W.update(workspace_size);
for (size_t times = 0; times < Base::warmup_times; ++times)
opr->exec(tensors[0], tensors[1], tensors[2], nullptr,
Base::W.workspace());
megcoreSynchronize(opr->handle()->megcore_computing_handle());
Timer timer;
timer.start();
for (size_t times = 0; times < Base::exec_times; ++times) {
opr->exec(tensors[0], tensors[1], tensors[2], nullptr,
Base::W.workspace());
}
megcoreSynchronize(opr->handle()->megcore_computing_handle());
timer.stop();
printf("%.3fms %s\n", timer.get_time_in_us() / 1e3,
algo->name());
if (min_time > timer.get_time_in_us()) {
min_time = timer.get_time_in_us();
Base::target_algo = algo;
}
}
opr->execution_policy().algorithm = Base::target_algo;
auto workspace_size = opr->get_workspace_in_bytes(
tensors[0].layout, tensors[1].layout, tensors[2].layout, nullptr);
Base::W.update(workspace_size);
}
if (!Base::target_algo) {
auto workspace_size = opr->get_workspace_in_bytes(
tensors[0].layout, tensors[1].layout, tensors[2].layout,
nullptr);
Base::W.update(workspace_size);
}
opr->exec(tensors[0], tensors[1], tensors[2], nullptr,
Base::W.workspace());
}
};
template <class Opr>
struct OprProxyProfiling5 : public OprProxyProfilingBase<Opr, 5> {
using Base = OprProxyProfilingBase<Opr, 5>;
......@@ -274,10 +329,67 @@ struct OprProxyProfiling5 : public OprProxyProfilingBase<Opr, 5> {
DEF_PROF5(DeformableConvForward);
DEF_PROF5(DeformableConvBackwardFilter);
DEF_PROF5(ConvBiasForward);
//DEF_PROF5(ConvBiasForward);
DEF_PROF5(BatchConvBiasForward);
#undef DEF_PROF5
//! TODO: it should adapt weight preprocess later
template <>
struct OprProxy<ConvBiasForward> : public OprProxyProfiling5<ConvBiasForward> {
using OprProxyProfiling5<ConvBiasForward>::OprProxyProfiling5;
void exec(ConvBiasForward* opr, const TensorNDArray& tensors) {
megdnn_assert(tensors.size() == 5);
if (!Base::W.valid()) {
Base::W = WorkspaceWrapper(opr->handle(), 0);
}
if (Base::m_profiling && !Base::target_algo) {
size_t min_time = std::numeric_limits<size_t>::max();
for (auto algo :
opr->get_all_algorithms(tensors[0].layout, tensors[1].layout,
tensors[2].layout, tensors[3].layout,
tensors[4].layout)) {
opr->execution_policy().algorithm = algo;
auto workspace_size = opr->get_workspace_in_bytes(
tensors[0].layout, tensors[1].layout, tensors[2].layout,
tensors[3].layout, tensors[4].layout, nullptr);
Base::W.update(workspace_size);
for (size_t times = 0; times < Base::warmup_times; ++times)
opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
tensors[4], nullptr, Base::W.workspace());
megcoreSynchronize(opr->handle()->megcore_computing_handle());
Timer timer;
timer.start();
for (size_t times = 0; times < Base::exec_times; ++times) {
opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
tensors[4], nullptr, Base::W.workspace());
}
megcoreSynchronize(opr->handle()->megcore_computing_handle());
timer.stop();
printf("%.3fms %s\n", timer.get_time_in_us() / 1e3,
algo->name());
if (min_time > timer.get_time_in_us()) {
min_time = timer.get_time_in_us();
Base::target_algo = algo;
}
}
opr->execution_policy().algorithm = Base::target_algo;
auto workspace_size = opr->get_workspace_in_bytes(
tensors[0].layout, tensors[1].layout, tensors[2].layout,
tensors[3].layout, tensors[4].layout, nullptr);
Base::W.update(workspace_size);
}
if (!Base::target_algo) {
auto workspace_size = opr->get_workspace_in_bytes(
tensors[0].layout, tensors[1].layout, tensors[2].layout,
tensors[3].layout, tensors[4].layout, nullptr);
Base::W.update(workspace_size);
}
opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], tensors[4],
nullptr, Base::W.workspace());
}
};
template <class Opr>
struct OprProxyProfiling8 : public OprProxyProfilingBase<Opr, 8> {
using Base = OprProxyProfilingBase<Opr, 8>;
......
......@@ -75,8 +75,8 @@ TEST_F(CPU, MASK_PROPAGATE) {
auto dst = TensorND{dst_ptr, dst_layout};
WorkspaceWrapper workspace{
handle(), opr->get_workspace_in_bytes(src.layout, filter.layout,
dst.layout)};
opr->exec(src, filter, dst, workspace.workspace());
dst.layout, nullptr)};
opr->exec(src, filter, dst, nullptr, workspace.workspace());
for (size_t i = 0; i < dst.layout.total_nr_elems(); ++i) {
mask_dst.ptr<int>()[i] = dst_ptr[i] > 0;
}
......
......@@ -176,6 +176,46 @@ public:
}
}
//! special for weight preprocess
void exec_convolution(ConvolutionForward* opr0, ConvolutionForward* opr1) {
opr0->param().pad_h = pad_h;
opr0->param().pad_w = pad_w;
opr1->param() = opr0->param();
opr1->param().sparse = param::Convolution::Sparse::GROUP;
TensorND a0, b0, c0, a1, b1, c1;
std::tie(a0, b0, c0) = shuffle(std::make_tuple(
src0->tensornd(), flt0->tensornd(), dst0->tensornd()));
std::tie(a1, b1, c1) = shuffle(std::make_tuple(
src1->tensornd(), flt1->tensornd(), dst1->tensornd()));
WorkspaceWrapper wk(
handle,
std::max(opr0->get_workspace_in_bytes(a0.layout, b0.layout,
c0.layout, nullptr),
opr1->get_workspace_in_bytes(a1.layout, b1.layout,
c1.layout, nullptr)));
cudaProfilerStart();
cudaEventRecord(cuda_ev[0], cuda_stream);
opr0->exec(a0, b0, c0, nullptr, wk.workspace());
cudaEventRecord(cuda_ev[1], cuda_stream);
opr1->exec(a1, b1, c1, nullptr, wk.workspace());
cudaEventRecord(cuda_ev[2], cuda_stream);
cudaProfilerStop();
if (getenv("MEGDNN_CHANWISE_CONV_VERBOSE") ||
getenv("MEGDNN_CHANWISE_CONV_FULLBENCH")) {
cudaStreamSynchronize(cuda_stream);
float t0 = -1, t1 = -1;
cudaEventElapsedTime(&t0, cuda_ev[0], cuda_ev[1]);
cudaEventElapsedTime(&t1, cuda_ev[1], cuda_ev[2]);
printf("%s;%s;%s: cudnn/megdnn: %.3fms/%.3fms=%.3f\n",
lsrc.TensorShape::to_string().c_str(),
lflt1.TensorShape::to_string().c_str(),
ldst.TensorShape::to_string().c_str(),
t0, t1, t0 / t1);
}
}
void cmp_dst() {
Tensor<> dst0_cpu(handle_cpu, ldst), dst1_cpu(handle_cpu, ldst);
megdnn_memcpy_D2H(handle,
......@@ -399,7 +439,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) {
benv.alloc(N, IC, IH, IW, CHL_MUL, FH, FW, PH, PW);
benv.fill_src();
benv.fill_flt();
benv.exec(conv0.get(), conv1.get());
benv.exec_convolution(conv0.get(), conv1.get());
benv.cmp_dst();
};
......
......@@ -30,10 +30,10 @@ TEST(DISPATCHER, NULL_DISPATCHER)
auto layout = TensorLayout({1, 1, 1, 1}, dtype::Float32());
TensorND src(nullptr, layout), filter(nullptr, layout), dst(nullptr, layout);
auto wsize = opr->get_workspace_in_bytes(layout, layout, layout);
auto wsize = opr->get_workspace_in_bytes(layout, layout, layout, nullptr);
Workspace workspace(nullptr, wsize);
opr->exec(src, filter, dst, workspace);
opr->exec(src, filter, dst, nullptr, workspace);
}
#endif
......
......@@ -217,11 +217,11 @@ TEST_F(NAIVE, CONV_BIAS_QUANTIZED8x8x32_NCHW32) {
size_t ws_size = conv_opr->get_workspace_in_bytes(
src_layout_4, filter_layout_4, bias_layout_4, z_layout_4,
dst_layout_4);
dst_layout_4, nullptr);
WorkspaceWrapper ws{handle(), ws_size};
conv_opr->exec(src_ts_4.tensornd(), filter_ts_4.tensornd(),
bias_ts_4.tensornd(), z_ts_4.tensornd(), dst_ts_4.tensornd(),
ws.workspace());
nullptr, ws.workspace());
TensorLayout src_layout_32{{N, IC / 32, IH, IW, 32},
dtype::QuantizedS8(0.1f)};
......
......@@ -209,7 +209,8 @@ TEST_F(NAIVE, CONVOLUTION_WITH_NCHW4) {
}
auto workspace_size = conv->get_workspace_in_bytes(
tensors[0].layout, tensors[1].layout, tensors[2].layout);
tensors[0].layout, tensors[1].layout, tensors[2].layout,
nullptr);
dt_byte* workspace_ptr = static_cast<dt_byte*>(malloc(workspace_size));
Workspace workspace{workspace_ptr, workspace_size};
......@@ -217,7 +218,7 @@ TEST_F(NAIVE, CONVOLUTION_WITH_NCHW4) {
relayout->exec(nchw4_tensors[0], nchw_tensors[0]);
relayout->exec(nchw4_tensors[1], nchw_tensors[1]);
conv->exec(nchw_tensors[0], nchw_tensors[1], nchw_tensors[2],
conv->exec(nchw_tensors[0], nchw_tensors[1], nchw_tensors[2], nullptr,
workspace);
relayout->exec(nchw_tensors[2], nchw4_tensors[2]);
......
......@@ -1334,8 +1334,8 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_WEIGHT_PREPROCESS) {
size_t conv_bias_workspace_in_bytes =
conv_bias_opr->get_workspace_in_bytes(
tensors[0].layout, filter_transform_layout,
tensors[2].layout, tensors[3].layout,
tensors[4].layout);
tensors[2].layout, tensors[3].layout, tensors[4].layout,
nullptr);
WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(),
conv_bias_workspace_in_bytes,
......@@ -1347,7 +1347,8 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_WEIGHT_PREPROCESS) {
winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor,
wb.get_workspace(2));
conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2],
tensors[3], tensors[4], wb.get_workspace(1));
tensors[3], tensors[4], nullptr,
wb.get_workspace(1));
free(wb.ptr());
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册