提交 25089e52 编写于 作者: M Megvii Engine Team

refactor(megdnn): refactor matmul algo in conv backward data

GitOrigin-RevId: 8de601df6e7ddcdb48cd38d6ff05d132c1a014fe
上级 3620a940
...@@ -141,6 +141,10 @@ public: ...@@ -141,6 +141,10 @@ public:
size_t get_workspace_in_bytes(const SizeArgs& args) const override; size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override; void exec(const ExecArgs& args) const override;
std::vector<SearchItem> get_subopr_list(
const TensorLayoutArray& layouts,
const OperatorBase* opr) const override;
const char* name() const override { return "MATMUL"; } const char* name() const override { return "MATMUL"; }
bool is_reproducible() const override { return true; } bool is_reproducible() const override { return true; }
MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL) MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL)
......
...@@ -6,37 +6,96 @@ ...@@ -6,37 +6,96 @@
* *
* Unless required by applicable law or agreed to in writing, * Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an * software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/ */
#include "./algo.h" #include "./algo.h"
#include "src/cuda/utils.h"
#include "src/cuda/convolution/helper.h" #include "src/cuda/convolution/helper.h"
#include "src/cuda/convolution/im2col.cuh" #include "src/cuda/convolution/im2col.cuh"
#include "src/cuda/matrix_mul/opr_impl.h" #include "src/cuda/matrix_mul/opr_impl.h"
#include "src/cuda/utils.h"
using namespace megdnn; using namespace megdnn;
using namespace cuda; using namespace cuda;
namespace {
std::pair<TensorLayoutArray, MatrixMulForward::Param> sub_opr_config(
const ConvolutionBackwardDataImpl::CanonizedFilterMeta& fm,
const TensorLayout& filter_layout, const TensorLayout& diff_layout,
const TensorLayout& grad_layout,
const ConvolutionBackwardDataImpl* opr) {
size_t N = grad_layout.shape[0], IC = fm.icpg,
OC = fm.ocpg, OH = diff_layout.shape[2],
OW = diff_layout.shape[3], FH = fm.spatial[0],
FW = fm.spatial[1];
megdnn_assert(filter_layout.dtype.enumv() == diff_layout.dtype.enumv());
TensorLayout Al({OC, IC * FH * FW}, filter_layout.dtype),
Bl({IC * FH * FW, OH * OW * N}, filter_layout.dtype),
Cl({OC, OH * OW * N}, filter_layout.dtype);
MatrixMulForward::Param param;
if (opr->param().compute_mode ==
param::Convolution::ComputeMode::FLOAT32) {
param.compute_mode = param::MatrixMul::ComputeMode::FLOAT32;
}
param.transposeA = true;
return {{Al, Cl, Bl}, param};
}
} // namespace
std::vector<Algorithm::SearchItem>
ConvolutionBackwardDataImpl::AlgoMatmul::get_subopr_list(
const TensorLayoutArray& layouts, const OperatorBase* opr) const {
const ConvolutionBackwardDataImpl* conv_backward_data_opr =
static_cast<const ConvolutionBackwardDataImpl*>(opr);
CanonizedFilterMeta fm = conv_backward_data_opr->check_layout_fwd(
layouts[2], layouts[0], layouts[1]);
auto&& config = sub_opr_config(fm, layouts[0], layouts[1], layouts[2],
conv_backward_data_opr);
std::string param_str;
Algorithm::serialize_write_pod(config.second, param_str);
return {{Algorithm::OprType::MATRIX_MUL_FORWARD, param_str,
config.first}};
}
bool ConvolutionBackwardDataImpl::AlgoMatmul::is_available( bool ConvolutionBackwardDataImpl::AlgoMatmul::is_available(
const SizeArgs &args) const { const SizeArgs& args) const {
if (args.diff_layout->dtype == args.filter_layout->dtype && if (args.diff_layout->dtype == args.filter_layout->dtype &&
args.diff_layout->dtype == dtype::BFloat16()) { args.diff_layout->dtype == dtype::BFloat16()) {
return false; return false;
} }
auto &&fm = args.filter_meta; auto&& fm = args.filter_meta;
return args.filter_meta.format == Param::Format::NCHW && return args.filter_meta.format == Param::Format::NCHW &&
args.diff_layout->dtype.category() == DTypeCategory::FLOAT && args.diff_layout->dtype.category() == DTypeCategory::FLOAT &&
fm.group == 1 && fm.spatial_ndim == 2; fm.group == 1 && fm.spatial_ndim == 2;
} }
size_t ConvolutionBackwardDataImpl::AlgoMatmul::get_workspace_in_bytes( size_t ConvolutionBackwardDataImpl::AlgoMatmul::get_workspace_in_bytes(
const SizeArgs &args) const { const SizeArgs& args) const {
return matmul_get_workspace_bundle( auto matmul_opr =
args.as_fwd_args()).total_size_in_bytes(); args.handle->create_operator<MatrixMulForward>();
if (args.opr->execution_policy().algo.valid() &&
!args.opr->execution_policy().sub_policy.empty()) {
megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1);
matmul_opr->execution_policy() =
args.opr->execution_policy().sub_policy[0];
}
auto&& config =
sub_opr_config(args.filter_meta, *args.filter_layout,
*args.diff_layout, *args.grad_layout, args.opr);
matmul_opr->param() = config.second;
auto&& sizes = matmul_get_workspace_bundle(args.as_fwd_args());
sizes.push_back(matmul_opr->get_workspace_in_bytes(
config.first[0], config.first[1], config.first[2]));
return WorkspaceBundle(nullptr, sizes).total_size_in_bytes();
} }
void ConvolutionBackwardDataImpl::AlgoMatmul::exec(const ExecArgs &args) const { void ConvolutionBackwardDataImpl::AlgoMatmul::exec(const ExecArgs& args) const {
#define cb(DType) \ #define cb(DType) \
if (args.diff_layout->dtype == DType()) { \ if (args.diff_layout->dtype == DType()) { \
using ctype = typename DTypeTrait<DType>::ctype; \ using ctype = typename DTypeTrait<DType>::ctype; \
...@@ -49,77 +108,67 @@ void ConvolutionBackwardDataImpl::AlgoMatmul::exec(const ExecArgs &args) const { ...@@ -49,77 +108,67 @@ void ConvolutionBackwardDataImpl::AlgoMatmul::exec(const ExecArgs &args) const {
megdnn_assert_internal(0); megdnn_assert_internal(0);
} }
template<typename T> template <typename T>
void ConvolutionBackwardDataImpl::AlgoMatmul::exec_internal( void ConvolutionBackwardDataImpl::AlgoMatmul::exec_internal(
const ExecArgs &args) { const ExecArgs& args) {
auto &&fm = args.filter_meta; auto&& fm = args.filter_meta;
size_t N = args.grad_layout->shape[0], size_t N = args.grad_layout->shape[0], IC = fm.icpg,
IC = fm.icpg, IH = args.grad_layout->shape[2], IW = args.grad_layout->shape[3],
IH = args.grad_layout->shape[2], OC = fm.ocpg, OH = args.diff_layout->shape[2],
IW = args.grad_layout->shape[3], OW = args.diff_layout->shape[3], FH = fm.spatial[0],
OC = fm.ocpg, FW = fm.spatial[1], PH = fm.padding[0], PW = fm.padding[1],
OH = args.diff_layout->shape[2], SH = fm.stride[0], SW = fm.stride[1], DH = fm.dilation[0],
OW = args.diff_layout->shape[3],
FH = fm.spatial[0],
FW = fm.spatial[1],
PH = fm.padding[0],
PW = fm.padding[1],
SH = fm.stride[0],
SW = fm.stride[1],
DH = fm.dilation[0],
DW = fm.dilation[1]; DW = fm.dilation[1];
auto stream = cuda_stream(args.handle); auto stream = cuda_stream(args.handle);
auto wbundle = matmul_get_workspace_bundle(args.as_fwd_args());
wbundle.set(args.workspace.raw_ptr); auto matmul_opr = args.handle->create_operator<MatrixMulForward>();
T *diff_t = static_cast<T *>(wbundle.get(0)); if (args.opr->execution_policy().algo.valid()) {
T *col = static_cast<T *>(wbundle.get(1)); megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1);
matmul_opr->execution_policy() =
args.opr->execution_policy().sub_policy[0];
}
auto&& config =
sub_opr_config(args.filter_meta, *args.filter_layout,
*args.diff_layout, *args.grad_layout, args.opr);
matmul_opr->param() = config.second;
auto&& sizes = matmul_get_workspace_bundle(args.as_fwd_args());
sizes.push_back(matmul_opr->get_workspace_in_bytes(
config.first[0], config.first[1], config.first[2]));
auto wbundle = WorkspaceBundle(args.workspace.raw_ptr, sizes);
T* diff_t = static_cast<T*>(wbundle.get(0));
T* col = static_cast<T*>(wbundle.get(1));
{ {
// transpose diff // transpose diff
TensorLayout froml({N, OC*OH*OW}, typename DTypeTrait<T>::dtype()), TensorLayout froml({N, OC * OH * OW}, typename DTypeTrait<T>::dtype()),
tol(froml); tol(froml);
froml.stride[0] = args.diff_layout->stride[0]; froml.stride[0] = args.diff_layout->stride[0];
tol.stride[0] = 1; tol.stride[0] = 1;
tol.stride[1] = N; tol.stride[1] = N;
TensorND from(args.diff_tensor->ptr<T>(), froml), TensorND from(args.diff_tensor->ptr<T>(), froml), to(diff_t, tol);
to(diff_t, tol);
args.handle->relayout_opr()->exec(from, to); args.handle->relayout_opr()->exec(from, to);
} }
{ {
// take gemm grad // take gemm grad
TensorLayout Al({OC, IC*FH*FW}, typename DTypeTrait<T>::dtype()), TensorLayout Al({OC, IC * FH * FW}, typename DTypeTrait<T>::dtype()),
Bl({IC*FH*FW, OH*OW*N}, typename DTypeTrait<T>::dtype()), Bl({IC * FH * FW, OH * OW * N},
Cl({OC, OH*OW*N}, typename DTypeTrait<T>::dtype()); typename DTypeTrait<T>::dtype()),
TensorND A(args.filter_tensor->ptr<T>(), Al), Cl({OC, OH * OW * N}, typename DTypeTrait<T>::dtype());
B(col, Bl), TensorND A(args.filter_tensor->ptr<T>(), Al), B(col, Bl), C(diff_t, Cl);
C(diff_t, Cl);
if (fm.should_flip) { if (fm.should_flip) {
convolution::flip_filter(args.as_fwd_args(), convolution::flip_filter(args.as_fwd_args(),
wbundle.get_workspace(2), A.raw_ptr); wbundle.get_workspace(2), A.raw_ptr);
matmul_opr->exec(A, C, B, wbundle.get_workspace(3));
} else {
matmul_opr->exec(A, C, B, wbundle.get_workspace(2));
} }
auto&& matmul_opr = args.handle->create_operator<MatrixMulForward>();
if (args.opr->param().compute_mode ==
param::Convolution::ComputeMode::FLOAT32) {
matmul_opr->param().compute_mode =
param::MatrixMul::ComputeMode::FLOAT32;
}
matmul_opr->param().transposeA = true;
megdnn_assert(matmul_opr->get_workspace_in_bytes(A.layout, C.layout,
B.layout) == 0_z,
"Assume matmul opr in algo MATMUL doesn't need extra "
"workspace");
matmul_opr->exec(A, C, B, Workspace());
} }
{ {
// col2im // col2im
convolution::col2im<T>(col, args.grad_tensor->ptr<T>(), convolution::col2im<T>(col, args.grad_tensor->ptr<T>(), N,
N, args.grad_layout->stride[0], args.grad_layout->stride[0], IC, IH, IW, FH, FW,
IC, IH, IW, OH, OW, PH, PW, SH, SW, DH, DW, stream);
FH, FW,
OH, OW,
PH, PW,
SH, SW,
DH, DW,
stream);
} }
} }
......
...@@ -31,8 +31,9 @@ bool ConvolutionBackwardFilterImpl::AlgoMatmul::is_available( ...@@ -31,8 +31,9 @@ bool ConvolutionBackwardFilterImpl::AlgoMatmul::is_available(
size_t ConvolutionBackwardFilterImpl::AlgoMatmul::get_workspace_in_bytes( size_t ConvolutionBackwardFilterImpl::AlgoMatmul::get_workspace_in_bytes(
const SizeArgs &args) const { const SizeArgs &args) const {
return matmul_get_workspace_bundle( return WorkspaceBundle(nullptr,
args.as_fwd_args()).total_size_in_bytes(); matmul_get_workspace_bundle(args.as_fwd_args()))
.total_size_in_bytes();
} }
void ConvolutionBackwardFilterImpl::AlgoMatmul::exec( void ConvolutionBackwardFilterImpl::AlgoMatmul::exec(
...@@ -69,7 +70,8 @@ void ConvolutionBackwardFilterImpl::AlgoMatmul::exec_internal( ...@@ -69,7 +70,8 @@ void ConvolutionBackwardFilterImpl::AlgoMatmul::exec_internal(
DH = fm.dilation[0], DH = fm.dilation[0],
DW = fm.dilation[1]; DW = fm.dilation[1];
auto stream = cuda_stream(args.handle); auto stream = cuda_stream(args.handle);
auto wbundle = matmul_get_workspace_bundle(args.as_fwd_args()); auto wbundle = WorkspaceBundle(
nullptr, matmul_get_workspace_bundle(args.as_fwd_args()));
wbundle.set(args.workspace.raw_ptr); wbundle.set(args.workspace.raw_ptr);
T *diff_t = static_cast<T *>(wbundle.get(0)); T *diff_t = static_cast<T *>(wbundle.get(0));
T *col = static_cast<T *>(wbundle.get(1)); T *col = static_cast<T *>(wbundle.get(1));
......
...@@ -48,7 +48,7 @@ bool convolution::is_cudnn_supported(const ForwardSizeArgs &args) { ...@@ -48,7 +48,7 @@ bool convolution::is_cudnn_supported(const ForwardSizeArgs &args) {
return supported; return supported;
} }
WorkspaceBundle convolution::matmul_get_workspace_bundle( SmallVector<size_t> convolution::matmul_get_workspace_bundle(
const ForwardSizeArgs &args) { const ForwardSizeArgs &args) {
auto dtype = args.src_layout->dtype; auto dtype = args.src_layout->dtype;
auto &&fm = args.filter_meta; auto &&fm = args.filter_meta;
...@@ -67,7 +67,7 @@ WorkspaceBundle convolution::matmul_get_workspace_bundle( ...@@ -67,7 +67,7 @@ WorkspaceBundle convolution::matmul_get_workspace_bundle(
if (args.filter_meta.should_flip) { if (args.filter_meta.should_flip) {
sizes.push_back(dtype.size() * OC * IC * FH * FW); sizes.push_back(dtype.size() * OC * IC * FH * FW);
} }
return {nullptr, std::move(sizes)}; return sizes;
} }
void convolution::flip_filter(const ForwardSizeArgs &args, void convolution::flip_filter(const ForwardSizeArgs &args,
......
...@@ -34,7 +34,8 @@ namespace convolution { ...@@ -34,7 +34,8 @@ namespace convolution {
bool is_cudnn_supported(const ForwardSizeArgs &args); bool is_cudnn_supported(const ForwardSizeArgs &args);
//! get workspace bundle for matmul algo //! get workspace bundle for matmul algo
WorkspaceBundle matmul_get_workspace_bundle(const ForwardSizeArgs &args); SmallVector<size_t> matmul_get_workspace_bundle(
const ForwardSizeArgs& args);
struct CUDNNForwardDescs { struct CUDNNForwardDescs {
TensorDesc src_desc, dst_desc; TensorDesc src_desc, dst_desc;
......
...@@ -230,7 +230,7 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) ...@@ -230,7 +230,7 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA)
} }
checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>( checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
ExecutionPolicyAlgoName{"CONVOLUTION_BACKWARD_DATD_BFLOAT16", ExecutionPolicyAlgoName{"CONVOLUTION_BACKWARD_DATD_BFLOAT16",
{{"MATMUL", {}}}})); {{"MATMUL", {{"CUBLAS", {}}}}}}));
src.dtype = dst.dtype = filter.dtype = dtype::BFloat16(); src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
checker.set_rng(0, &rng) checker.set_rng(0, &rng)
...@@ -243,6 +243,37 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) ...@@ -243,6 +243,37 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA)
} }
} }
TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_MATMUL)
{
using namespace convolution;
std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
Checker<ConvolutionBackwardData> checker(handle_cuda());
checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}}));
NormalRNG default_rng;
for (auto &&arg: args) {
float scale =
64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]);
UniformFloatRNG rng(scale, 2 * scale);
auto src = TensorLayout(arg.src, dtype::Float32());
auto filter = TensorLayout(arg.filter, dtype::Float32());
TensorLayout dst;
{
auto opr = handle_cuda()->create_operator<Convolution>();
opr->param() = arg.param;
opr->deduce_layout(src, filter, dst);
}
src.dtype = dst.dtype = filter.dtype = dtype::Float32();
checker.set_rng(0, &default_rng)
.set_rng(1, &default_rng)
.set_epsilon(1e-3)
.set_param(arg.param)
.exec(TensorLayoutArray{filter, dst, src});
}
}
TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5)
{ {
// BRAIN-481 failed on architectures 7.0, remove the following if statement, // BRAIN-481 failed on architectures 7.0, remove the following if statement,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册