提交 08ff62de 编写于 作者: M Megvii Engine Team

refactor(megdnn): refactor batched matmul algo in conv bias

GitOrigin-RevId: 64fda611ff39c3f6ab46761d70daef8433375688
上级 8773926e
......@@ -361,9 +361,6 @@ private:
};
class ConvBiasForwardImpl::AlgoBatchedMatmul final : public AlgoBase {
static void extract_matmul_layouts(const SizeArgs& args, TensorLayout& A,
TensorLayout& B, TensorLayout& C);
public:
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
......@@ -372,10 +369,15 @@ public:
const char* name() const override {
if (m_name.empty()) {
m_name = ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
"BATCHEDMATMUL", {});
"BATCHED_MATMUL", {});
}
return m_name.c_str();
}
std::vector<SearchItem> get_subopr_list(
const TensorLayoutArray& layouts,
const OperatorBase* opr) const override;
bool is_reproducible() const override { return true; }
MEGDNN_DECL_ALGO_TYPE(CUDA_BATCHED_MATMUL)
......
......@@ -6,10 +6,13 @@
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/common/algo_chooser.h"
#include "src/common/conv_bias.h"
#include "src/cuda/batched_matrix_mul/algo.h"
#include "src/cuda/conv_bias/algo.h"
#include "src/cuda/handle.h"
#include "src/cuda/utils.cuh"
......@@ -18,18 +21,72 @@ using namespace megdnn;
using namespace cuda;
using namespace conv_bias;
namespace {
std::pair<TensorLayoutArray, MatrixMulForward::Param> sub_opr_config(
const ConvBiasForwardImpl::CanonizedFilterMeta& fm,
const TensorLayout& src_layout, const TensorLayout&,
const TensorLayout& dst_layout, const ConvBiasForwardImpl* opr) {
// A {N, OC, IC}
// B {N, IC, H * W}
// C {N, OC, H * W}
size_t batched = src_layout.shape[0];
TensorLayout A, B, C;
A = {{batched, fm.ocpg, fm.icpg}, fm.dtype};
A.stride[0] = 0;
B.ndim = 3;
B.shape[1] = src_layout.shape[1];
B.shape[2] = src_layout.shape[2] * src_layout.shape[3];
B.shape[0] = batched;
B.stride[2] = 1;
B.stride[1] = src_layout.stride[1];
B.stride[0] = src_layout.stride[0];
B.dtype = src_layout.dtype;
C = {{dst_layout.shape[0], dst_layout.shape[1], B.shape[2]},
dst_layout.dtype};
MatrixMulForward::Param param;
if (opr->param().compute_mode == param::Convolution::ComputeMode::FLOAT32) {
param.compute_mode = param::MatrixMul::ComputeMode::FLOAT32;
}
return {{A, B, C}, param};
}
} // namespace
std::vector<Algorithm::SearchItem>
ConvBiasForwardImpl::AlgoBatchedMatmul::get_subopr_list(
const TensorLayoutArray& layouts, const OperatorBase* opr) const {
const ConvBiasForwardImpl* conv_bias_opr =
static_cast<const ConvBiasForwardImpl*>(opr);
CanonizedFilterMeta fm =
conv_bias_opr->check_layout_fwd(layouts[0], layouts[1], layouts[4]);
auto&& config = sub_opr_config(fm, layouts[0], layouts[1], layouts[4],
conv_bias_opr);
std::string param_str;
Algorithm::serialize_write_pod(config.second, param_str);
return {{Algorithm::OprType::BATCHED_MATRIX_MUL_FORWARD, param_str,
config.first}};
}
bool ConvBiasForwardImpl::AlgoBatchedMatmul::is_available(
const SizeArgs& args) const {
if (args.z_layout->ndim > 0)
return false;
//! cudnn batched matmul with discontinuous stride has many bugs, so disable
//! here.
TensorLayout A, B, C;
extract_matmul_layouts(args, A, B, C);
if (!B.is_contiguous()) {
return false;
auto bmatmul_opr = args.handle->create_operator<BatchedMatrixMulForward>();
if (args.opr->execution_policy().algo.valid() &&
!args.opr->execution_policy().sub_policy.empty()) {
megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1);
bmatmul_opr->execution_policy() =
args.opr->execution_policy().sub_policy[0];
}
auto&& config =
sub_opr_config(args.filter_meta, *args.src_layout,
*args.filter_layout, *args.dst_layout, args.opr);
bmatmul_opr->param() = config.second;
auto&& fm = args.filter_meta;
return fm.format == Param::Format::NCHW &&
(fm.dtype.enumv() == DTypeEnum::Float32 ||
......@@ -37,29 +94,10 @@ bool ConvBiasForwardImpl::AlgoBatchedMatmul::is_available(
fm.spatial_ndim == 2 && fm.group == 1 && fm.dilation[0] == 1 &&
fm.dilation[1] == 1 && fm.spatial[0] == 1 && fm.spatial[1] == 1 &&
fm.padding[0] == 0 && fm.padding[1] == 0 && fm.stride[0] == 1 &&
fm.stride[1] == 1;
}
void ConvBiasForwardImpl::AlgoBatchedMatmul::extract_matmul_layouts(
const SizeArgs& args, TensorLayout& A, TensorLayout& B,
TensorLayout& C) {
auto&& fm = args.filter_meta;
// A {N, OC, IC}
// B {N, IC, H * W}
// C {N, OC, H * W}
size_t batched = args.src_layout->shape[0];
A = {{batched, fm.ocpg, fm.icpg}, fm.dtype};
A.stride[0] = 0;
B.ndim = 3;
B.shape[1] = args.src_layout->shape[1];
B.shape[2] = args.src_layout->shape[2] * args.src_layout->shape[3];
B.shape[0] = batched;
B.stride[2] = 1;
B.stride[1] = args.src_layout->stride[1];
B.stride[0] = args.src_layout->stride[0];
B.dtype = args.src_layout->dtype;
C = {{args.dst_layout->shape[0], args.dst_layout->shape[1], B.shape[2]},
args.dst_layout->dtype};
fm.stride[1] == 1 &&
get_algorithm(
static_cast<BatchedMatrixMulForwardImpl*>(bmatmul_opr.get()),
config.first[0], config.first[1], config.first[2]);
}
WorkspaceBundle ConvBiasForwardImpl::AlgoBatchedMatmul::get_workspace_bundle(
......@@ -76,11 +114,23 @@ WorkspaceBundle ConvBiasForwardImpl::AlgoBatchedMatmul::get_workspace_bundle(
SizeArgs conv_args = args;
conv_args.dst_layout = &dst_layout;
TensorLayout A, B, C;
extract_matmul_layouts(conv_args, A, B, C);
sizes.insert(
sizes.begin(),
args.handle->batched_matrix_mul()->get_workspace_in_bytes(A, B, C));
auto bmatmul_opr = args.handle->create_operator<BatchedMatrixMulForward>();
if (args.opr->execution_policy().algo.valid() &&
!args.opr->execution_policy().sub_policy.empty()) {
megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1);
bmatmul_opr->execution_policy() =
args.opr->execution_policy().sub_policy[0];
}
auto&& config =
sub_opr_config(args.filter_meta, *args.src_layout,
*args.filter_layout, *args.dst_layout, args.opr);
bmatmul_opr->param() = config.second;
sizes.insert(sizes.begin(),
args.handle->batched_matrix_mul()->get_workspace_in_bytes(
config.first[0], config.first[1], config.first[2]));
return {ptr, std::move(sizes)};
}
......@@ -104,13 +154,23 @@ void ConvBiasForwardImpl::AlgoBatchedMatmul::exec(const ExecArgs& args) const {
conv_args.dst_tensor = &conv_dst_tensor;
conv_args.dst_layout = &conv_dst_tensor.layout;
{
TensorND A, B, C;
extract_matmul_layouts(args, A.layout, B.layout, C.layout);
A.raw_ptr = args.filter_tensor->raw_ptr;
B.raw_ptr = args.src_tensor->raw_ptr;
C.raw_ptr = args.dst_tensor->raw_ptr;
auto mm = args.handle->batched_matrix_mul();
mm->exec(A, B, C, bundle.get_workspace(0));
auto bmatmul_opr =
args.handle->create_operator<BatchedMatrixMulForward>();
if (args.opr->execution_policy().algo.valid()) {
megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1);
bmatmul_opr->execution_policy() =
args.opr->execution_policy().sub_policy[0];
}
auto&& config =
sub_opr_config(args.filter_meta, *args.src_layout,
*args.filter_layout, *args.dst_layout, args.opr);
bmatmul_opr->param() = config.second;
TensorND A{args.filter_tensor->raw_ptr, config.first[0]},
B{args.src_tensor->raw_ptr, config.first[1]},
C{args.dst_tensor->raw_ptr, config.first[2]};
bmatmul_opr->exec(A, B, C, bundle.get_workspace(0));
}
handle_bias_and_nonlinear(args.handle, args.nonlinear_mode,
&conv_dst_tensor, args.dst_tensor,
......
......@@ -46,6 +46,7 @@ struct OprTypeFromOprTrait;
}
cb(MATRIX_MUL_FORWARD, MatrixMulForward);
cb(BATCHED_MATRIX_MUL_FORWARD, BatchedMatrixMulForward);
cb(CONVOLUTION_FORWARD, ConvolutionForward);
cb(CONVOLUTION_BACKWARD_DATA, ConvolutionBackwardData);
cb(CONVOLUTION_BACKWARD_FILTER, ConvolutionBackwardFilter);
......@@ -66,6 +67,7 @@ cb(CONVBIAS_FORWARD, ConvBiasForward);
// clang-format off
#define FOREACH_OPR_TYPE(cb) \
cb(MATRIX_MUL_FORWARD) \
cb(BATCHED_MATRIX_MUL_FORWARD) \
cb(CONVOLUTION_FORWARD) \
cb(CONVOLUTION_BACKWARD_DATA) \
cb(CONVOLUTION_BACKWARD_FILTER) \
......@@ -83,6 +85,7 @@ cb(CONVBIAS_FORWARD, ConvBiasForward);
#define FOREACH_OPR_TYPE_WITH_STMT(cb, stmt) \
cb(MATRIX_MUL_FORWARD, stmt) \
cb(BATCHED_MATRIX_MUL_FORWARD, stmt) \
cb(CONVOLUTION_FORWARD, stmt) \
cb(CONVOLUTION_BACKWARD_DATA, stmt) \
cb(CONVOLUTION_BACKWARD_FILTER, stmt) \
......
......@@ -821,7 +821,7 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_MATMUL_NCHW4) {
{{8, 64, 12, 12, 4}, {256, 64, 3, 3, 4}, {1, 64, 1, 1, 4}, {}, {}});
}
TEST_F(CUDA, CONV_BIAS_FORWARD_MATMUL_1x1) {
TEST_F(CUDA, CONV_BIAS_FORWARD_BATCHED_MATMUL) {
using namespace conv_bias;
std::vector<TestArg> args = get_args_1x1();
Checker<ConvBiasForward> checker(handle_cuda());
......@@ -834,13 +834,15 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_MATMUL_1x1) {
.set_rng(1, &default_rng)
.set_rng(2, &default_rng)
.set_epsilon(1e-3);
checker.set_before_exec_callback(
AlgoChecker<ConvBiasForward>(ExecutionPolicyAlgoName{
ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
"BATCHED_MATMUL", {})
.c_str(),
{{"CUBLAS", {}}}}));
for (auto&& arg : args) {
checker.set_param(arg.param);
checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<
ConvBias>(
ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
"BATCHEDMATMUL", {})
.c_str()));
checker.execs({arg.src, arg.filter, arg.bias, {}, {}});
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册