提交 f14e0c17 编写于 作者: M Megvii Engine Team

feat(mgb): add recursive for fastrun and megdnn test

GitOrigin-RevId: 743846f64536cd604a24024378ee93a7e333a50b
上级 85fa9883
......@@ -330,6 +330,8 @@ struct TensorLayout : public TensorShape {
/* =================== properties =================== */
std::string to_string() const;
std::string serialize() const;
#endif // MEGDNN_CC_HOST
/*!
......
......@@ -11,6 +11,7 @@
*/
#pragma once
#include <type_traits>
#include "megdnn/basic_types.h"
#include "megdnn/handle.h"
......@@ -144,8 +145,11 @@ public:
return {{handle_type(), type(), param()}, name(), is_reproducible()};
}
Info::Desc desc() const { return {handle_type(), type(), param()}; }
template <typename T>
static void serialize_write_pod(const T& val, std::string& result) {
static_assert(std::is_standard_layout<T>::value, "invalid type");
result.append(reinterpret_cast<const char*>(&val), sizeof(T));
}
......@@ -155,6 +159,7 @@ public:
template <typename T>
static T deserialize_read_pod(const std::string& data, size_t offset = 0) {
static_assert(std::is_standard_layout<T>::value, "invalid type");
T ret;
//! A pointer to an object or incomplete type may be converted to a
//! pointer to a different object or incomplete type. If the resulting
......@@ -167,10 +172,69 @@ public:
return ret;
}
template <typename T>
static T deserialize_read_pod(const char* data, size_t offset = 0) {
static_assert(std::is_standard_layout<T>::value, "invalid type");
T ret;
//! A pointer to an object or incomplete type may be converted to a
//! pointer to a different object or incomplete type. If the resulting
//! pointer is not correctly aligned for the pointed-to type, the
//! behavior is undefined.
//!
//! so here we should use memcpy instead of
//! *reinterpret_cast<const T*>(&data[offset]);
memcpy(&ret, data + offset, sizeof(T));
return ret;
}
enum class OprType : uint32_t {
MATRIX_MUL_FORWARD,
BATCHED_MATRIX_MUL_FORWARD,
CONVOLUTION_FORWARD,
CONVOLUTION_BACKWARD_DATA,
CONVOLUTION_BACKWARD_FILTER,
CONVOLUTION3D_FORWARD,
CONVOLUTION3D_BACKWARD_DATA,
CONVOLUTION3D_BACKWARD_FILTER,
LOCAL_SHARE_FORWARD,
LOCAL_SHARE_BACKWARD_DATA,
LOCAL_SHARE_BACKWARD_FILTER,
DEFORMABLE_CONV_FORWARD,
DEFORMABLE_CONV_BACKWARD_DATA,
DEFORMABLE_CONV_BACKWARD_FILTER,
CONVBIAS_FORWARD,
BATCH_CONV_FORWARD,
};
struct SearchItem {
OprType opr_type;
//! serialized param
std::string param;
TensorLayoutArray layouts;
};
/**
* \brief get subopr list of the algo
*
* \param layouts origin layouts of the parent opr
* \param opr parent opr
*/
virtual std::vector<SearchItem> get_subopr_list(const TensorLayoutArray&,
const OperatorBase*) const {
return {};
}
protected:
Handle::HandleType m_handle_type = Handle::HandleType::NAIVE;
};
//! policy for executing the operator
struct ExecutionPolicy {
//! INVALID_ALGO_TYPE algo_type means using heuristic
Algorithm::Info::Desc algo;
std::vector<ExecutionPolicy> sub_policy;
};
/*!
* \brief define Algorithm and ExecutionPolicy for oprs that have
* multiple impl algos
......@@ -198,12 +262,6 @@ public:
*/
virtual const char* get_algorithm_set_name() const = 0;
//! policy for executing the operator
struct ExecutionPolicy {
//! INVALID_ALGO_TYPE algo_type means using heuristic
AlgorithmInfo algo;
};
ExecutionPolicy& execution_policy() { return m_execution_policy; }
const ExecutionPolicy& execution_policy() const {
......@@ -464,6 +522,9 @@ protected:
bool reproducible = false) = 0;
};
} // namespace detail
using Algorithm = detail::Algorithm;
using ExecutionPolicy = detail::ExecutionPolicy;
} // namespace megdnn
#include "megdnn/internal/visibility_epilogue.h"
......
......@@ -25,17 +25,17 @@ namespace megdnn {
*/
template <class Opr, typename... Args>
typename Opr::AlgoBase* get_algorithm(Opr* opr, Args&&... args) {
typename Opr::AlgorithmInfo ret;
typename Opr::AlgorithmDesc ret;
auto set = opr->execution_policy().algo;
if (set.valid()) {
ret = set;
} else {
ret = opr->get_algorithm_info_heuristic(
std::forward<Args>(args)..., std::numeric_limits<size_t>::max(),
false);
false).desc;
}
return static_cast<typename Opr::AlgoBase*>(
opr->get_algorithm_from_desc(ret.desc));
opr->get_algorithm_from_desc(ret));
}
/*!
......@@ -46,7 +46,7 @@ template <class Opr, typename... Args>
typename Opr::AlgoBase* get_algorithm_or_construct(Opr* opr, Args&&... args) {
auto set = opr->execution_policy().algo;
if (set.valid()) {
return opr->algo_pack().construct_and_get_algo(set.desc);
return opr->algo_pack().construct_and_get_algo(set);
} else {
return static_cast<typename Opr::AlgoBase*>(
opr->get_algorithm_heuristic(std::forward<Args>(args)...,
......
......@@ -20,6 +20,7 @@
#include <mutex>
#include <numeric>
#include <tuple>
#include <type_traits>
using namespace megdnn;
......@@ -35,6 +36,26 @@ class DefaultErrorHandler final : public ErrorHandler {
#endif
}
};
template <typename T>
void serialize_pod(const T& val, std::string& result) {
static_assert(std::is_standard_layout<T>::value, "invalid type");
result.append(reinterpret_cast<const char*>(&val), sizeof(T));
}
template <typename T>
void serialize_vec(const T* val, size_t size, std::string& result) {
result.append(reinterpret_cast<const char*>(val), sizeof(T) * size);
}
template <typename T>
T deserialize_pod(const std::string& data, size_t& offset) {
T ret;
memcpy(&ret, data.data() + offset, sizeof(T));
offset += sizeof(T);
return ret;
}
} // namespace
ErrorHandler* ErrorHandler::sm_inst;
......@@ -126,17 +147,23 @@ bool TensorShape::eq_shape(const TensorShape& rhs) const {
size_t eq = 0;
switch (ndim) {
case 7:
eq += shape[6] == rhs.shape[6]; MEGDNN_FALLTHRU
eq += shape[6] == rhs.shape[6];
MEGDNN_FALLTHRU
case 6:
eq += shape[5] == rhs.shape[5]; MEGDNN_FALLTHRU
eq += shape[5] == rhs.shape[5];
MEGDNN_FALLTHRU
case 5:
eq += shape[4] == rhs.shape[4]; MEGDNN_FALLTHRU
eq += shape[4] == rhs.shape[4];
MEGDNN_FALLTHRU
case 4:
eq += shape[3] == rhs.shape[3]; MEGDNN_FALLTHRU
eq += shape[3] == rhs.shape[3];
MEGDNN_FALLTHRU
case 3:
eq += shape[2] == rhs.shape[2]; MEGDNN_FALLTHRU
eq += shape[2] == rhs.shape[2];
MEGDNN_FALLTHRU
case 2:
eq += shape[1] == rhs.shape[1]; MEGDNN_FALLTHRU
eq += shape[1] == rhs.shape[1];
MEGDNN_FALLTHRU
case 1:
eq += shape[0] == rhs.shape[0];
}
......@@ -435,8 +462,8 @@ bool TensorLayout::try_reshape(TensorLayout& result,
for (size_t i = 0; i < tshp.ndim; ++i) {
if (!tshp.shape[i]) {
megdnn_throw_if(!format.is_default(), tensor_reshape_error,
megdnn_mangle(ssprintf("bad target tshp: %s",
tshp.to_string().c_str())));
megdnn_mangle(ssprintf("bad target tshp: %s",
tshp.to_string().c_str())));
is_empty_shape = true;
break;
}
......@@ -510,8 +537,36 @@ std::string TensorLayout::to_string() const {
rst.append(" @ ");
rst.append(format.impl()->to_string());
}
rst.append(std::string(" ") + dtype.name());
rst.append("}");
return rst;
}
std::string TensorLayout::serialize() const {
std::string rst;
serialize_pod<size_t>(ndim, rst);
serialize_vec<size_t>(shape, ndim, rst);
serialize_vec<ptrdiff_t>(stride, ndim, rst);
rst.append(format.impl()->to_string());
//! serialize dtype
serialize_pod(dtype.enumv(), rst);
if (dtype.has_param()) {
switch (dtype.enumv()) {
#define cb(_dt) \
case DTypeTrait<dtype::_dt>::enumv: \
serialize_pod(dtype::_dt::downcast_from(dtype).param(), rst); \
break;
MEGDNN_FOREACH_PARAMETERIZED_DTYPE(cb)
#undef cb
default:
megdnn_assert(false,
"cannot serialize unknown parameterized DType");
break;
}
}
return rst;
}
// vim: syntax=cpp.doxygen
......@@ -24,7 +24,7 @@ bool BatchedMatrixMulForwardImpl::AlgoBruteForce::is_available(
const SizeArgs& args) const {
MatrixMulForwardImpl mm{args.opr->handle()};
mm.param() = {args.opr->param().transposeA, args.opr->param().transposeB};
mm.execution_policy() = {m_algorithm->info()};
mm.execution_policy() = {m_algorithm->desc(), {}};
auto mm_layout_a = args.layout_a.remove_axis(0);
auto mm_layout_b = args.layout_b.remove_axis(0);
......@@ -39,7 +39,7 @@ size_t BatchedMatrixMulForwardImpl::AlgoBruteForce::get_workspace_in_bytes(
auto mm_opr = args.opr->handle()->create_operator<MatrixMulForward>();
mm_opr->param() = {args.opr->param().transposeA,
args.opr->param().transposeB};
mm_opr->execution_policy() = {m_algorithm->info()};
mm_opr->execution_policy() = {m_algorithm->desc(), {}};
return mm_opr->get_workspace_in_bytes(args.layout_a, args.layout_b,
args.layout_c);
......@@ -50,7 +50,7 @@ void BatchedMatrixMulForwardImpl::AlgoBruteForce::exec(
auto&& mm_opr = args.opr->handle()->create_operator<MatrixMulForward>();
mm_opr->param() = {args.opr->param().transposeA,
args.opr->param().transposeB};
mm_opr->execution_policy() = {m_algorithm->info()};
mm_opr->execution_policy() = {m_algorithm->desc(), {}};
rep(n, N) {
TensorND A_, B_, C_;
auto tensor_n_from_batch = [n](const TensorND& in, TensorND& out) {
......
......@@ -47,7 +47,7 @@ ConvBiasForwardImpl::AlgoBFloat16::float_args(
change_dtype(fdst);
opr->param() = args.opr->param();
opr->param().compute_mode = Param::ComputeMode::DEFAULT;
opr->execution_policy() = {m_impl->info()};
opr->execution_policy() = {m_impl->desc(), {}};
return SizeArgs(opr, fsrc, ffilter, fbias, fz, fdst);
}
......@@ -110,7 +110,7 @@ void ConvBiasForwardImpl::AlgoBFloat16::exec(const ExecArgs& args) const {
auto convbias_opr = args.handle->create_operator<ConvBias>();
convbias_opr->param() = args.opr->param();
convbias_opr->param().compute_mode = Param::ComputeMode::DEFAULT;
convbias_opr->execution_policy() = {m_impl->info()};
convbias_opr->execution_policy() = {m_impl->desc(), {}};
convbias_opr->exec(fsrc_tensor, ffilter_tensor, fbias_tensor, fz_tensor,
fdst_tensor, nullptr, cvter.workspace());
}
......
......@@ -46,12 +46,8 @@ ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() {
megdnn_assert(all_algos_data == all_algos.data());
non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group matmul
size_t algo_size = all_algos.size();
for (size_t i=0; i<algo_size; ++i) {
bfloat16_refhold.emplace_back(new AlgoBFloat16(all_algos[i]));
all_algos.push_back(bfloat16_refhold.back().get());
bfloat16_algos.push_back(bfloat16_refhold.back().get());
}
all_algos.push_back(&bfloat16);
bfloat16_algos.push_back(&bfloat16);
for (auto&& algo : all_algos) {
m_all_algos_map.emplace(algo->info().desc, algo);
......
......@@ -170,28 +170,22 @@ public:
class ConvolutionBackwardDataImpl::AlgoBFloat16 final : public AlgoBase {
public:
AlgoBFloat16(ConvolutionBackwardDataImpl::AlgoBase*);
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override;
const char* name() const override { return m_name.c_str(); }
std::vector<SearchItem> get_subopr_list(
const TensorLayoutArray& layouts,
const OperatorBase* opr) const override;
const char* name() const override {
return "CONVOLUTION_BACKWARD_DATD_BFLOAT16";
}
bool is_reproducible() const override { return true; }
private:
std::string m_name;
ConvolutionBackwardDataImpl::AlgoBase* m_algorithm = nullptr;
SizeArgs float_args(const SizeArgs& args, ConvolutionBackwardDataImpl* opr,
TensorLayout& fsrc, TensorLayout& ffilter,
TensorLayout& fdst) const;
WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
MEGDNN_DECL_ALGO_TYPE(CUDA_BFLOAT16)
std::string param() const override {
std::string ret;
serialize_write_pod(m_algorithm, ret);
return ret;
}
};
//! implement group conv by another algo
......@@ -237,7 +231,7 @@ public:
AlgoChanwiseSmall chanwise_small;
std::vector<AlgoGroupConvGeneral> gconv;
std::unordered_map<AlgoBase*, AlgoGroupConvGeneral*> algo2gconv;
std::vector<std::unique_ptr<AlgoBFloat16>> bfloat16_refhold;
AlgoBFloat16 bfloat16;
std::vector<AlgoBase*>
//! all algorithms
......
......@@ -17,33 +17,39 @@ using namespace megdnn;
using namespace cuda;
using namespace convolution;
ConvolutionBackwardDataImpl::AlgoBFloat16::AlgoBFloat16(
ConvolutionBackwardDataImpl::AlgoBase* algorithm)
: m_algorithm(algorithm) {
megdnn_assert_internal(algorithm);
m_name = ssprintf("CONVOLUTION_BACKWARD_DATD_BFLOAT16:%s",
m_algorithm->name());
}
ConvolutionBackwardDataImpl::AlgoBase::SizeArgs
ConvolutionBackwardDataImpl::AlgoBFloat16::float_args(
const SizeArgs& args, ConvolutionBackwardDataImpl* opr,
TensorLayout& ffilter, TensorLayout& fdiff, TensorLayout& fgrad) const {
ffilter = *args.filter_layout;
fdiff = *args.diff_layout;
fgrad = *args.grad_layout;
namespace {
std::pair<TensorLayoutArray, ConvolutionBackwardDataImpl::Param> sub_opr_config(
const TensorLayoutArray& layouts,
const ConvolutionBackwardDataImpl* opr) {
megdnn_assert(layouts.size() >= 3);
std::pair<TensorLayoutArray, ConvolutionBackwardDataImpl::Param> ret;
ret.first = layouts;
auto change_dtype = [](TensorLayout& layout) {
if (layout.dtype == dtype::BFloat16()) {
layout.dtype = dtype::Float32();
}
};
change_dtype(ffilter);
change_dtype(fdiff);
change_dtype(fgrad);
opr->param() = args.opr->param();
opr->param().compute_mode = Param::ComputeMode::DEFAULT;
opr->execution_policy() = {m_algorithm->info()};
return SizeArgs(opr, ffilter, fdiff, fgrad);
change_dtype(ret.first[0]);
change_dtype(ret.first[1]);
change_dtype(ret.first[2]);
ret.second = opr->param();
ret.second.compute_mode =
ConvolutionBackwardData::Param::ComputeMode::DEFAULT;
return ret;
}
}
std::vector<Algorithm::SearchItem>
ConvolutionBackwardDataImpl::AlgoBFloat16::get_subopr_list(
const TensorLayoutArray& layouts, const OperatorBase* opr) const {
auto&& config = sub_opr_config(
layouts, static_cast<const ConvolutionBackwardDataImpl*>(opr));
std::string param_str;
Algorithm::serialize_write_pod(config.second, param_str);
return {{Algorithm::OprType::CONVOLUTION_BACKWARD_DATA, param_str,
config.first}};
}
bool ConvolutionBackwardDataImpl::AlgoBFloat16::is_available(
......@@ -51,24 +57,30 @@ bool ConvolutionBackwardDataImpl::AlgoBFloat16::is_available(
TensorLayout ffilter, fdiff, fgrad;
auto conv_back_data_opr =
args.handle->create_operator<ConvolutionBackwardData>();
SizeArgs fargs = float_args(
args,
static_cast<ConvolutionBackwardDataImpl*>(conv_back_data_opr.get()),
ffilter, fdiff, fgrad);
auto&& config = sub_opr_config(
{*args.filter_layout, *args.diff_layout, *args.grad_layout},
args.opr);
conv_back_data_opr->param() = config.second;
return args.diff_layout->dtype == args.filter_layout->dtype &&
args.diff_layout->dtype == dtype::BFloat16() &&
m_algorithm->is_available(fargs);
get_algorithm(static_cast<ConvolutionBackwardDataImpl*>(
conv_back_data_opr.get()),
config.first[0], config.first[1], config.first[2]);
}
WorkspaceBundle ConvolutionBackwardDataImpl::AlgoBFloat16::get_workspace_bundle(
void* ptr, const SizeArgs& args) const {
TensorLayout ffilter, fdiff, fgrad;
auto conv_back_data_opr =
args.handle->create_operator<ConvolutionBackwardData>();
SizeArgs fargs = float_args(
args,
static_cast<ConvolutionBackwardDataImpl*>(conv_back_data_opr.get()),
ffilter, fdiff, fgrad);
if (args.opr->execution_policy().algo.valid()) {
megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1);
conv_back_data_opr->execution_policy() =
args.opr->execution_policy().sub_policy[0];
}
auto&& config = sub_opr_config(
{*args.filter_layout, *args.diff_layout, *args.grad_layout},
args.opr);
conv_back_data_opr->param() = config.second;
SmallVector<size_t> sizes;
auto get_workspace = [&sizes](const TensorLayout& src,
const TensorLayout& dst) {
......@@ -76,10 +88,12 @@ WorkspaceBundle ConvolutionBackwardDataImpl::AlgoBFloat16::get_workspace_bundle(
sizes.push_back(dst.span().dist_byte());
}
};
get_workspace(*args.filter_layout, ffilter);
get_workspace(*args.diff_layout, fdiff);
get_workspace(*args.grad_layout, fgrad);
sizes.push_back(m_algorithm->get_workspace_in_bytes(fargs));
get_workspace(*args.filter_layout, config.first[0]);
get_workspace(*args.diff_layout, config.first[1]);
get_workspace(*args.grad_layout, config.first[2]);
sizes.push_back(conv_back_data_opr->get_workspace_in_bytes(
config.first[0], config.first[1], config.first[2]));
return {ptr, std::move(sizes)};
}
......@@ -103,9 +117,13 @@ void ConvolutionBackwardDataImpl::AlgoBFloat16::exec(
{
auto conv_back_data_opr =
args.handle->create_operator<ConvolutionBackwardData>();
if (args.opr->execution_policy().algo.valid()) {
megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1);
conv_back_data_opr->execution_policy() =
args.opr->execution_policy().sub_policy[0];
}
conv_back_data_opr->param() = args.opr->param();
conv_back_data_opr->param().compute_mode = Param::ComputeMode::DEFAULT;
conv_back_data_opr->execution_policy() = {m_algorithm->info()};
conv_back_data_opr->exec(ffilter_tensor, fdiff_tensor, fgrad_tensor,
cvter.workspace());
}
......
......@@ -42,7 +42,7 @@ ConvolutionBackwardFilterImpl::AlgoBFloat16::float_args(
change_dtype(fgrad);
opr->param() = args.opr->param();
opr->param().compute_mode = Param::ComputeMode::DEFAULT;
opr->execution_policy() = {m_algorithm->info()};
opr->execution_policy() = {m_algorithm->desc(), {}};
return SizeArgs(opr, fsrc, fdiff, fgrad);
}
......@@ -107,7 +107,7 @@ void ConvolutionBackwardFilterImpl::AlgoBFloat16::exec(
conv_back_filter_opr->param() = args.opr->param();
conv_back_filter_opr->param().compute_mode =
Param::ComputeMode::DEFAULT;
conv_back_filter_opr->execution_policy() = {m_algorithm->info()};
conv_back_filter_opr->execution_policy() = {m_algorithm->desc(), {}};
conv_back_filter_opr->exec(fsrc_tensor, fdiff_tensor, fgrad_tensor,
cvter.workspace());
}
......
......@@ -69,7 +69,7 @@ ConvolutionForwardImpl::conv_bias_extra_data(const TensorLayout& src,
conv_param.dilate_h,
conv_param.dilate_w,
conv_param.compute_mode};
ret.convbias_opr->execution_policy() = {this->execution_policy().algo};
ret.convbias_opr->execution_policy() = {this->execution_policy().algo, {}};
return ret;
}
......@@ -102,7 +102,7 @@ ConvolutionForwardImpl::get_algorithm_from_desc(
conv_param.dilate_h,
conv_param.dilate_w,
conv_param.compute_mode};
convbias_opr->execution_policy() = {this->execution_policy().algo};
convbias_opr->execution_policy() = {this->execution_policy().algo, {}};
return static_cast<ConvBiasForwardImpl*>(convbias_opr.get())
->get_algorithm_from_desc(desc);
......@@ -160,7 +160,7 @@ void ConvolutionBackwardDataImpl::exec(_megdnn_tensor_in filter,
_megdnn_tensor_out grad,
_megdnn_workspace workspace) {
AlgoBase::ExecArgs args(this, filter, diff, grad, workspace);
auto algo = get_algorithm(this, filter.layout, args.filter_meta,
auto algo = get_algorithm(this, filter.layout,
diff.layout, grad.layout);
algo->check_workspace(args, workspace).exec(args);
}
......
......@@ -83,6 +83,17 @@ public:
workspace_limit_in_bytes, reproducible)
->info();
}
AlgorithmInfo get_algorithm_info_heuristic(const TensorLayout& filter,
const TensorLayout& diff,
const TensorLayout& grad,
size_t workspace_limit_in_bytes,
bool reproducible) {
return get_algorithm_heuristic(filter, diff, grad,
workspace_limit_in_bytes, reproducible)
->info();
}
size_t get_workspace_in_bytes(const TensorLayout& filter,
const TensorLayout& diff,
const TensorLayout& grad) override;
......
......@@ -82,7 +82,7 @@ void MatrixMulForwardImpl::AlgoBFloat16::exec(const ExecArgs& args) const {
args.opr->handle()->create_operator<MatrixMulForward>();
matmul_opr->param() = args.opr->param();
matmul_opr->param().compute_mode = Param::ComputeMode::DEFAULT;
matmul_opr->execution_policy() = {m_algorithm->info()};
matmul_opr->execution_policy() = {m_algorithm->desc(), {}};
matmul_opr->exec(a, b, c, ctypecvt.workspace());
}
ctypecvt.comp_to_dst_type(c, args.tensor_c);
......
/**
* \file dnn/src/fallback/conv_bias/opr_impl.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
g * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
......@@ -367,7 +366,7 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param(
"should be equal");
auto&& fm = check_layout_fwd(src, filter, dst);
auto& conv_fm = reinterpret_cast<ConvolutionImpl::CanonizedFilterMeta&>(fm);
size_t nr_threads = static_cast<naive::HandleImpl*>(handle())
->megcore_dispatcher()
->nr_threads();
......@@ -495,7 +494,7 @@ ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm_from_desc(
ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm(
const NCBKernSizeParam& param, size_t workspace_size) {
if (auto algo = get_algorithm_from_desc(execution_policy().algo.desc)) {
if (auto algo = get_algorithm_from_desc(execution_policy().algo)) {
return algo;
}
if (!m_prev_selected_algo ||
......
......@@ -387,7 +387,7 @@ ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm_from_desc(
ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm(
const NCBKernSizeParam& param, size_t workspace_size) {
if (auto algo = get_algorithm_from_desc(execution_policy().algo.desc)) {
if (auto algo = get_algorithm_from_desc(execution_policy().algo)) {
return algo;
}
if (!m_prev_selected_algo ||
......@@ -783,7 +783,7 @@ ConvolutionBackwardDataImpl::get_algorithm_from_desc(
ConvolutionBackwardDataImpl::Algorithm*
ConvolutionBackwardDataImpl::get_algorithm(const NCBKernSizeParam& param) {
if (auto algo = get_algorithm_from_desc(execution_policy().algo.desc)) {
if (auto algo = get_algorithm_from_desc(execution_policy().algo)) {
return algo;
}
if (!m_prev_selected_algo ||
......
......@@ -134,7 +134,7 @@ MatrixMul::Algorithm* MatrixMulImpl::get_algorithm_heuristic(
size_t workspace_limit_in_bytes, bool reproducible) {
auto kern_size_param = make_kern_size_param(A, B, C);
if (auto algo = static_cast<AlgoBase*>(
get_algorithm_from_desc(execution_policy().algo.desc))) {
get_algorithm_from_desc(execution_policy().algo))) {
megdnn_assert(algo->get_workspace(kern_size_param) <
workspace_limit_in_bytes);
auto cur = megdnn::get_reproducible_algo<MatrixMulImpl>(algo,
......
......@@ -382,7 +382,7 @@ float algo_benchmark(Benchmarker<Opr, T>& benchmark, TensorLayoutArray layouts,
for (auto i : algos) {
if (std::regex_match(i.name,
std::regex("(" + algo_base + ")(.*)"))) {
opr->execution_policy().algo = i;
opr->execution_policy().algo = i.desc;
auto used = benchmark.exec(layouts);
min_used = std::min(min_used, used);
printf("run algo: %s used: %f ms min_used: %f ms\n", i.name.c_str(),
......
......@@ -242,6 +242,11 @@ public:
return *this;
}
Checker& reset_before_exec_callback() {
m_before_exec_callback = nullptr;
return *this;
}
//! set a tensors constraints function, for the purpose of manipulating
//! tensors when testing.
Checker& set_tensors_constraint(
......@@ -435,6 +440,17 @@ public:
Testcase operator=(const Testcase&) = delete;
};
struct ExecutionPolicyAlgoName {
std::string name;
std::vector<ExecutionPolicyAlgoName> sub_policy_names;
ExecutionPolicyAlgoName(const char* name) : name{name} {}
ExecutionPolicyAlgoName(
const char* name,
const std::vector<ExecutionPolicyAlgoName>& sub_policy)
: name{name}, sub_policy_names{sub_policy} {}
};
/*!
* \brief a callable to check that given algorithm is used for heuristic
* \param require_algo if its value is true, then requires
......@@ -444,48 +460,76 @@ public:
*/
template <class Opr, typename OprAlgoProxy = OprAlgoProxy<Opr>>
class AlgoChecker {
std::string m_name;
typename Opr::Algorithm* m_algo = nullptr;
bool* m_require_algo;
public:
AlgoChecker(const char* name, bool* require_algo = nullptr)
: m_name{name}, m_require_algo{require_algo} {}
AlgoChecker(typename Opr::Algorithm* algo, bool* require_algo = nullptr)
: m_algo{algo}, m_require_algo{require_algo} {}
AlgoChecker(ExecutionPolicyAlgoName name, bool* require_algo = nullptr)
: m_policy_name{name}, m_require_algo{require_algo} {}
AlgoChecker(ExecutionPolicy policy, bool* require_algo = nullptr)
: m_policy{policy}, m_require_algo{require_algo} {}
static ExecutionPolicy construct_execution_policy_from_name(
const ExecutionPolicyAlgoName& policy_name,
const TensorLayoutArray& layouts, const std::string& param,
Handle* handle) {
ExecutionPolicy ret;
megdnn_assert(layouts.size() == OprTrait<Opr>::arity);
auto opr = handle->create_operator<Opr>();
opr->param() =
Algorithm::deserialize_read_pod<typename Opr::Param>(param);
for (auto algo_info :
AlgoProxy<Opr, OprTrait<Opr>::arity>::get_all_algorithms_info(
opr.get(), layouts)) {
if (std::regex_match(
algo_info.name,
std::regex("(" + policy_name.name + ")(.*)"))) {
ret.algo = algo_info.desc;
} else {
continue;
}
Algorithm* algo = opr->get_algorithm_from_desc(algo_info.desc);
std::vector<Algorithm::SearchItem>&& sub_items =
algo->get_subopr_list(layouts, opr.get());
FOREACH_OPR_TYPE_DISPATCH(sub_items, {
ExecutionPolicy policy =
AlgoChecker<_Opr>::construct_execution_policy_from_name(
policy_name.sub_policy_names[_item_idx],
_item.layouts, _item.param, handle);
ret.sub_policy.push_back(policy);
});
return ret;
}
return ret;
}
void operator()(Opr* opr, const CheckerHelper::TensorValueArray& arr) {
TensorLayoutArray layouts;
for (auto&& val : arr) {
layouts.push_back(val.layout);
}
if (!m_policy_name.name.empty()) {
std::string param_str;
Algorithm::serialize_write_pod(opr->param(), param_str);
m_policy = construct_execution_policy_from_name(
m_policy_name, layouts, param_str, opr->handle());
ASSERT_TRUE(m_policy.algo.valid())
<< "algorithm " << m_policy_name.name << " not found";
}
if (m_require_algo && *m_require_algo) {
auto algo =
OprAlgoProxy::get_algorithm_info_heuristic(opr, layouts);
if (m_name.empty()) {
ASSERT_EQ(m_algo->name(), algo.name.c_str());
} else {
ASSERT_TRUE(std::regex_match(
algo.name.c_str(), std::regex("(" + m_name + ")(.*)")));
}
ASSERT_STREQ(opr->get_algorithm_from_desc(m_policy.algo)->name(),
algo.name.c_str());
} else {
if (m_name.empty()) {
opr->execution_policy().algo = m_algo->info();
return;
} else {
for (auto i :
OprAlgoProxy::get_all_algorithms_info(opr, layouts)) {
if (std::regex_match(i.name,
std::regex("(" + m_name + ")(.*)"))) {
opr->execution_policy().algo = i;
return;
}
}
}
ASSERT_TRUE(false) << "algorithm " << m_name << " not found";
opr->execution_policy() = m_policy;
}
}
private:
ExecutionPolicyAlgoName m_policy_name;
ExecutionPolicy m_policy;
bool* m_require_algo;
};
} // namespace test
......
......@@ -580,7 +580,7 @@ void convolution::test_conv_config_combinations(int k_size,
checker.set_rng(0, &rng).set_rng(1, &rng);
for (auto algo : opr->get_all_algorithms_info(ily, fly, oly)) {
used_algos.insert(algo.desc);
opr->execution_policy().algo = algo;
opr->execution_policy().algo = algo.desc;
checker
.set_epsilon(eps_getter(dtype == 1, 0, algo.name.c_str()))
.execs({ishp, fshp, {}});
......@@ -599,7 +599,7 @@ void convolution::test_conv_config_combinations(int k_size,
opr->param() = param;
for (auto algo: opr->get_all_algorithms_info(fly, oly, ily)) {
used_algos_bwd_data.insert(algo.desc);
opr->execution_policy().algo = algo;
opr->execution_policy().algo = algo.desc;
checker_bwd_data
.set_epsilon(eps_getter(dtype == 1, 1, algo.name.c_str()))
.execl({fly, oly, ily});
......@@ -620,7 +620,7 @@ void convolution::test_conv_config_combinations(int k_size,
opr->param() = param;
for (auto algo: opr->get_all_algorithms_info(ily, oly, fly)) {
used_algos_bwd_flt.insert(algo.desc);
opr->execution_policy().algo = algo;
opr->execution_policy().algo = algo.desc;
checker_bwd_filter
.set_epsilon(eps_getter(dtype == 1, 2, algo.name.c_str()))
.execl({ily, oly, fly});
......
/**
* \file dnn/test/common/fast_run_cache.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "test/common/fast_run_cache.h"
#include "src/common/utils.h"
using namespace megdnn;
using namespace test;
FastRunCache::SearchItemStorage::SearchItemStorage(
const Algorithm::SearchItem& item) {
Algorithm::serialize_write_pod(item.opr_type, data_hold);
for (auto&& layout : item.layouts) {
data_hold += layout.serialize();
}
data_hold += item.param;
}
Algorithm::Info::Desc FastRunCache::get(const Algorithm::SearchItem& key) {
SearchItemStorage key_storage(key);
key_storage.init_hash();
auto iter = m_cache.find(key_storage);
if (iter == m_cache.end()) {
return {};
}
return iter->second;
}
void FastRunCache::put(const Algorithm::SearchItem& key,
const Algorithm::Info::Desc& val) {
SearchItemStorage key_storage(key);
key_storage.init_hash();
megdnn_assert(m_cache.find(key_storage) == m_cache.end());
m_cache[std::move(key_storage)] = val;
}
// vim: syntax=cpp.doxygen
/**
* \file dnn/test/common/fast_run_cache.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "megdnn/oprs.h"
#include "src/common/hash_ct.h"
#include <unordered_map>
namespace megdnn {
namespace test {
class FastRunCache {
struct SearchItemStorage {
std::string data_hold;
size_t hash = 0;
SearchItemStorage(const Algorithm::SearchItem& item);
SearchItemStorage& init_hash() {
hash = XXHash64CT::hash(data_hold.data(), data_hold.size(),
20201225);
return *this;
}
bool operator==(const SearchItemStorage& rhs) const {
return data_hold == rhs.data_hold;
}
struct Hash {
size_t operator()(const SearchItemStorage& s) const {
return s.hash;
}
};
};
std::unordered_map<SearchItemStorage, Algorithm::Info::Desc,
SearchItemStorage::Hash>
m_cache;
public:
Algorithm::Info::Desc get(const Algorithm::SearchItem& key);
void put(const Algorithm::SearchItem& key,
const Algorithm::Info::Desc& val);
};
} // namespace test
} // namespace megdnn
// vim: syntax=cpp.doxygen
此差异已折叠。
......@@ -279,7 +279,7 @@ void benchmark_target_algo(Handle* handle, const std::vector<BenchArgs>& args,
benchmarker.set_param(bparam);
if (!algo) {
benchmarker.proxy()->target_algo_info.reset();
benchmarker.proxy()->target_execution_policy.algo.reset();
}
auto time_in_ms =
benchmarker.execs(
......
......@@ -514,7 +514,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_FWD) {
auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
size_t FW) {
checker.proxy()->target_algo_info.reset();
checker.proxy()->target_execution_policy.algo.reset();
checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}});
};
......@@ -538,7 +538,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_DATA) {
auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
size_t FW) {
checker.proxy()->target_algo_info.reset();
checker.proxy()->target_execution_policy.algo.reset();
checker.execs({{C, 1, 1, FH, FW},
{N, C, IH - FH + 1, IW - FW + 1},
{N, C, IH, IW}});
......@@ -564,7 +564,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_FILTER) {
auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH,
size_t FW) {
checker.proxy()->target_algo_info.reset();
checker.proxy()->target_execution_policy.algo.reset();
checker.execs({{N, C, IH, IW},
{N, C, IH - FH + 1, IW - FW + 1},
{C, 1, 1, FH, FW}});
......@@ -614,7 +614,7 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) {
.set_dtype(2, dtype::Float32())
.set_rng(0, &rng)
.set_rng(1, &rng);
bencher.proxy()->target_algo_info.reset();
bencher.proxy()->target_execution_policy.algo.reset();
auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;
bencher.set_param(param)
......@@ -623,10 +623,10 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) {
.set_dtype(2, dtype::Float16())
.set_rng(0, &rng)
.set_rng(1, &rng);
bencher.proxy()->target_algo_info.reset();
bencher.proxy()->target_execution_policy.algo.reset();
auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;
bencher.proxy()->target_algo_info.reset();
bencher.proxy()->target_execution_policy.algo.reset();
param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
bencher.set_param(param);
auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS;
......
......@@ -168,7 +168,7 @@ void benchmark_target_algo(
benchmarker.set_param(param);
if (!algo) {
benchmarker.proxy()->target_algo_info.reset();
benchmarker.proxy()->target_execution_policy.algo.reset();
}
TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
......@@ -327,7 +327,7 @@ void benchmark_target_algo_with_cudnn_tsc(
benchmarker.set_param(param);
if (!algo) {
benchmarker.proxy()->target_algo_info.reset();
benchmarker.proxy()->target_execution_policy.algo.reset();
}
TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
......
......@@ -8,6 +8,7 @@
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "megdnn/dtype.h"
#include "megdnn/oprs.h"
#include "megdnn/opr_param_defs.h"
#include "test/cuda/fixture.h"
......@@ -223,14 +224,19 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA)
.set_epsilon(1e-1)
.set_param(arg.param)
.exec(TensorLayoutArray{filter, dst, src});
src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
checker.
set_rng(0, &rng).
set_rng(1, &rng).
set_epsilon(1e-1).
set_param(arg.param).
exec(TensorLayoutArray{filter, dst, src});
}
checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>(
ExecutionPolicyAlgoName{"CONVOLUTION_BACKWARD_DATD_BFLOAT16",
{{"MATMUL", {}}}}));
src.dtype = dst.dtype = filter.dtype = dtype::BFloat16();
arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
checker.set_rng(0, &rng)
.set_rng(1, &rng)
.set_epsilon(1e-1)
.set_param(arg.param)
.exec(TensorLayoutArray{filter, dst, src});
checker.reset_before_exec_callback();
checker.opr()->execution_policy() = {};
}
}
......@@ -382,32 +388,35 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_1) {
#if MEGDNN_WITH_BENCHMARK
TEST_F(CUDA, CONV_FWD_BENCHMARK) {
auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH=1,
size_t SW=1, size_t FH=1, size_t FW=1, size_t PH=0, size_t PW=0, bool fp16io_c32=false) {
auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
size_t SH = 1, size_t SW = 1, size_t FH = 1, size_t FW = 1,
size_t PH = 0, size_t PW = 0, bool fp16io_c32 = false) {
auto benchmarker = Benchmarker<ConvolutionForward>(handle_cuda());
benchmarker.set_dtype(0, dtype::Float16())
.set_dtype(1, dtype::Float16())
.set_dtype(2, dtype::Float16());
.set_dtype(1, dtype::Float16())
.set_dtype(2, dtype::Float16());
ConvolutionForward::Param param;
param.stride_h = SH;
param.stride_w = SW;
param.pad_h = PH;
param.pad_w = PW;
if (fp16io_c32) {
param.compute_mode = ConvolutionForward::Param::ComputeMode::FLOAT32;
param.compute_mode =
ConvolutionForward::Param::ComputeMode::FLOAT32;
}
benchmarker.set_param(param);
std::unique_ptr<OprProxy<ConvolutionForward>> proxy{new OprProxy<ConvolutionForward>{true}};
std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
new OprProxy<ConvolutionForward>{true}};
benchmarker.set_proxy(proxy);
size_t OH = (IH - FH + 2 * PH) / SH + 1;
size_t OW = (IW - FW + 2 * PW) / SW + 1;
auto time = benchmarker.execs({
{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}});
auto time = benchmarker.execs(
{{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}});
time /= 1000.0 * 10.0;
auto flo = (double) N * OC * IC * OH * OW * FH * FW * 2;
auto flo = (double)N * OC * IC * OH * OW * FH * FW * 2;
auto flops = flo / time / 1e12;
printf("comp_type %s: ", fp16io_c32 ? "32" : "16");
printf("%.3fG FLO, flops %.3fTFLOPS\n", flo/1e9, flops);
printf("%.3fG FLO, flops %.3fTFLOPS\n", flo / 1e9, flops);
};
run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, false);
run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, true);
......@@ -415,7 +424,8 @@ TEST_F(CUDA, CONV_FWD_BENCHMARK) {
TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) {
CUBenchmarker<ConvolutionForward> bench{handle_cuda()};
std::unique_ptr<OprProxy<ConvolutionForward>> proxy{new OprProxy<ConvolutionForward>{true}};
std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
new OprProxy<ConvolutionForward>{true}};
size_t RUNS = 10;
bench.set_proxy(proxy).set_times(RUNS);
......@@ -429,7 +439,7 @@ TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) {
param.pad_h = param.pad_w = PH;
param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
bench.set_param(param);
bench.proxy()->target_algo_info.reset();
bench.proxy()->target_execution_policy.algo.reset();
TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
filter{{OC, IC, FH, FH}, dtype::Float32()};
TensorLayout dst;
......@@ -440,13 +450,13 @@ TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) {
}
auto time_ms_fp32 = bench.execl({src, filter, dst}) / RUNS;
src.dtype = filter.dtype = dst.dtype = dtype::Float16();
bench.proxy()->target_algo_info.reset();
bench.proxy()->target_execution_policy.algo.reset();
bench.set_dtype(0, dtype::Float16())
.set_dtype(1, dtype::Float16())
.set_dtype(2, dtype::Float16());
auto time_ms_true_fp16 = bench.execl({src, filter, dst}) / RUNS;
param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
bench.proxy()->target_algo_info.reset();
bench.proxy()->target_execution_policy.algo.reset();
bench.set_param(param);
auto time_ms_pseudo_fp16 = bench.execl({src, filter, dst}) / RUNS;
float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
......@@ -500,7 +510,7 @@ TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) {
param.pad_h = param.pad_w = PH;
param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
bench.set_param(param);
bench.proxy()->target_algo_info.reset();
bench.proxy()->target_execution_policy.algo.reset();
TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
filter{{OC, IC, FH, FH}, dtype::Float32()};
TensorLayout dst;
......@@ -511,13 +521,13 @@ TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) {
}
auto time_ms_fp32 = bench.execl({filter, dst, src}) / RUNS;
src.dtype = filter.dtype = dst.dtype = dtype::Float16();
bench.proxy()->target_algo_info.reset();
bench.proxy()->target_execution_policy.algo.reset();
bench.set_dtype(0, dtype::Float16())
.set_dtype(1, dtype::Float16())
.set_dtype(2, dtype::Float16());
auto time_ms_true_fp16 = bench.execl({filter, dst, src}) / RUNS;
param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
bench.proxy()->target_algo_info.reset();
bench.proxy()->target_execution_policy.algo.reset();
bench.set_param(param);
auto time_ms_pseudo_fp16 = bench.execl({filter, dst, src}) / RUNS;
float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
......@@ -554,6 +564,62 @@ TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) {
run(32, 64, 64, 56, 56, 1, 1, 0);
}
TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) {
CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
new OprProxy<ConvolutionBackwardData>{true}};
size_t RUNS = 10;
bench.set_proxy(proxy).set_times(RUNS);
auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW,
size_t FH, size_t SH, size_t PH) {
bench.set_dtype(0, dtype::BFloat16())
.set_dtype(1, dtype::BFloat16())
.set_dtype(2, dtype::BFloat16());
param::Convolution param;
param.stride_h = param.stride_w = SH;
param.pad_h = param.pad_w = PH;
param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
bench.set_param(param);
bench.proxy()->target_execution_policy = {};
TensorLayout src{{N, IC, IH, IW}, dtype::BFloat16()},
filter{{OC, IC, FH, FH}, dtype::BFloat16()};
TensorLayout dst;
{
auto&& opr = handle_cuda()->create_operator<Convolution>();
opr->param() = param;
opr->deduce_layout(src, filter, dst);
}
auto used = bench.execl({filter, dst, src}) / RUNS;
float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
filter.to_string().c_str(), dst.to_string().c_str());
printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used,
(flo / (used * 1e9)));
};
run(32, 64, 3, 224, 224, 7, 2, 3);
run(32, 128, 128, 28, 28, 3, 1, 1);
run(32, 256, 256, 14, 14, 3, 1, 1);
run(32, 512, 512, 7, 7, 3, 1, 1);
run(32, 64, 64, 56, 56, 3, 1, 1);
run(32, 512, 256, 56, 56, 1, 2, 0);
run(32, 1024, 512, 28, 28, 1, 2, 0);
run(32, 2048, 1024, 14, 14, 1, 2, 0);
run(32, 512, 128, 28, 28, 1, 1, 0);
run(32, 128, 512, 28, 28, 1, 1, 0);
run(32, 1024, 256, 14, 14, 1, 1, 0);
run(32, 256, 1024, 14, 14, 1, 1, 0);
run(32, 2048, 512, 7, 7, 1, 1, 0);
run(32, 512, 2048, 7, 7, 1, 1, 0);
run(32, 256, 64, 56, 56, 1, 1, 0);
run(32, 64, 256, 56, 56, 1, 1, 0);
run(32, 128, 256, 56, 56, 1, 2, 0);
run(32, 256, 512, 28, 28, 1, 2, 0);
run(32, 512, 1024, 14, 14, 1, 2, 0);
run(32, 64, 64, 56, 56, 1, 1, 0);
}
TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) {
CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()};
std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
......@@ -571,7 +637,7 @@ TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) {
param.pad_h = param.pad_w = PH;
param.compute_mode = param::Convolution::ComputeMode::DEFAULT;
bench.set_param(param);
bench.proxy()->target_algo_info.reset();
bench.proxy()->target_execution_policy.algo.reset();
TensorLayout src{{N, IC, IH, IW}, dtype::Float32()},
filter{{OC, IC, FH, FH}, dtype::Float32()};
TensorLayout dst;
......@@ -582,13 +648,13 @@ TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) {
}
auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS;
src.dtype = filter.dtype = dst.dtype = dtype::Float16();
bench.proxy()->target_algo_info.reset();
bench.proxy()->target_execution_policy.algo.reset();
bench.set_dtype(0, dtype::Float16())
.set_dtype(1, dtype::Float16())
.set_dtype(2, dtype::Float16());
auto time_ms_true_fp16 = bench.execl({src, dst, filter}) / RUNS;
param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
bench.proxy()->target_algo_info.reset();
bench.proxy()->target_execution_policy.algo.reset();
bench.set_param(param);
auto time_ms_pseudo_fp16 = bench.execl({src, dst, filter}) / RUNS;
float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH;
......@@ -630,8 +696,7 @@ TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) {
#undef V
#undef V1
} // namespace test
} // namespace megdnn
} // namespace test
} // namespace megdnn
// vim: syntax=cpp.doxygen
......@@ -778,7 +778,7 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_BWD_FILTER) {
.set_dtype(2, dtype::Float32())
.set_rng(0, &rng)
.set_rng(1, &rng);
bencher.proxy()->target_algo_info.reset();
bencher.proxy()->target_execution_policy.algo.reset();
auto time_in_ms = bencher.execs({src, diff, grad}) / RUNS;
printf("src=%s, diff=%s, grad=%s, float32: %.2fms "
......@@ -856,7 +856,7 @@ TEST_F(CUDA, BENCHMARK_GROUP_LOCAL_SHARE_FORWARD) {
.set_dtype(2, dtype::Float32())
.set_rng(0, &rng)
.set_rng(1, &rng);
bencher.proxy()->target_algo_info.reset();
bencher.proxy()->target_execution_policy.algo.reset();
auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS;
;
......@@ -915,7 +915,7 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_BWD_DATA) {
.set_dtype(2, dtype::Float32())
.set_rng(0, &rng)
.set_rng(1, &rng);
bencher.proxy()->target_algo_info.reset();
bencher.proxy()->target_execution_policy.algo.reset();
auto time_in_ms = bencher.execs({filter, diff, grad}) / RUNS;
printf("filter=%s, diff=%s, grad=%s, float32: %.2fms "
......@@ -1002,11 +1002,11 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_FORWARD_BOTTLENECK) {
.set_dtype(2, dtype::Float32())
.set_rng(0, &rng)
.set_rng(1, &rng);
bencher.proxy()->target_algo_info.reset();
bencher.proxy()->target_execution_policy.algo.reset();
auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS;
bencher_conv.set_param(conv_param);
bencher_conv.proxy()->target_algo_info.reset();
bencher_conv.proxy()->target_execution_policy.algo.reset();
auto time_in_ms_conv =
bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS;
......@@ -1094,11 +1094,11 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_FORWARD_FROM_RESEARCH) {
.set_dtype(2, dtype::Float32())
.set_rng(0, &rng)
.set_rng(1, &rng);
bencher.proxy()->target_algo_info.reset();
bencher.proxy()->target_execution_policy.algo.reset();
auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS;
bencher_conv.set_param(conv_param);
bencher_conv.proxy()->target_algo_info.reset();
bencher_conv.proxy()->target_execution_policy.algo.reset();
auto time_in_ms_conv =
bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS;
......@@ -1177,11 +1177,11 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_FORWARD) {
.set_dtype(2, dtype::Float32())
.set_rng(0, &rng)
.set_rng(1, &rng);
bencher.proxy()->target_algo_info.reset();
bencher.proxy()->target_execution_policy.algo.reset();
auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS;
bencher_conv.set_param(conv_param);
bencher_conv.proxy()->target_algo_info.reset();
bencher_conv.proxy()->target_execution_policy.algo.reset();
auto time_in_ms_conv =
bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS;
......
......@@ -10,6 +10,7 @@
*/
#include <gtest/gtest.h>
#include "megdnn/basic_types.h"
#include "src/common/utils.h"
#include "test/common/random_state.h"
......@@ -21,9 +22,29 @@ class ResetSeedListener : public ::testing::EmptyTestEventListener {
}
};
megdnn::LogLevel min_log_level;
void set_log_level() {
megdnn::LogLevel level = megdnn::LogLevel::INFO;
auto setting = std::getenv("MEGDNN_LOG_LEVEL");
if (setting) {
if (!strcmp(setting, "INFO")) {
level = megdnn::LogLevel::INFO;
} else if (!strcmp(setting, "DEBUG")) {
level = megdnn::LogLevel::DEBUG;
} else if (!strcmp(setting, "WARN")) {
level = megdnn::LogLevel::WARN;
} else {
megdnn_assert(!strcmp(setting, "ERROR"));
level = megdnn::LogLevel::ERROR;
}
}
min_log_level = level;
}
void log_handler(megdnn::LogLevel level, const char* file, const char* func,
int line, const char* fmt, va_list ap) {
if (level < megdnn::LogLevel::ERROR) {
if (level < min_log_level) {
return;
}
char msg[1024];
......@@ -39,6 +60,7 @@ void log_handler(megdnn::LogLevel level, const char* file, const char* func,
extern "C" int gtest_main(int argc, char** argv) {
::megdnn::set_log_handler(log_handler);
set_log_level();
ResetSeedListener listener;
auto&& listeners = ::testing::UnitTest::GetInstance()->listeners();
::testing::InitGoogleTest(&argc, argv);
......
......@@ -450,6 +450,7 @@ TEST_F(X86, BENCHMARK_CONVOLUTION_I8x8x32_MKLDNN) {
}
}
#endif
#endif
} // namespace test
......
......@@ -27,6 +27,7 @@
#include "megbrain/gopt/inference.h"
#include "megbrain/test/helper.h"
#include "megdnn/oprs/base.h"
#include <atomic>
#include <chrono>
......@@ -1924,19 +1925,19 @@ TEST(TestGraph, NaiveRecord2NCHW44) {
namespace {
template <typename DnnOp, typename... Args>
typename DnnOp::AlgorithmInfo try_find_any_weight_preprocess_algo(
typename megdnn::ExecutionPolicy try_find_any_weight_preprocess_algo(
DnnOp* dnn_op, const char* mgb_info, Maybe<bool>& found,
Args&& ...args) {
if (found.valid()) {
if (found.val()) {
return dnn_op->execution_policy().algo;
return dnn_op->execution_policy();
} else {
return {};
}
}
for (auto&& algo : dnn_op->get_all_algorithms_info(
std::forward<Args>(args)...)) {
dnn_op->execution_policy().algo = algo;
dnn_op->execution_policy().algo = algo.desc;
auto layouts = dnn_op->deduce_preprocessed_filter_layout(
std::forward<Args>(args)...);
if (layouts.empty()) continue;
......@@ -1949,7 +1950,7 @@ typename DnnOp::AlgorithmInfo try_find_any_weight_preprocess_algo(
}
if (valid) {
found.emplace(true);
return algo;
return {algo.desc, {}};
}
}
found.emplace(false);
......@@ -1958,19 +1959,19 @@ typename DnnOp::AlgorithmInfo try_find_any_weight_preprocess_algo(
}
template <typename DnnOp, typename... Args>
typename DnnOp::AlgorithmInfo try_find_any_bias_preprocess_algo(
typename megdnn::ExecutionPolicy try_find_any_bias_preprocess_algo(
DnnOp* dnn_op, const char* mgb_info, Maybe<bool>& found,
Args&& ...args) {
if (found.valid()) {
if (found.val()) {
return dnn_op->execution_policy().algo;
return dnn_op->execution_policy();
} else {
return {};
}
}
for (auto&& algo : dnn_op->get_all_algorithms_info(
std::forward<Args>(args)...)) {
dnn_op->execution_policy().algo = algo;
dnn_op->execution_policy().algo = algo.desc;
auto layouts = dnn_op->deduce_preprocessed_filter_layout(
std::forward<Args>(args)...);
if (layouts.size() <= 1)
......@@ -1981,7 +1982,7 @@ typename DnnOp::AlgorithmInfo try_find_any_bias_preprocess_algo(
}
if (valid) {
found.emplace(true);
return algo;
return {algo.desc, {}};
}
}
found.emplace(false);
......
......@@ -14,6 +14,8 @@
#include "../internal/invoke.h"
#include "../internal/megdnn_opr_wrapper.inl"
#include "megdnn/handle.h"
#include "megdnn/oprs/base.h"
#if MGB_ROCM
#include "hcc_detail/hcc_defs_prologue.h"
......@@ -32,12 +34,96 @@ MIDOUT_DECL(megbrain_opr_profile)
} \
MIDOUT_END();
namespace {
std::string serialize_policy(const megdnn::ExecutionPolicy& policy) {
std::string ret;
//! serialize AlgorithmDesc
megdnn::Algorithm::serialize_write_pod(policy.algo.handle_type, ret);
megdnn::Algorithm::serialize_write_pod(policy.algo.type, ret);
uint32_t param_size = policy.algo.param.size();
megdnn::Algorithm::serialize_write_pod<uint32_t>(param_size, ret);
ret += policy.algo.param;
//! serialize sub_policy
uint32_t size = policy.sub_policy.size();
megdnn::Algorithm::serialize_write_pod(size, ret);
for (auto&& sub : policy.sub_policy) {
ret += serialize_policy(sub);
}
return ret;
}
megdnn::ExecutionPolicy deserialize_policy(const char* buf, uint32_t size,
uint32_t& offset) {
megdnn::ExecutionPolicy ret;
#define cb(_val, _type) \
_val = megdnn::Algorithm::deserialize_read_pod<_type>(buf, offset); \
offset += sizeof(_val)
cb(ret.algo.handle_type, megdnn::Handle::HandleType);
cb(ret.algo.type, uint32_t);
uint32_t param_size = 0;
cb(param_size, uint32_t);
if (param_size > 0) {
ret.algo.param = std::string(buf + offset, param_size);
offset += param_size;
}
uint32_t nr_policy = 0;
cb(nr_policy, uint32_t);
#undef cb
for (uint32_t i = 0; i < nr_policy; i++) {
ret.sub_policy.push_back(deserialize_policy(buf, size, offset));
}
return ret;
}
}
namespace mgb {
namespace opr {
#define APPLY(statement, ...) \
mgb::apply([&](const auto&... args) { return statement; }, \
std::tuple_cat(__VA_ARGS__))
////////////// TimedProfiler::Param::ExecutionPolicyBlob //////////////////////
template <typename Opr>
typename TimedProfiler<Opr>::Param::ExecutionPolicyBlob
TimedProfiler<Opr>::Param::ExecutionPolicyBlob::serialize(
const megdnn::ExecutionPolicy& policy) {
ExecutionPolicyBlob ret;
std::string serialize_bin = serialize_policy(policy);
mgb_assert(serialize_bin.size() < MAX_SIZE_IN_BYTES);
memcpy(ret.data, serialize_bin.data(), serialize_bin.size());
ret.size = serialize_bin.size();
return ret;
}
template <typename Opr>
megdnn::ExecutionPolicy
TimedProfiler<Opr>::Param::ExecutionPolicyBlob::deserialize() const {
uint32_t offset = 0;
auto&& ret = deserialize_policy(data, size, offset);
mgb_assert(offset == size);
return std::move(ret);
}
#define INST(Opr) \
template typename TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob \
TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob::serialize( \
const megdnn::ExecutionPolicy& policy); \
template megdnn::ExecutionPolicy \
TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob::deserialize() \
const;
MGB_FOREACH_FASTRUN_OPR(INST)
#undef INST
////////////////// TimedProfiler //////////////////////////////
template <typename Opr>
const double TimedProfiler<Opr>::timeout_setting =
TimedProfiler<Opr>::init_timeout_setting();
......@@ -99,18 +185,7 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
}
megdnn_opr->param() = param.opr_param;
{
typename Opr::AlgorithmInfo algo;
for (auto i :
APPLY(megdnn_opr->get_all_algorithms_info(args...), layouts)) {
if (!strcmp(i.name.c_str(), param.algo_name)) {
algo = i;
break;
}
}
mgb_assert(algo.valid(), "algorithm %s not found", param.algo_name);
megdnn_opr->execution_policy() = {algo};
}
megdnn_opr->execution_policy() = param.execution_policy.deserialize();
// Allocate preprocessed weight buffers.
TensorLayoutArray preprocessed_layout;
......@@ -222,13 +297,16 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
});
ev_end->record();
megdnn::Algorithm* algo = megdnn_opr->get_algorithm_from_desc(
megdnn_opr->execution_policy().algo);
mgb_assert(algo);
double next_report_time = 0.5;
while (!ev_end->finished()) {
if (timer.get_secs() >= next_report_time) {
mgb_log_warn(
"profiling conv algo %s already took %.3f/%.3f secs"
" (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ",
param.algo_name, timer.get_secs(), param.actual_timeout);
algo->name(), timer.get_secs(), param.actual_timeout);
next_report_time = timer.get_secs() + 1;
}
using namespace std::literals;
......
......@@ -46,7 +46,7 @@ private:
static bool check_layout(const TensorLayout& layout, int transpose);
//! store the policy of all transpose situations
megdnn::MatrixMul::ExecutionPolicy m_cadidate_execution_policies[4];
megdnn::ExecutionPolicy m_cadidate_execution_policies[4];
};
/*!
......@@ -76,7 +76,7 @@ private:
static bool check_layout(const TensorLayout& layout, bool transpose);
//! store the policy of all transpose situations
megdnn::BatchedMatrixMul::ExecutionPolicy m_cadidate_execution_policies[4];
megdnn::ExecutionPolicy m_cadidate_execution_policies[4];
};
/*!
......
......@@ -12,9 +12,14 @@
#pragma once
#include <memory>
#include "megbrain/graph/cg.h"
#include "megbrain/graph/operator_node.h"
#include "megbrain/opr/search_policy/algo_chooser_helper.h"
#include "megbrain/opr/search_policy/profiler.h"
#include "megbrain/opr/dnn/convolution.h"
#include "megbrain/opr/blas.h"
#include "megdnn/oprs/base.h"
template <class MegDNNOpr>
struct MegDNNOpr2MGBOpr;
......@@ -49,52 +54,64 @@ class AlgoChooser {
static constexpr int arity = OprArityTrait<Opr>::arity;
using ImplAlgo = typename Opr::AlgorithmInfo;
using ImplExecutionPolicy = megdnn::ExecutionPolicy;
using MGBOpr = typename MegDNNOpr2MGBOpr<Opr>::MGBOpr;
using TensorLayoutArray = std::array<TensorLayout, arity>;
public:
using FixedTensorLayouts = std::array<TensorLayout, arity>;
class ExeContext {
const TensorLayoutArray& m_layouts;
FixedTensorLayouts m_layouts;
Opr* m_megdnn_opr;
const MGBOpr* m_mgb_opr;
std::string m_param;
const cg::OperatorNodeBase* m_base_mgb_opr;
CompNode m_cn;
megdnn::param::ExecutionPolicy m_execution_policy;
bool m_allow_weight_preprocess;
public:
ExeContext(const TensorLayoutArray& layouts, Opr* megdnn_opr,
const MGBOpr* mgb_opr, bool allow_weight_preprocess)
: m_layouts{layouts},
m_megdnn_opr{megdnn_opr},
m_mgb_opr{mgb_opr},
m_allow_weight_preprocess{allow_weight_preprocess} {
mgb_assert(m_layouts.size() == layouts.size());
static_assert(
std::tuple_size<TensorLayoutArray>::value == 3 ||
std::tuple_size<TensorLayoutArray>::value == 5 ||
std::tuple_size<TensorLayoutArray>::value == 8,
"Convolution AlgoChooser assumes arity = 3 , 5 or 8 (for "
"deformable conv)");
}
ExeContext(const FixedTensorLayouts& layouts, Opr* megdnn_opr,
const std::string& param_str,
const cg::OperatorNodeBase* mgb_opr, const CompNode& cn,
const megdnn::param::ExecutionPolicy& execution_policy,
bool allow_weight_preprocess);
Opr* megdnn_opr() const { return m_megdnn_opr; }
const MGBOpr* mgb_opr() const { return m_mgb_opr; }
const TensorLayout& inp_layout(size_t idx) const {
return m_layouts[idx];
}
const TensorLayoutArray& layouts() const { return m_layouts; }
cg::ComputingGraph* owner_graph() const {
return m_base_mgb_opr->owner_graph();
}
const cg::OperatorNodeBase* mgb_opr() const { return m_base_mgb_opr; }
const megdnn::param::ExecutionPolicy& execution_policy() const {
return m_execution_policy;
}
CompNode comp_node() const { return m_cn; }
const std::string& param() const { return m_param; }
bool allow_weight_preprocess() const {
return m_allow_weight_preprocess;
}
megdnn::Algorithm* get_algorithm_from_desc(
const megdnn::Algorithm::Info::Desc& desc) const {
return m_megdnn_opr->get_algorithm_from_desc(desc);
}
const FixedTensorLayouts& layouts() const { return m_layouts; }
ImplAlgo choose_by_heuristic(bool reproducible = false) const;
ImplExecutionPolicy choose_by_heuristic(
bool reproducible = false) const;
//! get all candidate algos, and the one choose_by_heuristic() is
//! put first
std::vector<ImplAlgo> get_all_candidates() const;
//! get candidate algos with workspace limit.
std::vector<ImplAlgo> get_all_candidates_with_workspace_limit() const;
//! get workspace size required for specific algo
size_t get_workspace_size_bytes(ImplAlgo algo) const;
//! get workspace size required for specific execution policy
size_t get_workspace_size_bytes(
const ImplExecutionPolicy& policy) const;
/*!
* \brief profile a single algorithm
......@@ -106,28 +123,59 @@ class AlgoChooser {
* timeout used during profiling
*/
Maybe<AlgoChooserProfileCache::ResultEntry> profile_single_algo(
ImplAlgo algo, double& timeout) const;
const ImplExecutionPolicy& policy, double& timeout) const;
//! get all profile algorithm from cache, return invalid if not exists
ImplAlgo get_profile_result_from_cache(bool require_reproducible) const;
/**
* \brief construct execution policy from cache.
*
* \param require_reproducible select algo which is reproducible
* \param policy execution policy
*/
void construct_execution_policy_from_cache(
bool require_reproducible, ImplExecutionPolicy& policy) const;
private:
Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter() const;
};
//! entrance for getting algorithm according to execution strategy
static ImplAlgo get_algo(ExeContext& ctx);
//! get all profile result, either by retrieving cache or profiling
static AlgoChooserProfileCache::Result get_profile_result(
ExeContext& ctx, bool enable_update);
template<typename U>
friend class AlgoChooser;
static ImplAlgo choose_by_profile(ExeContext& ctx,
bool require_reproducible,
bool enable_update = true);
private:
//! entrance for getting algorithm according to execution strategy
static ImplExecutionPolicy get_policy(ExeContext& ctx);
//! profile and save to cache
static void profile(ExeContext& ctx, bool require_reproducible);
static ImplExecutionPolicy choose_by_profile(ExeContext& ctx,
bool require_reproducible,
bool enable_update = true);
/**
* flatten search space in postorder traversal
* The subopr search construct a search tree
*
* A
* / \
* B1B2 C
* / \
* D1D2D3 E
* We use postorder traverse the search tree.
* D1 -> D2 -> D3 -> E -> B1 -> B2 -> C -> A
*/
static std::vector<megdnn::Algorithm::SearchItem> flatten_search_space(
const ExeContext& ctx);
public:
/*!
* \brief setup algorithm and return workspace size
*/
static size_t setup_algo(const TensorLayoutArray& layouts, Opr* megdnn_opr,
static size_t setup_algo(const FixedTensorLayouts& layouts, Opr* megdnn_opr,
const MGBOpr* mgb_opr,
bool allow_weight_preprocess = false);
};
......
......@@ -28,9 +28,9 @@ namespace mixin {
class AlgoChooserHelper : cg::OperatorNodeMixinBase {
public:
using ExecutionPolicy = megdnn::param::ExecutionPolicy;
using AlgorithmInfo = megdnn::detail::Algorithm::Info;
using AlgorithmPolicy = megdnn::ExecutionPolicy;
using AlgoChooserHook =
std::function<AlgorithmInfo(const cg::OperatorNodeBase*)>;
std::function<AlgorithmPolicy(const cg::OperatorNodeBase*)>;
const ExecutionPolicy& execution_policy() const {
if (!m_policy_accessed) {
......
......@@ -18,6 +18,7 @@
#include "megbrain/comp_node.h"
#include "megdnn/basic_types.h"
#include "megdnn/oprs/base.h"
#include "megdnn/oprs/linalg.h"
#include "megdnn/oprs/nn.h"
......@@ -139,7 +140,17 @@ class TimedProfiler {
public:
struct Param {
char algo_name[128];
struct ExecutionPolicyBlob {
//! enlarge the max size if needed
constexpr static size_t MAX_SIZE_IN_BYTES = 10240;
char data[MAX_SIZE_IN_BYTES];
uint32_t size;
static ExecutionPolicyBlob serialize(
const megdnn::ExecutionPolicy& policy);
megdnn::ExecutionPolicy deserialize() const;
};
ExecutionPolicyBlob execution_policy;
size_t workspace;
megdnn::DTypeEnum dtypes[arity];
CompNode::Locator comp_node_loc;
......
......@@ -20,11 +20,13 @@
#include "megbrain/opr/basic_arith.h"
#include "megbrain/gopt/inference.h"
#include "megbrain/opr/tensor_manip.h"
#include "megdnn/dtype.h"
#include "megdnn/oprs/base.h"
#include <gmock/gmock.h>
#include <cmath>
#include <memory>
#include <random>
using namespace mgb;
......@@ -37,6 +39,73 @@ using Mode = Param::Mode;
Mode modes_to_check[] = {Mode::CONVOLUTION, Mode::CROSS_CORRELATION};
void conv_bwd_data_brute(const std::vector<std::shared_ptr<HostTensorND>>& inps,
std::shared_ptr<HostTensorND>& dest,
const opr::ConvolutionBackwardData::Param& param) {
mgb_assert(param.format == Param::Format::NCHW);
auto &&data = *inps[0], &&filter = *inps[1];
size_t N = data.shape(0), IH = data.shape(2), IW = data.shape(3);
size_t GROUP, ICPG, OCPG, FH, FW;
if (param.sparse == Param::Sparse::DENSE) {
GROUP = 1, ICPG = filter.shape(0), OCPG = filter.shape(1),
FH = filter.shape(2), FW = filter.shape(3);
} else {
mgb_assert(param.sparse == Param::Sparse::GROUP);
GROUP = filter.shape(0), ICPG = filter.shape(1), OCPG = filter.shape(2),
FH = filter.shape(3), FW = filter.shape(4);
}
auto get_shp = [](size_t inp, size_t filter, size_t stride, size_t pad,
size_t dilate) {
return (inp - 1) * stride + (filter - 1) * dilate + 1 - pad * 2;
};
size_t OH = get_shp(IH, FH, param.stride_h, param.pad_h, param.dilate_h),
OW = get_shp(IW, FW, param.stride_w, param.pad_w, param.dilate_w);
dest = std::make_shared<HostTensorND>(CompNode::load("xpu0"),
TensorShape{N, OCPG * GROUP, OH, OW});
auto&& out = *dest;
auto fptr = filter.ptr<float>(), dptr = data.ptr<float>(),
optr = out.ptr<float>();
memset(optr, 0, sizeof(float) * out.shape().total_nr_elems());
auto ol = out.layout(), fl = filter.layout();
#define FOR2(a, A, b, B) \
for (size_t a = 0; a < A; ++a) \
for (size_t b = 0; b < B; ++b)
#define FOR3(a, A, b, B, c, C) \
FOR2(a, A, b, B) \
for (size_t c = 0; c < C; ++c)
FOR3(n, N, group, GROUP, icg, ICPG)
FOR2(ih, IH, iw, IW) {
float scale = *(dptr++);
FOR3(ocg, OCPG, fh, FH, fw, FW) {
auto oc_tot = group * OCPG + ocg;
int oh = int(ih * param.stride_h + fh * param.dilate_h) -
int(param.pad_h),
ow = int(iw * param.stride_w + fw * param.dilate_w) -
int(param.pad_w);
if (oh >= 0 && ow >= 0 && oh < static_cast<int>(OH) &&
ow < static_cast<int>(OW)) {
auto out_off = n * ol.stride[0] + oc_tot * ol.stride[1] +
oh * ol.stride[2] + ow;
size_t flt_off = 0;
if (param.sparse == Param::Convolution::Sparse::DENSE) {
flt_off = icg * fl.stride[0] +
ocg * fl.stride[1] + fh * fl.stride[2] + fw;
} else {
flt_off = group * fl.stride[0] + icg * fl.stride[1] +
ocg * fl.stride[2] + fh * fl.stride[3] + fw;
}
optr[out_off] += scale * fptr[flt_off];
}
}
}
#undef FOR3
#undef FOR2
}
void conv_bwd_flt_brute(const std::vector<std::shared_ptr<HostTensorND>>& inps,
std::shared_ptr<HostTensorND>& out,
const opr::ConvolutionBackwardFilter::Param& param) {
......@@ -370,7 +439,8 @@ TEST(TestOprDNN, ConvolutionExePolicy) {
PersistentCacheHook cache_hook{on_get};
#if MGB_ENABLE_FASTRUN
for (auto strategy: {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, S::PROFILE_HEURISTIC}) {
for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE,
S::PROFILE_HEURISTIC}) {
#else
for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) {
#endif
......@@ -406,6 +476,95 @@ TEST(TestOprDNN, ConvolutionExePolicy) {
}
}
TEST(TestOprDNN, ConvolutionBackwardDataBfloat16ExePolicy) {
REQUIRE_GPU(1);
Param param{Mode::CROSS_CORRELATION, 1, 1, 1, 1};
param.compute_mode = Param::ComputeMode::FLOAT32;
using Policy = opr::Convolution::ExecutionPolicy;
using S = Policy::Strategy;
auto gen_bfp16 = [](HostTensorND& dest) {
RNGxorshf rng{next_rand_seed()};
auto rand_real = [&rng]() {
std::uniform_real_distribution<float> dist(-1, 1);
return dist(rng);
};
auto ptr = dest.ptr<dt_bfloat16>();
size_t elems = dest.shape().total_nr_elems();
for (size_t i = 0; i < elems; i++) {
ptr[i] = dt_bfloat16(rand_real());
}
};
auto f32_to_bf16 = [](const std::shared_ptr<HostTensorND>& src)
-> std::shared_ptr<HostTensorND> {
auto ret = std::make_shared<HostTensorND>(
src->comp_node(), src->shape(), dtype::BFloat16{});
for (size_t i = 0; i < src->layout().total_nr_elems(); i++) {
ret->ptr<dt_bfloat16>()[i] = src->ptr<dt_float32>()[i];
}
return ret;
};
auto bf16_to_f32 = [](const std::shared_ptr<HostTensorND>& src)
-> std::shared_ptr<HostTensorND> {
auto ret = std::make_shared<HostTensorND>(
src->comp_node(), src->shape(), dtype::Float32{});
for (size_t i = 0; i < src->layout().total_nr_elems(); i++) {
ret->ptr<dt_float32>()[i] = src->ptr<dt_bfloat16>()[i];
}
return ret;
};
int nr_get = 0;
auto on_get = [&nr_get](const std::string&, const void*, size_t,
const void*, size_t) { ++nr_get; };
PersistentCacheHook cache_hook{on_get};
#if MGB_ENABLE_FASTRUN
for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE,
S::PROFILE_HEURISTIC}) {
#else
for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) {
#endif
using Checker = AutoOprChecker<2, 1>;
auto make_graph = [&](const Checker::SymInpArray& inputs)
-> Checker::SymOutArray {
Policy policy;
policy.strategy = strategy;
return {opr::ConvolutionBackwardData::make_deconv(
inputs[0], inputs[1], param, policy)};
};
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
std::shared_ptr<HostTensorND> out;
conv_bwd_data_brute(
{bf16_to_f32(inp[0]), bf16_to_f32(inp[1])}, out,
param);
dest[0] = *f32_to_bf16(out);
};
Checker::RunOptions opt;
opt.outputs_max_err = 1e-3;
nr_get = 0;
Checker(make_graph, fwd)
.disable_grad_check()
.set_input_dtype(0, dtype::BFloat16{})
.set_input_dtype(1, dtype::BFloat16{})
.set_input_generator(0, gen_bfp16)
.set_input_generator(1, gen_bfp16)
.run({TensorShape{3, 4, 10, 6}, {4, 2, 3, 3}}, opt)
.run({TensorShape{2, 2, 4, 3}, {2, 2, 3, 3}}, opt)
.run({TensorShape{1, 3, 10, 6}, {3, 2, 3, 3}}, opt);
if (strategy == S::HEURISTIC) {
ASSERT_EQ(0, nr_get);
} else {
ASSERT_LT(0, nr_get);
}
}
}
TEST(TestOprDNN, Deconvolution) {
// dilated grouped deconv
using Checker = AutoOprChecker<2, 1>;
......@@ -420,55 +579,9 @@ TEST(TestOprDNN, Deconvolution) {
};
auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
auto &&data = *inp[0], &&filter = *inp[1];
size_t N = data.shape(0), IH = data.shape(2), IW = data.shape(3);
size_t GROUP = filter.shape(0), ICPG = filter.shape(1),
OCPG = filter.shape(2), FH = filter.shape(3),
FW = filter.shape(4);
auto get_shp = [](size_t inp, size_t filter, size_t stride, size_t pad,
size_t dilate) {
return (inp - 1) * stride + (filter - 1) * dilate + 1 - pad * 2;
};
auto &&out = dest[0];
size_t OH = get_shp(IH, FH, param.stride_h, param.pad_h,
param.dilate_h),
OW = get_shp(IW, FW, param.stride_w, param.pad_w,
param.dilate_w);
out.resize({N, OCPG * GROUP, OH, OW});
auto fptr = filter.ptr<float>(), dptr = data.ptr<float>(),
optr = out.ptr<float>();
memset(optr, 0, sizeof(float) * out.shape().total_nr_elems());
auto ol = out.layout(), fl = filter.layout();
#define FOR2(a, A, b, B) \
for (size_t a = 0; a < A; ++a) \
for (size_t b = 0; b < B; ++b)
#define FOR3(a, A, b, B, c, C) \
FOR2(a, A, b, B) \
for (size_t c = 0; c < C; ++c)
FOR3(n, N, group, GROUP, icg, ICPG)
FOR2(ih, IH, iw, IW) {
float scale = *(dptr++);
FOR3(ocg, OCPG, fh, FH, fw, FW) {
auto oc_tot = group * OCPG + ocg;
int oh = int(ih * param.stride_h + fh * param.dilate_h) -
int(param.pad_h),
ow = int(iw * param.stride_w + fw * param.dilate_w) -
int(param.pad_w);
if (oh >= 0 && ow >= 0 && oh < static_cast<int>(OH) &&
ow < static_cast<int>(OW)) {
auto out_off = n * ol.stride[0] + oc_tot * ol.stride[1] +
oh * ol.stride[2] + ow,
flt_off = group * fl.stride[0] + icg * fl.stride[1] +
ocg * fl.stride[2] + fh * fl.stride[3] + fw;
optr[out_off] += scale * fptr[flt_off];
}
}
}
#undef FOR3
#undef FOR2
std::shared_ptr<HostTensorND> out;
conv_bwd_data_brute({inp[0], inp[1]}, out, param);
dest[0] = *out;
};
Checker::RunOptions opt;
......@@ -1547,7 +1660,8 @@ TEST(TestOprDNN, LocalShareForwardExecPolicy) {
PersistentCacheHook cache_hook{on_get};
#if MGB_ENABLE_FASTRUN
for (auto strategy: {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, S::PROFILE_HEURISTIC}) {
for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE,
S::PROFILE_HEURISTIC}) {
#else
for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) {
#endif
......@@ -2004,29 +2118,34 @@ TEST(TestOprDNN, HeuristicReproducible) {
.run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt)
.run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt);
auto algo = static_cast<megdnn::ConvolutionBackwardFilter*>(
auto&& megdnn_opr = static_cast<megdnn::ConvolutionBackwardFilter*>(
static_cast<opr::ConvolutionBackwardFilter*>(
bwd_flt->owner_opr())
->megdnn_opr())
->execution_policy()
.algo;
->megdnn_opr());
auto&& algo = megdnn_opr->execution_policy().algo;
megdnn::Algorithm* palgo =
megdnn_opr->get_algorithm_from_desc(algo);
mgb_assert(palgo, "Unknown algo description");
if (strategy == S::HEURISTIC_REPRODUCIBLE) {
EXPECT_TRUE(algo.is_reproducible);
EXPECT_TRUE(palgo->is_reproducible());
}
algo_name0 = algo.name.c_str();
algo_name0 = palgo->name();
}
{
Checker checker(make_graph, fwd);
checker.run(inp_tensor(2, 3, 4, 9, 8, 3, 3), opt)
.run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt)
.run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt);
auto algo = static_cast<megdnn::ConvolutionBackwardFilter*>(
static_cast<opr::ConvolutionBackwardFilter*>(
bwd_flt->owner_opr())
->megdnn_opr())
->execution_policy()
.algo;
algo_name1 = algo.name.c_str();
auto&& megdnn_opr = static_cast<megdnn::ConvolutionBackwardFilter*>(
static_cast<opr::ConvolutionBackwardFilter*>(
bwd_flt->owner_opr())
->megdnn_opr());
auto&& algo = megdnn_opr->execution_policy().algo;
megdnn::Algorithm* palgo =
megdnn_opr->get_algorithm_from_desc(algo);
mgb_assert(palgo, "Unknown algo description");
algo_name1 = palgo->name();
}
EXPECT_TRUE(algo_name0 == algo_name1);
}
......@@ -2286,6 +2405,8 @@ TEST_F(TestWeightPreprocess, NoPreprocessNeeded) {
MockAlgorithm algo;
EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _))
.WillRepeatedly(Return(&algo));
EXPECT_CALL(mock, get_algorithm_from_desc(_))
.WillRepeatedly(Return(&algo));
EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
.WillRepeatedly(Return(0));
EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
......@@ -2318,6 +2439,9 @@ TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) {
EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _))
.WillRepeatedly(Return(filter_layout));
EXPECT_CALL(mock, get_algorithm_from_desc(_))
.WillRepeatedly(Return(&algo));
Expectation algo_call =
EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _))
.WillOnce(Return(&algo));
......@@ -2349,7 +2473,6 @@ TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) {
pf->tensors[0].ptr<float>()[0] = 114.514f;
pf->tensors[1].ptr<float>()[0] = 1926.0817f;
}));
// Run the graph multiple times.
for (int i = 0; i < 3; i++) {
if (i > 0) {
......@@ -2381,6 +2504,8 @@ TEST_F(TestNoWeightPreprocess, NoPreprocess) {
MockAlgorithm algo;
EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _))
.WillRepeatedly(Return(&algo));
EXPECT_CALL(mock, get_algorithm_from_desc(_))
.WillRepeatedly(Return(&algo));
EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
.WillRepeatedly(Return(0));
EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
......
......@@ -16,157 +16,157 @@ namespace {
const char* EXPECTED_TEXT_OUT_REC[3] = {
// rec level 0
R"OUTPUT(
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2
deps:
val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0
deps:
val: [2.352, 0.1114, -0.2721, 0.7569, -0.2438, ...]min=-0.272 max=2.35 mean=0.471 l2=1.02 sd=0.994 s
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16
deps:
[i0]var1: [2.352, 0.1114, -0.2721, 0.7569, -0.2438, ...] s
val: [2.352, 0.1114, 0, 0.7569, 0, ...]min=0 max=2.35 mean=0.557 l2=1.01 sd=0.924 s
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10
deps:
[i0]var1: [2.352, 0.1114, -0.2721, 0.7569, -0.2438, ...] s
[i1]var5: <host value[s]> [0] s
[i2]var7: <host value[s]> [1] s
val: [2.352, 0.1114, -0.2721]min=-0.272 max=2.35 mean=0.731 l2=1.37 sd=1.42 s
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12
deps:
[i0]var11: [2.352, 0.1114, -0.2721] s
[i1]var9: <host value[s]> [2, 3] s
val: [2.352, 0.1114, -0.2721, 2.352, 0.1114, ...]min=-0.272 max=2.35 mean=0.731 l2=1.37 sd=1.27 s
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14
deps:
[i0]var3: [2] s
[i1]var13: [2.352, 0.1114, -0.2721, 2.352, 0.1114, ...] s
val: [4.352, 2.111, 1.728, 4.352, 2.111, ...]min=1.73 max=4.35 mean=2.73 l2=2.97 sd=1.27 s
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18
deps:
[i0]var15: [10.24, 0.2352, 0, 3.294, 0, ...] s
[i1]var17: [2.352, 0.1114, 0, 0.7569, 0, ...] s
val: [10.24, 0.2352, 0, 3.294, 0, ...]min=0 max=10.2 mean=2.33 l2=4.39 sd=4.08 s
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2
deps:
val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0
deps:
val: [0.05521, 0.724, 1.134, -0.2697, -1.545, ...]min=-1.54 max=1.13 mean=-0.105 l2=0.895 sd=0.974 s
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16
deps:
[i0]var1: [0.05521, 0.724, 1.134, -0.2697, -1.545, ...] s
val: [0.05521, 0.724, 1.134, 0, 0, ...]min=0 max=1.13 mean=0.319 l2=0.55 sd=0.491 s
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10
deps:
[i0]var1: [0.05521, 0.724, 1.134, -0.2697, -1.545, ...] s
[i1]var5: <host value[s]> [0] s
[i2]var7: <host value[s]> [1] s
val: [0.05521, 0.724, 1.134]min=0.0552 max=1.13 mean=0.638 l2=0.778 sd=0.545 s
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12
deps:
[i0]var11: [0.05521, 0.724, 1.134] s
[i1]var9: <host value[s]> [2, 3] s
val: [0.05521, 0.724, 1.134, 0.05521, 0.724, ...]min=0.0552 max=1.13 mean=0.638 l2=0.778 sd=0.487 s
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14
deps:
[i0]var3: [2] s
[i1]var13: [0.05521, 0.724, 1.134, 0.05521, 0.724, ...] s
val: [2.055, 2.724, 3.134, 2.055, 2.724, ...]min=2.06 max=3.13 mean=2.64 l2=2.68 sd=0.487 s
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18
deps:
[i0]var15: [0.1135, 1.972, 3.556, 0, 0, ...] s
[i1]var17: [0.05521, 0.724, 1.134, 0, 0, ...] s
val: [0.1135, 1.972, 3.556, 0, 0, ...]min=0 max=3.56 mean=0.94 l2=1.66 sd=1.5 s
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2
deps:
val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0
deps:
val: [-0.5069, 0.4525, 0.1695, -0.02793, -0.1907, ...]min=-0.507 max=1.32 mean=0.203 l2=0.616 sd=0.637 s
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16
deps:
[i0]var1: [-0.5069, 0.4525, 0.1695, -0.02793, -0.1907, ...] s
val: [0, 0.4525, 0.1695, 0, 0, ...]min=0 max=1.32 mean=0.324 l2=0.574 sd=0.52 s
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10
deps:
[i0]var1: [-0.5069, 0.4525, 0.1695, -0.02793, -0.1907, ...] s
[i1]var5: <host value[s]> [0] s
[i2]var7: <host value[s]> [1] s
val: [-0.5069, 0.4525, 0.1695]min=-0.507 max=0.453 mean=0.0384 l2=0.404 sd=0.493 s
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12
deps:
[i0]var11: [-0.5069, 0.4525, 0.1695] s
[i1]var9: <host value[s]> [2, 3] s
val: [-0.5069, 0.4525, 0.1695, -0.5069, 0.4525, ...]min=-0.507 max=0.453 mean=0.0384 l2=0.404 sd=0.441 s
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14
deps:
[i0]var3: [2] s
[i1]var13: [-0.5069, 0.4525, 0.1695, -0.5069, 0.4525, ...] s
val: [1.493, 2.453, 2.17, 1.493, 2.453, ...]min=1.49 max=2.45 mean=2.04 l2=2.08 sd=0.441 s
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18
deps:
[i0]var15: [0, 1.11, 0.3678, 0, 0, ...] s
[i1]var17: [0, 0.4525, 0.1695, 0, 0, ...] s
val: [0, 1.11, 0.3678, 0, 0, ...]min=0 max=2.87 mean=0.724 l2=1.26 sd=1.13 s
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2
deps:
val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0
deps:
val: [-0.03637, 2.111, 0.3236, -0.4861, -2.071, ...]min=-2.07 max=2.11 mean=0.0589 l2=1.25 sd=1.37 s
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16
deps:
[i0]var1: [-0.03637, 2.111, 0.3236, -0.4861, -2.071, ...] s
val: [0, 2.111, 0.3236, 0, 0, ...]min=0 max=2.11 mean=0.491 l2=0.897 sd=0.822 s
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10
deps:
[i0]var1: [-0.03637, 2.111, 0.3236, -0.4861, -2.071, ...] s
[i1]var5: <host value[s]> [0] s
[i2]var7: <host value[s]> [1] s
val: [-0.03637, 2.111, 0.3236]min=-0.0364 max=2.11 mean=0.799 l2=1.23 sd=1.15 s
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12
deps:
[i0]var11: [-0.03637, 2.111, 0.3236] s
[i1]var9: <host value[s]> [2, 3] s
val: [-0.03637, 2.111, 0.3236, -0.03637, 2.111, ...]min=-0.0364 max=2.11 mean=0.799 l2=1.23 sd=1.03 s
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14
deps:
[i0]var3: [2] s
[i1]var13: [-0.03637, 2.111, 0.3236, -0.03637, 2.111, ...] s
val: [1.964, 4.111, 2.324, 1.964, 4.111, ...]min=1.96 max=4.11 mean=2.8 l2=2.95 sd=1.03 s
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18
deps:
[i0]var15: [0, 8.675, 0.7518, 0, 0, ...] s
[i1]var17: [0, 2.111, 0.3236, 0, 0, ...] s
val: [0, 8.675, 0.7518, 0, 0, ...]min=0 max=8.68 mean=1.77 l2=3.59 sd=3.42 s
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2
deps:
val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s
var1 produced: name=var1 layout={5(4),4(1)} owner_opr=opr0{Host2DeviceCopy} opr0
var1 produced: name=var1 layout={5(4),4(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0
deps:
val: [-1.199, -1.02, 1.098, -1.472, -0.3848, ...]min=-2.24 max=1.25 mean=-0.347 l2=1.04 sd=1.01 s
var17 produced: name=var17 layout={5(4),4(1)} owner_opr=opr16{Elemwise} opr16
var17 produced: name=var17 layout={5(4),4(1) Float32} owner_opr=opr16{Elemwise} opr16
deps:
[i0]var1: [-1.199, -1.02, 1.098, -1.472, -0.3848, ...] s
val: [0, 0, 1.098, 0, 0, ...]min=0 max=1.25 mean=0.262 l2=0.471 sd=0.402 s
var11 produced: name=var11 layout={1(4),4(1)} owner_opr=opr10{Subtensor} opr10
var11 produced: name=var11 layout={1(4),4(1) Float32} owner_opr=opr10{Subtensor} opr10
deps:
[i0]var1: [-1.199, -1.02, 1.098, -1.472, -0.3848, ...] s
[i1]var5: <host value[s]> [0] s
[i2]var7: <host value[s]> [1] s
val: [-1.199, -1.02, 1.098, -1.472]min=-1.47 max=1.1 mean=-0.648 l2=1.21 sd=1.18 s
var13 produced: name=var13 layout={5(0),4(1)} owner_opr=opr12{Broadcast} opr12
var13 produced: name=var13 layout={5(0),4(1) Float32} owner_opr=opr12{Broadcast} opr12
deps:
[i0]var11: [-1.199, -1.02, 1.098, -1.472] s
[i1]var9: <host value[s]> [5, 4] s
val: [-1.199, -1.02, 1.098, -1.472, -1.199, ...]min=-1.47 max=1.1 mean=-0.648 l2=1.21 sd=1.05 s
var15 produced: name=var15 layout={5(4),4(1)} owner_opr=opr14{Elemwise} opr14
var15 produced: name=var15 layout={5(4),4(1) Float32} owner_opr=opr14{Elemwise} opr14
deps:
[i0]var3: [2] s
[i1]var13: [-1.199, -1.02, 1.098, -1.472, -1.199, ...] s
val: [0.8006, 0.9802, 3.098, 0.5279, 0.8006, ...]min=0.528 max=3.1 mean=1.35 l2=1.69 sd=1.05 s
var19 produced: name=var19 layout={5(4),4(1)} owner_opr=opr18{Elemwise} opr18
var19 produced: name=var19 layout={5(4),4(1) Float32} owner_opr=opr18{Elemwise} opr18
deps:
[i0]var15: [0, 0, 3.401, 0, 0, ...] s
[i1]var17: [0, 0, 1.098, 0, 0, ...] s
......@@ -176,33 +176,33 @@ var19 produced: name=var19 layout={5(4),4(1)} owner_opr=opr18{Elemwise} opr18
// rec level 1
R"OUTPUT(
==== begin lazy value recording
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2
deps:
val: <see lazy value below> s
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0
deps:
val: <see lazy value below> s
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16
deps:
[i0]var1: <see lazy value below> s
val: <see lazy value below> s
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10
deps:
[i0]var1: <see lazy value below> s
[i1]var5: <host value[s]> [0] s
[i2]var7: <host value[s]> [1] s
val: <see lazy value below> s
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12
deps:
[i0]var11: <see lazy value below> s
[i1]var9: <host value[s]> [2, 3] s
val: <see lazy value below> s
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14
deps:
[i0]var3: <see lazy value below> s
[i1]var13: <see lazy value below> s
val: <see lazy value below> s
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18
deps:
[i0]var15: <see lazy value below> s
[i1]var17: <see lazy value below> s
......@@ -242,33 +242,33 @@ var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18
// rec level 2
R"OUTPUT(
==== begin lazy value recording
var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2
var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2
deps:
val: <see lazy value below> s
var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0
var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0
deps:
val: <see lazy value below> s
var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16
var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16
deps:
[i0]var1: <see lazy value below> s
val: <see lazy value below> s
var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10
var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10
deps:
[i0]var1: <see lazy value below> s
[i1]var5: <host value[s]> [0] s
[i2]var7: <host value[s]> [1] s
val: <see lazy value below> s
var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12
var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12
deps:
[i0]var11: <see lazy value below> s
[i1]var9: <host value[s]> [2, 3] s
val: <see lazy value below> s
var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14
var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14
deps:
[i0]var3: <see lazy value below> s
[i1]var13: <see lazy value below> s
val: <see lazy value below> s
var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18
var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18
deps:
[i0]var15: <see lazy value below> s
[i1]var17: <see lazy value below> s
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册