Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
1e90c457
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
399
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
1e90c457
编写于
11月 02, 2020
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refactor(mgb/opr): move fastrun out of conv opr
GitOrigin-RevId: d5ef5356f62d46cf0e178652c97d5fd13be40dd8
上级
2fc77caf
变更
7
展开全部
隐藏空白更改
内联
并排
Showing
7 changed file
with
1031 addition
and
810 deletion
+1031
-810
src/opr/impl/dnn/convolution.cpp
src/opr/impl/dnn/convolution.cpp
+35
-806
src/opr/impl/search_policy/algo_chooser.cpp
src/opr/impl/search_policy/algo_chooser.cpp
+401
-0
src/opr/impl/search_policy/profiler.cpp
src/opr/impl/search_policy/profiler.cpp
+259
-0
src/opr/impl/search_policy/workspace_need_limit_getter.inl
src/opr/impl/search_policy/workspace_need_limit_getter.inl
+36
-0
src/opr/include/megbrain/opr/search_policy/algo_chooser.h
src/opr/include/megbrain/opr/search_policy/algo_chooser.h
+140
-0
src/opr/include/megbrain/opr/search_policy/profiler.h
src/opr/include/megbrain/opr/search_policy/profiler.h
+160
-0
src/opr/test/basic_arith/elemwise.cpp
src/opr/test/basic_arith/elemwise.cpp
+0
-4
未找到文件。
src/opr/impl/dnn/convolution.cpp
浏览文件 @
1e90c457
此差异已折叠。
点击以展开。
src/opr/impl/search_policy/algo_chooser.cpp
0 → 100644
浏览文件 @
1e90c457
/**
* \file src/opr/impl/search_policy/algo_chooser.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "megbrain/opr/search_policy/algo_chooser.h"
#include "megbrain/opr/search_policy/profiler.h"
#include "../internal/invoke.h"
#include "../internal/megdnn_opr_wrapper.inl"
#include "./workspace_need_limit_getter.inl"
//! TODO: here has to be know some megdnn::opr when there is produced midout.h
//! fix it if there is another graceful way.
#include "megdnn/oprs.h"
#include "midout.h"
MIDOUT_DECL
(
megbrain_opr_algo_chooser
)
#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_algo_chooser, __VA_ARGS__) {
#define MIDOUT_E \
} \
MIDOUT_END();
using
mgb
::
opr
::
intl
::
WorkspaceLimitGetter
;
#define APPLY(statement, ...) \
mgb::apply([&](const auto&... args) { return statement; }, \
std::tuple_cat(__VA_ARGS__))
// timeout delta to be added with fastest known algorithm for new algos
constexpr
double
TIMEOUT_TOLERANCE
=
2
;
namespace
mgb
{
namespace
opr
{
template
<
typename
Opr
>
AlgoChooserProfileCache
::
Result
AlgoChooser
<
Opr
>::
get_profile_result
(
ExeContext
&
ctx
,
bool
enable_update
)
{
AlgoChooserProfileCache
&
cache
=
ctx
.
mgb_opr
()
->
profile_cache
();
ConvTensorLayouts
origin_layouts
=
ctx
.
layouts
();
typename
Opr
::
Param
origin_param
=
ctx
.
mgb_opr
()
->
param
();
get_origin_param_and_layouts
(
ctx
,
origin_layouts
,
origin_param
);
AlgoChooserProfileCache
::
Key
cache_key
{
origin_layouts
.
data
(),
origin_layouts
.
size
(),
&
origin_param
,
sizeof
(
origin_param
)};
{
auto
&&
rst
=
cache
.
get
(
cache_key
);
if
(
rst
.
valid
())
return
rst
.
val
();
}
AlgoChooserProfileCache
::
Result
prof_rst
;
if
(
!
enable_update
)
return
prof_rst
;
std
::
string
str_on_inp_shape
=
ssprintf
(
"on input layouts (%s, %s)"
,
ctx
.
layouts
()[
0
].
to_string
().
c_str
(),
ctx
.
layouts
()[
1
].
to_string
().
c_str
());
double
cur_timeout
=
0
;
RealTimer
timer
;
for
(
auto
algo
:
ctx
.
get_all_candidates_with_workspace_limit
())
{
Maybe
<
AlgoChooserProfileCache
::
ResultEntry
>
cur_rst
;
std
::
string
msg
=
ssprintf
(
"profiling %s algorithm %s %s"
,
ctx
.
mgb_opr
()
->
dyn_typeinfo
()
->
name
,
algo
->
name
(),
str_on_inp_shape
.
c_str
());
timer
.
reset
();
MGB_TRY
{
cur_rst
=
ctx
.
profile_single_algo
(
algo
,
cur_timeout
);
}
MGB_CATCH
(
std
::
exception
&
exc
,
{
mgb_log_warn
(
"caught exception during %s: %s"
,
msg
.
c_str
(),
exc
.
what
());
continue
;
})
MGB_CATCH
(...,
{
mgb_log_warn
(
"caught exception during %s"
,
msg
.
c_str
());
continue
;
})
if
(
!
cur_rst
.
valid
())
{
mgb_log_warn
(
"timeout when %s; timeout setting: %.3fsec"
,
msg
.
c_str
(),
cur_timeout
);
continue
;
}
if
(
!
cur_timeout
)
{
cur_timeout
=
timer
.
get_secs
()
+
TIMEOUT_TOLERANCE
;
}
else
{
cur_timeout
=
std
::
min
(
cur_timeout
,
timer
.
get_secs
()
+
TIMEOUT_TOLERANCE
);
}
auto
&&
rst
=
cur_rst
.
val
();
mgb_log_debug
(
"%s: workspace: %zu; time: %.3gsec"
,
msg
.
c_str
(),
rst
.
workspace
,
rst
.
time
);
prof_rst
.
push_back
(
rst
);
}
mgb_assert
(
!
prof_rst
.
empty
(),
"no usable convolution algorithm %s"
,
str_on_inp_shape
.
c_str
());
cache
.
put
(
cache_key
,
prof_rst
);
return
prof_rst
;
}
template
<
>
void
AlgoChooser
<
megdnn
::
ConvBias
>::
get_origin_param_and_layouts
(
const
ExeContext
&
ctx
,
ConvTensorLayouts
&
layouts
,
megdnn
::
ConvBias
::
Param
&
param
)
{
auto
format
=
static_cast
<
megdnn
::
param
::
ConvBias
::
Format
>
(
ctx
.
megdnn_opr
()
->
param
().
format
);
size_t
output_block_size
=
ctx
.
megdnn_opr
()
->
param
().
output_block_size
;
megdnn
::
ConvBias
::
deduce_winograd_origin_layout_and_param
(
format
,
output_block_size
,
ctx
.
layouts
()[
0
],
ctx
.
layouts
()[
1
],
layouts
[
1
],
param
);
}
template
<
typename
Opr
>
typename
AlgoChooser
<
Opr
>::
ImplAlgo
AlgoChooser
<
Opr
>::
choose_by_profile
(
ExeContext
&
ctx
,
bool
require_reproducible
,
bool
enable_update
)
{
MIDOUT_B
(
Opr
,
midout_iv
(
MGB_HASH_STR
(
"AlgoChooser::choose_by_profile"
)))
auto
opr
=
ctx
.
mgb_opr
();
if
(
opr
->
owner_graph
()
->
options
().
no_profiling_on_shape_change
)
{
auto
algo
=
ctx
.
megdnn_opr
()
->
execution_policy
().
algorithm
;
if
(
algo
)
return
algo
;
}
std
::
unordered_map
<
std
::
string
,
ImplAlgo
>
algo_map
;
for
(
auto
i
:
ctx
.
get_all_candidates
())
{
auto
ins
=
algo_map
.
emplace
(
i
->
name
(),
i
);
mgb_assert
(
ins
.
second
,
"duplicated algo name: %s"
,
i
->
name
());
}
auto
&&
prof
=
get_profile_result
(
ctx
,
enable_update
);
if
(
prof
.
empty
())
return
nullptr
;
for
(
auto
&&
i
:
prof
)
{
if
((
!
require_reproducible
||
i
.
reproducible
))
{
auto
iter
=
algo_map
.
find
(
i
.
algo
);
mgb_assert
(
iter
!=
algo_map
.
end
(),
"algorithm %s exists in "
"profiling result but not in algo_map; please "
"report this "
"bug; opr: %s{%s}, shapes: %s %s %s"
,
ctx
.
mgb_opr
()
->
cname
(),
ctx
.
mgb_opr
()
->
dyn_typeinfo
()
->
name
,
ctx
.
layouts
()[
0
].
TensorShape
::
to_string
().
c_str
(),
ctx
.
layouts
()[
1
].
TensorShape
::
to_string
().
c_str
(),
ctx
.
layouts
()[
2
].
TensorShape
::
to_string
().
c_str
(),
i
.
algo
.
c_str
());
return
iter
->
second
;
}
}
mgb_log_error
(
"Workspace requirement (%zu) could not be satisfied. Abort now "
"to "
"avoid further problems"
,
WorkspaceLimitGetter
::
get_workspace_limit
(
opr
->
owner_graph
(),
opr
->
comp_node
(),
opr
->
execution_policy
().
workspace_limit
));
mgb_trap
();
MIDOUT_E
}
template
<
typename
Opr
>
size_t
AlgoChooser
<
Opr
>::
setup_algo
(
const
ConvTensorLayouts
&
layouts
,
Opr
*
megdnn_opr
,
const
MGBOpr
*
mgb_opr
,
bool
allow_weight_preprocess
)
{
if
(
WorkspaceLimitGetter
::
is_prealloc_run
(
mgb_opr
->
owner_graph
()))
{
return
0
;
}
ExeContext
ctx
(
layouts
,
megdnn_opr
,
mgb_opr
,
allow_weight_preprocess
);
auto
algo
=
get_algo
(
ctx
);
size_t
workspace
=
ctx
.
get_workspace_size_bytes
(
algo
);
mgb_log_debug
(
"%s: tensor layouts(%s %s, %s %s) -> (%s %s): algo=%s "
"workspace=%.2fMiB reproducible=%d"
,
mgb_opr
->
dyn_typeinfo
()
->
name
,
layouts
[
0
].
to_string
().
c_str
(),
layouts
[
0
].
dtype
.
name
(),
layouts
[
1
].
to_string
().
c_str
(),
layouts
[
1
].
dtype
.
name
(),
layouts
[
layouts
.
size
()
-
1
].
to_string
().
c_str
(),
layouts
[
layouts
.
size
()
-
1
].
dtype
.
name
(),
algo
->
name
(),
workspace
/
(
1024
*
1024.0
),
algo
->
is_reproducible
());
megdnn_opr
->
execution_policy
()
=
{
algo
};
return
workspace
;
}
template
<
typename
Opr
>
typename
AlgoChooser
<
Opr
>::
ImplAlgo
AlgoChooser
<
Opr
>::
get_algo
(
ExeContext
&
ctx
)
{
using
S
=
mixin
::
Convolution
::
ExecutionPolicy
::
Strategy
;
MGB_MARK_USED_VAR
(
TIMEOUT_TOLERANCE
);
switch
(
ctx
.
mgb_opr
()
->
execution_policy
().
strategy
)
{
case
S
::
HEURISTIC
:
return
ctx
.
choose_by_heuristic
();
case
S
::
HEURISTIC_REPRODUCIBLE
:
return
ctx
.
choose_by_heuristic
(
true
);
case
S
::
PROFILE_HEURISTIC
:
{
ImplAlgo
algo
=
choose_by_profile
(
ctx
,
false
,
false
);
if
(
algo
==
nullptr
)
algo
=
ctx
.
choose_by_heuristic
();
return
algo
;
}
#if MGB_ENABLE_FASTRUN
case
S
::
PROFILE
:
return
choose_by_profile
(
ctx
,
false
);
case
S
::
PROFILE_REPRODUCIBLE
:
return
choose_by_profile
(
ctx
,
true
);
#endif
default:
mgb_throw
(
GraphError
,
"bad convolution ExecutionPolicy strategy"
);
}
}
#define INST(Opr) \
template AlgoChooser<megdnn::Opr>::ImplAlgo \
AlgoChooser<megdnn::Opr>::get_algo(ExeContext& ctx); \
template AlgoChooserProfileCache::Result \
AlgoChooser<megdnn::Opr>::get_profile_result(ExeContext& ctx, \
bool enable_update); \
template AlgoChooser<megdnn::Opr>::ImplAlgo \
AlgoChooser<megdnn::Opr>::choose_by_profile( \
ExeContext& ctx, bool require_reproducible, bool enable_update); \
template size_t AlgoChooser<megdnn::Opr>::setup_algo( \
const ConvTensorLayouts& layouts, megdnn::Opr* megdnn_opr, \
const MGBOpr* mgb_opr, bool allow_weight_preprocess);
MGB_FOREACH_FASTRUN_OPR
(
INST
)
#undef INST
//////////////////////////////// ExeContext /////////////////////////////
template
<
typename
Opr
>
typename
AlgoChooser
<
Opr
>::
ImplAlgo
AlgoChooser
<
Opr
>::
ExeContext
::
choose_by_heuristic
(
bool
reproducible
)
const
{
auto
opr
=
m_mgb_opr
;
auto
workspace_limit
=
WorkspaceLimitGetter
::
get_workspace_limit
(
opr
->
owner_graph
(),
opr
->
comp_node
(),
opr
->
execution_policy
().
workspace_limit
);
return
APPLY
(
m_megdnn_opr
->
get_algorithm_heuristic
(
args
...,
workspace_limit
,
reproducible
),
m_layouts
);
}
template
<
typename
Opr
>
std
::
vector
<
typename
AlgoChooser
<
Opr
>::
ImplAlgo
>
AlgoChooser
<
Opr
>::
ExeContext
::
get_all_candidates
()
const
{
auto
heu
=
choose_by_heuristic
();
auto
&&
ret
=
APPLY
(
m_megdnn_opr
->
get_all_algorithms
(
args
...),
m_layouts
);
bool
found
=
false
;
for
(
size_t
i
=
0
;
i
<
ret
.
size
();
++
i
)
{
if
(
ret
[
i
]
==
heu
)
{
found
=
true
;
std
::
swap
(
ret
[
i
],
ret
[
0
]);
break
;
}
}
mgb_assert
(
found
,
"algo %s got by heuristic not found in "
"candidate list"
,
heu
->
name
());
return
std
::
move
(
ret
);
}
template
<
typename
Opr
>
std
::
vector
<
typename
AlgoChooser
<
Opr
>::
ImplAlgo
>
AlgoChooser
<
Opr
>::
ExeContext
::
get_all_candidates_with_workspace_limit
()
const
{
auto
&&
all_algos
=
get_all_candidates
();
auto
opr
=
m_mgb_opr
;
auto
workspace_limit
=
WorkspaceLimitGetter
::
get_workspace_limit
(
opr
->
owner_graph
(),
opr
->
comp_node
(),
opr
->
execution_policy
().
workspace_limit
);
std
::
vector
<
ImplAlgo
>
ret
;
for
(
auto
&&
algo
:
all_algos
)
{
if
(
get_workspace_size_bytes
(
algo
)
<=
workspace_limit
)
{
ret
.
push_back
(
algo
);
}
}
return
ret
;
}
template
<
typename
Opr
>
size_t
AlgoChooser
<
Opr
>::
ExeContext
::
get_workspace_size_bytes
(
ImplAlgo
algo
)
const
{
m_megdnn_opr
->
execution_policy
()
=
{
algo
};
size_t
result
;
if_constexpr
<
opr_supports_preprocess
<
Opr
>
()
>
(
[
&
](
auto
_
)
{
auto
&&
opr
=
_
(
m_megdnn_opr
);
auto
prep
=
this
->
construct_fake_preprocess_filter
();
PreprocessFilter
<
Opr
>*
prep_ptr
=
prep
.
valid
()
?
&
prep
.
val
()
:
nullptr
;
result
=
std
::
max
(
APPLY
(
opr
->
get_preprocess_workspace_in_bytes
(
args
...),
m_layouts
),
APPLY
(
opr
->
get_workspace_in_bytes
(
args
...,
prep_ptr
),
m_layouts
));
},
/* else */
[
&
](
auto
_
)
{
result
=
APPLY
(
_
(
m_megdnn_opr
)
->
get_workspace_in_bytes
(
args
...),
m_layouts
);
});
return
result
;
}
template
<
typename
Opr
>
Maybe
<
AlgoChooserProfileCache
::
ResultEntry
>
AlgoChooser
<
Opr
>::
ExeContext
::
profile_single_algo
(
ImplAlgo
algo
,
double
&
timeout
)
const
{
typename
TimedProfiler
<
Opr
>::
Param
param
;
auto
name
=
algo
->
name
();
// force check copy size <= dest len-1 from gcc8 for safe
auto
len
=
sizeof
(
param
.
algo_name
);
strncpy
(
param
.
algo_name
,
name
,
len
-
1
);
param
.
algo_name
[
len
-
1
]
=
'\0'
;
mgb_assert
(
!
param
.
algo_name
[
sizeof
(
param
.
algo_name
)
-
2
],
"algo name too long: %s; len=%zu"
,
name
,
strlen
(
name
));
param
.
workspace
=
get_workspace_size_bytes
(
algo
);
for
(
int
i
=
0
;
i
<
arity
;
++
i
)
{
auto
&&
src
=
m_layouts
[
i
];
mgb_assert
(
src
.
format
.
is_default
()
&&
(
src
.
dtype
.
category
()
==
DTypeCategory
::
FLOAT
||
src
.
dtype
.
category
()
==
DTypeCategory
::
INT
||
src
.
dtype
.
category
()
==
DTypeCategory
::
QUANTIZED
),
"unsupported layout in profiling: %s"
,
src
.
to_string
().
c_str
());
param
.
dtypes
[
i
]
=
src
.
dtype
.
enumv
();
}
param
.
comp_node_loc
=
m_mgb_opr
->
output
(
0
)
->
comp_node
().
locator
();
mgb_assert
(
param
.
shapes
.
size
()
==
m_layouts
.
size
());
for
(
size_t
i
=
0
;
i
<
param
.
shapes
.
size
();
++
i
)
param
.
shapes
[
i
]
=
m_layouts
[
i
];
param
.
opr_param
=
m_megdnn_opr
->
param
();
param
.
allow_weight_preprocess
=
m_allow_weight_preprocess
;
auto
rst
=
TimedProfiler
<
Opr
>::
profile
(
param
,
timeout
);
// MIOpen conv profiles all available algos when a specfic shape is
// provided for the first time, which probably adds to the result time.
// Therefore, a second profile execution is needed.
if
(
strncmp
(
name
,
"MIOpen"
,
6
)
==
0
)
rst
=
TimedProfiler
<
Opr
>::
profile
(
param
,
timeout
);
if
(
!
rst
.
valid
())
return
None
;
return
AlgoChooserProfileCache
::
ResultEntry
{
algo
->
name
(),
algo
->
is_reproducible
(),
rst
.
val
().
time
,
param
.
workspace
};
}
template
<
typename
Opr
>
Maybe
<
PreprocessFilter
<
Opr
>>
AlgoChooser
<
Opr
>::
ExeContext
::
construct_fake_preprocess_filter
()
const
{
Maybe
<
PreprocessFilter
<
Opr
>>
result
=
None
;
if_constexpr
<
opr_supports_preprocess
<
Opr
>
()
>
([
&
](
auto
_
)
{
if
(
!
m_allow_weight_preprocess
)
return
;
auto
opr
=
_
(
m_megdnn_opr
);
auto
layout
=
APPLY
(
opr
->
deduce_preprocessed_filter_layout
(
args
...),
m_layouts
);
if
(
layout
.
empty
())
return
;
result
=
PreprocessFilter
<
Opr
>
{};
auto
&
res
=
result
.
val
();
res
.
algorithm_id
=
nullptr
;
res
.
tensors
.
resize
(
layout
.
size
());
for
(
size_t
i
=
0
;
i
<
layout
.
size
();
i
++
)
{
res
.
tensors
[
i
]
=
megdnn
::
TensorND
(
nullptr
,
layout
[
i
]);
}
});
return
result
;
}
#define INST(Opr) \
template typename AlgoChooser<megdnn::Opr>::ImplAlgo \
AlgoChooser<megdnn::Opr>::ExeContext::choose_by_heuristic( \
bool reproducible) const; \
template std::vector<typename AlgoChooser<megdnn::Opr>::ImplAlgo> \
AlgoChooser<megdnn::Opr>::ExeContext::get_all_candidates() const; \
template std::vector<typename AlgoChooser<megdnn::Opr>::ImplAlgo> \
AlgoChooser<megdnn::Opr>::ExeContext:: \
get_all_candidates_with_workspace_limit() const; \
template size_t \
AlgoChooser<megdnn::Opr>::ExeContext::get_workspace_size_bytes( \
typename AlgoChooser<megdnn::Opr>::ImplAlgo algo) const; \
template Maybe<AlgoChooserProfileCache::ResultEntry> \
AlgoChooser<megdnn::Opr>::ExeContext::profile_single_algo( \
typename AlgoChooser<megdnn::Opr>::ImplAlgo algo, double& timeout) \
const; \
MGB_FOREACH_FASTRUN_OPR
(
INST
)
#undef INST
}
// namespace opr
}
// namespace mgb
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
src/opr/impl/search_policy/profiler.cpp
0 → 100644
浏览文件 @
1e90c457
/**
* \file src/opr/impl/search_policy/profile.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "megbrain/opr/search_policy/profiler.h"
#include "../internal/invoke.h"
//! TODO: here has to be know some megdnn::opr when there is produced midout.h
//! fix it if there is another graceful way.
#include "megdnn/oprs.h"
#include "midout.h"
MIDOUT_DECL
(
megbrain_opr_profile
)
#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_profile, __VA_ARGS__) {
#define MIDOUT_E \
} \
MIDOUT_END();
namespace
mgb
{
namespace
opr
{
#define APPLY(statement, ...) \
mgb::apply([&](const auto&... args) { return statement; }, \
std::tuple_cat(__VA_ARGS__))
template
<
typename
Opr
>
const
double
TimedProfiler
<
Opr
>::
timeout_setting
=
TimedProfiler
<
Opr
>::
init_timeout_setting
();
template
<
typename
Opr
>
double
TimedProfiler
<
Opr
>::
init_timeout_setting
()
{
#if MGB_ENABLE_FASTRUN
sys
::
TimedFuncInvoker
::
ins
().
register_func
(
AlgoChooserFuncId
<
Opr
>::
ID
,
&
TimedProfiler
<
Opr
>::
prof_impl
,
&
TimedProfiler
<
Opr
>::
prof_init_device
);
auto
to_set
=
MGB_GETENV
(
"MGB_CONV_PROFILING_TIMEOUT"
);
if
(
to_set
)
return
std
::
stod
(
to_set
);
#endif
return
0
;
}
#define APPLY(statement, ...) \
mgb::apply([&](const auto&... args) { return statement; }, \
std::tuple_cat(__VA_ARGS__))
template
<
typename
Opr
>
typename
TimedProfiler
<
Opr
>::
TResult
TimedProfiler
<
Opr
>::
prof_impl
(
const
TParam
&
raw_param
)
{
MIDOUT_B
(
Opr
,
midout_iv
(
MGB_HASH_STR
(
"TimedProfiler::prof_impl"
)))
auto
&&
param
=
raw_param
.
as_single_pod
<
Param
>
();
CompNode
cn
=
CompNode
::
load
(
param
.
comp_node_loc
,
param
.
comp_node_loc
);
auto
megdnn_opr
=
intl
::
create_megdnn_opr
<
Opr
>
(
cn
);
std
::
array
<
TensorLayout
,
arity
>
layouts
;
auto
from_enum
=
[
&
](
DTypeEnum
enumv
)
->
DType
{
switch
(
enumv
)
{
#define cb(_dt) \
case DTypeTrait<_dt>::enumv: \
return _dt(1.0f, static_cast<uint8_t>(0))
cb
(
dtype
::
Quantized8Asymm
);
#undef cb
#define cb(_dt) \
case DTypeTrait<_dt>::enumv: \
return _dt(1.0f)
cb
(
dtype
::
QuantizedS8
);
cb
(
dtype
::
QuantizedS16
);
cb
(
dtype
::
QuantizedS32
);
default:
return
DType
::
from_enum
(
enumv
);
#undef cb
}
};
for
(
int
i
=
0
;
i
<
arity
;
++
i
)
{
layouts
[
i
]
=
{
param
.
shapes
[
i
],
from_enum
(
param
.
dtypes
[
i
])};
}
megdnn_opr
->
param
()
=
param
.
opr_param
;
{
typename
Opr
::
Algorithm
*
algo
=
nullptr
;
for
(
auto
i
:
APPLY
(
megdnn_opr
->
get_all_algorithms
(
args
...),
layouts
))
{
if
(
!
strcmp
(
i
->
name
(),
param
.
algo_name
))
{
algo
=
i
;
break
;
}
}
mgb_assert
(
algo
,
"algorithm %s not found"
,
param
.
algo_name
);
megdnn_opr
->
execution_policy
()
=
{
algo
};
}
// Allocate preprocessed weight buffers.
TensorLayoutArray
preprocessed_layout
;
if_constexpr
<
opr_supports_preprocess
<
Opr
>
()
>
([
&
](
auto
_
)
{
if
(
param
.
allow_weight_preprocess
)
{
preprocessed_layout
=
APPLY
(
_
(
megdnn_opr
)
->
deduce_preprocessed_filter_layout
(
args
...),
layouts
);
}
});
{
// first allocate a whole chunk to avoid memory fragmentation (here we
// rely on memory allocator to reuse memory)
auto
align
=
cn
.
get_mem_addr_alignment
();
size_t
tot_size
=
align
;
for
(
int
i
=
0
;
i
<
arity
;
++
i
)
{
tot_size
+=
layouts
[
i
].
span
().
high_byte
+
align
;
}
for
(
const
auto
&
layout
:
preprocessed_layout
)
{
tot_size
+=
layout
.
span
().
high_byte
+
align
;
}
tot_size
+=
param
.
workspace
;
DeviceTensorStorage
storage
{
cn
};
storage
.
ensure_size
(
tot_size
);
}
// allocate input and output memory
std
::
array
<
DeviceTensorND
,
arity_in
>
inp_val
;
std
::
array
<
DeviceTensorND
,
arity_out
>
out_val
;
DeviceTensorND
workspace
;
for
(
int
i
=
0
;
i
<
arity_in
;
++
i
)
{
inp_val
[
i
].
comp_node
(
cn
).
dtype
(
layouts
[
i
].
dtype
).
resize
(
layouts
[
i
]);
}
for
(
int
i
=
0
;
i
<
arity_out
;
++
i
)
{
out_val
[
i
]
.
comp_node
(
cn
)
.
dtype
(
layouts
[
arity_in
+
i
].
dtype
)
.
resize
(
layouts
[
arity_in
+
i
]);
}
megdnn
::
Workspace
mdn_workspace
;
// allocate workspace
if
(
param
.
workspace
)
{
workspace
.
comp_node
(
cn
).
dtype
(
dtype
::
Byte
()).
resize
({
param
.
workspace
});
mdn_workspace
.
size
=
param
.
workspace
;
mdn_workspace
.
raw_ptr
=
workspace
.
raw_ptr
();
}
// allocate storage for preprocessed filter
SmallVector
<
DeviceTensorND
>
flt_val
(
preprocessed_layout
.
size
());
for
(
size_t
i
=
0
;
i
<
preprocessed_layout
.
size
();
i
++
)
{
flt_val
[
i
]
=
{
cn
,
preprocessed_layout
[
i
],
preprocessed_layout
[
i
].
dtype
,
preprocessed_layout
[
i
].
format
};
}
for
(
int
i
=
0
;
i
<
arity_in
;
++
i
)
{
fill_zero_dev_tensor
(
inp_val
[
i
]);
}
PreprocessFilter
<
Opr
>
prep_flt
;
if_constexpr
<
opr_supports_preprocess
<
Opr
>
()
>
([
&
](
auto
_
)
{
if
(
!
preprocessed_layout
.
empty
())
{
auto
&&
pf
=
_
(
prep_flt
);
pf
.
algorithm_id
=
nullptr
;
pf
.
tensors
.
resize
(
flt_val
.
size
());
for
(
size_t
i
=
0
;
i
<
flt_val
.
size
();
i
++
)
{
pf
.
tensors
[
i
]
=
flt_val
[
i
].
as_megdnn
();
}
APPLY
(
_
(
megdnn_opr
)
->
exec_preprocess
(
args
...,
&
pf
,
mdn_workspace
),
std
::
forward_as_tuple
(
layouts
[
0
],
inp_val
[
1
].
as_megdnn
()),
array_skip
<
2
>
(
layouts
));
}
});
RealTimer
timer
;
auto
ev_start
=
cn
.
create_event
(
CompNode
::
Event
::
NEED_TIMER
),
ev_end
=
cn
.
create_event
(
CompNode
::
Event
::
NEED_TIMER
);
ev_start
->
record
();
if_constexpr
<
opr_supports_preprocess
<
Opr
>
()
>
(
[
&
](
auto
_
)
{
auto
&&
opr
=
_
(
megdnn_opr
);
PreprocessFilter
<
Opr
>*
pf
=
preprocessed_layout
.
empty
()
?
nullptr
:
&
prep_flt
;
APPLY
(
opr
->
exec
(
args
.
as_megdnn
()...,
pf
,
mdn_workspace
),
inp_val
,
out_val
);
},
/* else */
[
&
](
auto
_
)
{
APPLY
(
_
(
megdnn_opr
)
->
exec
(
args
.
as_megdnn
()...,
mdn_workspace
),
inp_val
,
out_val
);
});
ev_end
->
record
();
double
next_report_time
=
0.5
;
while
(
!
ev_end
->
finished
())
{
if
(
timer
.
get_secs
()
>=
next_report_time
)
{
mgb_log_warn
(
"profiling conv algo %s already took %.3f/%.3f secs"
" (limit can be set by MGB_CONV_PROFILING_TIMEOUT) "
,
param
.
algo_name
,
timer
.
get_secs
(),
param
.
actual_timeout
);
next_report_time
=
timer
.
get_secs
()
+
1
;
}
using
namespace
std
::
literals
;
std
::
this_thread
::
sleep_for
(
1000us
);
}
mgb_assert
(
ev_start
->
finished
());
return
TResult
::
from_pod
(
Result
{
ev_start
->
elapsed_time_until
(
*
ev_end
)});
MIDOUT_E
};
template
<
typename
Opr
>
Maybe
<
typename
TimedProfiler
<
Opr
>::
Result
>
TimedProfiler
<
Opr
>::
profile
(
const
Param
&
param
,
double
&
timeout
)
{
mgb_assert
(
timeout
>=
0
);
if
(
!
timeout
)
{
timeout
=
timeout_setting
;
}
else
if
(
timeout_setting
)
{
timeout
=
std
::
min
(
timeout
,
timeout_setting
);
}
param
.
actual_timeout
=
timeout
?
timeout
:
std
::
numeric_limits
<
double
>::
infinity
();
auto
res
=
sys
::
TimedFuncInvoker
::
ins
().
invoke
(
AlgoChooserFuncId
<
Opr
>::
ID
,
TParam
::
from_pod
(
const_cast
<
Param
&>
(
param
)),
timeout
);
if
(
res
.
valid
())
return
res
.
val
().
template
as_single_pod
<
Result
>();
return
None
;
}
template
<
typename
Opr
>
void
TimedProfiler
<
Opr
>::
prof_init_device
(
const
TParam
&
raw_param
)
{
MIDOUT_B
(
Opr
,
midout_iv
(
MGB_HASH_STR
(
"TimedProfiler::prof_init_device"
)))
auto
&&
param
=
raw_param
.
as_single_pod
<
Param
>
();
CompNode
cn
=
CompNode
::
load
(
param
.
comp_node_loc
,
param
.
comp_node_loc
);
// wait for cuda init, so its time does not get accounted in timeout
cn
.
sync
();
MIDOUT_E
}
#define INST(Opr) \
template const double TimedProfiler<megdnn::Opr>::timeout_setting; \
template double TimedProfiler<megdnn::Opr>::init_timeout_setting(); \
template typename TimedProfiler<megdnn::Opr>::TResult \
TimedProfiler<megdnn::Opr>::prof_impl(const TParam& raw_param); \
template Maybe<typename TimedProfiler<megdnn::Opr>::Result> \
TimedProfiler<megdnn::Opr>::profile(const Param& param, double& timeout); \
template void TimedProfiler<megdnn::Opr>::prof_init_device( \
const TParam& raw_param);
MGB_FOREACH_FASTRUN_OPR
(
INST
)
#undef INST
}
// namespace opr
}
// namespace mgb
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
src/opr/impl/search_policy/workspace_need_limit_getter.inl
0 → 100644
浏览文件 @
1e90c457
/**
* \file src/opr/impl/search_policy/workspace_need_limit_getter.inl
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "megbrain/opr/search_policy/profiler.h"
#include "../internal/megdnn_opr_wrapper.inl"
namespace
mgb
{
namespace
opr
{
namespace
intl
{
#define cb(_Opr) \
template <> \
struct AutoAddWorkspaceNeedLimitGetter<megdnn::_Opr> { \
static constexpr bool val = true; \
};
MGB_FOREACH_FASTRUN_OPR
(
cb
)
#undef cb
}
// namespace intl
}
// namespace opr
}
// namespace mgb
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
src/opr/include/megbrain/opr/search_policy/algo_chooser.h
0 → 100644
浏览文件 @
1e90c457
/**
* \file src/opr/include/megbrain/opr/search_policy/algo_chooser.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "megbrain/opr/search_policy/profiler.h"
template
<
class
MegDNNOpr
>
struct
MegDNNOpr2MGBOpr
;
#define cb(_Opr) \
template <> \
struct MegDNNOpr2MGBOpr<megdnn::_Opr> { \
using MGBOpr = mgb::opr::_Opr; \
};
MGB_FOREACH_FASTRUN_OPR
(
cb
)
#undef cb
namespace
mgb
{
namespace
opr
{
/* =================== AlgoChooser =================== */
/*!
* \brief choose algorithm according to ExecutionPolicy
*
* This class only provides static methods, and the entry point is
* AlgoChooser::setup_algo. When profiling is needed, it would first try to
* retrive profiling stats from cache, and run TimedProfiler when necessary
*
* \tparam Opr megdnn operator impl
*/
template
<
typename
Opr
>
class
AlgoChooser
{
static
constexpr
int
arity_in
=
OprArityTrait
<
Opr
>::
arity_in
;
static
constexpr
int
arity_out
=
OprArityTrait
<
Opr
>::
arity_out
;
static
constexpr
int
arity
=
OprArityTrait
<
Opr
>::
arity
;
using
ImplAlgo
=
typename
Opr
::
Algorithm
*
;
using
MGBOpr
=
typename
MegDNNOpr2MGBOpr
<
Opr
>::
MGBOpr
;
using
ConvTensorLayouts
=
std
::
array
<
TensorLayout
,
arity
>
;
class
ExeContext
{
const
ConvTensorLayouts
&
m_layouts
;
Opr
*
m_megdnn_opr
;
const
MGBOpr
*
m_mgb_opr
;
bool
m_allow_weight_preprocess
;
public:
ExeContext
(
const
ConvTensorLayouts
&
layouts
,
Opr
*
megdnn_opr
,
const
MGBOpr
*
mgb_opr
,
bool
allow_weight_preprocess
)
:
m_layouts
{
layouts
},
m_megdnn_opr
{
megdnn_opr
},
m_mgb_opr
{
mgb_opr
},
m_allow_weight_preprocess
{
allow_weight_preprocess
}
{
mgb_assert
(
m_layouts
.
size
()
==
layouts
.
size
());
static_assert
(
std
::
tuple_size
<
ConvTensorLayouts
>::
value
==
3
||
std
::
tuple_size
<
ConvTensorLayouts
>::
value
==
5
||
std
::
tuple_size
<
ConvTensorLayouts
>::
value
==
8
,
"Convolution AlgoChooser assumes arity = 3 , 5 or 8 (for "
"deformable conv)"
);
}
Opr
*
megdnn_opr
()
const
{
return
m_megdnn_opr
;
}
const
MGBOpr
*
mgb_opr
()
const
{
return
m_mgb_opr
;
}
const
TensorLayout
&
inp_layout
(
size_t
idx
)
const
{
return
m_layouts
[
idx
];
}
const
ConvTensorLayouts
&
layouts
()
const
{
return
m_layouts
;
}
ImplAlgo
choose_by_heuristic
(
bool
reproducible
=
false
)
const
;
//! get all candidate algos, and the one choose_by_heuristic() is
//! put first
std
::
vector
<
ImplAlgo
>
get_all_candidates
()
const
;
//! get candidate algos with workspace limit.
std
::
vector
<
ImplAlgo
>
get_all_candidates_with_workspace_limit
()
const
;
//! get workspace size required for specific algo
size_t
get_workspace_size_bytes
(
ImplAlgo
algo
)
const
;
/*!
* \brief profile a single algorithm
*
* This is actually a wrapper that constructs param and call
* TimedProfiler<Opr>::profile for the actual profiling
*
* \param[in,out] timeout set the timeout, and return the actual
* timeout used during profiling
*/
Maybe
<
AlgoChooserProfileCache
::
ResultEntry
>
profile_single_algo
(
ImplAlgo
algo
,
double
&
timeout
)
const
;
private:
Maybe
<
PreprocessFilter
<
Opr
>>
construct_fake_preprocess_filter
()
const
;
};
//! entrance for getting algorithm according to execution strategy
static
ImplAlgo
get_algo
(
ExeContext
&
ctx
);
static
void
get_origin_param_and_layouts
(
const
ExeContext
&
,
ConvTensorLayouts
&
,
typename
Opr
::
Param
&
)
{}
//! get all profile result, either by retrieving cache or profiling
static
AlgoChooserProfileCache
::
Result
get_profile_result
(
ExeContext
&
ctx
,
bool
enable_update
);
static
ImplAlgo
choose_by_profile
(
ExeContext
&
ctx
,
bool
require_reproducible
,
bool
enable_update
=
true
);
public:
/*!
* \brief setup algorithm and return workspace size
*/
static
size_t
setup_algo
(
const
ConvTensorLayouts
&
layouts
,
Opr
*
megdnn_opr
,
const
MGBOpr
*
mgb_opr
,
bool
allow_weight_preprocess
=
false
);
};
}
// namespace opr
}
// namespace mgb
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
src/opr/include/megbrain/opr/search_policy/profiler.h
0 → 100644
浏览文件 @
1e90c457
/**
* \file src/opr/include/megbrain/opr/search_policy/profile.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "megbrain/opr/dnn/convolution.h"
#include "megbrain/utils/hash_ct.h"
#include "megbrain/utils/timer.h"
#include "megdnn/basic_types.h"
#include "megdnn/oprs/nn.h"
namespace
mgb
{
namespace
opr
{
#define MGB_FOREACH_FASTRUN_OPR(cb) \
cb(ConvolutionForward); \
cb(ConvBiasForward); \
cb(ConvolutionBackwardData); \
cb(ConvolutionBackwardFilter); \
cb(Convolution3DForward); \
cb(Convolution3DBackwardData); \
cb(Convolution3DBackwardFilter); \
cb(LocalShareForward); \
cb(LocalShareBackwardData); \
cb(LocalShareBackwardFilter); \
cb(DeformableConvForward); \
cb(DeformableConvBackwardFilter); \
cb(DeformableConvBackwardData); \
cb(BatchConvBiasForward);
template
<
typename
Opr
>
struct
OprArityTrait
;
template
<
typename
Opr
,
int
_arity_in
,
int
_arity_out
>
struct
OprArityTraitTmpl
{
static
constexpr
int
arity_in
=
_arity_in
;
static
constexpr
int
arity_out
=
_arity_out
;
static
constexpr
int
arity
=
arity_in
+
arity_out
;
};
#define INST_ARITY(_Opr, _in, _out) \
template <> \
struct OprArityTrait<_Opr> : public OprArityTraitTmpl<_Opr, _in, _out> {};
INST_ARITY
(
megdnn
::
ConvolutionBackwardData
,
2
,
1
);
INST_ARITY
(
megdnn
::
ConvolutionBackwardFilter
,
2
,
1
);
INST_ARITY
(
megdnn
::
Convolution3DForward
,
2
,
1
);
INST_ARITY
(
megdnn
::
Convolution3DBackwardData
,
2
,
1
);
INST_ARITY
(
megdnn
::
Convolution3DBackwardFilter
,
2
,
1
);
INST_ARITY
(
megdnn
::
LocalShareForward
,
2
,
1
);
INST_ARITY
(
megdnn
::
LocalShareBackwardData
,
2
,
1
);
INST_ARITY
(
megdnn
::
LocalShareBackwardFilter
,
2
,
1
);
INST_ARITY
(
megdnn
::
Convolution
,
2
,
1
);
INST_ARITY
(
megdnn
::
DeformableConvForward
,
4
,
1
);
INST_ARITY
(
megdnn
::
DeformableConvBackwardFilter
,
4
,
1
);
INST_ARITY
(
megdnn
::
BatchConvBiasForward
,
4
,
1
);
INST_ARITY
(
megdnn
::
ConvBias
,
4
,
1
);
INST_ARITY
(
megdnn
::
DeformableConvBackwardData
,
5
,
3
);
#undef INST_ARITY
template
<
typename
Opr
>
constexpr
bool
opr_supports_preprocess
()
{
return
std
::
is_same
<
Opr
,
megdnn
::
ConvolutionForward
>::
value
||
std
::
is_same
<
Opr
,
megdnn
::
ConvBias
>::
value
;
}
template
<
typename
Opr
,
bool
has_prep
>
struct
PreprocessFilterImpl
{
using
T
=
union
{};
};
template
<
typename
Opr
>
struct
PreprocessFilterImpl
<
Opr
,
true
>
{
using
T
=
typename
Opr
::
PreprocessedFilter
;
};
template
<
typename
Opr
>
using
PreprocessFilter
=
typename
PreprocessFilterImpl
<
Opr
,
opr_supports_preprocess
<
Opr
>
()
>::
T
;
template
<
typename
Opr
>
struct
AlgoChooserFuncId
{};
#define DEF_FUNC_ID(func) \
template <> \
struct AlgoChooserFuncId<megdnn::func> { \
__attribute__( \
(unused)) static constexpr sys::TimedFuncInvoker::FuncId ID = \
static_cast<sys::TimedFuncInvoker::FuncId>( \
MGB_HASH_STR("megdnn::" #func)); \
};
MGB_FOREACH_FASTRUN_OPR
(
DEF_FUNC_ID
)
#undef DEF_FUNC_ID
/* =================== TimedProfiler =================== */
/*!
* \brief profile a megdnn opr conv with given param
*
* This class only provides static methods, and the entry point is
* TimedProfiler::profile; it would run profiler in a timed environment by
* sys::TimedFuncInvoker
*
* \tparam Opr megdnn opr impl
*/
template
<
typename
Opr
>
class
TimedProfiler
{
static
constexpr
int
arity_in
=
OprArityTrait
<
Opr
>::
arity_in
;
static
constexpr
int
arity_out
=
OprArityTrait
<
Opr
>::
arity_out
;
static
constexpr
int
arity
=
OprArityTrait
<
Opr
>::
arity
;
using
ConvTensorShapes
=
std
::
array
<
TensorShape
,
arity
>
;
public:
struct
Param
{
char
algo_name
[
128
];
size_t
workspace
;
DTypeEnum
dtypes
[
arity
];
CompNode
::
Locator
comp_node_loc
;
ConvTensorShapes
shapes
;
typename
Opr
::
Param
opr_param
;
bool
allow_weight_preprocess
;
//! filled by profile()
mutable
double
actual_timeout
;
};
struct
Result
{
double
time
;
};
static
Maybe
<
Result
>
profile
(
const
Param
&
param
,
double
&
timeout
);
private:
using
TParam
=
sys
::
TimedFuncInvoker
::
Param
;
using
TResult
=
sys
::
TimedFuncInvoker
::
Result
;
static
const
double
timeout_setting
;
static
double
init_timeout_setting
();
static
TResult
prof_impl
(
const
TParam
&
raw_param
);
static
void
prof_init_device
(
const
TParam
&
raw_param
);
};
}
// namespace opr
}
// namespace mgb
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
src/opr/test/basic_arith/elemwise.cpp
浏览文件 @
1e90c457
...
...
@@ -593,10 +593,6 @@ namespace {
struct
enable_for_dtype_impl
<
dtype
::
Bool
,
Trait
>
{
static
constexpr
bool
value
=
Trait
::
ALLOW_BOOL
;
};
template
<
>
struct
enable_for_dtype_impl
<
dtype
::
Bool
,
void
>
{
static
constexpr
bool
value
=
false
;
};
}
//! whether to enable test for specific dtype and Trait
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录