Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MindSpore
akg
提交
5a35fac5
A
akg
项目概览
MindSpore
/
akg
通知
59
Star
7
Fork
7
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
A
akg
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5a35fac5
编写于
6月 28, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
6月 28, 2020
浏览文件
操作
浏览文件
下载
差异文件
!19 improve MulticoreStrategy in auto tiling and add related comments
Merge pull request !19 from yangsijia/fix-issue-I1L074
上级
3dd905ae
da939f82
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
94 addition
and
45 deletion
+94
-45
src/poly/tiling_analyzer.cc
src/poly/tiling_analyzer.cc
+66
-34
src/poly/tiling_analyzer.h
src/poly/tiling_analyzer.h
+2
-1
src/poly/tiling_strategy_manager.cc
src/poly/tiling_strategy_manager.cc
+25
-9
src/poly/tiling_strategy_manager.h
src/poly/tiling_strategy_manager.h
+1
-1
未找到文件。
src/poly/tiling_analyzer.cc
浏览文件 @
5a35fac5
...
...
@@ -608,26 +608,30 @@ void TileCandidate::DoMemInfer() {
}
}
int
TileCandidate
::
GetMinUbToGmDataAfterAxis
(
TileAxis
*
axis
)
{
// e.g.1
// Input ir:
// for (cc0) <--- axis, dtype = float16
// for (cc1) <--- tile factor 1024, dtype = float16
// GM_BUF1[cc0, cc1] = UB_BUF1[cc0, cc1]
// for (cc0) <--- axis
// for (cc2) <--- tile factor 1024, dtype = float32
// GM_BUF2[cc0, cc2] = UB_BUF2[cc0, cc2]
// Return:
// 1024 * 2
// e.g.2
// Input ir:
// for (cc0) <--- axis, dtype = float16
// GM_BUF1[cc0] = UB_BUF1[cc0]
// Return:
// 1 * 2
int
min_data_each_core
=
-
1
;
/*
* This function returns current data size moved from local buffer (UB in Davinci)
* to main memory (GM in Davinci) within target axis.
* e.g.1: target is not inner-most axis
* Input ir:
* for (cc0) <--- axis, dtype = float16
* for (cc1) <--- tile factor 1024, dtype = float16
* GM_BUF1[cc0, cc1] = UB_BUF1[cc0, cc1]
* for (cc0) <--- axis
* for (cc2) <--- tile factor 1024, dtype = float32
* GM_BUF2[cc0, cc2] = UB_BUF2[cc0, cc2]
* Return:
* min(1024 * 2(fp16), 1024 * 4(fp32)) = 1024 * 2
*
* e.g.2: target is inner-most axis
* Input ir:
* for (cc0) <--- axis, dtype = float16
* GM_BUF1[cc0] = UB_BUF1[cc0]
* Return:
* 32(ALIGN_BYTES) / 2(fp16) = 16
*/
int
TileCandidate
::
GetDmaCopySizeWithinAxis
(
TileAxis
*
target_axis
)
{
std
::
stringstream
ss
;
int
min_data_each_core
=
-
1
;
bool
before_this_axis
=
true
;
for
(
const
auto
&
attr
:
analyzer_
->
RootAxis
()
->
attrs
)
{
if
(
attr
.
attr_key
.
find
(
"DMA3"
)
==
std
::
string
::
npos
)
{
...
...
@@ -635,7 +639,7 @@ int TileCandidate::GetMinUbToGmDataAfterAxis(TileAxis *axis) {
}
int64_t
data_each_core
=
1
;
int
data_bytes
=
-
1
;
bool
record
=
true
;
bool
need_
record
=
true
;
std
::
string
gm_buf_name
=
attr
.
attr_value
;
auto
it
=
analyzer_
->
buf_info_
.
find
(
gm_buf_name
);
if
(
it
==
analyzer_
->
buf_info_
.
end
())
{
...
...
@@ -643,32 +647,28 @@ int TileCandidate::GetMinUbToGmDataAfterAxis(TileAxis *axis) {
}
auto
gm_buf
=
it
->
second
.
get
();
for
(
auto
&
gm_axis
:
*
(
gm_buf
->
tile_axis
))
{
if
(
gm_axis
->
index
!=
axis
->
index
)
{
record
=
false
;
if
(
gm_axis
->
index
!=
target_axis
->
index
||
gm_axis
->
range_extent
.
as
<
IntImm
>
()
==
nullptr
)
{
need_
record
=
false
;
break
;
}
if
(
gm_axis
==
axis
)
{
if
(
gm_axis
==
target_
axis
)
{
before_this_axis
=
false
;
continue
;
}
if
(
before_this_axis
)
{
continue
;
}
if
(
gm_axis
->
range_extent
.
as
<
IntImm
>
()
==
nullptr
)
{
record
=
false
;
break
;
}
int64_t
l1_val
=
MIN_TILE
;
std
::
tie
(
l1_val
,
std
::
ignore
)
=
GetConstTileVal
(
gm_axis
);
if
(
l1_val
==
TileVarId
::
VAR
)
{
record
=
false
;
need_
record
=
false
;
break
;
}
CHECK_NE
(
l1_val
,
0
)
<<
"Inner axis "
<<
gm_axis
->
dim_axis
<<
" should be tile before axis "
<<
axis
->
dim_axis
;
CHECK_NE
(
l1_val
,
0
)
<<
"Inner axis "
<<
gm_axis
->
dim_axis
<<
" should be tile before axis "
<<
target_axis
->
dim_axis
;
if
(
gm_axis
->
HasAnyAttr
({
"REDUCE_AXIS"
,
"TRANSPOSE"
,
"TRANSFORM"
}))
{
ss
<<
"axis "
<<
gm_axis
->
index
<<
"_"
<<
gm_axis
->
dim_axis
<<
" cannot be flatten. clear data each core."
;
analyzer_
->
logger_
.
AppendLog
(
DO_TILING
,
ss
);
data_each_core
=
1
;
data_bytes
=
1
;
continue
;
...
...
@@ -678,19 +678,51 @@ int TileCandidate::GetMinUbToGmDataAfterAxis(TileAxis *axis) {
auto
min_bytes
=
static_cast
<
int
>
(
ALIGN_BYTES
/
GetMaxAlignBytes
(
gm_axis
->
data_size
));
data_bytes
=
(
data_bytes
==
-
1
||
min_bytes
<
data_bytes
)
?
min_bytes
:
data_bytes
;
}
if
(
record
&&
(
min_data_each_core
==
-
1
||
data_bytes
*
data_each_core
<
min_data_each_core
))
if
(
need_record
&&
(
min_data_each_core
==
-
1
||
data_bytes
*
data_each_core
<
min_data_each_core
))
{
min_data_each_core
=
data_bytes
*
data_each_core
;
}
}
ss
<<
"[Data within axis "
<<
axis
->
index
<<
"_"
<<
axis
->
dim_axis
<<
"]: "
<<
min_data_each_core
;
ss
<<
"[Data within axis "
<<
target_axis
->
index
<<
"_"
<<
target_
axis
->
dim_axis
<<
"]: "
<<
min_data_each_core
;
analyzer_
->
logger_
.
AppendLog
(
DO_TILING
,
ss
);
return
min_data_each_core
==
-
1
?
static_cast
<
int
>
(
ALIGN_BYTES
/
GetMaxAlignBytes
(
axis
->
data_size
))
return
min_data_each_core
==
-
1
?
static_cast
<
int
>
(
ALIGN_BYTES
/
GetMaxAlignBytes
(
target_
axis
->
data_size
))
:
min_data_each_core
;
}
/*
* This function returns the minimal tile size of axis that can enable multi-core function.
* If inner-most data granularity of DMA from local buffer to main memory is less than align bytes,
* which is 32 in Davinci Core, it will disable multi-core function.
*/
int
TileCandidate
::
GetMinFactorToEnableMulticore
(
TileAxis
*
axis
)
{
return
std
::
max
(
static_cast
<
int
>
(
ALIGN_BYTES
/
GetMinUbToGmDataAfterAxis
(
axis
)),
1
);
return
std
::
max
(
static_cast
<
int
>
(
ALIGN_BYTES
/
GetDmaCopySizeWithinAxis
(
axis
)),
1
);
}
/*
* This function returns the minimal tile size of axis that each core can have enough data granularity to process.
* Minimal data granularity for each core is set to 256 bytes by default and if actual data granularity is less
* than this value, the candidate tile sizes will be regarded as multi-core inefficient.
*/
int
TileCandidate
::
GetMinFactorForMinDataGranularity
(
TileAxis
*
axis
)
{
auto
granularity
=
1
;
for
(
auto
a
:
this
->
tile_axis_
)
{
if
(
a
==
axis
)
{
continue
;
}
if
(
!
a
->
range_extent
.
as
<
IntImm
>
())
{
continue
;
}
int64_t
l1_val
=
this
->
GetConstTileVal
(
a
).
first
;
if
(
l1_val
==
TileVarId
::
UNDEFINE
||
l1_val
==
TileVarId
::
VAR
)
{
continue
;
}
granularity
*=
l1_val
;
}
return
std
::
max
(
static_cast
<
int
>
(
MIN_MULTICORE_BYTES
/
granularity
),
1
);
}
/*
* This function returns the multiplies of loop extent of all the pending (not tiled) axes.
*/
int
TileCandidate
::
GetMaximalPendingBlocks
(
TileAxis
*
excluded_axis
)
{
int64_t
blocks
=
1
;
for
(
auto
axis
:
this
->
tile_axis_
)
{
...
...
src/poly/tiling_analyzer.h
浏览文件 @
5a35fac5
...
...
@@ -380,7 +380,8 @@ class TileCandidate {
static
int
GetCoreNumConf
();
int
GetMinFactorToEnableMulticore
(
TileAxis
*
axis
);
int
GetMaximalPendingBlocks
(
TileAxis
*
excluded_axis
);
int
GetMinUbToGmDataAfterAxis
(
TileAxis
*
axis
);
int
GetDmaCopySizeWithinAxis
(
TileAxis
*
axis
);
int
GetMinFactorForMinDataGranularity
(
TileAxis
*
axis
);
private:
void
DoMemInfer
();
...
...
src/poly/tiling_strategy_manager.cc
浏览文件 @
5a35fac5
...
...
@@ -433,14 +433,13 @@ void GemmStrategy::AddConstraint() {
std
::
pair
<
int
,
int
>
MulticoreStrategy
::
GetProposalRangeForFullMulticore
(
TileAxis
*
multicore_axis
)
{
int
max_core
=
cand_
.
GetCoreNumConf
();
int
used_core
=
1
;
std
::
pair
<
int
,
int
>
proposal_range
=
std
::
make_pair
(
std
::
max
(
static_cast
<
int
>
(
MIN_MULTICORE_BYTES
/
cand_
.
GetMinUbToGmDataAfterAxis
(
multicore_axis
)),
1
),
-
1
);
std
::
pair
<
int
,
int
>
proposal_range
=
std
::
make_pair
(
cand_
.
GetMinFactorForMinDataGranularity
(
multicore_axis
),
-
1
);
auto
this_level_core
=
std
::
max
(
static_cast
<
int
>
(
max_core
/
used_core
),
1
);
std
::
stringstream
ss
;
if
(
multicore_axis
->
range_extent
.
as
<
IntImm
>
()
==
nullptr
)
return
proposal_range
;
auto
shape
=
multicore_axis
->
range_extent
.
as
<
IntImm
>
()
->
value
;
bool
is_last_level
=
false
;
for
(
auto
other_axis
:
cand_
.
GetTileAxis
())
{
for
(
auto
other_axis
:
this
->
cand_
.
GetTileAxis
())
{
if
(
other_axis
==
multicore_axis
)
break
;
if
(
other_axis
->
index
!=
multicore_axis
->
index
||
other_axis
->
HasAttr
(
"REDUCE_AXIS"
))
continue
;
if
(
other_axis
->
range_extent
.
as
<
IntImm
>
()
==
nullptr
)
return
proposal_range
;
...
...
@@ -480,6 +479,7 @@ std::pair<int, int> MulticoreStrategy::GetProposalRangeForFullMulticore(TileAxis
logger_
.
AppendLog
(
DO_TILING
,
ss
);
return
proposal_range
;
}
int64_t
MulticoreStrategy
::
AdjustTilingAccordingToMulticoreConstraint
(
TileAxis
*
multicore_axis
,
int64_t
tiling_factor
)
{
CHECK_GT
(
tiling_factor
,
0
)
<<
"tiling factor cant be zero or negative"
;
auto
proposal_range
=
GetProposalRangeForFullMulticore
(
multicore_axis
);
...
...
@@ -488,12 +488,19 @@ int64_t MulticoreStrategy::AdjustTilingAccordingToMulticoreConstraint(TileAxis *
auto
origin_factor
=
tiling_factor
;
std
::
stringstream
ss
;
if
((
!
multicore_axis
->
mc_sup
)
||
(
multicore_axis
->
HasAttr
(
"REDUCE_AXIS"
))
||
(
tiling_factor
<
cand_
.
GetMinFactorToEnableMulticore
(
multicore_axis
)
||
(
tiling_factor
==
max_factor_for_full_cores
)
||
(
max_factor_for_full_cores
<=
0
)))
{
if
((
!
multicore_axis
->
mc_sup
)
||
(
multicore_axis
->
HasAttr
(
"REDUCE_AXIS"
)
||
(
max_factor_for_full_cores
<=
0
)))
{
logger_
.
AppendLine
(
DO_TILING
,
"This axis is not suitable for multicore, return."
);
return
origin_factor
;
}
if
(
tiling_factor
<
cand_
.
GetMinFactorToEnableMulticore
(
multicore_axis
))
{
logger_
.
AppendLine
(
DO_TILING
,
"Inner-most tile size is smaller than 32 bytes, multicore is disable, return."
);
return
origin_factor
;
}
if
((
tiling_factor
<=
min_factor_for_enough_data
)
||
(
min_factor_for_enough_data
>=
cand_
.
GetCoreNumConf
()
*
max_factor_for_full_cores
))
{
logger_
.
AppendLine
(
DO_TILING
,
"Cannot increase degree of parallelism by adjusting current tiling factor, return."
);
return
origin_factor
;
}
auto
CheckConstConstraint
=
[
this
,
&
ss
](
Expr
constraint
)
{
if
(
constraint
.
as
<
IntImm
>
()
==
nullptr
)
{
...
...
@@ -505,18 +512,27 @@ int64_t MulticoreStrategy::AdjustTilingAccordingToMulticoreConstraint(TileAxis *
CheckConstConstraint
(
multicore_axis
->
l1_constraints
.
tile_min_
);
CheckConstConstraint
(
multicore_axis
->
l1_constraints
.
tile_mod_
);
auto
pending_blocks
=
cand_
.
GetMaximalPendingBlocks
(
multicore_axis
);
if
(
tiling_factor
<
max_factor_for_full_cores
)
{
auto
end
=
static_cast
<
int
>
(
sqrt
(
max_factor_for_full_cores
));
while
(
max_factor_for_full_cores
%
tiling_factor
!=
0
&&
tiling_factor
>
end
)
--
tiling_factor
;
}
else
{
while
(
max_factor_for_full_cores
%
tiling_factor
!=
0
&&
tiling_factor
>
end
)
{
--
tiling_factor
;
}
}
else
if
(
max_factor_for_full_cores
>=
min_factor_for_enough_data
)
{
tiling_factor
=
max_factor_for_full_cores
;
}
else
if
(
max_factor_for_full_cores
<
min_factor_for_enough_data
)
{
// In this case, simply adjusting tiling factor to max_factor_for_full_core may lead to insufficient data
// in each core while adjusting tiling factor to min_factor_for_enough_date may lead to fewer parallel cores.
// Since pending blocks can compensate data in each core, we make decision upon on its value.
tiling_factor
=
pending_blocks
>=
static_cast
<
int
>
(
min_factor_for_enough_data
/
max_factor_for_full_cores
)
?
max_factor_for_full_cores
:
min_factor_for_enough_data
;
}
auto
shape
=
multicore_axis
->
range_extent
.
as
<
IntImm
>
()
->
value
;
bool
efficient
=
(
shape
%
tiling_factor
==
0
)
>=
(
shape
%
origin_factor
==
0
);
auto
multicore_shrink_limit
=
2
;
auto
reduced_mem
=
std
::
max
(
origin_factor
-
tiling_factor
,
min_factor_for_enough_data
-
tiling_factor
);
auto
pending_blocks
=
cand_
.
GetMaximalPendingBlocks
(
multicore_axis
);
if
((
static_cast
<
int
>
(
origin_factor
/
tiling_factor
)
>=
multicore_shrink_limit
)
&&
reduced_mem
>
pending_blocks
)
{
ss
<<
"If axis adjust to "
<<
tiling_factor
<<
", "
<<
reduced_mem
<<
" memory is reduced;"
<<
" while maximal pending blocks is only "
<<
pending_blocks
<<
", adjust may not be efficient."
;
...
...
src/poly/tiling_strategy_manager.h
浏览文件 @
5a35fac5
...
...
@@ -192,12 +192,12 @@ class MulticoreStrategy {
MulticoreStrategy
(
TileCandidate
&
cand
,
const
std
::
string
log_file
)
:
cand_
(
cand
),
logger_
(
TileLogger
::
GetInstance
(
log_file
))
{}
~
MulticoreStrategy
()
{}
std
::
pair
<
int
,
int
>
GetProposalRangeForFullMulticore
(
TileAxis
*
axis
);
int64_t
AdjustTilingAccordingToMulticoreConstraint
(
TileAxis
*
axis
,
int64_t
tiling_factor
);
private:
TileCandidate
&
cand_
;
TileLogger
&
logger_
;
std
::
pair
<
int
,
int
>
GetProposalRangeForFullMulticore
(
TileAxis
*
axis
);
};
}
// namespace poly
}
// namespace ir
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录