Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
9516108a
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
9516108a
编写于
10月 28, 2021
作者:
A
Aurelius84
提交者:
GitHub
10月 28, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Modify Struct into Class to improve encapsulation and Polish code exception (#36797)
* Refactor InterpreterCore code * make tuple
上级
a7d8837b
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
435 addition
and
308 deletion
+435
-308
paddle/fluid/framework/new_executor/event_manager.cc
paddle/fluid/framework/new_executor/event_manager.cc
+6
-6
paddle/fluid/framework/new_executor/interpretercore.cc
paddle/fluid/framework/new_executor/interpretercore.cc
+97
-125
paddle/fluid/framework/new_executor/interpretercore.h
paddle/fluid/framework/new_executor/interpretercore.h
+4
-5
paddle/fluid/framework/new_executor/interpretercore_util.cc
paddle/fluid/framework/new_executor/interpretercore_util.cc
+97
-107
paddle/fluid/framework/new_executor/interpretercore_util.h
paddle/fluid/framework/new_executor/interpretercore_util.h
+0
-1
paddle/fluid/framework/new_executor/new_executor_defs.h
paddle/fluid/framework/new_executor/new_executor_defs.h
+210
-26
paddle/fluid/framework/new_executor/standalone_executor.cc
paddle/fluid/framework/new_executor/standalone_executor.cc
+5
-20
paddle/fluid/framework/new_executor/stream_analyzer.cc
paddle/fluid/framework/new_executor/stream_analyzer.cc
+15
-16
paddle/fluid/framework/new_executor/stream_analyzer.h
paddle/fluid/framework/new_executor/stream_analyzer.h
+1
-2
未找到文件。
paddle/fluid/framework/new_executor/event_manager.cc
浏览文件 @
9516108a
...
...
@@ -22,13 +22,13 @@ void EventManager::WaitEvent(const Instruction& instruction,
// If InterpreterCore in on CPUPlace, do nothing.
if
(
platform
::
is_cpu_place
(
place
))
return
;
VLOG
(
3
)
<<
"Deal StreamWaitEventOrSync for "
<<
instruction
.
kernel_func_
.
operator_base_
->
Type
();
VLOG
(
3
)
<<
"Deal StreamWaitEventOrSync for "
<<
instruction
.
OpBase
()
->
Type
();
for
(
auto
&
event_iter
:
instruction
.
intput_events_
)
{
for
(
auto
&
event_iter
:
instruction
.
InputEvents
()
)
{
VLOG
(
3
)
<<
"wait var_id: "
<<
event_iter
.
var_id_
<<
" 's event with waiter_type: "
<<
event_iter
.
waiter_type_
;
event_iter
.
event_
->
Wait
(
event_iter
.
waiter_type_
,
instruction
.
dev_ctx_
);
event_iter
.
event_
->
Wait
(
event_iter
.
waiter_type_
,
&
instruction
.
DeviceContext
());
}
}
...
...
@@ -37,9 +37,9 @@ void EventManager::RecordEvent(const Instruction& instruction,
// If InterpreterCore in on CPUPlace, do nothing.
if
(
platform
::
is_cpu_place
(
place
))
return
;
for
(
auto
&
event
:
instruction
.
output_events_
)
{
for
(
auto
&
event
:
instruction
.
OutputEvents
()
)
{
VLOG
(
3
)
<<
"Record event in out_var_id: "
<<
event
.
var_id_
;
event
.
event_
->
Record
(
instruction
.
dev_ctx_
);
event
.
event_
->
Record
(
&
instruction
.
DeviceContext
()
);
}
}
...
...
paddle/fluid/framework/new_executor/interpretercore.cc
浏览文件 @
9516108a
...
...
@@ -79,11 +79,9 @@ paddle::framework::FetchList InterpreterCore::Run(
const
std
::
vector
<
framework
::
Tensor
>&
feed_tensors
)
{
auto
FeedInput
=
[
&
]
{
for
(
size_t
i
=
0
;
i
<
feed_names_
.
size
();
++
i
)
{
auto
it
=
global_scope_
->
name2id
.
find
(
feed_names_
[
i
]);
assert
(
it
!=
global_scope_
->
name2id
.
end
());
auto
*
feed_var
=
global_scope_
->
Var
(
feed_names_
[
i
]);
auto
feed_tensor
=
global_scope_
->
var_list
[
it
->
second
]
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
feed_tensor
=
feed_var
->
GetMutable
<
framework
::
LoDTensor
>
();
feed_tensor
->
ShareDataWith
(
feed_tensors
[
i
]);
}
};
...
...
@@ -93,7 +91,7 @@ paddle::framework::FetchList InterpreterCore::Run(
global_scope_
);
FeedInput
();
paddle
::
framework
::
interpretercore
::
build_op_func_list
(
place_
,
main_program_
,
&
op_list_
,
&
vec_func_list_
,
global_scope_
);
place_
,
main_program_
,
&
vec_func_list_
,
global_scope_
);
is_build_
=
true
;
// convert vec func_list to graph
Convert
();
...
...
@@ -103,42 +101,39 @@ paddle::framework::FetchList InterpreterCore::Run(
}
// return Fetch Tensors
return
*
(
global_scope_
->
var_list
[
global_scope_
->
name2id
[
"fetch_vars"
]]
->
GetMutable
<
framework
::
FetchList
>
());
auto
*
fetch_var
=
global_scope_
->
Var
(
"fetch_vars"
);
return
*
(
fetch_var
->
GetMutable
<
framework
::
FetchList
>
());
}
void
InterpreterCore
::
Convert
()
{
input_var2op_info_
.
resize
(
global_scope_
->
var_list
.
size
());
vec_instruction_
.
reserve
(
vec_func_list_
.
size
());
dependecy_count_
.
resize
(
vec_func_list_
.
size
());
vec_meta_info_
.
resize
(
global_scope_
->
var_list
.
size
());
for
(
size_t
i
=
0
;
i
<
vec_func_list_
.
size
();
++
i
)
{
Instruction
temp_inst
;
auto
*
op_base
=
op_list_
[
i
];
temp_inst
.
dev_ctx_
=
stream_analyzer_
.
ParseDeviceContext
(
vec_func_list_
[
i
],
*
op_base
);
temp_inst
.
kernel_func_
.
compute_func_
=
vec_func_list_
[
i
].
kernel_func_
;
temp_inst
.
kernel_func_
.
operator_base_
=
op_base
;
temp_inst
.
input_index_
=
vec_func_list_
[
i
].
input_index
;
temp_inst
.
output_index_
=
vec_func_list_
[
i
].
output_index
;
temp_inst
.
type_
=
vec_func_list_
[
i
].
type_
;
temp_inst
.
no_data_transform_index_
=
vec_func_list_
[
i
].
no_data_transform_index
;
auto
var_nums
=
global_scope_
->
VarSize
();
input_var2op_info_
.
resize
(
var_nums
);
vec_meta_info_
.
resize
(
var_nums
);
OpInOutInfo
info
;
auto
op_nums
=
vec_func_list_
.
size
();
vec_instruction_
.
reserve
(
op_nums
);
dependecy_count_
.
resize
(
op_nums
);
for
(
size_t
op_idx
=
0
;
op_idx
<
op_nums
;
++
op_idx
)
{
auto
&
op_func_node
=
vec_func_list_
[
op_idx
];
auto
*
dev_ctx_
=
stream_analyzer_
.
ParseDeviceContext
(
op_func_node
);
vec_instruction_
.
emplace_back
(
op_idx
,
op_func_node
,
*
dev_ctx_
);
auto
&
instr
=
vec_instruction_
.
back
();
OpInOutInfo
info
;
std
::
vector
<
size_t
>
gc_check_input_list
;
for
(
auto
&
item
:
vec_func_list_
[
i
].
input_index
)
{
for
(
auto
&
item
:
op_func_node
.
input_index
)
{
for
(
auto
id
:
item
.
second
)
{
input_var2op_info_
[
id
].
push_back
(
i
);
input_var2op_info_
.
at
(
id
).
push_back
(
op_idx
);
// var can be gc-ed
if
(
!
info
.
IsBuilt
())
{
info
.
Build
(
op_
list_
[
i
]
);
info
.
Build
(
op_
func_node
.
operator_base_
);
}
if
(
global_scope_
->
vec_meta_info_
[
id
].
vardesc_
)
{
if
(
info
.
IsInArgBufferNeeded
(
global_scope_
->
vec_meta_info_
[
id
].
vardesc_
->
Name
()))
{
auto
*
var_desc
=
global_scope_
->
VarDesc
(
id
);
if
(
var_desc
)
{
if
(
info
.
IsInArgBufferNeeded
(
var_desc
->
Name
()))
{
gc_check_input_list
.
push_back
(
id
);
}
}
else
{
...
...
@@ -150,22 +145,20 @@ void InterpreterCore::Convert() {
auto
last
=
std
::
unique
(
gc_check_input_list
.
begin
(),
gc_check_input_list
.
end
());
gc_check_input_list
.
erase
(
last
,
gc_check_input_list
.
end
());
for
(
auto
var_id
:
gc_check_input_list
)
{
vec_meta_info_
[
var_id
].
var_ref_count_
++
;
instr
.
AddGCCheckVar
(
var_id
);
}
temp_inst
.
gc_check_var_list
.
swap
(
gc_check_input_list
);
vec_instruction_
.
push_back
(
temp_inst
);
}
for
(
size_t
i
=
0
;
i
<
vec_instruction_
.
size
();
++
i
)
{
// checkout ouput
for
(
auto
&
item
:
vec_instruction_
[
i
].
output_index_
)
{
for
(
auto
&
item
:
vec_instruction_
[
i
].
Outputs
()
)
{
for
(
auto
id
:
item
.
second
)
{
if
(
input_var2op_info_
[
id
]
.
size
()
==
0
)
{
if
(
input_var2op_info_
.
at
(
id
)
.
size
()
==
0
)
{
// output var not be used by any kernel
vec_instruction_
[
i
].
gc_check_var_list
.
push_back
(
id
);
vec_instruction_
[
i
].
AddGCCheckVar
(
id
);
vec_meta_info_
[
id
].
var_ref_count_
++
;
}
}
...
...
@@ -174,7 +167,7 @@ void InterpreterCore::Convert() {
for
(
size_t
i
=
0
;
i
<
vec_instruction_
.
size
();
++
i
)
{
std
::
vector
<
size_t
>
vec_temp
;
for
(
auto
&
item
:
vec_instruction_
[
i
].
output_index_
)
{
for
(
auto
&
item
:
vec_instruction_
[
i
].
Outputs
()
)
{
for
(
auto
id
:
item
.
second
)
{
vec_temp
=
interpretercore
::
merge_vector
(
vec_temp
,
input_var2op_info_
[
id
]);
...
...
@@ -205,7 +198,7 @@ void InterpreterCore::Convert() {
BuildSkipShareLoDInfo
();
for
(
size_t
i
=
0
;
i
<
vec_instruction_
.
size
();
++
i
)
{
gc_event_
.
emplace_back
(
vec_instruction_
[
i
].
execution_ctx_
.
get
()
->
GetPlace
(),
gc_event_
.
emplace_back
(
vec_instruction_
[
i
].
DeviceContext
().
GetPlace
(),
platform
::
GenerateDeviceEventFlag
());
}
...
...
@@ -215,15 +208,14 @@ void InterpreterCore::Convert() {
}
bool
InterpreterCore
::
BuildInplaceCheckVarIsOnlyInput
(
size_t
var_index
)
{
if
(
!
global_scope_
->
vec_meta_info_
[
var_index
].
vardesc_
)
{
return
input_var2op_info_
[
var_index
]
.
size
()
==
1
;
if
(
!
global_scope_
->
VarDesc
(
var_index
)
)
{
return
input_var2op_info_
.
at
(
var_index
)
.
size
()
==
1
;
}
else
{
int
is_input_cnt
=
0
;
for
(
auto
inst_id
:
input_var2op_info_
[
var_index
]
)
{
for
(
auto
inst_id
:
input_var2op_info_
.
at
(
var_index
)
)
{
OpInOutInfo
info
;
info
.
Build
(
vec_instruction_
[
inst_id
].
kernel_func_
.
operator_base_
);
if
(
info
.
IsInArgBufferNeeded
(
global_scope_
->
vec_meta_info_
[
var_index
].
vardesc_
->
Name
()))
{
info
.
Build
(
vec_instruction_
.
at
(
inst_id
).
OpBase
());
if
(
info
.
IsInArgBufferNeeded
(
global_scope_
->
VarDesc
(
var_index
)
->
Name
()))
{
is_input_cnt
++
;
}
}
...
...
@@ -233,35 +225,31 @@ bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(size_t var_index) {
void
InterpreterCore
::
BuildInplace
()
{
for
(
size_t
i
=
0
;
i
<
vec_instruction_
.
size
();
++
i
)
{
if
(
!
vec_instruction_
[
i
]
.
kernel_func_
.
operator_base_
->
Info
()
.
infer_inplace_
)
{
auto
&
instr
=
vec_instruction_
[
i
];
auto
*
op_base
=
instr
.
OpBase
();
if
(
!
op_base
->
Info
()
.
infer_inplace_
)
{
continue
;
}
auto
in_to_outs
=
vec_instruction_
[
i
].
kernel_func_
.
operator_base_
->
Info
().
infer_inplace_
(
platform
::
is_gpu_place
(
vec_instruction_
[
i
].
dev_ctx_
->
GetPlace
()));
auto
in_to_outs
=
op_base
->
Info
().
infer_inplace_
(
platform
::
is_gpu_place
(
instr
.
DeviceContext
().
GetPlace
()));
auto
&
inputs
=
instr
.
Inputs
();
auto
&
outputs
=
instr
.
Outputs
();
for
(
auto
&
pair
:
in_to_outs
)
{
auto
iter
=
vec_instruction_
[
i
].
input_index_
.
find
(
pair
.
first
);
if
(
iter
!=
vec_instruction_
[
i
].
input_index_
.
end
())
{
auto
iter
=
inputs
.
find
(
pair
.
first
);
if
(
iter
!=
inputs
.
end
())
{
if
(
BuildInplaceCheckVarIsOnlyInput
(
iter
->
second
[
0
]))
{
auto
iterout
=
vec_instruction_
[
i
].
output_index_
.
find
(
pair
.
second
);
if
(
iterout
!=
vec_instruction_
[
i
].
output_index_
.
end
())
{
auto
invar
=
global_scope_
->
var_list
[
iter
->
second
[
0
]]
;
auto
outvar
=
global_scope_
->
var_list
[
iterout
->
second
[
0
]]
;
auto
iterout
=
outputs
.
find
(
pair
.
second
);
if
(
iterout
!=
outputs
.
end
())
{
auto
invar
=
global_scope_
->
Var
(
iter
->
second
[
0
])
;
auto
outvar
=
global_scope_
->
Var
(
iterout
->
second
[
0
])
;
if
(
invar
&&
outvar
)
{
vec_instruction_
[
i
].
vec_inplace_in_to_out_
.
emplace_back
(
invar
,
outvar
);
VLOG
(
3
)
<<
"inplace "
<<
vec_instruction_
[
i
].
kernel_func_
.
operator_base_
->
Type
()
<<
" "
<<
global_scope_
->
vec_meta_info_
[
iter
->
second
[
0
]]
.
vardesc_
->
Name
()
instr
.
AddInplace
(
invar
,
outvar
);
VLOG
(
3
)
<<
"inplace "
<<
op_base
->
Type
()
<<
" "
<<
global_scope_
->
VarDesc
(
iter
->
second
[
0
])
->
Name
()
<<
" -> "
<<
global_scope_
->
vec_meta_info_
[
iterout
->
second
[
0
]]
.
vardesc_
->
Name
()
<<
global_scope_
->
VarDesc
(
iterout
->
second
[
0
])
->
Name
()
<<
std
::
endl
;
}
}
...
...
@@ -274,48 +262,35 @@ void InterpreterCore::BuildInplace() {
void
InterpreterCore
::
BuildAndCacheInstructionCtx
(
Instruction
*
instr_node
,
const
VariableScope
&
var_scope
,
const
platform
::
Place
&
place
)
{
auto
op_base
=
instr_node
->
kernel_func_
.
operator_base_
;
VariableValueMap
ins_map
;
for
(
auto
&
var_name_item
:
instr_node
->
input_index_
)
{
for
(
auto
&
var_name_item
:
instr_node
->
Inputs
()
)
{
std
::
vector
<
Variable
*>
input_vars
;
input_vars
.
reserve
(
var_name_item
.
second
.
size
());
for
(
auto
&
id
:
var_name_item
.
second
)
{
input_vars
.
emplace_back
(
var_scope
.
var_list
[
id
]
);
input_vars
.
emplace_back
(
var_scope
.
Var
(
id
)
);
}
ins_map
.
emplace
(
var_name_item
.
first
,
std
::
move
(
input_vars
));
}
VariableValueMap
outs_map
;
for
(
auto
&
var_name_item
:
instr_node
->
output_index_
)
{
for
(
auto
&
var_name_item
:
instr_node
->
Outputs
()
)
{
std
::
vector
<
Variable
*>
out_vars
;
out_vars
.
reserve
(
var_name_item
.
second
.
size
());
for
(
auto
&
id
:
var_name_item
.
second
)
{
out_vars
.
emplace_back
(
var_scope
.
var_list
[
id
]
);
out_vars
.
emplace_back
(
var_scope
.
Var
(
id
)
);
}
outs_map
.
emplace
(
var_name_item
.
first
,
std
::
move
(
out_vars
));
}
instr_node
->
runtime_ctx_
.
reset
(
new
RuntimeContext
({},
{}));
instr_node
->
runtime_ctx_
->
inputs
.
swap
(
ins_map
);
instr_node
->
runtime_ctx_
->
outputs
.
swap
(
outs_map
);
instr_node
->
infershape_ctx_
.
reset
(
new
InterpretercoreInferShapeContext
(
*
op_base
,
*
instr_node
->
runtime_ctx_
.
get
()));
auto
*
dev_ctx
=
instr_node
->
dev_ctx_
;
Scope
scope
;
instr_node
->
execution_ctx_
.
reset
(
new
ExecutionContext
(
*
op_base
,
scope
,
*
dev_ctx
,
*
instr_node
->
runtime_ctx_
.
get
()));
// set runtime_ctx and infershape_ctx_
instr_node
->
ResetContext
(
ins_map
,
outs_map
);
}
void
InterpreterCore
::
BuildSkipShareLoDInfo
()
{
for
(
size_t
i
=
0
;
i
<
vec_instruction_
.
size
();
++
i
)
{
bool
can_skip_lod
=
true
;
for
(
auto
&
input
:
vec_instruction_
[
i
].
runtime_ctx_
.
ge
t
()
->
inputs
)
{
for
(
auto
&
input
:
vec_instruction_
[
i
].
InnerRuntimeContex
t
()
->
inputs
)
{
for
(
auto
&
var
:
input
.
second
)
{
if
(
var
->
IsType
<
LoDTensor
>
())
{
if
(
var
->
Get
<
LoDTensor
>
().
lod
().
size
()
!=
0
)
{
...
...
@@ -328,23 +303,21 @@ void InterpreterCore::BuildSkipShareLoDInfo() {
}
}
}
vec_instruction_
[
i
].
infershape_ctx_
.
ge
t
()
->
SetSkipLoD
(
can_skip_lod
);
vec_instruction_
[
i
].
InnerInferShapeContex
t
()
->
SetSkipLoD
(
can_skip_lod
);
}
}
void
InterpreterCore
::
RunInstruction
(
const
Instruction
&
instr_node
)
{
VLOG
(
3
)
<<
"RunInstruction: "
<<
instr_node
.
kernel_func_
.
operator_base_
->
Type
();
VLOG
(
3
)
<<
"RunInstruction: "
<<
instr_node
.
OpBase
()
->
Type
();
{
platform
::
RecordEvent
infershape_event
(
"InferShape"
);
static_cast
<
const
framework
::
OperatorWithKernel
*>
(
instr_node
.
kernel_func_
.
operator_base_
)
->
InferShape
(
instr_node
.
infershape_ctx_
.
get
());
static_cast
<
const
framework
::
OperatorWithKernel
*>
(
instr_node
.
OpBase
())
->
InferShape
(
instr_node
.
InnerInferShapeContext
().
get
());
}
if
(
FLAGS_new_executor_use_inplace
)
{
for
(
auto
&
pair
:
instr_node
.
vec_inplace_in_to_out_
)
{
for
(
auto
&
pair
:
instr_node
.
InplaceInfo
()
)
{
const
auto
&
in
=
paddle
::
framework
::
details
::
GetTensorFromVar
(
pair
.
first
);
auto
*
out
=
paddle
::
framework
::
details
::
GetMutableTensorFromVar
(
pair
.
second
);
...
...
@@ -355,7 +328,7 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
}
{
platform
::
RecordEvent
compute_event
(
"Compute"
);
instr_node
.
kernel_func_
.
compute_func_
(
*
instr_node
.
execution_ctx_
.
get
());
instr_node
.
KernelFunc
()(
*
instr_node
.
InnerExecutionContext
()
.
get
());
}
}
...
...
@@ -369,7 +342,7 @@ void InterpreterCore::ExecuteInstructionList(
for
(
size_t
i
=
0
;
i
<
dependecy_count_
.
size
();
++
i
)
{
if
(
dependecy_count_
[
i
]
==
0
)
{
async_work_queue_
.
AddTask
(
vec_instr
[
i
].
type_
,
async_work_queue_
.
AddTask
(
vec_instr
.
at
(
i
).
KernelType
()
,
[
&
,
i
]
{
RunInstructionAsync
(
i
);
});
}
}
...
...
@@ -391,43 +364,43 @@ void InterpreterCore::ExecuteInstructionList(
void
InterpreterCore
::
RunNextInstructions
(
const
Instruction
&
instr
,
std
::
queue
<
size_t
>*
reserved_next_ops
)
{
auto
&
next_instr
=
instr
.
next_instruction_
;
auto
&
next_instr
=
instr
.
NextInstructions
()
;
auto
&
atomic_deps
=
async_work_queue_
.
AtomicDeps
();
auto
IsReady
=
[
&
](
size_t
next_id
)
{
return
atomic_deps
[
next_id
]
->
fetch_sub
(
1
,
std
::
memory_order_relaxed
)
==
1
;
};
if
(
instr
.
type_
==
OpFuncType
::
kQueueAsync
)
{
if
(
instr
.
KernelType
()
==
OpFuncType
::
kQueueAsync
)
{
// move all sync_ops into other threads
for
(
auto
next_id
:
next_instr
.
synchronize_run_
)
{
for
(
auto
next_id
:
next_instr
.
SyncRunIds
()
)
{
if
(
IsReady
(
next_id
))
{
async_work_queue_
.
AddTask
(
vec_instruction_
[
next_id
].
type_
,
vec_instruction_
[
next_id
].
KernelType
()
,
[
&
,
next_id
]
{
RunInstructionAsync
(
next_id
);
});
}
}
// keep all async_ops running in current thread
for
(
auto
next_id
:
next_instr
.
direct_run_
)
{
for
(
auto
next_id
:
next_instr
.
DirectRunIds
()
)
{
if
(
IsReady
(
next_id
))
{
reserved_next_ops
->
push
(
next_id
);
}
}
for
(
auto
next_id
:
next_instr
.
event_wait_run_
)
{
for
(
auto
next_id
:
next_instr
.
EventRunIds
()
)
{
if
(
IsReady
(
next_id
))
{
reserved_next_ops
->
push
(
next_id
);
}
}
}
else
{
// move async_ops into async_thread
for
(
auto
next_id
:
next_instr
.
event_wait_run_
)
{
for
(
auto
next_id
:
next_instr
.
EventRunIds
()
)
{
if
(
IsReady
(
next_id
))
{
async_work_queue_
.
AddTask
(
vec_instruction_
[
next_id
].
type_
,
vec_instruction_
[
next_id
].
KernelType
()
,
[
&
,
next_id
]
{
RunInstructionAsync
(
next_id
);
});
}
}
auto
direct_run_ops
=
interpretercore
::
merge_vector
(
next_instr
.
synchronize_run_
,
next_instr
.
direct_run_
);
next_instr
.
SyncRunIds
(),
next_instr
.
DirectRunIds
()
);
size_t
first_op
=
0
;
for
(
auto
next_id
:
direct_run_ops
)
{
if
(
IsReady
(
next_id
))
{
...
...
@@ -438,7 +411,7 @@ void InterpreterCore::RunNextInstructions(
}
// move rest ops into other threads
async_work_queue_
.
AddTask
(
vec_instruction_
[
next_id
].
type_
,
vec_instruction_
[
next_id
].
KernelType
()
,
[
&
,
next_id
]
{
RunInstructionAsync
(
next_id
);
});
}
}
...
...
@@ -452,8 +425,8 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
while
(
!
ready_ops
.
empty
())
{
instr_id
=
ready_ops
.
front
();
ready_ops
.
pop
();
auto
&
instr_node
=
vec_instruction_
[
instr_id
]
;
auto
*
op
=
instr_node
.
kernel_func_
.
operator_base_
;
auto
&
instr_node
=
vec_instruction_
.
at
(
instr_id
)
;
auto
*
op
=
instr_node
.
OpBase
()
;
platform
::
RecordEvent
instruction_event
(
op
->
Type
());
event_manager_
.
WaitEvent
(
instr_node
,
place_
);
...
...
@@ -486,28 +459,27 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
op_run_number_
.
fetch_add
(
1
,
std
::
memory_order_relaxed
);
// GC infomation
CheckGC
(
instr_
id
,
instr_node
.
gc_check_var_list
);
CheckGC
(
instr_
node
);
RunNextInstructions
(
instr_node
,
&
ready_ops
);
}
}
void
InterpreterCore
::
CheckGC
(
size_t
instr_id
,
const
std
::
vector
<
size_t
>&
gc_check_list
)
{
void
InterpreterCore
::
CheckGC
(
const
Instruction
&
instr
)
{
size_t
instr_id
=
instr
.
Id
();
auto
&
var_scope
=
*
global_scope_
;
auto
&
atomic_var_ref
=
async_work_queue_
.
AtomicVarRef
();
for
(
auto
var_id
:
gc_check_list
)
{
for
(
auto
var_id
:
instr
.
GCCheckVars
()
)
{
bool
is_ready
=
atomic_var_ref
[
var_id
]
->
fetch_sub
(
1
,
std
::
memory_order_relaxed
)
==
1
;
if
(
is_ready
&&
var_scope
.
vec_meta_info_
[
var_id
].
vardesc_
&&
!
var_scope
.
vec_meta_info_
[
var_id
].
vardesc_
->
Persistable
())
{
gc_
.
Add
(
var_scope
.
var_list
[
var_id
],
gc_event_
[
instr_id
],
vec_instruction_
[
instr_id
].
dev_ctx_
);
}
else
if
(
is_ready
&&
var_scope
.
vec_meta_info_
[
var_id
].
vardesc_
==
nullptr
)
{
gc_
.
Add
(
var_scope
.
var_list
[
var_id
],
gc_event_
[
instr_id
],
vec_instruction_
[
instr_id
].
dev_ctx_
);
if
(
is_ready
&&
var_scope
.
VarDesc
(
var_id
)
&&
!
var_scope
.
VarDesc
(
var_id
)
->
Persistable
())
{
gc_
.
Add
(
var_scope
.
Var
(
var_id
),
gc_event_
.
at
(
instr_id
),
&
instr
.
DeviceContext
());
}
else
if
(
is_ready
&&
var_scope
.
VarDesc
(
var_id
)
==
nullptr
)
{
gc_
.
Add
(
var_scope
.
Var
(
var_id
),
gc_event_
.
at
(
instr_id
),
&
instr
.
DeviceContext
());
}
}
}
...
...
@@ -516,11 +488,11 @@ void InterpreterCore::DryRunPrepare(
const
std
::
vector
<
framework
::
Tensor
>&
feed_tensors
)
{
auto
FeedInput
=
[
&
]
{
for
(
size_t
i
=
0
;
i
<
feed_names_
.
size
();
++
i
)
{
auto
it
=
global_scope_
->
name2id
.
find
(
feed_names_
[
i
]);
assert
(
it
!=
global_scope_
->
name2id
.
end
());
auto
*
feed_var
=
global_scope_
->
FindVar
(
feed_names_
[
i
]);
PADDLE_ENFORCE_NOT_NULL
(
feed_var
,
platform
::
errors
::
NotFound
(
"feed_var shall not be nullptr."
));
auto
feed_tensor
=
global_scope_
->
var_list
[
it
->
second
]
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
feed_tensor
=
feed_var
->
GetMutable
<
framework
::
LoDTensor
>
();
feed_tensor
->
ShareDataWith
(
feed_tensors
[
i
]);
}
};
...
...
@@ -530,7 +502,7 @@ void InterpreterCore::DryRunPrepare(
global_scope_
);
FeedInput
();
paddle
::
framework
::
interpretercore
::
build_op_func_list
(
place_
,
main_program_
,
&
op_list_
,
&
vec_func_list_
,
global_scope_
);
place_
,
main_program_
,
&
vec_func_list_
,
global_scope_
);
is_build_
=
true
;
// convert vec func_list to graph
Convert
();
...
...
paddle/fluid/framework/new_executor/interpretercore.h
浏览文件 @
9516108a
...
...
@@ -67,7 +67,7 @@ class InterpreterCore {
void
DryRunPrepare
(
const
std
::
vector
<
framework
::
Tensor
>&
feed_tensors
);
void
CheckGC
(
size_t
instr_id
,
const
std
::
vector
<
size_t
>&
gc_check_list
);
void
CheckGC
(
const
Instruction
&
instr
);
void
RunInstructionAsync
(
size_t
instr_id
);
void
RunNextInstructions
(
const
Instruction
&
instr_id
,
...
...
@@ -82,16 +82,15 @@ class InterpreterCore {
ProgramDesc
main_program_
;
VariableScope
*
global_scope_
;
std
::
vector
<
Instruction
>
vec_instruction_
;
std
::
vector
<
paddle
::
framework
::
OpFuncNode
>
vec_func_list_
;
std
::
vector
<
Instruction
>
vec_instruction_
;
// deconstruct before OpFuncNode
InstructionInfo
instruction_info_
;
std
::
vector
<
size_t
>
dependecy_count_
;
std
::
vector
<
std
::
vector
<
size_t
>>
input_var2op_info_
;
std
::
vector
<
VariableMetaInfo
>
ref_coun_info_
;
std
::
vector
<
VariableMetaInfo
>
vec_meta_info_
;
std
::
vector
<
paddle
::
framework
::
OpFuncNode
>
vec_func_list_
;
std
::
vector
<
paddle
::
framework
::
OperatorBase
*>
op_list_
;
std
::
vector
<
std
::
string
>
feed_names_
;
InterpreterProfiler
dry_run_profiler_
;
...
...
paddle/fluid/framework/new_executor/interpretercore_util.cc
浏览文件 @
9516108a
...
...
@@ -19,6 +19,7 @@
namespace
paddle
{
namespace
framework
{
namespace
interpretercore
{
using
VariableIdMap
=
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
;
AtomicVectorSizeT
&
AsyncWorkQueue
::
PrepareAtomicDeps
(
const
std
::
vector
<
size_t
>&
dependecy_count
)
{
...
...
@@ -132,43 +133,29 @@ void build_variable_scope(const framework::ProgramDesc& pdesc,
VariableScope
*
var_scope
)
{
auto
&
global_block
=
pdesc
.
Block
(
0
);
for
(
auto
&
var
:
global_block
.
AllVars
())
{
if
(
var
->
Name
()
==
framework
::
kEmptyVarName
)
{
for
(
auto
&
var_desc
:
global_block
.
AllVars
())
{
auto
var_name
=
var_desc
->
Name
();
if
(
var_name
==
framework
::
kEmptyVarName
)
{
continue
;
}
if
(
var_scope
->
name2id
.
find
(
var
->
Name
())
==
var_scope
->
name2id
.
end
())
{
var_scope
->
name2id
[
var
->
Name
()]
=
var_scope
->
var_list
.
size
();
auto
v
=
new
Variable
();
InitializeVariable
(
v
,
var
->
GetType
());
var_scope
->
var_list
.
push_back
(
v
);
VariableMetaInfo
info
;
info
.
var_ref_count_
=
0
;
info
.
vardesc_
=
var
;
var_scope
->
vec_meta_info_
.
push_back
(
info
);
if
(
nullptr
==
var_scope
->
FindVar
(
var_name
))
{
var_scope
->
AddVar
(
var_desc
->
Name
(),
var_desc
);
}
else
{
auto
var_id
=
var_scope
->
name2id
[
var
->
Name
()]
;
if
(
nullptr
==
var_
scope
->
vec_meta_info_
[
var_id
].
vardesc_
)
{
VLOG
(
3
)
<<
"update var:"
<<
var
->
Name
()
<<
" desc from nullptr into "
<<
var
;
var_scope
->
vec_meta_info_
[
var_id
].
vardesc_
=
var
;
auto
*
var_desc
=
var_scope
->
VarDesc
(
var_name
)
;
if
(
nullptr
==
var_
desc
)
{
VLOG
(
3
)
<<
"update var:"
<<
var
_name
<<
" desc from nullptr into "
<<
var
_desc
;
var_scope
->
VarMetaInfo
(
var_name
).
vardesc_
=
var_desc
;
}
}
}
}
void
build_op_func_list
(
const
platform
::
Place
&
place
,
const
framework
::
ProgramDesc
&
pdesc
,
std
::
vector
<
OperatorBase
*>*
op_list
,
std
::
vector
<
OpFuncNode
>*
vec_func_list
,
VariableScope
*
var_scope
)
{
auto
&
global_block
=
pdesc
.
Block
(
0
);
auto
&
all_op_kernels
=
OperatorWithKernel
::
AllOpKernels
();
std
::
vector
<
OperatorBase
*>
create_all_ops
(
const
framework
::
BlockDesc
&
block
)
{
std
::
vector
<
OperatorBase
*>
ops
;
for
(
auto
&
op
:
global_
block
.
AllOps
())
{
VLOG
(
3
)
<<
"
Build OpFuncNode
from : "
<<
op
->
Type
();
for
(
auto
&
op
:
block
.
AllOps
())
{
VLOG
(
3
)
<<
"
CreateOp
from : "
<<
op
->
Type
();
auto
&
info
=
OpInfoMap
::
Instance
().
Get
(
op
->
Type
());
...
...
@@ -179,64 +166,96 @@ void build_op_func_list(const platform::Place& place,
if
(
info
.
Checker
()
!=
nullptr
)
{
info
.
Checker
()
->
Check
(
&
op_attr_map
);
}
// step 1. Prepare VariableValueMap of input/output
auto
op_base
=
info
.
Creator
()(
op
->
Type
(),
inputs_names
,
outputs_names
,
op_attr_map
);
ops
.
push_back
(
op_base
);
}
return
ops
;
}
std
::
tuple
<
VariableValueMap
,
VariableIdMap
>
build_variable_map
(
const
VariableNameMap
&
var_name_map
,
VariableScope
*
var_scope
)
{
VariableValueMap
name2var
;
VariableIdMap
name2id
;
for
(
auto
&
item
:
var_name_map
)
{
std
::
vector
<
Variable
*>
vars
;
std
::
vector
<
int
>
ids
;
vars
.
reserve
(
item
.
second
.
size
());
for
(
auto
&
var_name
:
item
.
second
)
{
auto
var_id
=
var_scope
->
VarId
(
var_name
);
auto
*
in_var
=
var_scope
->
Var
(
var_id
);
vars
.
push_back
(
in_var
);
ids
.
push_back
(
var_id
);
}
name2var
[
item
.
first
]
=
std
::
move
(
vars
);
name2id
[
item
.
first
]
=
std
::
move
(
ids
);
}
return
std
::
make_tuple
(
name2var
,
name2id
);
}
void
apply_device_guard
(
const
OperatorBase
*
op_base
,
const
platform
::
Place
&
place
,
OpKernelType
*
expected_kernel_key
)
{
bool
need_change_place
=
(
op_base
->
HasAttr
(
"op_device"
)
&&
(
op_base
->
Attr
<
std
::
string
>
(
"op_device"
).
length
()
>
0
));
if
(
need_change_place
)
{
auto
&
op_device
=
op_base
->
Attr
<
std
::
string
>
(
"op_device"
);
if
(
op_device
==
"cpu"
||
platform
::
is_cpu_place
(
place
))
{
VLOG
(
3
)
<<
"Switch into CPUPlace by device_guard."
;
expected_kernel_key
->
place_
=
platform
::
CPUPlace
();
}
else
if
(
op_device
.
find
(
"gpu"
)
!=
std
::
string
::
npos
&&
platform
::
is_gpu_place
(
place
))
{
VLOG
(
3
)
<<
"Switch into "
<<
place
<<
" by device_guard."
;
expected_kernel_key
->
place_
=
place
;
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Fatal
(
"Unsupported current place %s"
,
op_device
));
}
}
}
void
build_op_func_list
(
const
platform
::
Place
&
place
,
const
framework
::
ProgramDesc
&
pdesc
,
std
::
vector
<
OpFuncNode
>*
vec_func_list
,
VariableScope
*
var_scope
)
{
auto
&
global_block
=
pdesc
.
Block
(
0
);
auto
&
all_op_kernels
=
OperatorWithKernel
::
AllOpKernels
();
// Step 1: create all ops for global block.
auto
ops
=
create_all_ops
(
global_block
);
auto
unused_var_map
=
get_unused_vars
(
global_block
,
ops
);
size_t
ops_index
=
0
;
for
(
auto
&
op
:
global_block
.
AllOps
())
{
VLOG
(
3
)
<<
op
->
Type
();
// << op->Type() << endl;
VLOG
(
3
)
<<
"Build OpFuncNode from : "
<<
op
->
Type
();
auto
op_base
=
ops
[
ops_index
++
];
auto
inputs_names
=
op
->
Inputs
();
auto
outputs_names
=
op
->
Outputs
();
VariableValueMap
ins_map
;
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
ins_name2id
;
for
(
auto
&
var_name_item
:
inputs_names
)
{
std
::
vector
<
Variable
*>
input_vars
;
std
::
vector
<
int
>
vec_ids
;
input_vars
.
reserve
(
var_name_item
.
second
.
size
());
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
auto
it
=
var_scope
->
name2id
.
find
(
var_name
);
assert
(
it
!=
var_scope
->
name2id
.
end
());
input_vars
.
push_back
(
var_scope
->
var_list
[
it
->
second
]);
vec_ids
.
push_back
(
it
->
second
);
}
ins_map
[
var_name_item
.
first
]
=
input_vars
;
ins_name2id
[
var_name_item
.
first
]
=
vec_ids
;
}
VariableIdMap
ins_name2id
;
std
::
tie
(
ins_map
,
ins_name2id
)
=
build_variable_map
(
inputs_names
,
var_scope
);
VariableValueMap
outs_map
;
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
outs_name2id
;
for
(
auto
&
var_name_item
:
outputs_names
)
{
std
::
vector
<
Variable
*>
output_vars
;
std
::
vector
<
int
>
vec_ids
;
output_vars
.
reserve
(
var_name_item
.
second
.
size
());
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
auto
it
=
var_scope
->
name2id
.
find
(
var_name
);
assert
(
it
!=
var_scope
->
name2id
.
end
());
output_vars
.
push_back
(
var_scope
->
var_list
[
it
->
second
]);
vec_ids
.
push_back
(
it
->
second
);
}
outs_map
[
var_name_item
.
first
]
=
output_vars
;
outs_name2id
[
var_name_item
.
first
]
=
vec_ids
;
}
VariableIdMap
outs_name2id
;
std
::
tie
(
outs_map
,
outs_name2id
)
=
build_variable_map
(
outputs_names
,
var_scope
);
// step 2: build OpFuncNode
OpFuncNode
op_func_node
;
op_func_node
.
input_index
=
ins_name2id
;
op_func_node
.
output_index
=
outs_name2id
;
//
step 2:
construct RuntimeContext and analysis KernelType
// construct RuntimeContext and analysis KernelType
RuntimeContext
runtime_context
({},
{});
runtime_context
.
inputs
.
swap
(
ins_map
);
runtime_context
.
outputs
.
swap
(
outs_map
);
InterpretercoreInferShapeContext
infer_shape_ctx
(
*
op_base
,
runtime_context
);
// TODO(Aurelius84): In case of control flow ops, they are NOT inheritted
// from OperatorWithKernel.
static_cast
<
const
framework
::
OperatorWithKernel
*>
(
op_base
)
->
InferShape
(
&
infer_shape_ctx
);
auto
kernels_iter
=
all_op_kernels
.
find
(
op
->
Type
());
...
...
@@ -256,32 +275,18 @@ void build_op_func_list(const platform::Place& place,
->
GetExpectedKernelType
(
ExecutionContext
(
*
op_base
,
scope
,
*
dev_ctx
,
runtime_context
));
// consider device_guard context
bool
need_change_place
=
(
op_base
->
HasAttr
(
"op_device"
)
&&
(
op_base
->
Attr
<
std
::
string
>
(
"op_device"
).
length
()
>
0
));
if
(
need_change_place
)
{
auto
&
op_device
=
op_base
->
Attr
<
std
::
string
>
(
"op_device"
);
if
(
op_device
==
"cpu"
||
platform
::
is_cpu_place
(
place
))
{
VLOG
(
3
)
<<
"Switch into CPUPlace by device_guard."
;
expected_kernel_key
.
place_
=
platform
::
CPUPlace
();
}
else
if
(
op_device
.
find
(
"gpu"
)
!=
std
::
string
::
npos
&&
platform
::
is_gpu_place
(
place
))
{
VLOG
(
3
)
<<
"Switch into "
<<
place
<<
" by device_guard."
;
expected_kernel_key
.
place_
=
place
;
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Fatal
(
"Unsupported current place %s"
,
op_device
));
}
}
// consider device_guard()
apply_device_guard
(
op_base
,
place
,
&
expected_kernel_key
);
VLOG
(
3
)
<<
"expected_kernel_key : "
<<
expected_kernel_key
;
// step 3. Insert memcpy_op if needed
VariableValueMap
&
ins_map_temp
=
runtime_context
.
inputs
;
std
::
unordered_set
<
int
>
no_data_transform_index
;
for
(
auto
&
var_name_item
:
ins_map_temp
)
{
for
(
size_t
i
=
0
;
i
<
var_name_item
.
second
.
size
();
++
i
)
{
auto
var
=
var_name_item
.
second
[
i
];
auto
&
var_name
=
inputs_names
[
var_name_item
.
first
].
at
(
i
);
auto
tensor_in
=
static_cast
<
const
Tensor
*>
(
&
(
var
->
Get
<
LoDTensor
>
()));
if
(
!
tensor_in
->
IsInitialized
())
{
continue
;
...
...
@@ -293,32 +298,19 @@ void build_op_func_list(const platform::Place& place,
if
(
platform
::
is_same_place
(
kernel_type_for_var
.
place_
,
expected_kernel_key
.
place_
))
{
// record no need data transformer input var_id
auto
&
var_name
=
inputs_names
[
var_name_item
.
first
][
i
];
VLOG
(
3
)
<<
op
->
Type
()
<<
" found no data_transform var: "
<<
var_name
<<
" with id: "
<<
var_
scope
->
name2id
[
var_name
]
;
no_data_transform_index
.
emplace
(
var_scope
->
name2id
[
var_name
]
);
<<
" with id: "
<<
var_
name
;
no_data_transform_index
.
emplace
(
var_scope
->
VarId
(
var_name
)
);
}
else
{
if
(
op_base
->
Type
()
==
"fetch_v2"
)
{
op_base
->
SetAttr
(
"deepcopy"
,
false
);
}
// need trans place
// 1. add var in scope
// 2. add copy op
std
::
string
new_var_name
=
"temp_1"
+
std
::
to_string
(
var_scope
->
var_list
.
size
()
+
1
);
auto
v
=
new
Variable
();
v
->
GetMutable
<
LoDTensor
>
();
var_scope
->
name2id
[
new_var_name
]
=
var_scope
->
var_list
.
size
();
var_scope
->
var_list
.
push_back
(
v
);
VariableMetaInfo
info
;
info
.
var_ref_count_
=
0
;
info
.
vardesc_
=
nullptr
;
var_scope
->
vec_meta_info_
.
push_back
(
info
);
var_name
+
"_copy_"
+
std
::
to_string
(
var_scope
->
VarSize
()
+
1
);
var_scope
->
AddVar
(
new_var_name
,
nullptr
);
VariableNameMap
copy_in_map
;
auto
x_iter
=
inputs_names
.
find
(
var_name_item
.
first
);
copy_in_map
[
"X"
]
=
{
x_iter
->
second
[
i
]};
copy_in_map
[
"X"
]
=
{
var_name
};
VariableNameMap
copy_out_map
;
copy_out_map
[
"Out"
]
=
{
new_var_name
};
AttributeMap
attr_map
;
...
...
@@ -328,23 +320,23 @@ void build_op_func_list(const platform::Place& place,
:
is_gpu_place
(
expected_kernel_key
.
place_
)
?
1
:
-
1
;
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
copy_ins_name2id
;
copy_ins_name2id
[
"X"
]
=
ins_name2id
[
var_name_item
.
first
]
;
copy_ins_name2id
[
"X"
]
=
ins_name2id
.
at
(
var_name_item
.
first
)
;
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
copy_out_name2id
;
copy_out_name2id
[
"Out"
]
=
{
var_scope
->
name2id
[
new_var_name
]
};
copy_out_name2id
[
"Out"
]
=
{
var_scope
->
VarId
(
new_var_name
)
};
op_func_node
.
input_index
[
var_name_item
.
first
][
i
]
=
var_scope
->
name2id
[
new_var_name
]
;
var_scope
->
VarId
(
new_var_name
)
;
VariableValueMap
copy_ins_value_map
;
copy_ins_value_map
[
"X"
]
=
{
var
};
VariableValueMap
copy_outs_value_map
;
copy_outs_value_map
[
"Out"
]
=
{
v
};
copy_outs_value_map
[
"Out"
]
=
{
v
ar_scope
->
Var
(
new_var_name
)
};
// memcpy_d2h, memcpy_h2d
auto
memcpy_op_type
=
get_memcpy_type
(
kernel_type_for_var
.
place_
,
expected_kernel_key
.
place_
);
VLOG
(
3
)
<<
string
::
Sprintf
(
"Insert %s with %s(%s) -> %s(%s)."
,
memcpy_op_type
,
x_iter
->
second
[
i
]
,
memcpy_op_type
,
var_name
,
kernel_type_for_var
.
place_
,
new_var_name
,
expected_kernel_key
.
place_
);
auto
&
copy_info
=
OpInfoMap
::
Instance
().
Get
(
memcpy_op_type
);
...
...
@@ -385,16 +377,16 @@ void build_op_func_list(const platform::Place& place,
// as kQueueSync and execute them in thread pool.
copy_op_func_node
.
type_
=
OpFuncType
::
kQueueSync
;
copy_op_func_node
.
dev_ctx_
=
dev_ctx
;
op_list
->
push_back
(
copy_op
)
;
copy_op_func_node
.
operator_base_
=
copy_op
;
vec_func_list
->
push_back
(
copy_op_func_node
);
var_name_item
.
second
[
i
]
=
v
;
var_name_item
.
second
[
i
]
=
v
ar_scope
->
Var
(
new_var_name
)
;
}
}
}
op_func_node
.
no_data_transform_index
=
std
::
move
(
no_data_transform_index
);
// step 4. Run op kernel
op_
list
->
push_back
(
op_base
)
;
op_
func_node
.
operator_base_
=
op_base
;
VLOG
(
3
)
<<
op_base
->
Type
()
<<
" : expected_kernel_key : "
<<
expected_kernel_key
;
...
...
@@ -436,9 +428,7 @@ void build_op_func_list(const platform::Place& place,
new
std
::
deque
<
std
::
shared_ptr
<
memory
::
Allocation
>>
();
for
(
auto
&
var_name
:
delete_vars
)
{
auto
it
=
var_scope
->
name2id
.
find
(
var_name
);
assert
(
it
!=
var_scope
->
name2id
.
end
());
auto
*
var
=
var_scope
->
var_list
[
it
->
second
];
auto
*
var
=
var_scope
->
FindVar
(
var_name
);
if
(
var
==
nullptr
)
{
continue
;
}
...
...
paddle/fluid/framework/new_executor/interpretercore_util.h
浏览文件 @
9516108a
...
...
@@ -101,7 +101,6 @@ void build_variable_scope(const framework::ProgramDesc& pdesc,
void
build_op_func_list
(
const
platform
::
Place
&
place
,
const
framework
::
ProgramDesc
&
pdesc
,
std
::
vector
<
OperatorBase
*>*
op_list
,
std
::
vector
<
OpFuncNode
>*
vec_func_list
,
VariableScope
*
var_scope
);
...
...
paddle/fluid/framework/new_executor/new_executor_defs.h
浏览文件 @
9516108a
...
...
@@ -19,6 +19,7 @@
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/device_event_base.h"
#include "paddle/fluid/platform/event.h"
...
...
@@ -463,7 +464,6 @@ class InterpretercoreInferShapeContext : public InferShapeContext {
struct
OpKernelFunc
{
OpKernelComputeFunc
compute_func_
;
OperatorBase
*
operator_base_
;
};
struct
VariableMetaInfo
{
...
...
@@ -471,13 +471,108 @@ struct VariableMetaInfo {
paddle
::
framework
::
VarDesc
*
vardesc_
;
};
struct
VariableScope
{
// TODO(Aurelius84): Consider inherit ScopeBase to unify interface.
class
VariableScope
{
public:
Variable
*
FindVar
(
const
std
::
string
&
name
)
const
{
if
(
!
HasVar
(
name
))
{
return
nullptr
;
}
auto
var_id
=
VarId
(
name
);
CheckExist
(
var_id
);
return
var_list
[
var_id
];
}
bool
HasVar
(
const
std
::
string
&
name
)
const
{
return
name2id
.
find
(
name
)
!=
name2id
.
end
();
}
int
VarId
(
const
std
::
string
&
name
)
const
{
CheckExist
(
name
);
return
name2id
.
at
(
name
);
}
Variable
*
Var
(
int
id
)
const
{
return
var_list
.
at
(
id
);
}
Variable
*
Var
(
const
std
::
string
&
name
)
const
{
return
var_list
.
at
(
VarId
(
name
));
}
size_t
VarSize
()
const
{
return
var_list
.
size
();
}
void
AddVar
(
const
std
::
string
&
name
,
VarDesc
*
var_desc
)
{
// NOLINT
name2id
[
name
]
=
VarSize
();
auto
v
=
new
Variable
();
if
(
nullptr
==
var_desc
)
{
v
->
GetMutable
<
LoDTensor
>
();
}
else
{
InitializeVariable
(
v
,
var_desc
->
GetType
());
}
var_list
.
push_back
(
v
);
VariableMetaInfo
info
;
info
.
var_ref_count_
=
0
;
info
.
vardesc_
=
var_desc
;
vec_meta_info_
.
push_back
(
info
);
}
void
AddVar
(
const
std
::
string
&
name
,
Variable
&
var
)
{
// NOLINT
name2id
[
name
]
=
VarSize
();
var_list
.
push_back
(
&
var
);
VariableMetaInfo
info
;
info
.
var_ref_count_
=
0
;
info
.
vardesc_
=
nullptr
;
vec_meta_info_
.
push_back
(
info
);
}
paddle
::
framework
::
VarDesc
*
VarDesc
(
const
std
::
string
&
name
)
const
{
return
VarDesc
(
VarId
(
name
));
}
paddle
::
framework
::
VarDesc
*
VarDesc
(
int
id
)
const
{
CheckExist
(
id
);
return
vec_meta_info_
[
id
].
vardesc_
;
}
VariableMetaInfo
&
VarMetaInfo
(
const
std
::
string
&
name
)
{
return
vec_meta_info_
[
VarId
(
name
)];
}
void
CheckExist
(
int
id
)
const
{
PADDLE_ENFORCE_LT
(
id
,
var_list
.
size
(),
platform
::
errors
::
PreconditionNotMet
(
"Required var_id < %d, but received var_id = %d."
,
var_list
.
size
(),
id
));
}
void
CheckExist
(
const
std
::
string
&
name
)
const
{
PADDLE_ENFORCE_EQ
(
HasVar
(
name
),
true
,
platform
::
errors
::
NotFound
(
"%s not in VariableScope."
,
name
));
}
private:
std
::
vector
<
Variable
*>
var_list
;
std
::
map
<
std
::
string
,
int
>
name2id
;
std
::
vector
<
VariableMetaInfo
>
vec_meta_info_
;
};
struct
NextInstruction
{
class
NextInstruction
{
public:
void
AddDirectRun
(
size_t
id
)
{
direct_run_
.
push_back
(
id
);
}
void
ADDEventRun
(
size_t
id
)
{
event_wait_run_
.
push_back
(
id
);
}
void
AddSyncRun
(
size_t
id
)
{
synchronize_run_
.
push_back
(
id
);
}
const
std
::
vector
<
size_t
>&
DirectRunIds
()
const
{
return
direct_run_
;
}
const
std
::
vector
<
size_t
>&
EventRunIds
()
const
{
return
event_wait_run_
;
}
const
std
::
vector
<
size_t
>&
SyncRunIds
()
const
{
return
synchronize_run_
;
}
private:
std
::
vector
<
size_t
>
direct_run_
;
std
::
vector
<
size_t
>
event_wait_run_
;
std
::
vector
<
size_t
>
synchronize_run_
;
...
...
@@ -503,49 +598,138 @@ enum class OpFuncType {
};
class
RuntimeInferShapeContext
;
struct
Instruction
{
OpKernelFunc
kernel_func_
;
struct
OpFuncNode
{
OperatorBase
*
operator_base_
;
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
input_index
;
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
output_index
;
std
::
unordered_set
<
int
>
no_data_transform_index
;
OpKernelComputeFunc
kernel_func_
;
platform
::
DeviceContext
*
dev_ctx_
;
// not owned
OpFuncType
type_
;
};
class
Instruction
{
public:
Instruction
(
size_t
id
,
const
OpFuncNode
&
op_func_node
,
const
platform
::
DeviceContext
&
dev_ctx
)
:
id_
(
id
),
op_func_node_
(
op_func_node
),
dev_ctx_
(
dev_ctx
)
{
PADDLE_ENFORCE_GE
(
id
,
0
,
platform
::
errors
::
PreconditionNotMet
(
"Required id >= 0, but received id = %d"
,
id
));
}
size_t
Id
()
const
{
return
id_
;
}
const
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>&
Inputs
()
const
{
return
op_func_node_
.
input_index
;
}
const
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>&
Outputs
()
const
{
return
op_func_node_
.
output_index
;
}
const
std
::
unordered_set
<
int
>&
NoDataTransformVars
()
const
{
return
op_func_node_
.
no_data_transform_index
;
}
OpKernelComputeFunc
KernelFunc
()
const
{
return
op_func_node_
.
kernel_func_
;
}
OpFuncType
KernelType
()
const
{
return
op_func_node_
.
type_
;
}
OperatorBase
*
OpBase
()
const
{
auto
*
op_base
=
op_func_node_
.
operator_base_
;
PADDLE_ENFORCE_NOT_NULL
(
op_base
,
platform
::
errors
::
PreconditionNotMet
(
"op_base shall not be nullptr."
));
return
op_base
;
}
NextInstruction
&
NextInstructions
()
{
return
next_instruction_
;
}
const
NextInstruction
&
NextInstructions
()
const
{
return
next_instruction_
;
}
void
AddGCCheckVar
(
size_t
id
)
{
gc_check_var_list_
.
push_back
(
id
);
}
const
std
::
vector
<
size_t
>&
GCCheckVars
()
const
{
return
gc_check_var_list_
;
}
void
ResetContext
(
const
VariableValueMap
&
in_vars
,
const
VariableValueMap
&
out_vars
)
{
runtime_ctx_
.
reset
(
new
RuntimeContext
(
in_vars
,
out_vars
));
infershape_ctx_
.
reset
(
new
InterpretercoreInferShapeContext
(
*
OpBase
(),
*
runtime_ctx_
.
get
()));
// NOTE: Because execution_ctx_ is constructed by `scope&`, so we fake an
// empty here to avoid illegal local reference.
static
framework
::
Scope
scope_
;
execution_ctx_
.
reset
(
new
ExecutionContext
(
*
OpBase
(),
scope_
,
dev_ctx_
,
*
runtime_ctx_
.
get
()));
}
std
::
shared_ptr
<
RuntimeContext
>
InnerRuntimeContext
()
const
{
return
runtime_ctx_
;
}
std
::
shared_ptr
<
InterpretercoreInferShapeContext
>
InnerInferShapeContext
()
const
{
return
infershape_ctx_
;
}
std
::
shared_ptr
<
ExecutionContext
>
InnerExecutionContext
()
const
{
return
execution_ctx_
;
}
const
platform
::
DeviceContext
&
DeviceContext
()
const
{
return
dev_ctx_
;
}
const
std
::
vector
<
std
::
pair
<
Variable
*
,
Variable
*>>&
InplaceInfo
()
const
{
return
vec_inplace_in_to_out_
;
}
void
AddInplace
(
Variable
*
in
,
Variable
*
out
)
{
vec_inplace_in_to_out_
.
emplace_back
(
in
,
out
);
}
const
std
::
vector
<
EventInter
>&
InputEvents
()
const
{
return
intput_events_
;
}
const
std
::
vector
<
EventInter
>&
OutputEvents
()
const
{
return
output_events_
;
}
void
AddInputEvent
(
size_t
var_id
,
std
::
shared_ptr
<
platform
::
DeviceEvent
>
event
,
platform
::
DeviceType
waiter_type
)
{
intput_events_
.
emplace_back
(
var_id
,
event
,
waiter_type
);
}
void
AddOutputEvent
(
size_t
var_id
,
std
::
shared_ptr
<
platform
::
DeviceEvent
>
event
,
platform
::
DeviceType
waiter_type
)
{
output_events_
.
emplace_back
(
var_id
,
event
,
waiter_type
);
}
private:
size_t
id_
;
const
OpFuncNode
&
op_func_node_
;
// not owned
const
platform
::
DeviceContext
&
dev_ctx_
;
// not owned
std
::
shared_ptr
<
RuntimeContext
>
runtime_ctx_
;
std
::
shared_ptr
<
InterpretercoreInferShapeContext
>
infershape_ctx_
;
std
::
shared_ptr
<
ExecutionContext
>
execution_ctx_
;
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
input_index_
;
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
output_index_
;
std
::
unordered_set
<
int
>
no_data_transform_index_
;
std
::
vector
<
size_t
>
gc_check_var_list
;
std
::
vector
<
size_t
>
gc_check_var_list
_
;
NextInstruction
next_instruction_
;
std
::
vector
<
EventInter
>
intput_events_
;
std
::
vector
<
EventInter
>
output_events_
;
platform
::
DeviceContext
*
dev_ctx_
;
// not owned
OpFuncType
type_
;
std
::
vector
<
std
::
pair
<
Variable
*
,
Variable
*>>
vec_inplace_in_to_out_
;
};
struct
OpFuncNode
{
// int unsed;
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
input_index
;
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
output_index
;
std
::
unordered_set
<
int
>
no_data_transform_index
;
OpKernelComputeFunc
kernel_func_
;
platform
::
DeviceContext
*
dev_ctx_
;
// not owned
OpFuncType
type_
;
};
namespace
interpretercore
{
static
constexpr
char
kMemcpyH2D
[]
=
"memcpy_h2d"
;
static
constexpr
char
kMemcpyD2H
[]
=
"memcpy_d2h"
;
static
bool
IsMemcpyH2D
(
const
Instruction
&
instr
)
{
return
instr
.
kernel_func_
.
operator_base_
->
Type
()
==
kMemcpyH2D
;
return
instr
.
OpBase
()
->
Type
()
==
kMemcpyH2D
;
}
static
bool
IsMemcpyD2H
(
const
Instruction
&
instr
)
{
return
instr
.
kernel_func_
.
operator_base_
->
Type
()
==
kMemcpyD2H
;
return
instr
.
OpBase
()
->
Type
()
==
kMemcpyD2H
;
}
}
// namespace interpretercore
...
...
paddle/fluid/framework/new_executor/standalone_executor.cc
浏览文件 @
9516108a
...
...
@@ -33,23 +33,16 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
auto
name_list
=
outer_scope_
->
LocalVarNames
();
for
(
auto
name
:
name_list
)
{
auto
v
=
outer_scope_
->
Var
(
name
);
if
(
global_scope_
.
name2id
.
find
(
name
)
==
global_scope_
.
name2id
.
end
())
{
global_scope_
.
name2id
[
name
]
=
global_scope_
.
var_list
.
size
();
global_scope_
.
var_list
.
push_back
(
v
);
VariableMetaInfo
info
;
info
.
var_ref_count_
=
0
;
info
.
vardesc_
=
nullptr
;
global_scope_
.
vec_meta_info_
.
push_back
(
info
);
if
(
!
global_scope_
.
HasVar
(
name
))
{
global_scope_
.
AddVar
(
name
,
*
v
);
}
}
}
// run startup program
std
::
vector
<
paddle
::
framework
::
OpFuncNode
>
vec_func_list
;
std
::
vector
<
paddle
::
framework
::
OperatorBase
*>
op_list
;
paddle
::
framework
::
interpretercore
::
build_op_func_list
(
place_
,
startup_prog
,
&
op_list
,
&
vec_func_list
,
&
global_scope_
);
place_
,
startup_prog
,
&
vec_func_list
,
&
global_scope_
);
}
paddle
::
framework
::
FetchList
StandaloneExecutor
::
Run
(
...
...
@@ -80,16 +73,8 @@ void StandaloneExecutor::BuildVariableOuterScope(
continue
;
}
if
(
var_scope
->
name2id
.
find
(
var
->
Name
())
==
var_scope
->
name2id
.
end
())
{
var_scope
->
name2id
[
var
->
Name
()]
=
var_scope
->
var_list
.
size
();
auto
v
=
outer_scope
->
Var
(
var
->
Name
());
InitializeVariable
(
v
,
var
->
GetType
());
var_scope
->
var_list
.
push_back
(
v
);
VariableMetaInfo
info
;
info
.
var_ref_count_
=
0
;
info
.
vardesc_
=
var
;
var_scope
->
vec_meta_info_
.
push_back
(
info
);
if
(
!
var_scope
->
HasVar
(
var
->
Name
()))
{
var_scope
->
AddVar
(
var
->
Name
(),
var
);
}
}
}
...
...
paddle/fluid/framework/new_executor/stream_analyzer.cc
浏览文件 @
9516108a
...
...
@@ -31,15 +31,15 @@ namespace framework {
std
::
vector
<
size_t
>
StreamAnalyzer
::
ParseEventVarIds
(
const
Instruction
&
cur_instr
,
const
Instruction
&
next_instr
)
{
std
::
unordered_set
<
size_t
>
unique_var_ids
;
for
(
auto
&
item
:
cur_instr
.
output_index_
)
{
for
(
auto
&
item
:
cur_instr
.
Outputs
()
)
{
unique_var_ids
.
insert
(
item
.
second
.
begin
(),
item
.
second
.
end
());
}
std
::
vector
<
size_t
>
new_event_var_ids
;
for
(
auto
&
item
:
next_instr
.
input_index_
)
{
for
(
auto
&
item
:
next_instr
.
Inputs
()
)
{
for
(
auto
var_id
:
item
.
second
)
{
if
(
unique_var_ids
.
count
(
var_id
)
>
0
&&
next_instr
.
no_data_transform_index_
.
count
(
var_id
)
==
0
)
{
next_instr
.
NoDataTransformVars
()
.
count
(
var_id
)
==
0
)
{
new_event_var_ids
.
push_back
(
var_id
);
}
}
...
...
@@ -57,8 +57,7 @@ void StreamAnalyzer::AssociateInputWithEvents(
var_id2event_
.
emplace
(
var_id
,
std
::
move
(
device_event
));
}
// Add events for next_instr.inputs
next_instr
->
intput_events_
.
emplace_back
(
var_id
,
var_id2event_
.
at
(
var_id
),
waiter_type
);
next_instr
->
AddInputEvent
(
var_id
,
var_id2event_
.
at
(
var_id
),
waiter_type
);
}
}
...
...
@@ -66,13 +65,13 @@ void StreamAnalyzer::Schedule(const std::vector<size_t>& downstream_ops,
std
::
vector
<
Instruction
>*
instructions
,
size_t
op_index
)
{
auto
&
cur_instr
=
instructions
->
at
(
op_index
);
auto
&
next_instruction
=
cur_instr
.
next_instruction_
;
auto
&
next_instruction
=
cur_instr
.
NextInstructions
()
;
std
::
vector
<
size_t
>
event_var_ids
;
for
(
auto
next_op_id
:
downstream_ops
)
{
auto
&
next_instr
=
instructions
->
at
(
next_op_id
);
if
(
IsDirectRun
(
cur_instr
,
next_instr
))
{
next_instruction
.
direct_run_
.
emplace_back
(
next_op_id
);
next_instruction
.
AddDirectRun
(
next_op_id
);
}
else
{
// Always insert events between different stream
auto
new_event_var_ids
=
ParseEventVarIds
(
cur_instr
,
next_instr
);
...
...
@@ -83,24 +82,24 @@ void StreamAnalyzer::Schedule(const std::vector<size_t>& downstream_ops,
AssociateInputWithEvents
(
new_event_var_ids
,
&
next_instr
,
waiter_type
);
if
(
waiter_type
==
platform
::
kCPU
)
{
// GPU -> CPU
next_instruction
.
synchronize_run_
.
emplace_back
(
next_op_id
);
next_instruction
.
AddSyncRun
(
next_op_id
);
}
else
{
// GPU -> GPU(different stream)
next_instruction
.
event_wait_run_
.
emplace_back
(
next_op_id
);
next_instruction
.
ADDEventRun
(
next_op_id
);
}
}
}
// Create events for these cross-stream vars
VLOG
(
3
)
<<
cur_instr
.
kernel_func_
.
operator_base_
->
Type
()
VLOG
(
3
)
<<
cur_instr
.
OpBase
()
->
Type
()
<<
" event_var_ids.size: "
<<
event_var_ids
.
size
();
for
(
auto
var_id
:
event_var_ids
)
{
cur_instr
.
output_events_
.
emplace_back
(
var_id
,
var_id2event_
.
at
(
var_id
),
platform
::
kCUDA
/*not used*/
);
cur_instr
.
AddOutputEvent
(
var_id
,
var_id2event_
.
at
(
var_id
),
platform
::
kCUDA
/*not used*/
);
}
}
platform
::
DeviceContext
*
StreamAnalyzer
::
ParseDeviceContext
(
const
OpFuncNode
&
op_func_node
,
const
OperatorBase
&
op_base
)
{
auto
&
op_type
=
op_
base
.
Type
();
const
OpFuncNode
&
op_func_node
)
{
auto
&
op_type
=
op_
func_node
.
operator_base_
->
Type
();
auto
*
dev_ctx
=
op_func_node
.
dev_ctx_
;
if
(
op_type
==
interpretercore
::
kMemcpyH2D
)
{
VLOG
(
3
)
<<
"Get dev_ctx from d2h_context_pool_"
;
...
...
@@ -122,13 +121,13 @@ platform::DeviceContext* StreamAnalyzer::ParseDeviceContext(
*/
bool
StreamAnalyzer
::
IsDirectRun
(
Instruction
&
cur_instr
,
const
Instruction
&
next_instr
)
{
return
(
cur_instr
.
dev_ctx_
==
next_instr
.
dev_ctx_
||
return
(
&
cur_instr
.
DeviceContext
()
==
&
next_instr
.
DeviceContext
()
||
interpretercore
::
IsMemcpyD2H
(
cur_instr
)
||
interpretercore
::
IsMemcpyH2D
(
next_instr
));
}
platform
::
DeviceType
StreamAnalyzer
::
GetWaiterType
(
const
Instruction
&
instr
)
{
if
(
instr
.
type_
==
OpFuncType
::
kQueueSync
)
{
if
(
instr
.
KernelType
()
==
OpFuncType
::
kQueueSync
)
{
return
platform
::
kCPU
;
}
else
{
return
platform
::
kCUDA
;
...
...
paddle/fluid/framework/new_executor/stream_analyzer.h
浏览文件 @
9516108a
...
...
@@ -32,8 +32,7 @@ class StreamAnalyzer {
void
Schedule
(
const
std
::
vector
<
size_t
>&
downstream_ops
,
std
::
vector
<
Instruction
>*
instructions
,
size_t
op_index
);
platform
::
DeviceContext
*
ParseDeviceContext
(
const
OpFuncNode
&
op_func_node
,
const
OperatorBase
&
op_base
);
platform
::
DeviceContext
*
ParseDeviceContext
(
const
OpFuncNode
&
op_func_node
);
private:
std
::
vector
<
size_t
>
ParseEventVarIds
(
const
Instruction
&
cur_instr
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录