Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
35d5b1b9
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
35d5b1b9
编写于
5月 30, 2018
作者:
X
Xin Pan
提交者:
GitHub
5月 30, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #11036 from panyx0718/dist_timeline
better profiler and benchmark
上级
32d50864
f14e579c
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
87 addition
and
38 deletion
+87
-38
benchmark/fluid/fluid_benchmark.py
benchmark/fluid/fluid_benchmark.py
+20
-12
paddle/fluid/platform/profiler.cc
paddle/fluid/platform/profiler.cc
+6
-2
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+1
-0
python/paddle/fluid/profiler.py
python/paddle/fluid/profiler.py
+60
-24
未找到文件。
benchmark/fluid/fluid_benchmark.py
浏览文件 @
35d5b1b9
...
...
@@ -98,6 +98,8 @@ def parse_args():
'--use_fake_data'
,
action
=
'store_true'
,
help
=
'If set ommit the actual read data operators.'
)
parser
.
add_argument
(
'--profile'
,
action
=
'store_true'
,
help
=
'If set, profile a few steps.'
)
parser
.
add_argument
(
'--update_method'
,
type
=
str
,
...
...
@@ -108,8 +110,8 @@ def parse_args():
return
args
def
append_nccl2_prepare
():
if
os
.
getenv
(
"PADDLE_TRAINER_ID"
,
None
)
!=
None
:
def
append_nccl2_prepare
(
trainer_id
):
if
trainer_id
>=
0
:
# append gen_nccl_id at the end of startup program
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
))
port
=
os
.
getenv
(
"PADDLE_PSERVER_PORT"
)
...
...
@@ -136,12 +138,12 @@ def append_nccl2_prepare():
})
return
nccl_id_var
,
num_trainers
,
trainer_id
else
:
raise
Exception
(
"must set PADDLE_TRAINER_ID env variables for
dist train."
)
raise
Exception
(
"must set positive PADDLE_TRAINER_ID env variables for "
"nccl-based
dist train."
)
def
dist_transpile
():
if
"PADDLE_TRAINING_ROLE"
not
in
os
.
environ
:
def
dist_transpile
(
trainer_id
):
if
trainer_id
<
0
:
return
None
,
None
# the port of all pservers, needed by both trainer and pserver
...
...
@@ -158,9 +160,6 @@ def dist_transpile():
trainers
=
int
(
os
.
getenv
(
"PADDLE_TRAINERS"
))
# the IP of the local machine, needed by pserver only
current_endpoint
=
os
.
getenv
(
"PADDLE_CURRENT_IP"
,
""
)
+
":"
+
port
# the unique trainer id, starting from 0, needed by trainer
# only
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
,
"0"
))
# the role, should be either PSERVER or TRAINER
training_role
=
os
.
getenv
(
"PADDLE_TRAINING_ROLE"
)
...
...
@@ -295,6 +294,11 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
iters
=
0
start_time
=
time
.
time
()
for
batch_id
,
data
in
enumerate
(
train_reader
()):
if
args
.
profile
and
pass_id
==
0
and
batch_id
==
5
:
profiler
.
start_profiler
(
"All"
)
elif
args
.
profile
and
pass_id
==
0
and
batch_id
==
10
:
profiler
.
stop_profiler
(
"total"
,
"/tmp/profile_%d"
%
trainer_id
)
if
iters
==
args
.
skip_batch_num
:
start_time
=
time
.
time
()
num_samples
=
0
...
...
@@ -334,7 +338,11 @@ def print_arguments(args):
def
main
():
args
=
parse_args
()
print_arguments
(
args
)
nccl_id_var
,
num_trainers
,
trainer_id
=
None
,
1
,
0
# the unique trainer id, starting from 0, needed by trainer
# only
nccl_id_var
,
num_trainers
,
trainer_id
=
(
None
,
1
,
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
,
"-1"
)))
if
args
.
use_cprof
:
pr
=
cProfile
.
Profile
()
...
...
@@ -348,7 +356,7 @@ def main():
fluid
.
memory_optimize
(
fluid
.
default_main_program
())
if
args
.
update_method
==
"pserver"
:
train_prog
,
startup_prog
=
dist_transpile
()
train_prog
,
startup_prog
=
dist_transpile
(
trainer_id
)
if
not
train_prog
:
raise
Exception
(
"Must configure correct environments to run dist train."
)
...
...
@@ -364,7 +372,7 @@ def main():
train_args
.
append
(
fluid
.
default_startup_program
())
if
args
.
update_method
==
"nccl2"
:
nccl_id_var
,
num_trainers
,
trainer_id
=
append_nccl2_prepare
()
nccl_id_var
,
num_trainers
,
trainer_id
=
append_nccl2_prepare
(
trainer_id
)
if
args
.
gpus
==
1
:
# NOTE: parallel executor use profiler interanlly
if
args
.
use_nvprof
and
args
.
device
==
'GPU'
:
...
...
paddle/fluid/platform/profiler.cc
浏览文件 @
35d5b1b9
...
...
@@ -38,6 +38,7 @@ struct EventList;
static
int64_t
profiler_lister_id
=
0
;
static
bool
should_send_profile_state
=
false
;
std
::
mutex
profiler_mu
;
// The profiler state, the initial value is ProfilerState::kDisabled
static
ProfilerState
g_state
=
ProfilerState
::
kDisabled
;
...
...
@@ -228,6 +229,8 @@ void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE
(
state
!=
ProfilerState
::
kDisabled
,
"Can't enbale profling, since the input state is "
,
"ProfilerState::kDisabled"
);
std
::
lock_guard
<
std
::
mutex
>
l
(
profiler_mu
);
if
(
state
==
g_state
)
{
return
;
}
...
...
@@ -295,7 +298,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
}
else
if
(
g_state
==
ProfilerState
::
kAll
)
{
place
=
"All"
;
}
else
{
PADDLE_THROW
(
"Invalid profiler state"
);
PADDLE_THROW
(
"Invalid profiler state"
,
g_state
);
}
std
::
cout
<<
"Place: "
<<
place
<<
std
::
endl
;
...
...
@@ -443,6 +446,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
void
DisableProfiler
(
EventSortingKey
sorted_key
,
const
std
::
string
&
profile_path
)
{
std
::
lock_guard
<
std
::
mutex
>
l
(
profiler_mu
);
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
// Mark the profiling stop.
Mark
(
"_stop_profiler_"
,
nullptr
);
...
...
@@ -466,7 +470,7 @@ void SetProfileListener() {
std
::
mt19937
rng
;
rng
.
seed
(
std
::
random_device
()());
std
::
uniform_int_distribution
<
std
::
mt19937
::
result_type
>
dist6
(
1
,
std
::
numeric_limits
<
std
::
mt19937
::
result_type
>::
max
());
1
,
std
::
numeric_limits
<
int
>::
max
());
profiler_lister_id
=
dist6
(
rng
);
}
int64_t
ListenerId
()
{
return
profiler_lister_id
;
}
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
35d5b1b9
...
...
@@ -495,6 +495,7 @@ All parameter, weight, gradient are variables in Paddle.
m
.
def
(
"enable_profiler"
,
platform
::
EnableProfiler
);
m
.
def
(
"disable_profiler"
,
platform
::
DisableProfiler
);
m
.
def
(
"is_profiler_enabled"
,
platform
::
IsProfileEnabled
);
m
.
def
(
"reset_profiler"
,
platform
::
ResetProfiler
);
// -- python binds for parallel executor.
...
...
python/paddle/fluid/profiler.py
浏览文件 @
35d5b1b9
...
...
@@ -16,7 +16,10 @@ import core
from
contextlib
import
contextmanager
import
os
__all__
=
[
'cuda_profiler'
,
'reset_profiler'
,
'profiler'
]
__all__
=
[
'cuda_profiler'
,
'reset_profiler'
,
'profiler'
,
'start_profiler'
,
'stop_profiler'
]
NVPROF_CONFIG
=
[
"gpustarttimestamp"
,
...
...
@@ -72,20 +75,31 @@ def reset_profiler():
core
.
reset_profiler
()
@
contextmanager
def
profiler
(
state
,
sorted_key
=
None
,
profile_path
=
'/tmp/profile'
):
"""The profiler interface.
Different from cuda_profiler, this profiler can be used to profile both CPU
and GPU program. By defalut, it records the CPU and GPU operator kernels,
if you want to profile other program, you can refer the profiling tutorial
to add more records.
def
start_profiler
(
state
):
"""Enable the profiler.
Args:
state (string) : The profiling state, which should be 'CPU', 'GPU'
or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling
GPU as well. 'All' also generates timeline.
"""
if
core
.
is_profiler_enabled
():
return
if
state
not
in
[
'CPU'
,
'GPU'
,
"All"
]:
raise
ValueError
(
"The state must be 'CPU' or 'GPU' or 'All'."
)
if
state
==
"GPU"
:
prof_state
=
core
.
ProfilerState
.
kCUDA
elif
state
==
"CPU"
:
prof_state
=
core
.
ProfilerState
.
kCPU
else
:
prof_state
=
core
.
ProfilerState
.
kAll
core
.
enable_profiler
(
prof_state
)
def
stop_profiler
(
sorted_key
=
None
,
profile_path
=
'/tmp/profile'
):
"""Stop the profiler.
Args:
state (string) : The profiling state, which should be 'CPU' or 'GPU',
telling the profiler to use CPU timer or GPU timer for profiling.
Although users may have already specified the execution place
(CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
would not inherit this place.
sorted_key (string) : If None, the profiling results will be printed
in the order of first end time of events. Otherwise, the profiling
results will be sorted by the this flag. This flag should be one
...
...
@@ -98,17 +112,8 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
profile_path (string) : If state == 'All', it will write a profile
proto output file.
"""
if
state
not
in
[
'CPU'
,
'GPU'
,
"All"
]:
raise
ValueError
(
"The state must be 'CPU' or 'GPU' or 'All'."
)
if
state
==
"GPU"
:
prof_state
=
core
.
ProfilerState
.
kCUDA
elif
state
==
"CPU"
:
prof_state
=
core
.
ProfilerState
.
kCPU
else
:
prof_state
=
core
.
ProfilerState
.
kAll
core
.
enable_profiler
(
prof_state
)
yield
if
not
core
.
is_profiler_enabled
():
return
sorted_key
=
'default'
if
sorted_key
is
None
else
sorted_key
if
sorted_key
not
in
[
'default'
,
'calls'
,
'total'
,
'max'
,
'min'
,
'ave'
]:
raise
ValueError
(
"The sorted_key must be None or in 'calls', 'total', "
...
...
@@ -124,3 +129,34 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
# TODO(qingqing) : redirect C++ ostream to Python stream.
# with core.ostream_redirect(stdout=True, stderr=True):
core
.
disable_profiler
(
key_map
[
sorted_key
],
profile_path
)
@
contextmanager
def
profiler
(
state
,
sorted_key
=
None
,
profile_path
=
'/tmp/profile'
):
"""The profiler interface.
Different from cuda_profiler, this profiler can be used to profile both CPU
and GPU program. By defalut, it records the CPU and GPU operator kernels,
if you want to profile other program, you can refer the profiling tutorial
to add more records.
Args:
state (string) : The profiling state, which should be 'CPU' or 'GPU',
telling the profiler to use CPU timer or GPU timer for profiling.
Although users may have already specified the execution place
(CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
would not inherit this place.
sorted_key (string) : If None, the profiling results will be printed
in the order of first end time of events. Otherwise, the profiling
results will be sorted by the this flag. This flag should be one
of 'calls', 'total', 'max', 'min' or 'ave'.
The `calls` means sorting by the number of calls.
The `total` means sorting by the total execution time.
The `max` means sorting by the maximum execution time.
The `min` means sorting by the minimum execution time.
The `ave` means sorting by the average execution time.
profile_path (string) : If state == 'All', it will write a profile
proto output file.
"""
start_profiler
(
state
)
yield
stop_profiler
(
sorted_key
,
profile_path
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录