Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
35d5b1b9
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
35d5b1b9
编写于
5月 30, 2018
作者:
X
Xin Pan
提交者:
GitHub
5月 30, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #11036 from panyx0718/dist_timeline
better profiler and benchmark
上级
32d50864
f14e579c
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
87 addition
and
38 deletion
+87
-38
benchmark/fluid/fluid_benchmark.py
benchmark/fluid/fluid_benchmark.py
+20
-12
paddle/fluid/platform/profiler.cc
paddle/fluid/platform/profiler.cc
+6
-2
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+1
-0
python/paddle/fluid/profiler.py
python/paddle/fluid/profiler.py
+60
-24
未找到文件。
benchmark/fluid/fluid_benchmark.py
浏览文件 @
35d5b1b9
...
@@ -98,6 +98,8 @@ def parse_args():
...
@@ -98,6 +98,8 @@ def parse_args():
'--use_fake_data'
,
'--use_fake_data'
,
action
=
'store_true'
,
action
=
'store_true'
,
help
=
'If set ommit the actual read data operators.'
)
help
=
'If set ommit the actual read data operators.'
)
parser
.
add_argument
(
'--profile'
,
action
=
'store_true'
,
help
=
'If set, profile a few steps.'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--update_method'
,
'--update_method'
,
type
=
str
,
type
=
str
,
...
@@ -108,8 +110,8 @@ def parse_args():
...
@@ -108,8 +110,8 @@ def parse_args():
return
args
return
args
def
append_nccl2_prepare
():
def
append_nccl2_prepare
(
trainer_id
):
if
os
.
getenv
(
"PADDLE_TRAINER_ID"
,
None
)
!=
None
:
if
trainer_id
>=
0
:
# append gen_nccl_id at the end of startup program
# append gen_nccl_id at the end of startup program
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
))
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
))
port
=
os
.
getenv
(
"PADDLE_PSERVER_PORT"
)
port
=
os
.
getenv
(
"PADDLE_PSERVER_PORT"
)
...
@@ -136,12 +138,12 @@ def append_nccl2_prepare():
...
@@ -136,12 +138,12 @@ def append_nccl2_prepare():
})
})
return
nccl_id_var
,
num_trainers
,
trainer_id
return
nccl_id_var
,
num_trainers
,
trainer_id
else
:
else
:
raise
Exception
(
raise
Exception
(
"must set positive PADDLE_TRAINER_ID env variables for "
"must set PADDLE_TRAINER_ID env variables for
dist train."
)
"nccl-based
dist train."
)
def
dist_transpile
():
def
dist_transpile
(
trainer_id
):
if
"PADDLE_TRAINING_ROLE"
not
in
os
.
environ
:
if
trainer_id
<
0
:
return
None
,
None
return
None
,
None
# the port of all pservers, needed by both trainer and pserver
# the port of all pservers, needed by both trainer and pserver
...
@@ -158,9 +160,6 @@ def dist_transpile():
...
@@ -158,9 +160,6 @@ def dist_transpile():
trainers
=
int
(
os
.
getenv
(
"PADDLE_TRAINERS"
))
trainers
=
int
(
os
.
getenv
(
"PADDLE_TRAINERS"
))
# the IP of the local machine, needed by pserver only
# the IP of the local machine, needed by pserver only
current_endpoint
=
os
.
getenv
(
"PADDLE_CURRENT_IP"
,
""
)
+
":"
+
port
current_endpoint
=
os
.
getenv
(
"PADDLE_CURRENT_IP"
,
""
)
+
":"
+
port
# the unique trainer id, starting from 0, needed by trainer
# only
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
,
"0"
))
# the role, should be either PSERVER or TRAINER
# the role, should be either PSERVER or TRAINER
training_role
=
os
.
getenv
(
"PADDLE_TRAINING_ROLE"
)
training_role
=
os
.
getenv
(
"PADDLE_TRAINING_ROLE"
)
...
@@ -295,6 +294,11 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
...
@@ -295,6 +294,11 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
iters
=
0
iters
=
0
start_time
=
time
.
time
()
start_time
=
time
.
time
()
for
batch_id
,
data
in
enumerate
(
train_reader
()):
for
batch_id
,
data
in
enumerate
(
train_reader
()):
if
args
.
profile
and
pass_id
==
0
and
batch_id
==
5
:
profiler
.
start_profiler
(
"All"
)
elif
args
.
profile
and
pass_id
==
0
and
batch_id
==
10
:
profiler
.
stop_profiler
(
"total"
,
"/tmp/profile_%d"
%
trainer_id
)
if
iters
==
args
.
skip_batch_num
:
if
iters
==
args
.
skip_batch_num
:
start_time
=
time
.
time
()
start_time
=
time
.
time
()
num_samples
=
0
num_samples
=
0
...
@@ -334,7 +338,11 @@ def print_arguments(args):
...
@@ -334,7 +338,11 @@ def print_arguments(args):
def
main
():
def
main
():
args
=
parse_args
()
args
=
parse_args
()
print_arguments
(
args
)
print_arguments
(
args
)
nccl_id_var
,
num_trainers
,
trainer_id
=
None
,
1
,
0
# the unique trainer id, starting from 0, needed by trainer
# only
nccl_id_var
,
num_trainers
,
trainer_id
=
(
None
,
1
,
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
,
"-1"
)))
if
args
.
use_cprof
:
if
args
.
use_cprof
:
pr
=
cProfile
.
Profile
()
pr
=
cProfile
.
Profile
()
...
@@ -348,7 +356,7 @@ def main():
...
@@ -348,7 +356,7 @@ def main():
fluid
.
memory_optimize
(
fluid
.
default_main_program
())
fluid
.
memory_optimize
(
fluid
.
default_main_program
())
if
args
.
update_method
==
"pserver"
:
if
args
.
update_method
==
"pserver"
:
train_prog
,
startup_prog
=
dist_transpile
()
train_prog
,
startup_prog
=
dist_transpile
(
trainer_id
)
if
not
train_prog
:
if
not
train_prog
:
raise
Exception
(
raise
Exception
(
"Must configure correct environments to run dist train."
)
"Must configure correct environments to run dist train."
)
...
@@ -364,7 +372,7 @@ def main():
...
@@ -364,7 +372,7 @@ def main():
train_args
.
append
(
fluid
.
default_startup_program
())
train_args
.
append
(
fluid
.
default_startup_program
())
if
args
.
update_method
==
"nccl2"
:
if
args
.
update_method
==
"nccl2"
:
nccl_id_var
,
num_trainers
,
trainer_id
=
append_nccl2_prepare
()
nccl_id_var
,
num_trainers
,
trainer_id
=
append_nccl2_prepare
(
trainer_id
)
if
args
.
gpus
==
1
:
if
args
.
gpus
==
1
:
# NOTE: parallel executor use profiler interanlly
# NOTE: parallel executor use profiler interanlly
if
args
.
use_nvprof
and
args
.
device
==
'GPU'
:
if
args
.
use_nvprof
and
args
.
device
==
'GPU'
:
...
...
paddle/fluid/platform/profiler.cc
浏览文件 @
35d5b1b9
...
@@ -38,6 +38,7 @@ struct EventList;
...
@@ -38,6 +38,7 @@ struct EventList;
static
int64_t
profiler_lister_id
=
0
;
static
int64_t
profiler_lister_id
=
0
;
static
bool
should_send_profile_state
=
false
;
static
bool
should_send_profile_state
=
false
;
std
::
mutex
profiler_mu
;
// The profiler state, the initial value is ProfilerState::kDisabled
// The profiler state, the initial value is ProfilerState::kDisabled
static
ProfilerState
g_state
=
ProfilerState
::
kDisabled
;
static
ProfilerState
g_state
=
ProfilerState
::
kDisabled
;
...
@@ -228,6 +229,8 @@ void EnableProfiler(ProfilerState state) {
...
@@ -228,6 +229,8 @@ void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE
(
state
!=
ProfilerState
::
kDisabled
,
PADDLE_ENFORCE
(
state
!=
ProfilerState
::
kDisabled
,
"Can't enbale profling, since the input state is "
,
"Can't enbale profling, since the input state is "
,
"ProfilerState::kDisabled"
);
"ProfilerState::kDisabled"
);
std
::
lock_guard
<
std
::
mutex
>
l
(
profiler_mu
);
if
(
state
==
g_state
)
{
if
(
state
==
g_state
)
{
return
;
return
;
}
}
...
@@ -295,7 +298,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
...
@@ -295,7 +298,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
}
else
if
(
g_state
==
ProfilerState
::
kAll
)
{
}
else
if
(
g_state
==
ProfilerState
::
kAll
)
{
place
=
"All"
;
place
=
"All"
;
}
else
{
}
else
{
PADDLE_THROW
(
"Invalid profiler state"
);
PADDLE_THROW
(
"Invalid profiler state"
,
g_state
);
}
}
std
::
cout
<<
"Place: "
<<
place
<<
std
::
endl
;
std
::
cout
<<
"Place: "
<<
place
<<
std
::
endl
;
...
@@ -443,6 +446,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
...
@@ -443,6 +446,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
void
DisableProfiler
(
EventSortingKey
sorted_key
,
void
DisableProfiler
(
EventSortingKey
sorted_key
,
const
std
::
string
&
profile_path
)
{
const
std
::
string
&
profile_path
)
{
std
::
lock_guard
<
std
::
mutex
>
l
(
profiler_mu
);
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
// Mark the profiling stop.
// Mark the profiling stop.
Mark
(
"_stop_profiler_"
,
nullptr
);
Mark
(
"_stop_profiler_"
,
nullptr
);
...
@@ -466,7 +470,7 @@ void SetProfileListener() {
...
@@ -466,7 +470,7 @@ void SetProfileListener() {
std
::
mt19937
rng
;
std
::
mt19937
rng
;
rng
.
seed
(
std
::
random_device
()());
rng
.
seed
(
std
::
random_device
()());
std
::
uniform_int_distribution
<
std
::
mt19937
::
result_type
>
dist6
(
std
::
uniform_int_distribution
<
std
::
mt19937
::
result_type
>
dist6
(
1
,
std
::
numeric_limits
<
std
::
mt19937
::
result_type
>::
max
());
1
,
std
::
numeric_limits
<
int
>::
max
());
profiler_lister_id
=
dist6
(
rng
);
profiler_lister_id
=
dist6
(
rng
);
}
}
int64_t
ListenerId
()
{
return
profiler_lister_id
;
}
int64_t
ListenerId
()
{
return
profiler_lister_id
;
}
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
35d5b1b9
...
@@ -495,6 +495,7 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -495,6 +495,7 @@ All parameter, weight, gradient are variables in Paddle.
m
.
def
(
"enable_profiler"
,
platform
::
EnableProfiler
);
m
.
def
(
"enable_profiler"
,
platform
::
EnableProfiler
);
m
.
def
(
"disable_profiler"
,
platform
::
DisableProfiler
);
m
.
def
(
"disable_profiler"
,
platform
::
DisableProfiler
);
m
.
def
(
"is_profiler_enabled"
,
platform
::
IsProfileEnabled
);
m
.
def
(
"reset_profiler"
,
platform
::
ResetProfiler
);
m
.
def
(
"reset_profiler"
,
platform
::
ResetProfiler
);
// -- python binds for parallel executor.
// -- python binds for parallel executor.
...
...
python/paddle/fluid/profiler.py
浏览文件 @
35d5b1b9
...
@@ -16,7 +16,10 @@ import core
...
@@ -16,7 +16,10 @@ import core
from
contextlib
import
contextmanager
from
contextlib
import
contextmanager
import
os
import
os
__all__
=
[
'cuda_profiler'
,
'reset_profiler'
,
'profiler'
]
__all__
=
[
'cuda_profiler'
,
'reset_profiler'
,
'profiler'
,
'start_profiler'
,
'stop_profiler'
]
NVPROF_CONFIG
=
[
NVPROF_CONFIG
=
[
"gpustarttimestamp"
,
"gpustarttimestamp"
,
...
@@ -72,20 +75,31 @@ def reset_profiler():
...
@@ -72,20 +75,31 @@ def reset_profiler():
core
.
reset_profiler
()
core
.
reset_profiler
()
@
contextmanager
def
start_profiler
(
state
):
def
profiler
(
state
,
sorted_key
=
None
,
profile_path
=
'/tmp/profile'
):
"""Enable the profiler.
"""The profiler interface.
Different from cuda_profiler, this profiler can be used to profile both CPU
Args:
and GPU program. By defalut, it records the CPU and GPU operator kernels,
state (string) : The profiling state, which should be 'CPU', 'GPU'
if you want to profile other program, you can refer the profiling tutorial
or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling
to add more records.
GPU as well. 'All' also generates timeline.
"""
if
core
.
is_profiler_enabled
():
return
if
state
not
in
[
'CPU'
,
'GPU'
,
"All"
]:
raise
ValueError
(
"The state must be 'CPU' or 'GPU' or 'All'."
)
if
state
==
"GPU"
:
prof_state
=
core
.
ProfilerState
.
kCUDA
elif
state
==
"CPU"
:
prof_state
=
core
.
ProfilerState
.
kCPU
else
:
prof_state
=
core
.
ProfilerState
.
kAll
core
.
enable_profiler
(
prof_state
)
def
stop_profiler
(
sorted_key
=
None
,
profile_path
=
'/tmp/profile'
):
"""Stop the profiler.
Args:
Args:
state (string) : The profiling state, which should be 'CPU' or 'GPU',
telling the profiler to use CPU timer or GPU timer for profiling.
Although users may have already specified the execution place
(CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
would not inherit this place.
sorted_key (string) : If None, the profiling results will be printed
sorted_key (string) : If None, the profiling results will be printed
in the order of first end time of events. Otherwise, the profiling
in the order of first end time of events. Otherwise, the profiling
results will be sorted by the this flag. This flag should be one
results will be sorted by the this flag. This flag should be one
...
@@ -98,17 +112,8 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
...
@@ -98,17 +112,8 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
profile_path (string) : If state == 'All', it will write a profile
profile_path (string) : If state == 'All', it will write a profile
proto output file.
proto output file.
"""
"""
if
state
not
in
[
'CPU'
,
'GPU'
,
"All"
]:
if
not
core
.
is_profiler_enabled
():
raise
ValueError
(
"The state must be 'CPU' or 'GPU' or 'All'."
)
return
if
state
==
"GPU"
:
prof_state
=
core
.
ProfilerState
.
kCUDA
elif
state
==
"CPU"
:
prof_state
=
core
.
ProfilerState
.
kCPU
else
:
prof_state
=
core
.
ProfilerState
.
kAll
core
.
enable_profiler
(
prof_state
)
yield
sorted_key
=
'default'
if
sorted_key
is
None
else
sorted_key
sorted_key
=
'default'
if
sorted_key
is
None
else
sorted_key
if
sorted_key
not
in
[
'default'
,
'calls'
,
'total'
,
'max'
,
'min'
,
'ave'
]:
if
sorted_key
not
in
[
'default'
,
'calls'
,
'total'
,
'max'
,
'min'
,
'ave'
]:
raise
ValueError
(
"The sorted_key must be None or in 'calls', 'total', "
raise
ValueError
(
"The sorted_key must be None or in 'calls', 'total', "
...
@@ -124,3 +129,34 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
...
@@ -124,3 +129,34 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
# TODO(qingqing) : redirect C++ ostream to Python stream.
# TODO(qingqing) : redirect C++ ostream to Python stream.
# with core.ostream_redirect(stdout=True, stderr=True):
# with core.ostream_redirect(stdout=True, stderr=True):
core
.
disable_profiler
(
key_map
[
sorted_key
],
profile_path
)
core
.
disable_profiler
(
key_map
[
sorted_key
],
profile_path
)
@
contextmanager
def
profiler
(
state
,
sorted_key
=
None
,
profile_path
=
'/tmp/profile'
):
"""The profiler interface.
Different from cuda_profiler, this profiler can be used to profile both CPU
and GPU program. By defalut, it records the CPU and GPU operator kernels,
if you want to profile other program, you can refer the profiling tutorial
to add more records.
Args:
state (string) : The profiling state, which should be 'CPU' or 'GPU',
telling the profiler to use CPU timer or GPU timer for profiling.
Although users may have already specified the execution place
(CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
would not inherit this place.
sorted_key (string) : If None, the profiling results will be printed
in the order of first end time of events. Otherwise, the profiling
results will be sorted by the this flag. This flag should be one
of 'calls', 'total', 'max', 'min' or 'ave'.
The `calls` means sorting by the number of calls.
The `total` means sorting by the total execution time.
The `max` means sorting by the maximum execution time.
The `min` means sorting by the minimum execution time.
The `ave` means sorting by the average execution time.
profile_path (string) : If state == 'All', it will write a profile
proto output file.
"""
start_profiler
(
state
)
yield
stop_profiler
(
sorted_key
,
profile_path
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录