Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
76d8b14b
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
76d8b14b
编写于
5月 03, 2018
作者:
X
Xin Pan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add timeline support for distributed training
上级
0595f23e
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
149 addition
and
56 deletion
+149
-56
benchmark/cluster/vgg16/vgg16_fluid.py
benchmark/cluster/vgg16/vgg16_fluid.py
+22
-6
cmake/external/grpc.cmake
cmake/external/grpc.cmake
+1
-1
paddle/fluid/operators/detail/send_recv.proto
paddle/fluid/operators/detail/send_recv.proto
+4
-0
paddle/fluid/operators/detail/sendrecvop_utils.cc
paddle/fluid/operators/detail/sendrecvop_utils.cc
+8
-0
paddle/fluid/operators/detail/variable_response.cc
paddle/fluid/operators/detail/variable_response.cc
+21
-1
paddle/fluid/operators/listen_and_serv_op.cc
paddle/fluid/operators/listen_and_serv_op.cc
+5
-3
paddle/fluid/platform/profiler.cc
paddle/fluid/platform/profiler.cc
+26
-9
paddle/fluid/platform/profiler.h
paddle/fluid/platform/profiler.h
+8
-0
tools/timeline.py
tools/timeline.py
+54
-36
未找到文件。
benchmark/cluster/vgg16/vgg16_fluid.py
浏览文件 @
76d8b14b
...
...
@@ -80,6 +80,8 @@ parser.add_argument(
type
=
str
,
default
=
""
,
help
=
"Comma-separated list of hostname:port pairs"
)
parser
.
add_argument
(
"--profile"
,
action
=
'store_true'
,
help
=
"If set, profile a few steps."
)
# Flags for defining the tf.train.Server
parser
.
add_argument
(
...
...
@@ -183,8 +185,8 @@ def main():
start_time
=
time
.
time
()
num_samples
=
0
train_pass_acc
.
reset
()
for
batch_id
,
data
in
enumerate
(
train_reader
()):
ts
=
time
.
time
()
def
run_step
(
batch_id
,
data
):
img_data
=
np
.
array
(
map
(
lambda
x
:
x
[
0
].
reshape
(
data_shape
),
data
)).
astype
(
"float32"
)
...
...
@@ -196,14 +198,28 @@ def main():
feed
=
{
"pixel"
:
img_data
,
"label"
:
y_data
},
fetch_list
=
[
avg_cost
,
batch_acc
,
batch_size
])
return
loss
,
acc
,
b_size
if
args
.
profile
and
args
.
task_index
==
0
:
# warmup.
for
batch_id
,
data
in
enumerate
(
train_reader
()):
if
batch_id
>
5
:
break
run_step
(
batch_id
,
data
)
with
profiler
.
profiler
(
'All'
,
'total'
,
'/tmp/profile_vgg'
):
for
batch_id
,
data
in
enumerate
(
train_reader
()):
if
batch_id
>
5
:
break
run_step
(
batch_id
,
data
)
for
batch_id
,
data
in
enumerate
(
train_reader
()):
ts
=
time
.
time
()
loss
,
acc
,
b_size
=
run_step
(
batch_id
,
data
)
iters
+=
1
num_samples
+=
len
(
data
)
train_pass_acc
.
add
(
value
=
acc
,
weight
=
b_size
)
print
(
"Task:%d Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
"Speed = %.2f img/s "
%
(
args
.
task_index
,
pass_id
,
iters
,
loss
,
acc
,
len
(
data
)
/
(
time
.
time
()
-
ts
))
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
"Speed = %.2f img/s"
%
(
pass_id
,
iters
,
loss
,
acc
,
len
(
data
)
/
(
time
.
time
()
-
ts
))
)
# The accuracy is the accumulation of batches, but not the current batch.
pass_elapsed
=
time
.
time
()
-
start_time
...
...
cmake/external/grpc.cmake
浏览文件 @
76d8b14b
...
...
@@ -33,7 +33,7 @@ ExternalProject_Add(
extern_grpc
DEPENDS protobuf zlib
GIT_REPOSITORY
"https://github.com/grpc/grpc.git"
GIT_TAG
"v1.
10
.x"
GIT_TAG
"v1.
8
.x"
PREFIX
${
GRPC_SOURCES_DIR
}
UPDATE_COMMAND
""
CONFIGURE_COMMAND
""
...
...
paddle/fluid/operators/detail/send_recv.proto
浏览文件 @
76d8b14b
...
...
@@ -69,6 +69,10 @@ message VariableMessage {
bytes
rows
=
9
;
// Look up table block execution output variable name.
string
out_varname
=
10
;
// If true, the ps server will start profiling, the ps
// server stops profiling and generates a profile to /tmp/profile_ps_*
// when profile switches from true to false.
bool
profile
=
11
;
}
message
VoidMessage
{}
paddle/fluid/operators/detail/sendrecvop_utils.cc
浏览文件 @
76d8b14b
...
...
@@ -23,6 +23,7 @@ limitations under the License. */
#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
#include "paddle/fluid/operators/detail/proto_encoder_helper.h"
#include "paddle/fluid/operators/detail/variable_response.h"
#include "paddle/fluid/platform/profiler.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -45,6 +46,13 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
void
*
payload
=
nullptr
;
size_t
payload_size
;
ProtoEncodeHelper
e
(
static_cast
<
char
*>
(
buf
),
1024
);
// Note: normally the profiler is enabled in 1 trainer, hence only
// 1 trainer returns true for ShouldSendProfileState(). It tells PS
// servers the trainer's profiling state so that PS can follow the
// trainer.
if
(
platform
::
ShouldSendProfileState
())
{
e
.
WriteBool
(
VarMsg
::
kProfileFieldNumber
,
platform
::
IsProfileEnabled
());
}
e
.
WriteString
(
VarMsg
::
kVarnameFieldNumber
,
name
);
if
(
var
->
IsType
<
framework
::
LoDTensor
>
())
{
e
.
WriteUint64
(
VarMsg
::
kTypeFieldNumber
,
0
);
...
...
paddle/fluid/operators/detail/variable_response.cc
浏览文件 @
76d8b14b
...
...
@@ -17,6 +17,7 @@
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
...
...
@@ -427,7 +428,26 @@ int VariableResponse::Parse(Source* source) {
meta_
.
set_out_varname
(
temp
);
break
;
}
case
sendrecv
::
VariableMessage
::
kProfileFieldNumber
:
{
bool
profiling
;
if
(
!
input
.
ReadRaw
(
reinterpret_cast
<
void
*>
(
&
profiling
),
1
))
{
return
tag
;
}
meta_
.
set_profile
(
profiling
);
int64_t
lisner_id
=
platform
::
ListenerId
();
if
(
lisner_id
<=
0
)
{
break
;
}
if
(
profiling
&&
!
platform
::
IsProfileEnabled
())
{
platform
::
EnableProfiler
(
platform
::
ProfilerState
::
kCPU
);
}
else
if
(
!
profiling
&&
platform
::
IsProfileEnabled
())
{
// TODO(panyx0718): Should we allow to customize file dir.
platform
::
DisableProfiler
(
platform
::
EventSortingKey
::
kDefault
,
string
::
Sprintf
(
"/tmp/profile_ps_%lld"
,
lisner_id
));
}
break
;
}
default:
{
// Unknown tag, return unknown error.
return
-
1
;
...
...
paddle/fluid/operators/listen_and_serv_op.cc
浏览文件 @
76d8b14b
...
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/operators/listen_and_serv_op.h"
#include "paddle/fluid/platform/profiler.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -294,6 +295,8 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
void
ListenAndServOp
::
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
{
// Mark this as PS that it should decide profiling by listening from trainer.
platform
::
SetProfileLisener
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
framework
::
Scope
&
recv_scope
=
scope
.
NewScope
();
...
...
@@ -328,9 +331,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
rpc_service_
->
WaitServerReady
();
// Write to a file of server selected port for python use.
std
::
string
file_path
=
string
::
Sprintf
(
"/tmp/paddle.%d.selected_port"
,
static_cast
<
int
>
(
::
getpid
()));
std
::
string
file_path
=
string
::
Sprintf
(
"/tmp/paddle.%d.selected_port"
,
static_cast
<
int
>
(
::
getpid
()));
SavePort
(
file_path
);
if
(
sync_mode
)
{
RunSyncLoop
(
&
executor
,
program
,
&
recv_scope
,
prefetch_block
);
...
...
paddle/fluid/platform/profiler.cc
浏览文件 @
76d8b14b
...
...
@@ -13,12 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include <sys/time.h>
#include <time.h>
#include <algorithm>
#include <iomanip>
#include <limits>
#include <map>
#include <mutex> // NOLINT
#include <random>
#include <string>
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
...
...
@@ -33,6 +36,9 @@ namespace platform {
struct
EventList
;
static
int64_t
profiler_lister_id
=
0
;
static
bool
should_send_profile_state
=
false
;
// The profiler state, the initial value is ProfilerState::kDisabled
static
ProfilerState
g_state
=
ProfilerState
::
kDisabled
;
// The thread local event list only can be accessed by the specific thread
...
...
@@ -219,13 +225,12 @@ void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE
(
state
!=
ProfilerState
::
kDisabled
,
"Can't enbale profling, since the input state is "
,
"ProfilerState::kDisabled"
);
PADDLE_ENFORCE
(
g_state
==
ProfilerState
::
kDisabled
,
"The profiling state should be disabled when calling "
,
"EnableProfiler."
);
g_state
=
state
;
if
(
g_state
==
ProfilerState
::
kAll
)
{
GetDeviceTracer
()
->
Enable
();
if
(
state
==
g_state
)
{
return
;
}
g_state
=
state
;
should_send_profile_state
=
true
;
GetDeviceTracer
()
->
Enable
();
#ifdef PADDLE_WITH_CUDA
if
(
g_state
==
ProfilerState
::
kCUDA
)
{
// Generate some dummy events first to reduce the startup overhead.
...
...
@@ -435,8 +440,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
void
DisableProfiler
(
EventSortingKey
sorted_key
,
const
std
::
string
&
profile_path
)
{
PADDLE_ENFORCE
(
g_state
!=
ProfilerState
::
kDisabled
,
"Can't disable profiling, since it's not starting."
);
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
// Mark the profiling stop.
Mark
(
"_stop_profiler_"
,
nullptr
);
...
...
@@ -444,12 +448,25 @@ void DisableProfiler(EventSortingKey sorted_key,
ParseEvents
(
all_events
,
sorted_key
);
ResetProfiler
();
DeviceTracer
*
tracer
=
GetDeviceTracer
();
if
(
g_state
==
ProfilerState
::
kAll
&&
tracer
&&
tracer
->
IsEnabled
())
{
if
(
tracer
->
IsEnabled
())
{
tracer
->
Disable
();
tracer
->
GenProfile
(
profile_path
);
}
g_state
=
ProfilerState
::
kDisabled
;
should_send_profile_state
=
true
;
}
bool
IsProfileEnabled
()
{
return
g_state
!=
ProfilerState
::
kDisabled
;
}
bool
ShouldSendProfileState
()
{
return
should_send_profile_state
;
}
void
SetProfileLisener
()
{
std
::
mt19937
rng
;
rng
.
seed
(
std
::
random_device
()());
std
::
uniform_int_distribution
<
std
::
mt19937
::
result_type
>
dist6
(
1
,
std
::
numeric_limits
<
int64_t
>::
max
());
profiler_lister_id
=
dist6
(
rng
);
}
int64_t
ListenerId
()
{
return
profiler_lister_id
;
}
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/profiler.h
浏览文件 @
76d8b14b
...
...
@@ -114,5 +114,13 @@ void ResetProfiler();
void
DisableProfiler
(
EventSortingKey
sorted_key
,
const
std
::
string
&
profile_path
);
// Test if the profiler is currently enabled.
bool
IsProfileEnabled
();
// Whether the trainer should send profiling state to PS.
bool
ShouldSendProfileState
();
// Mark current process as PS by assigning a lister id.
void
SetProfileLisener
();
int64_t
ListenerId
();
}
// namespace platform
}
// namespace paddle
tools/timeline.py
浏览文件 @
76d8b14b
...
...
@@ -22,7 +22,11 @@ import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
'--profile_path'
,
type
=
str
,
default
=
''
,
help
=
'Input profile file name.'
)
'--profile_path'
,
type
=
str
,
default
=
''
,
help
=
'Input profile file name. If there are multiple file, the format '
'should be trainer1=file1,trainer2=file2,ps=file3'
)
parser
.
add_argument
(
'--timeline_path'
,
type
=
str
,
default
=
''
,
help
=
'Output timeline file name.'
)
args
=
parser
.
parse_args
()
...
...
@@ -108,8 +112,8 @@ class _ChromeTraceFormatter(object):
class
Timeline
(
object
):
def
__init__
(
self
,
profile_
pb
):
self
.
_profile_
pb
=
profile_pb
def
__init__
(
self
,
profile_
dict
):
self
.
_profile_
dict
=
profile_dict
self
.
_pid
=
0
self
.
_devices
=
dict
()
self
.
_chrome_trace
=
_ChromeTraceFormatter
()
...
...
@@ -120,35 +124,37 @@ class Timeline(object):
return
cur_pid
def
_allocate_pids
(
self
):
for
event
in
self
.
_profile_pb
.
events
:
if
event
.
type
==
profiler_pb2
.
Event
.
CPU
:
if
(
event
.
device_id
,
"CPU"
)
not
in
self
.
_devices
:
pid
=
self
.
_allocate_pid
()
self
.
_devices
[(
event
.
device_id
,
"CPU"
)]
=
pid
self
.
_chrome_trace
.
emit_pid
(
"cpu:block:%d"
%
(
event
.
device_id
),
pid
)
elif
event
.
type
==
profiler_pb2
.
Event
.
GPUKernel
:
if
(
event
.
device_id
,
"GPUKernel"
)
not
in
self
.
_devices
:
pid
=
self
.
_allocate_pid
()
self
.
_devices
[(
event
.
device_id
,
"GPUKernel"
)]
=
pid
self
.
_chrome_trace
.
emit_pid
(
"gpu:%d"
%
(
event
.
device_id
),
pid
)
for
k
,
profile_pb
in
self
.
_profile_dict
.
iteritems
():
for
event
in
profile_pb
.
events
:
if
event
.
type
==
profiler_pb2
.
Event
.
CPU
:
if
(
k
,
event
.
device_id
,
"CPU"
)
not
in
self
.
_devices
:
pid
=
self
.
_allocate_pid
()
self
.
_devices
[(
k
,
event
.
device_id
,
"CPU"
)]
=
pid
self
.
_chrome_trace
.
emit_pid
(
"%s:cpu:block:%d"
%
(
k
,
event
.
device_id
),
pid
)
elif
event
.
type
==
profiler_pb2
.
Event
.
GPUKernel
:
if
(
k
,
event
.
device_id
,
"GPUKernel"
)
not
in
self
.
_devices
:
pid
=
self
.
_allocate_pid
()
self
.
_devices
[(
k
,
event
.
device_id
,
"GPUKernel"
)]
=
pid
self
.
_chrome_trace
.
emit_pid
(
"%s:gpu:%d"
%
(
k
,
event
.
device_id
),
pid
)
def
_allocate_events
(
self
):
for
event
in
self
.
_profile_pb
.
events
:
if
event
.
type
==
profiler_pb2
.
Event
.
CPU
:
type
=
"CPU"
elif
event
.
type
==
profiler_pb2
.
Event
.
GPUKernel
:
type
=
"GPUKernel"
pid
=
self
.
_devices
[(
event
.
device_id
,
type
)]
args
=
{
'name'
:
event
.
name
}
if
event
.
memcopy
.
bytes
>
0
:
args
=
{
'mem_bytes'
:
event
.
memcopy
.
bytes
}
# TODO(panyx0718): Chrome tracing only handles ms. However, some
# ops takes micro-seconds. Hence, we keep the ns here.
self
.
_chrome_trace
.
emit_region
(
event
.
start_ns
,
(
event
.
end_ns
-
event
.
start_ns
)
/
1.0
,
pid
,
event
.
sub_device_id
,
'Op'
,
event
.
name
,
args
)
for
k
,
profile_pb
in
self
.
_profile_dict
.
iteritems
():
for
event
in
profile_pb
.
events
:
if
event
.
type
==
profiler_pb2
.
Event
.
CPU
:
type
=
"CPU"
elif
event
.
type
==
profiler_pb2
.
Event
.
GPUKernel
:
type
=
"GPUKernel"
pid
=
self
.
_devices
[(
k
,
event
.
device_id
,
type
)]
args
=
{
'name'
:
event
.
name
}
if
event
.
memcopy
.
bytes
>
0
:
args
=
{
'mem_bytes'
:
event
.
memcopy
.
bytes
}
# TODO(panyx0718): Chrome tracing only handles ms. However, some
# ops takes micro-seconds. Hence, we keep the ns here.
self
.
_chrome_trace
.
emit_region
(
event
.
start_ns
,
(
event
.
end_ns
-
event
.
start_ns
)
/
1.0
,
pid
,
event
.
sub_device_id
,
'Op'
,
event
.
name
,
args
)
def
generate_chrome_trace
(
self
):
self
.
_allocate_pids
()
...
...
@@ -163,11 +169,23 @@ timeline_path = '/tmp/timeline'
if
args
.
timeline_path
:
timeline_path
=
args
.
timeline_path
with
open
(
profile_path
,
'r'
)
as
f
:
profile_s
=
f
.
read
()
profile_pb
=
profiler_pb2
.
Profile
()
profile_pb
.
ParseFromString
(
profile_s
)
tl
=
Timeline
(
profile_pb
)
profile_paths
=
profile_path
.
split
(
','
)
profile_dict
=
dict
()
if
len
(
profile_path
)
==
1
:
with
open
(
profile_path
,
'r'
)
as
f
:
profile_s
=
f
.
read
()
profile_pb
=
profiler_pb2
.
Profile
()
profile_pb
.
ParseFromString
(
profile_s
)
profile_dict
[
'trainer'
]
=
profile_pb
else
:
for
profile_path
in
profile_paths
:
k
,
v
=
profile_path
.
split
(
'='
)
with
open
(
v
,
'r'
)
as
f
:
profile_s
=
f
.
read
()
profile_pb
=
profiler_pb2
.
Profile
()
profile_pb
.
ParseFromString
(
profile_s
)
profile_dict
[
k
]
=
profile_pb
tl
=
Timeline
(
profile_dict
)
with
open
(
timeline_path
,
'w'
)
as
f
:
f
.
write
(
tl
.
generate_chrome_trace
())
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录