Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
f76a32df
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
f76a32df
编写于
10月 14, 2019
作者:
T
Thunderbrook
提交者:
GitHub
10月 14, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
dump fix dov vec file num (#20539)
* support dump multi file test=develop * dump fix num file test=develop
上级
bf6470c7
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
42 addition
and
13 deletion
+42
-13
paddle/fluid/framework/dist_multi_trainer.cc
paddle/fluid/framework/dist_multi_trainer.cc
+24
-10
paddle/fluid/framework/trainer.h
paddle/fluid/framework/trainer.h
+5
-3
paddle/fluid/framework/trainer_desc.proto
paddle/fluid/framework/trainer_desc.proto
+3
-0
python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
...e/fluid/incubate/fleet/parameter_server/pslib/__init__.py
+1
-0
python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
...ncubate/fleet/parameter_server/pslib/optimizer_factory.py
+1
-0
python/paddle/fluid/trainer_desc.py
python/paddle/fluid/trainer_desc.py
+6
-0
python/paddle/fluid/trainer_factory.py
python/paddle/fluid/trainer_factory.py
+2
-0
未找到文件。
paddle/fluid/framework/dist_multi_trainer.cc
浏览文件 @
f76a32df
...
@@ -41,6 +41,8 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
...
@@ -41,6 +41,8 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
}
}
}
}
mpi_rank_
=
trainer_desc
.
mpi_rank
()
/
2
;
mpi_rank_
=
trainer_desc
.
mpi_rank
()
/
2
;
mpi_size_
=
trainer_desc
.
mpi_size
()
/
2
;
dump_file_num_
=
trainer_desc
.
dump_file_num
();
const
std
::
vector
<
paddle
::
framework
::
DataFeed
*>
readers
=
const
std
::
vector
<
paddle
::
framework
::
DataFeed
*>
readers
=
dataset
->
GetReaders
();
dataset
->
GetReaders
();
...
@@ -68,20 +70,25 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
...
@@ -68,20 +70,25 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
SetDebug
(
trainer_desc
.
debug
());
SetDebug
(
trainer_desc
.
debug
());
}
}
void
DistMultiTrainer
::
DumpWork
()
{
void
DistMultiTrainer
::
DumpWork
(
int
tid
)
{
#ifdef _LINUX
#ifdef _LINUX
int
err_no
=
0
;
std
::
string
path
=
string
::
format_string
(
"%s/part-%03d-%05d"
,
dump_fields_path_
.
c_str
(),
mpi_rank_
,
tid
);
std
::
shared_ptr
<
FILE
>
fp
=
fs_open_write
(
path
,
&
err_no
,
dump_converter_
);
while
(
1
)
{
while
(
1
)
{
std
::
string
out_str
;
std
::
string
out_str
;
if
(
!
queue_
->
Get
(
out_str
))
{
if
(
!
queue_
->
Get
(
out_str
))
{
break
;
break
;
}
}
size_t
write_count
=
size_t
write_count
=
fwrite_unlocked
(
out_str
.
data
(),
1
,
out_str
.
length
(),
fp
_
.
get
());
fwrite_unlocked
(
out_str
.
data
(),
1
,
out_str
.
length
(),
fp
.
get
());
if
(
write_count
!=
out_str
.
length
())
{
if
(
write_count
!=
out_str
.
length
())
{
VLOG
(
3
)
<<
"dump text failed"
;
VLOG
(
3
)
<<
"dump text failed"
;
continue
;
continue
;
}
}
write_count
=
fwrite_unlocked
(
"
\n
"
,
1
,
1
,
fp
_
.
get
());
write_count
=
fwrite_unlocked
(
"
\n
"
,
1
,
1
,
fp
.
get
());
if
(
write_count
!=
1
)
{
if
(
write_count
!=
1
)
{
VLOG
(
3
)
<<
"dump text failed"
;
VLOG
(
3
)
<<
"dump text failed"
;
continue
;
continue
;
...
@@ -92,20 +99,27 @@ void DistMultiTrainer::DumpWork() {
...
@@ -92,20 +99,27 @@ void DistMultiTrainer::DumpWork() {
void
DistMultiTrainer
::
InitDumpEnv
()
{
void
DistMultiTrainer
::
InitDumpEnv
()
{
queue_
=
paddle
::
framework
::
MakeChannel
<
std
::
string
>
();
queue_
=
paddle
::
framework
::
MakeChannel
<
std
::
string
>
();
int
err_no
=
0
;
std
::
string
path
=
string
::
format_string
(
"%s/part-%03d"
,
dump_fields_path_
.
c_str
(),
mpi_rank_
);
fp_
=
fs_open_write
(
path
,
&
err_no
,
dump_converter_
);
for
(
int
i
=
0
;
i
<
thread_num_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
thread_num_
;
++
i
)
{
workers_
[
i
]
->
SetChannelWriter
(
queue_
.
get
());
workers_
[
i
]
->
SetChannelWriter
(
queue_
.
get
());
}
}
dump_thread_
=
std
::
thread
(
&
DistMultiTrainer
::
DumpWork
,
this
);
dump_thread_num_
=
1
;
if
(
dump_file_num_
>
mpi_size_
)
{
dump_thread_num_
=
dump_file_num_
/
mpi_size_
;
if
(
dump_file_num_
%
mpi_size_
>
mpi_rank_
)
{
dump_thread_num_
+=
1
;
}
}
for
(
int
i
=
0
;
i
<
dump_thread_num_
;
i
++
)
{
dump_thread_
.
push_back
(
std
::
thread
(
std
::
bind
(
&
DistMultiTrainer
::
DumpWork
,
this
,
i
)));
}
}
}
void
DistMultiTrainer
::
FinalizeDumpEnv
()
{
void
DistMultiTrainer
::
FinalizeDumpEnv
()
{
queue_
->
Close
();
queue_
->
Close
();
dump_thread_
.
join
();
for
(
auto
&
th
:
dump_thread_
)
{
th
.
join
();
}
queue_
.
reset
();
queue_
.
reset
();
}
}
...
...
paddle/fluid/framework/trainer.h
浏览文件 @
f76a32df
...
@@ -93,13 +93,13 @@ class DistMultiTrainer : public MultiTrainer {
...
@@ -93,13 +93,13 @@ class DistMultiTrainer : public MultiTrainer {
void
MergeToRootScope
(
LoDTensor
*
root_tensor
,
LoDTensor
*
thread_tensor
);
void
MergeToRootScope
(
LoDTensor
*
root_tensor
,
LoDTensor
*
thread_tensor
);
virtual
void
FinalizeDumpEnv
();
virtual
void
FinalizeDumpEnv
();
virtual
void
InitDumpEnv
();
virtual
void
InitDumpEnv
();
virtual
void
DumpWork
();
virtual
void
DumpWork
(
int
tid
);
virtual
Scope
*
GetWorkerScope
(
int
thread_id
)
{
return
root_scope_
;
}
virtual
Scope
*
GetWorkerScope
(
int
thread_id
)
{
return
root_scope_
;
}
protected:
protected:
std
::
shared_ptr
<
paddle
::
framework
::
PullDenseWorker
>
pull_dense_worker_
;
std
::
shared_ptr
<
paddle
::
framework
::
PullDenseWorker
>
pull_dense_worker_
;
std
::
thread
dump_thread_
;
std
::
vector
<
std
::
thread
>
dump_thread_
;
std
::
shared_ptr
<
FILE
>
fp
_
;
int
dump_thread_num
_
;
std
::
shared_ptr
<
paddle
::
framework
::
ChannelObject
<
std
::
string
>>
queue_
;
std
::
shared_ptr
<
paddle
::
framework
::
ChannelObject
<
std
::
string
>>
queue_
;
bool
need_dump_field_
;
bool
need_dump_field_
;
...
@@ -107,6 +107,8 @@ class DistMultiTrainer : public MultiTrainer {
...
@@ -107,6 +107,8 @@ class DistMultiTrainer : public MultiTrainer {
std
::
string
dump_converter_
;
std
::
string
dump_converter_
;
std
::
vector
<
std
::
string
>
dump_fields_
;
std
::
vector
<
std
::
string
>
dump_fields_
;
int
mpi_rank_
;
int
mpi_rank_
;
int
mpi_size_
;
int
dump_file_num_
;
};
};
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
...
...
paddle/fluid/framework/trainer_desc.proto
浏览文件 @
f76a32df
...
@@ -40,6 +40,9 @@ message TrainerDesc {
...
@@ -40,6 +40,9 @@ message TrainerDesc {
repeated
string
dump_fields
=
13
;
repeated
string
dump_fields
=
13
;
optional
string
dump_converter
=
14
;
optional
string
dump_converter
=
14
;
optional
int32
mpi_size
=
16
[
default
=
-
1
];
optional
int32
dump_file_num
=
17
[
default
=
16
];
// device worker parameters
// device worker parameters
optional
HogwildWorkerParameter
hogwild_param
=
101
;
optional
HogwildWorkerParameter
hogwild_param
=
101
;
optional
DownpourWorkerParameter
downpour_param
=
103
;
optional
DownpourWorkerParameter
downpour_param
=
103
;
...
...
python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
浏览文件 @
f76a32df
...
@@ -588,6 +588,7 @@ class DownpourOptimizer(DistributedOptimizer):
...
@@ -588,6 +588,7 @@ class DownpourOptimizer(DistributedOptimizer):
no_grad_set
,
no_grad_set
,
self
.
_strategy
)
self
.
_strategy
)
opt_info
[
"mpi_rank"
]
=
fleet
.
_role_maker
.
_get_rank
()
opt_info
[
"mpi_rank"
]
=
fleet
.
_role_maker
.
_get_rank
()
opt_info
[
"mpi_size"
]
=
fleet
.
_role_maker
.
_get_size
()
fleet
.
_set_opt_info
(
opt_info
)
fleet
.
_set_opt_info
(
opt_info
)
programs
=
[
loss
.
block
.
program
for
loss
in
losses
]
programs
=
[
loss
.
block
.
program
for
loss
in
losses
]
...
...
python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
浏览文件 @
f76a32df
...
@@ -251,6 +251,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
...
@@ -251,6 +251,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
opt_info
[
"dump_slot"
]
=
False
opt_info
[
"dump_slot"
]
=
False
opt_info
[
"dump_converter"
]
=
""
opt_info
[
"dump_converter"
]
=
""
opt_info
[
"dump_fields"
]
=
strategy
.
get
(
"dump_fields"
,
[])
opt_info
[
"dump_fields"
]
=
strategy
.
get
(
"dump_fields"
,
[])
opt_info
[
"dump_file_num"
]
=
strategy
.
get
(
"dump_file_num"
,
16
)
opt_info
[
"dump_fields_path"
]
=
strategy
.
get
(
"dump_fields_path"
,
""
)
opt_info
[
"dump_fields_path"
]
=
strategy
.
get
(
"dump_fields_path"
,
""
)
if
server
.
_server
.
downpour_server_param
.
downpour_table_param
[
if
server
.
_server
.
downpour_server_param
.
downpour_table_param
[
0
].
accessor
.
accessor_class
==
"DownpourCtrAccessor"
:
0
].
accessor
.
accessor_class
==
"DownpourCtrAccessor"
:
...
...
python/paddle/fluid/trainer_desc.py
浏览文件 @
f76a32df
...
@@ -84,6 +84,9 @@ class TrainerDesc(object):
...
@@ -84,6 +84,9 @@ class TrainerDesc(object):
def
_set_mpi_rank
(
self
,
mpi_rank
):
def
_set_mpi_rank
(
self
,
mpi_rank
):
self
.
proto_desc
.
mpi_rank
=
mpi_rank
self
.
proto_desc
.
mpi_rank
=
mpi_rank
def
_set_mpi_size
(
self
,
mpi_size
):
self
.
proto_desc
.
mpi_size
=
mpi_size
def
_set_dump_fields
(
self
,
dump_fields
):
def
_set_dump_fields
(
self
,
dump_fields
):
for
field
in
dump_fields
:
for
field
in
dump_fields
:
self
.
proto_desc
.
dump_fields
.
append
(
field
)
self
.
proto_desc
.
dump_fields
.
append
(
field
)
...
@@ -91,6 +94,9 @@ class TrainerDesc(object):
...
@@ -91,6 +94,9 @@ class TrainerDesc(object):
def
_set_dump_fields_path
(
self
,
path
):
def
_set_dump_fields_path
(
self
,
path
):
self
.
proto_desc
.
dump_fields_path
=
path
self
.
proto_desc
.
dump_fields_path
=
path
def
_set_dump_file_num
(
self
,
dump_file_num
):
self
.
proto_desc
.
dump_file_num
=
dump_file_num
def
_set_dump_converter
(
self
,
converter
):
def
_set_dump_converter
(
self
,
converter
):
self
.
proto_desc
.
dump_converter
=
converter
self
.
proto_desc
.
dump_converter
=
converter
...
...
python/paddle/fluid/trainer_factory.py
浏览文件 @
f76a32df
...
@@ -47,8 +47,10 @@ class TrainerFactory(object):
...
@@ -47,8 +47,10 @@ class TrainerFactory(object):
trainer
.
_set_scale_datanorm
(
opt_info
[
"scale_datanorm"
])
trainer
.
_set_scale_datanorm
(
opt_info
[
"scale_datanorm"
])
trainer
.
_set_dump_slot
(
opt_info
[
"dump_slot"
])
trainer
.
_set_dump_slot
(
opt_info
[
"dump_slot"
])
trainer
.
_set_mpi_rank
(
opt_info
[
"mpi_rank"
])
trainer
.
_set_mpi_rank
(
opt_info
[
"mpi_rank"
])
trainer
.
_set_mpi_size
(
opt_info
[
"mpi_size"
])
trainer
.
_set_dump_fields
(
opt_info
[
"dump_fields"
])
trainer
.
_set_dump_fields
(
opt_info
[
"dump_fields"
])
trainer
.
_set_dump_fields_path
(
opt_info
[
"dump_fields_path"
])
trainer
.
_set_dump_fields_path
(
opt_info
[
"dump_fields_path"
])
trainer
.
_set_dump_file_num
(
opt_info
[
"dump_file_num"
])
trainer
.
_set_dump_converter
(
opt_info
[
"dump_converter"
])
trainer
.
_set_dump_converter
(
opt_info
[
"dump_converter"
])
trainer
.
_set_adjust_ins_weight
(
opt_info
[
"adjust_ins_weight"
])
trainer
.
_set_adjust_ins_weight
(
opt_info
[
"adjust_ins_weight"
])
trainer
.
_set_device_worker
(
device_worker
)
trainer
.
_set_device_worker
(
device_worker
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录