Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleHub
提交
a14bbd43
P
PaddleHub
项目概览
PaddlePaddle
/
PaddleHub
大约 1 年 前同步成功
通知
282
Star
12117
Fork
2091
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
200
列表
看板
标记
里程碑
合并请求
4
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleHub
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
200
Issue
200
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a14bbd43
编写于
12月 23, 2019
作者:
B
Bin Long
提交者:
GitHub
12月 23, 2019
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #269 from houj04/multimachine
add multimachine support (mpi) for autofinetune.
上级
5f7e926d
504f6d95
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
182 addition
and
17 deletion
+182
-17
paddlehub/autofinetune/autoft.py
paddlehub/autofinetune/autoft.py
+30
-5
paddlehub/autofinetune/mpi_helper.py
paddlehub/autofinetune/mpi_helper.py
+115
-0
paddlehub/commands/autofinetune.py
paddlehub/commands/autofinetune.py
+37
-12
未找到文件。
paddlehub/autofinetune/autoft.py
浏览文件 @
a14bbd43
...
...
@@ -26,6 +26,7 @@ from tb_paddle import SummaryWriter
from
paddlehub.common.logger
import
logger
from
paddlehub.common.utils
import
mkdir
from
paddlehub.autofinetune.evaluator
import
REWARD_SUM
,
TMP_HOME
from
paddlehub.autofinetune.mpi_helper
import
MPIHelper
if
six
.
PY3
:
INF
=
math
.
inf
...
...
@@ -75,6 +76,12 @@ class BaseTuningStrategy(object):
logdir
=
self
.
_output_dir
+
'/visualization/pop_{}'
.
format
(
i
))
self
.
writer_pop_trails
.
append
(
writer_pop_trail
)
# for parallel on mpi
self
.
mpi
=
MPIHelper
()
if
self
.
mpi
.
multi_machine
:
print
(
"Autofinetune multimachine mode: running on {}"
.
format
(
self
.
mpi
.
gather
(
self
.
mpi
.
name
)))
@
property
def
thread
(
self
):
return
self
.
_num_thread
...
...
@@ -177,16 +184,22 @@ class BaseTuningStrategy(object):
solutions_modeldirs
=
{}
mkdir
(
output_dir
)
for
idx
,
solution
in
enumerate
(
solutions
):
solutions
=
self
.
mpi
.
bcast
(
solutions
)
# split solutions to "solutions for me"
range_start
,
range_end
=
self
.
mpi
.
split_range
(
len
(
solutions
))
my_solutions
=
solutions
[
range_start
:
range_end
]
for
idx
,
solution
in
enumerate
(
my_solutions
):
cuda
=
self
.
is_cuda_free
[
"free"
][
0
]
modeldir
=
output_dir
+
"/model-"
+
str
(
idx
)
+
"/"
log_file
=
output_dir
+
"/log-"
+
str
(
idx
)
+
".info"
params_cudas_dirs
.
append
([
solution
,
cuda
,
modeldir
,
log_file
])
solutions_modeldirs
[
tuple
(
solution
)]
=
modeldir
solutions_modeldirs
[
tuple
(
solution
)]
=
(
modeldir
,
self
.
mpi
.
rank
)
self
.
is_cuda_free
[
"free"
].
remove
(
cuda
)
self
.
is_cuda_free
[
"busy"
].
append
(
cuda
)
if
len
(
params_cudas_dirs
)
==
self
.
thread
or
idx
==
len
(
solutions
)
-
1
:
)
==
self
.
thread
or
idx
==
len
(
my_
solutions
)
-
1
:
tp
=
ThreadPool
(
len
(
params_cudas_dirs
))
solution_results
+=
tp
.
map
(
self
.
evaluator
.
run
,
params_cudas_dirs
)
...
...
@@ -198,13 +211,25 @@ class BaseTuningStrategy(object):
self
.
is_cuda_free
[
"busy"
].
remove
(
param_cuda
[
1
])
params_cudas_dirs
=
[]
self
.
feedback
(
solutions
,
solution_results
)
all_solution_results
=
self
.
mpi
.
gather
(
solution_results
)
if
self
.
mpi
.
rank
==
0
:
# only rank 0 need to feedback
all_solution_results
=
[
y
for
x
in
all_solution_results
for
y
in
x
]
self
.
feedback
(
solutions
,
all_solution_results
)
# remove the tmp.txt which records the eval results for trials
tmp_file
=
os
.
path
.
join
(
TMP_HOME
,
"tmp.txt"
)
if
os
.
path
.
exists
(
tmp_file
):
os
.
remove
(
tmp_file
)
return
solutions_modeldirs
# collect all solutions_modeldirs
collected_solutions_modeldirs
=
self
.
mpi
.
allgather
(
solutions_modeldirs
)
return_dict
=
{}
for
i
in
collected_solutions_modeldirs
:
return_dict
.
update
(
i
)
return
return_dict
class
HAZero
(
BaseTuningStrategy
):
...
...
paddlehub/autofinetune/mpi_helper.py
0 → 100755
浏览文件 @
a14bbd43
#!/usr/bin/env python
# -*- coding: utf-8 -*-
class
MPIHelper
(
object
):
def
__init__
(
self
):
try
:
from
mpi4py
import
MPI
except
:
# local run
self
.
_size
=
1
self
.
_rank
=
0
self
.
_multi_machine
=
False
import
socket
self
.
_name
=
socket
.
gethostname
()
else
:
# in mpi environment
self
.
_comm
=
MPI
.
COMM_WORLD
self
.
_size
=
self
.
_comm
.
Get_size
()
self
.
_rank
=
self
.
_comm
.
Get_rank
()
self
.
_name
=
MPI
.
Get_processor_name
()
if
self
.
_size
>
1
:
self
.
_multi_machine
=
True
else
:
self
.
_multi_machine
=
False
@
property
def
multi_machine
(
self
):
return
self
.
_multi_machine
@
property
def
rank
(
self
):
return
self
.
_rank
@
property
def
size
(
self
):
return
self
.
_size
@
property
def
name
(
self
):
return
self
.
_name
def
bcast
(
self
,
data
):
if
self
.
_multi_machine
:
# call real bcast
return
self
.
_comm
.
bcast
(
data
,
root
=
0
)
else
:
# do nothing
return
data
def
gather
(
self
,
data
):
if
self
.
_multi_machine
:
# call real gather
return
self
.
_comm
.
gather
(
data
,
root
=
0
)
else
:
# do nothing
return
[
data
]
def
allgather
(
self
,
data
):
if
self
.
_multi_machine
:
# call real allgather
return
self
.
_comm
.
allgather
(
data
)
else
:
# do nothing
return
[
data
]
# calculate split range on mpi environment
def
split_range
(
self
,
array_length
):
if
self
.
_size
==
1
:
return
0
,
array_length
average_count
=
array_length
/
self
.
_size
if
array_length
%
self
.
_size
==
0
:
return
average_count
*
self
.
_rank
,
average_count
*
(
self
.
_rank
+
1
)
else
:
if
self
.
_rank
<
array_length
%
self
.
_size
:
return
(
average_count
+
1
)
*
self
.
_rank
,
(
average_count
+
1
)
*
(
self
.
_rank
+
1
)
else
:
start
=
(
average_count
+
1
)
*
(
array_length
%
self
.
_size
)
\
+
average_count
*
(
self
.
_rank
-
array_length
%
self
.
_size
)
return
start
,
start
+
average_count
if
__name__
==
"__main__"
:
mpi
=
MPIHelper
()
print
(
"Hello world from process {} of {} at {}."
.
format
(
mpi
.
rank
,
mpi
.
size
,
mpi
.
name
))
all_node_names
=
mpi
.
gather
(
mpi
.
name
)
print
(
"all node names using gather: {}"
.
format
(
all_node_names
))
all_node_names
=
mpi
.
allgather
(
mpi
.
name
)
print
(
"all node names using allgather: {}"
.
format
(
all_node_names
))
if
mpi
.
rank
==
0
:
data
=
range
(
10
)
else
:
data
=
None
data
=
mpi
.
bcast
(
data
)
print
(
"after bcast, process {} have data {}"
.
format
(
mpi
.
rank
,
data
))
data
=
[
i
+
mpi
.
rank
for
i
in
data
]
print
(
"after modify, process {} have data {}"
.
format
(
mpi
.
rank
,
data
))
new_data
=
mpi
.
gather
(
data
)
print
(
"after gather, process {} have data {}"
.
format
(
mpi
.
rank
,
new_data
))
# test for split
for
i
in
range
(
12
):
length
=
i
+
mpi
.
size
# length should >= mpi.size
[
start
,
end
]
=
mpi
.
split_range
(
length
)
split_result
=
mpi
.
gather
([
start
,
end
])
print
(
"length {}, split_result {}"
.
format
(
length
,
split_result
))
paddlehub/commands/autofinetune.py
浏览文件 @
a14bbd43
...
...
@@ -188,37 +188,62 @@ class AutoFineTuneCommand(BaseCommand):
run_round_cnt
=
run_round_cnt
+
1
print
(
"PaddleHub Autofinetune ends."
)
best_hparams_origin
=
autoft
.
get_best_hparams
()
best_hparams_origin
=
autoft
.
mpi
.
bcast
(
best_hparams_origin
)
with
open
(
autoft
.
_output_dir
+
"/log_file.txt"
,
"w"
)
as
f
:
best_hparams
=
evaluator
.
convert_params
(
autoft
.
get_best_hparams
()
)
best_hparams
=
evaluator
.
convert_params
(
best_hparams_origin
)
print
(
"The final best hyperparameters:"
)
f
.
write
(
"The final best hyperparameters:
\n
"
)
for
index
,
hparam_name
in
enumerate
(
autoft
.
hparams_name_list
):
print
(
"%s=%s"
%
(
hparam_name
,
best_hparams
[
index
]))
f
.
write
(
hparam_name
+
"
\t
:
\t
"
+
str
(
best_hparams
[
index
])
+
"
\n
"
)
best_hparams_dir
,
best_hparams_rank
=
solutions_modeldirs
[
tuple
(
best_hparams_origin
)]
print
(
"The final best eval score is %s."
%
autoft
.
get_best_eval_value
())
print
(
"The final best model parameters are saved as "
+
autoft
.
_output_dir
+
"/best_model ."
)
if
autoft
.
mpi
.
multi_machine
:
print
(
"The final best model parameters are saved as "
+
autoft
.
_output_dir
+
"/best_model on rank "
+
str
(
best_hparams_rank
)
+
" ."
)
else
:
print
(
"The final best model parameters are saved as "
+
autoft
.
_output_dir
+
"/best_model ."
)
f
.
write
(
"The final best eval score is %s.
\n
"
%
autoft
.
get_best_eval_value
())
f
.
write
(
"The final best model parameters are saved as ./best_model ."
)
best_model_dir
=
autoft
.
_output_dir
+
"/best_model"
shutil
.
copytree
(
solutions_modeldirs
[
tuple
(
autoft
.
get_best_hparams
())],
best_model_dir
)
f
.
write
(
"
\t
"
.
join
(
autoft
.
hparams_name_list
)
+
"
\t
saved_params_dir
\n
"
)
if
autoft
.
mpi
.
rank
==
best_hparams_rank
:
shutil
.
copytree
(
best_hparams_dir
,
best_model_dir
)
if
autoft
.
mpi
.
multi_machine
:
f
.
write
(
"The final best model parameters are saved as ./best_model on rank "
\
+
str
(
best_hparams_rank
)
+
" ."
)
f
.
write
(
"
\t
"
.
join
(
autoft
.
hparams_name_list
)
+
"
\t
saved_params_dir
\t
rank
\n
"
)
else
:
f
.
write
(
"The final best model parameters are saved as ./best_model ."
)
f
.
write
(
"
\t
"
.
join
(
autoft
.
hparams_name_list
)
+
"
\t
saved_params_dir
\n
"
)
print
(
"The related infomation
about hyperparamemters searched are saved as %s/log_file.txt ."
"The related infomation about hyperparamemters searched are saved as %s/log_file.txt ."
%
autoft
.
_output_dir
)
for
solution
,
modeldir
in
solutions_modeldirs
.
items
():
param
=
evaluator
.
convert_params
(
solution
)
param
=
[
str
(
p
)
for
p
in
param
]
f
.
write
(
"
\t
"
.
join
(
param
)
+
"
\t
"
+
modeldir
+
"
\n
"
)
if
autoft
.
mpi
.
multi_machine
:
f
.
write
(
"
\t
"
.
join
(
param
)
+
"
\t
"
+
modeldir
[
0
]
+
"
\t
"
+
str
(
modeldir
[
1
])
+
"
\n
"
)
else
:
f
.
write
(
"
\t
"
.
join
(
param
)
+
"
\t
"
+
modeldir
[
0
]
+
"
\n
"
)
return
True
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录