Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Parakeet
提交
91ab2b34
P
Parakeet
项目概览
PaddlePaddle
/
Parakeet
通知
8
Star
3
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
19
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Parakeet
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
19
Issue
19
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
91ab2b34
编写于
12月 19, 2019
作者:
K
Kexin Zhao
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
small change
上级
4af577ad
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
2 addition
and
115 deletion
+2
-115
parakeet/models/waveflow/configs/waveflow_ljspeech.yaml
parakeet/models/waveflow/configs/waveflow_ljspeech.yaml
+2
-2
parakeet/models/waveflow/slurm.py
parakeet/models/waveflow/slurm.py
+0
-113
未找到文件。
parakeet/models/waveflow/configs/waveflow_ljspeech.yaml
浏览文件 @
91ab2b34
...
@@ -12,8 +12,8 @@ seed: 1234
...
@@ -12,8 +12,8 @@ seed: 1234
learning_rate
:
0.0002
learning_rate
:
0.0002
batch_size
:
8
batch_size
:
8
test_every
:
2000
test_every
:
2000
save_every
:
5
000
save_every
:
10
000
max_iterations
:
2
000000
max_iterations
:
3
000000
sigma
:
1.0
sigma
:
1.0
n_flows
:
8
n_flows
:
8
...
...
parakeet/models/waveflow/slurm.py
已删除
100644 → 0
浏览文件 @
4af577ad
"""
Utility module for restarting training when using SLURM.
"""
import
subprocess
import
os
import
sys
import
shlex
import
re
import
time
def
job_info
():
"""Get information about the current job using `scontrol show job`.
Returns a dict mapping parameter names (e.g. "UserId", "RunTime", etc) to
their values, both as strings.
"""
job_id
=
int
(
os
.
environ
[
"SLURM_JOB_ID"
])
command
=
[
"scontrol"
,
"show"
,
"job"
,
str
(
job_id
)]
output
=
subprocess
.
check_output
(
command
).
decode
(
"utf-8"
)
# Use a regex to extract the parameter names and values
pattern
=
"([A-Za-z/]*)=([^
\t\n
]*)"
return
dict
(
re
.
findall
(
pattern
,
output
))
def
parse_hours
(
text
):
"""Parse a time format HH or DD-HH into a number of hours."""
hour_chunks
=
text
.
split
(
"-"
)
if
len
(
hour_chunks
)
==
1
:
return
int
(
hour_chunks
[
0
])
elif
len
(
hour_chunks
)
==
2
:
return
24
*
int
(
hour_chunks
[
0
])
+
int
(
hour_chunks
[
1
])
else
:
raise
ValueError
(
"Unexpected hour format (expected HH or "
"DD-HH, but got {})."
.
format
(
text
))
def
parse_time
(
text
):
"""Convert slurm time to an integer.
Expects time to be of the form:
"hours:minutes:seconds" or "day-hours:minutes:seconds".
"""
hours
,
minutes
,
seconds
=
text
.
split
(
":"
)
try
:
return
parse_hours
(
hours
)
*
3600
+
int
(
minutes
)
*
60
+
int
(
seconds
)
except
ValueError
as
e
:
raise
ValueError
(
"Error parsing time {}. Got error {}."
.
format
(
text
,
str
(
e
)))
def
restart_command
():
"""Using the environment and SLURM command, create a command that, when,
run, will enqueue a repeat of the current job using `sbatch`.
Return the command as a list of strings, suitable for passing to
`subprocess.check_call` or similar functions.
Returns:
resume_command: list<str>, command to run to restart job.
end_time: int or None; the time the job will end or None
if the job has unlimited runtime.
"""
# Make sure `RunTime` could be parsed correctly.
while
job_info
()[
"RunTime"
]
==
"INVALID"
:
time
.
sleep
(
1
)
# Get all the necessary information by querying SLURM with this job id
info
=
job_info
()
try
:
num_cpus
=
int
(
info
[
"CPUs/Task"
])
except
KeyError
:
num_cpus
=
int
(
os
.
environ
[
"SLURM_CPUS_PER_TASK"
])
num_tasks
=
int
(
os
.
environ
[
"SLURM_NTASKS"
])
nodes
=
info
[
"NumNodes"
]
gres
,
partition
=
info
.
get
(
"Gres"
),
info
.
get
(
"Partition"
)
stderr
,
stdout
=
info
.
get
(
"StdErr"
),
info
.
get
(
"StdOut"
)
job_name
=
info
.
get
(
"JobName"
)
command
=
[
"sbatch"
,
"--job-name={}"
.
format
(
job_name
),
"--ntasks={}"
.
format
(
num_tasks
),
"--exclude=asimov-186"
]
if
partition
:
command
.
extend
([
"--partition"
,
partition
])
if
gres
and
gres
!=
"(null)"
:
command
.
extend
([
"--gres"
,
gres
])
num_gpu
=
int
(
gres
.
split
(
':'
)[
-
1
])
print
(
"number of gpu assigned by slurm is {}"
.
format
(
num_gpu
))
if
stderr
:
command
.
extend
([
"--error"
,
stderr
])
if
stdout
:
command
.
extend
([
"--output"
,
stdout
])
python
=
subprocess
.
check_output
(
[
"/usr/bin/which"
,
"python3"
]).
decode
(
"utf-8"
).
strip
()
dist_setting
=
[
'-m'
,
'paddle.distributed.launch'
]
wrap_cmd
=
[
"srun"
,
python
,
'-u'
]
+
dist_setting
+
sys
.
argv
command
.
append
(
"--wrap={}"
.
format
(
" "
.
join
(
shlex
.
quote
(
arg
)
for
arg
in
wrap_cmd
)))
time_limit_string
=
info
[
"TimeLimit"
]
if
time_limit_string
.
lower
()
==
"unlimited"
:
print
(
"UNLIMITED detected: restart OFF, infinite learning ON."
,
flush
=
True
)
return
command
,
None
time_limit
=
parse_time
(
time_limit_string
)
runtime
=
parse_time
(
info
[
"RunTime"
])
end_time
=
time
.
time
()
+
time_limit
-
runtime
return
command
,
end_time
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录