Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
25241e9e
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
1 年多 前同步成功
通知
696
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
25241e9e
编写于
6月 19, 2018
作者:
G
gongweibao
提交者:
GitHub
6月 19, 2018
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix paddle env variables. (#11564)
上级
4116b551
变更
12
隐藏空白更改
内联
并排
Showing
12 changed file
with
56 addition
and
50 deletion
+56
-50
benchmark/fluid/fluid_benchmark.py
benchmark/fluid/fluid_benchmark.py
+1
-1
benchmark/fluid/kube_gen_job.py
benchmark/fluid/kube_gen_job.py
+11
-5
doc/fluid/howto/cluster/fluid_cluster_train_cn.md
doc/fluid/howto/cluster/fluid_cluster_train_cn.md
+2
-2
doc/fluid/howto/cluster/fluid_recordio.md
doc/fluid/howto/cluster/fluid_recordio.md
+2
-2
python/paddle/fluid/tests/book/notest_understand_sentiment.py
...on/paddle/fluid/tests/book/notest_understand_sentiment.py
+5
-5
python/paddle/fluid/tests/book/test_fit_a_line.py
python/paddle/fluid/tests/book/test_fit_a_line.py
+5
-5
python/paddle/fluid/tests/book/test_image_classification.py
python/paddle/fluid/tests/book/test_image_classification.py
+5
-5
python/paddle/fluid/tests/book/test_label_semantic_roles.py
python/paddle/fluid/tests/book/test_label_semantic_roles.py
+5
-5
python/paddle/fluid/tests/book/test_machine_translation.py
python/paddle/fluid/tests/book/test_machine_translation.py
+5
-5
python/paddle/fluid/tests/book/test_recognize_digits.py
python/paddle/fluid/tests/book/test_recognize_digits.py
+5
-5
python/paddle/fluid/tests/book/test_recommender_system.py
python/paddle/fluid/tests/book/test_recommender_system.py
+5
-5
python/paddle/fluid/tests/book/test_word2vec.py
python/paddle/fluid/tests/book/test_word2vec.py
+5
-5
未找到文件。
benchmark/fluid/fluid_benchmark.py
浏览文件 @
25241e9e
...
...
@@ -97,7 +97,7 @@ def dist_transpile(trainer_id, args):
return
train_program
,
fluid
.
default_startup_program
()
else
:
raise
ValueError
(
'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
'
PADDLE_
TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
)
...
...
benchmark/fluid/kube_gen_job.py
浏览文件 @
25241e9e
...
...
@@ -108,10 +108,10 @@ def gen_job():
tn_container
[
"ports"
][
0
][
"containerPort"
]
=
spreadport
envs
.
append
({
"name"
:
"PADDLE_JOB_NAME"
,
"value"
:
args
.
jobname
})
envs
.
append
({
"name"
:
"TRAINERS"
,
"value"
:
str
(
args
.
trainers
)})
envs
.
append
({
"name"
:
"
PADDLE_
TRAINERS"
,
"value"
:
str
(
args
.
trainers
)})
envs
.
append
({
"name"
:
"PSERVERS"
,
"value"
:
str
(
args
.
pservers
)})
envs
.
append
({
"name"
:
"ENTRY"
,
"value"
:
args
.
entry
})
envs
.
append
({
"name"
:
"PADDLE_
INIT
_PORT"
,
"value"
:
str
(
args
.
port
)})
envs
.
append
({
"name"
:
"PADDLE_
PSERVER
_PORT"
,
"value"
:
str
(
args
.
port
)})
envs
.
append
({
"name"
:
"PADDLE_PSERVER_PORT"
,
"value"
:
str
(
args
.
port
)})
# NOTE: these directories below are cluster specific, please modify
# this settings before you run on your own cluster.
...
...
@@ -167,16 +167,22 @@ def gen_job():
tn_container
[
"volumeMounts"
]
=
volumeMounts
ps_container
[
"env"
]
=
envs
ps_container
[
"env"
].
append
({
"name"
:
"TRAINING_ROLE"
,
"value"
:
"PSERVER"
})
ps_container
[
"env"
].
append
({
"name"
:
"PADDLE_TRAINING_ROLE"
,
"value"
:
"PSERVER"
})
tn_container
[
"env"
]
=
envs
if
args
.
disttype
==
"pserver"
:
tn_container
[
"env"
].
append
({
"name"
:
"TRAINING_ROLE"
,
"name"
:
"
PADDLE_
TRAINING_ROLE"
,
"value"
:
"TRAINER"
})
elif
args
.
disttype
==
"nccl2"
or
args
.
disttype
==
"local"
:
# NCCL2 have no training role, set to plain WORKER
tn_container
[
"env"
].
append
({
"name"
:
"TRAINING_ROLE"
,
"value"
:
"WORKER"
})
tn_container
[
"env"
].
append
({
"name"
:
"PADDLE_TRAINING_ROLE"
,
"value"
:
"WORKER"
})
os
.
mkdir
(
args
.
jobname
)
if
args
.
disttype
==
"pserver"
:
...
...
doc/fluid/howto/cluster/fluid_cluster_train_cn.md
浏览文件 @
25241e9e
...
...
@@ -168,13 +168,13 @@ cd /paddle/python/paddle/fluid/tests/book
第二步,启动Parameter Server:
```
bash
PADDLE_
INIT_PORT
=
6174
PADDLE_INIT_PSERVERS
=
192.168.1.2
TRAINERS
=
2
POD_IP
=
192.168.1.2
PADDLE_INIT_TRAINER_ID
=
1
TRAINING_ROLE
=
PSERVER python test_fit_a_line.py
PADDLE_
PSERVER_PORT
=
6174
PADDLE_PSERVER_IPS
=
192.168.1.2
PADDLE_TRAINERS
=
2
PADDLE_CURRENT_IP
=
192.168.1.2
PADDLE_TRAINER_ID
=
1
PADDLE_
TRAINING_ROLE
=
PSERVER python test_fit_a_line.py
```
执行命令后请等待出现提示:
```Server listening on 192.168.1.2:6174 ```
, 表示Paramter Server已经正常启动。
第三步,启动Trainer:
```
bash
PADDLE_
INIT_PORT
=
6174
PADDLE_INIT_PSERVERS
=
192.168.1.3
TRAINERS
=
2
POD_IP
=
192.168.1.3
PADDLE_INIT_TRAINER_ID
=
1
TRAINING_ROLE
=
TRAINER python test_fit_a_line.py
PADDLE_
PSERVER_PORT
=
6174
PADDLE_PSERVER_IPS
=
192.168.1.3
PADDLE_TRAINERS
=
2
PADDLE_CURRENT_IPP
=
192.168.1.3
PADDLE_TRAINER_ID
=
1
PADDLE_
TRAINING_ROLE
=
TRAINER python test_fit_a_line.py
```
由于我们定义的Trainer的数量是2个,因此需要在另外一个计算节点上再启动一个Trainer。
...
...
doc/fluid/howto/cluster/fluid_recordio.md
浏览文件 @
25241e9e
...
...
@@ -114,8 +114,8 @@ def gen_train_list(file_pattern, trainers, trainer_id):
ret_list
.
append
(
f
)
return
ret_list
trainers
=
int
(
os
.
getenv
(
"TRAINERS"
))
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_
INIT_
TRAINER_ID"
))
trainers
=
int
(
os
.
getenv
(
"
PADDLE_
TRAINERS"
))
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
))
data_file
=
fluid
.
layers
.
io
.
open_files
(
filenames
=
gen_train_list
(
"./mnist-[0-9]*.recordio"
,
2
,
0
),
thread_num
=
1
,
...
...
python/paddle/fluid/tests/book/notest_understand_sentiment.py
浏览文件 @
25241e9e
...
...
@@ -194,16 +194,16 @@ def train(word_dict,
if
is_local
:
train_loop
(
fluid
.
default_main_program
())
else
:
port
=
os
.
getenv
(
"PADDLE_
INIT
_PORT"
,
"6174"
)
pserver_ips
=
os
.
getenv
(
"PADDLE_
INIT_PSERVER
S"
)
# ip,ip...
port
=
os
.
getenv
(
"PADDLE_
PSERVER
_PORT"
,
"6174"
)
pserver_ips
=
os
.
getenv
(
"PADDLE_
PSERVER_IP
S"
)
# ip,ip...
eplist
=
[]
for
ip
in
pserver_ips
.
split
(
","
):
eplist
.
append
(
':'
.
join
([
ip
,
port
]))
pserver_endpoints
=
","
.
join
(
eplist
)
# ip:port,ip:port...
trainers
=
int
(
os
.
getenv
(
"TRAINERS"
))
trainers
=
int
(
os
.
getenv
(
"
PADDLE_
TRAINERS"
))
current_endpoint
=
os
.
getenv
(
"POD_IP"
)
+
":"
+
port
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_
INIT_
TRAINER_ID"
))
training_role
=
os
.
getenv
(
"TRAINING_ROLE"
,
"TRAINER"
)
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
))
training_role
=
os
.
getenv
(
"
PADDLE_
TRAINING_ROLE"
,
"TRAINER"
)
t
=
fluid
.
DistributeTranspiler
()
t
.
transpile
(
trainer_id
,
pservers
=
pserver_endpoints
,
trainers
=
trainers
)
if
training_role
==
"PSERVER"
:
...
...
python/paddle/fluid/tests/book/test_fit_a_line.py
浏览文件 @
25241e9e
...
...
@@ -69,16 +69,16 @@ def train(use_cuda, save_dirname, is_local):
if
is_local
:
train_loop
(
fluid
.
default_main_program
())
else
:
port
=
os
.
getenv
(
"PADDLE_
INIT
_PORT"
,
"6174"
)
pserver_ips
=
os
.
getenv
(
"PADDLE_
INIT_PSERVER
S"
)
# ip,ip...
port
=
os
.
getenv
(
"PADDLE_
PSERVER
_PORT"
,
"6174"
)
pserver_ips
=
os
.
getenv
(
"PADDLE_
PSERVER_IP
S"
)
# ip,ip...
eplist
=
[]
for
ip
in
pserver_ips
.
split
(
","
):
eplist
.
append
(
':'
.
join
([
ip
,
port
]))
pserver_endpoints
=
","
.
join
(
eplist
)
# ip:port,ip:port...
trainers
=
int
(
os
.
getenv
(
"TRAINERS"
))
trainers
=
int
(
os
.
getenv
(
"
PADDLE_
TRAINERS"
))
current_endpoint
=
os
.
getenv
(
"POD_IP"
)
+
":"
+
port
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_
INIT_
TRAINER_ID"
))
training_role
=
os
.
getenv
(
"TRAINING_ROLE"
,
"TRAINER"
)
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
))
training_role
=
os
.
getenv
(
"
PADDLE_
TRAINING_ROLE"
,
"TRAINER"
)
t
=
fluid
.
DistributeTranspiler
()
t
.
transpile
(
trainer_id
,
pservers
=
pserver_endpoints
,
trainers
=
trainers
)
if
training_role
==
"PSERVER"
:
...
...
python/paddle/fluid/tests/book/test_image_classification.py
浏览文件 @
25241e9e
...
...
@@ -178,16 +178,16 @@ def train(net_type, use_cuda, save_dirname, is_local):
if
is_local
:
train_loop
(
fluid
.
default_main_program
())
else
:
port
=
os
.
getenv
(
"PADDLE_
INIT
_PORT"
,
"6174"
)
pserver_ips
=
os
.
getenv
(
"PADDLE_
INIT_PSERVER
S"
)
# ip,ip...
port
=
os
.
getenv
(
"PADDLE_
PSERVER
_PORT"
,
"6174"
)
pserver_ips
=
os
.
getenv
(
"PADDLE_
PSERVER_IP
S"
)
# ip,ip...
eplist
=
[]
for
ip
in
pserver_ips
.
split
(
","
):
eplist
.
append
(
':'
.
join
([
ip
,
port
]))
pserver_endpoints
=
","
.
join
(
eplist
)
# ip:port,ip:port...
trainers
=
int
(
os
.
getenv
(
"TRAINERS"
))
trainers
=
int
(
os
.
getenv
(
"
PADDLE_
TRAINERS"
))
current_endpoint
=
os
.
getenv
(
"POD_IP"
)
+
":"
+
port
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_
INIT_
TRAINER_ID"
))
training_role
=
os
.
getenv
(
"TRAINING_ROLE"
,
"TRAINER"
)
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
))
training_role
=
os
.
getenv
(
"
PADDLE_
TRAINING_ROLE"
,
"TRAINER"
)
t
=
fluid
.
DistributeTranspiler
()
t
.
transpile
(
trainer_id
,
pservers
=
pserver_endpoints
,
trainers
=
trainers
)
if
training_role
==
"PSERVER"
:
...
...
python/paddle/fluid/tests/book/test_label_semantic_roles.py
浏览文件 @
25241e9e
...
...
@@ -209,16 +209,16 @@ def train(use_cuda, save_dirname=None, is_local=True):
if
is_local
:
train_loop
(
fluid
.
default_main_program
())
else
:
port
=
os
.
getenv
(
"PADDLE_
INIT
_PORT"
,
"6174"
)
pserver_ips
=
os
.
getenv
(
"PADDLE_
INIT_PSERVER
S"
)
# ip,ip...
port
=
os
.
getenv
(
"PADDLE_
PSERVER
_PORT"
,
"6174"
)
pserver_ips
=
os
.
getenv
(
"PADDLE_
PSERVER_IP
S"
)
# ip,ip...
eplist
=
[]
for
ip
in
pserver_ips
.
split
(
","
):
eplist
.
append
(
':'
.
join
([
ip
,
port
]))
pserver_endpoints
=
","
.
join
(
eplist
)
# ip:port,ip:port...
trainers
=
int
(
os
.
getenv
(
"TRAINERS"
))
trainers
=
int
(
os
.
getenv
(
"
PADDLE_
TRAINERS"
))
current_endpoint
=
os
.
getenv
(
"POD_IP"
)
+
":"
+
port
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_
INIT_
TRAINER_ID"
))
training_role
=
os
.
getenv
(
"TRAINING_ROLE"
,
"TRAINER"
)
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
))
training_role
=
os
.
getenv
(
"
PADDLE_
TRAINING_ROLE"
,
"TRAINER"
)
t
=
fluid
.
DistributeTranspiler
()
t
.
transpile
(
trainer_id
,
pservers
=
pserver_endpoints
,
trainers
=
trainers
)
if
training_role
==
"PSERVER"
:
...
...
python/paddle/fluid/tests/book/test_machine_translation.py
浏览文件 @
25241e9e
...
...
@@ -200,16 +200,16 @@ def train_main(use_cuda, is_sparse, is_local=True):
if
is_local
:
train_loop
(
framework
.
default_main_program
())
else
:
port
=
os
.
getenv
(
"PADDLE_
INIT
_PORT"
,
"6174"
)
pserver_ips
=
os
.
getenv
(
"PADDLE_
INIT_PSERVER
S"
)
# ip,ip...
port
=
os
.
getenv
(
"PADDLE_
PSERVER
_PORT"
,
"6174"
)
pserver_ips
=
os
.
getenv
(
"PADDLE_
PSERVER_IP
S"
)
# ip,ip...
eplist
=
[]
for
ip
in
pserver_ips
.
split
(
","
):
eplist
.
append
(
':'
.
join
([
ip
,
port
]))
pserver_endpoints
=
","
.
join
(
eplist
)
# ip:port,ip:port...
trainers
=
int
(
os
.
getenv
(
"TRAINERS"
))
trainers
=
int
(
os
.
getenv
(
"
PADDLE_
TRAINERS"
))
current_endpoint
=
os
.
getenv
(
"POD_IP"
)
+
":"
+
port
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_
INIT_
TRAINER_ID"
))
training_role
=
os
.
getenv
(
"TRAINING_ROLE"
,
"TRAINER"
)
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
))
training_role
=
os
.
getenv
(
"
PADDLE_
TRAINING_ROLE"
,
"TRAINER"
)
t
=
fluid
.
DistributeTranspiler
()
t
.
transpile
(
trainer_id
,
pservers
=
pserver_endpoints
,
trainers
=
trainers
)
if
training_role
==
"PSERVER"
:
...
...
python/paddle/fluid/tests/book/test_recognize_digits.py
浏览文件 @
25241e9e
...
...
@@ -151,16 +151,16 @@ def train(nn_type,
if
is_local
:
train_loop
(
fluid
.
default_main_program
())
else
:
port
=
os
.
getenv
(
"PADDLE_
INIT
_PORT"
,
"6174"
)
pserver_ips
=
os
.
getenv
(
"PADDLE_
INIT_PSERVER
S"
)
# ip,ip...
port
=
os
.
getenv
(
"PADDLE_
PSERVER
_PORT"
,
"6174"
)
pserver_ips
=
os
.
getenv
(
"PADDLE_
PSERVER_IP
S"
)
# ip,ip...
eplist
=
[]
for
ip
in
pserver_ips
.
split
(
","
):
eplist
.
append
(
':'
.
join
([
ip
,
port
]))
pserver_endpoints
=
","
.
join
(
eplist
)
# ip:port,ip:port...
trainers
=
int
(
os
.
getenv
(
"TRAINERS"
))
trainers
=
int
(
os
.
getenv
(
"
PADDLE_
TRAINERS"
))
current_endpoint
=
os
.
getenv
(
"POD_IP"
)
+
":"
+
port
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_
INIT_
TRAINER_ID"
))
training_role
=
os
.
getenv
(
"TRAINING_ROLE"
,
"TRAINER"
)
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
))
training_role
=
os
.
getenv
(
"
PADDLE_
TRAINING_ROLE"
,
"TRAINER"
)
t
=
fluid
.
DistributeTranspiler
()
t
.
transpile
(
trainer_id
,
pservers
=
pserver_endpoints
,
trainers
=
trainers
)
if
training_role
==
"PSERVER"
:
...
...
python/paddle/fluid/tests/book/test_recommender_system.py
浏览文件 @
25241e9e
...
...
@@ -220,16 +220,16 @@ def train(use_cuda, save_dirname, is_local=True):
if
is_local
:
train_loop
(
fluid
.
default_main_program
())
else
:
port
=
os
.
getenv
(
"PADDLE_
INIT
_PORT"
,
"6174"
)
pserver_ips
=
os
.
getenv
(
"PADDLE_
INIT_PSERVER
S"
)
# ip,ip...
port
=
os
.
getenv
(
"PADDLE_
PSERVER
_PORT"
,
"6174"
)
pserver_ips
=
os
.
getenv
(
"PADDLE_
PSERVER_IP
S"
)
# ip,ip...
eplist
=
[]
for
ip
in
pserver_ips
.
split
(
","
):
eplist
.
append
(
':'
.
join
([
ip
,
port
]))
pserver_endpoints
=
","
.
join
(
eplist
)
# ip:port,ip:port...
trainers
=
int
(
os
.
getenv
(
"TRAINERS"
))
trainers
=
int
(
os
.
getenv
(
"
PADDLE_
TRAINERS"
))
current_endpoint
=
os
.
getenv
(
"POD_IP"
)
+
":"
+
port
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_
INIT_
TRAINER_ID"
))
training_role
=
os
.
getenv
(
"TRAINING_ROLE"
,
"TRAINER"
)
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
))
training_role
=
os
.
getenv
(
"
PADDLE_
TRAINING_ROLE"
,
"TRAINER"
)
t
=
fluid
.
DistributeTranspiler
()
t
.
transpile
(
trainer_id
,
pservers
=
pserver_endpoints
,
trainers
=
trainers
)
if
training_role
==
"PSERVER"
:
...
...
python/paddle/fluid/tests/book/test_word2vec.py
浏览文件 @
25241e9e
...
...
@@ -125,16 +125,16 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
if
is_local
:
train_loop
(
fluid
.
default_main_program
())
else
:
port
=
os
.
getenv
(
"PADDLE_
INIT
_PORT"
,
"6174"
)
pserver_ips
=
os
.
getenv
(
"PADDLE_
INIT_PSERVER
S"
)
# ip,ip...
port
=
os
.
getenv
(
"PADDLE_
PSERVER
_PORT"
,
"6174"
)
pserver_ips
=
os
.
getenv
(
"PADDLE_
PSERVER_IP
S"
)
# ip,ip...
eplist
=
[]
for
ip
in
pserver_ips
.
split
(
","
):
eplist
.
append
(
':'
.
join
([
ip
,
port
]))
pserver_endpoints
=
","
.
join
(
eplist
)
# ip:port,ip:port...
trainers
=
int
(
os
.
getenv
(
"TRAINERS"
))
trainers
=
int
(
os
.
getenv
(
"
PADDLE_
TRAINERS"
))
current_endpoint
=
os
.
getenv
(
"POD_IP"
)
+
":"
+
port
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_
INIT_
TRAINER_ID"
))
training_role
=
os
.
getenv
(
"TRAINING_ROLE"
,
"TRAINER"
)
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
))
training_role
=
os
.
getenv
(
"
PADDLE_
TRAINING_ROLE"
,
"TRAINER"
)
t
=
fluid
.
DistributeTranspiler
()
t
.
transpile
(
trainer_id
,
pservers
=
pserver_endpoints
,
trainers
=
trainers
)
if
training_role
==
"PSERVER"
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录