未验证 提交 25241e9e 编写于 作者: G gongweibao 提交者: GitHub

Fix paddle env variables. (#11564)

上级 4116b551
...@@ -97,7 +97,7 @@ def dist_transpile(trainer_id, args): ...@@ -97,7 +97,7 @@ def dist_transpile(trainer_id, args):
return train_program, fluid.default_startup_program() return train_program, fluid.default_startup_program()
else: else:
raise ValueError( raise ValueError(
'TRAINING_ROLE environment variable must be either TRAINER or PSERVER' 'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
) )
......
...@@ -108,10 +108,10 @@ def gen_job(): ...@@ -108,10 +108,10 @@ def gen_job():
tn_container["ports"][0]["containerPort"] = spreadport tn_container["ports"][0]["containerPort"] = spreadport
envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname}) envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
envs.append({"name": "TRAINERS", "value": str(args.trainers)}) envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)})
envs.append({"name": "PSERVERS", "value": str(args.pservers)}) envs.append({"name": "PSERVERS", "value": str(args.pservers)})
envs.append({"name": "ENTRY", "value": args.entry}) envs.append({"name": "ENTRY", "value": args.entry})
envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)}) envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)}) envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
# NOTE: these directories below are cluster specific, please modify # NOTE: these directories below are cluster specific, please modify
# this settings before you run on your own cluster. # this settings before you run on your own cluster.
...@@ -167,16 +167,22 @@ def gen_job(): ...@@ -167,16 +167,22 @@ def gen_job():
tn_container["volumeMounts"] = volumeMounts tn_container["volumeMounts"] = volumeMounts
ps_container["env"] = envs ps_container["env"] = envs
ps_container["env"].append({"name": "TRAINING_ROLE", "value": "PSERVER"}) ps_container["env"].append({
"name": "PADDLE_TRAINING_ROLE",
"value": "PSERVER"
})
tn_container["env"] = envs tn_container["env"] = envs
if args.disttype == "pserver": if args.disttype == "pserver":
tn_container["env"].append({ tn_container["env"].append({
"name": "TRAINING_ROLE", "name": "PADDLE_TRAINING_ROLE",
"value": "TRAINER" "value": "TRAINER"
}) })
elif args.disttype == "nccl2" or args.disttype == "local": elif args.disttype == "nccl2" or args.disttype == "local":
# NCCL2 have no training role, set to plain WORKER # NCCL2 have no training role, set to plain WORKER
tn_container["env"].append({"name": "TRAINING_ROLE", "value": "WORKER"}) tn_container["env"].append({
"name": "PADDLE_TRAINING_ROLE",
"value": "WORKER"
})
os.mkdir(args.jobname) os.mkdir(args.jobname)
if args.disttype == "pserver": if args.disttype == "pserver":
......
...@@ -168,13 +168,13 @@ cd /paddle/python/paddle/fluid/tests/book ...@@ -168,13 +168,13 @@ cd /paddle/python/paddle/fluid/tests/book
第二步,启动Parameter Server: 第二步,启动Parameter Server:
```bash ```bash
PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.2 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=192.168.1.2 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=PSERVER python test_fit_a_line.py
``` ```
执行命令后请等待出现提示: ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。 执行命令后请等待出现提示: ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
第三步,启动Trainer: 第三步,启动Trainer:
```bash ```bash
PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.3 PADDLE_TRAINERS=2 PADDLE_CURRENT_IPP=192.168.1.3 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=TRAINER python test_fit_a_line.py
``` ```
由于我们定义的Trainer的数量是2个,因此需要在另外一个计算节点上再启动一个Trainer。 由于我们定义的Trainer的数量是2个,因此需要在另外一个计算节点上再启动一个Trainer。
......
...@@ -114,8 +114,8 @@ def gen_train_list(file_pattern, trainers, trainer_id): ...@@ -114,8 +114,8 @@ def gen_train_list(file_pattern, trainers, trainer_id):
ret_list.append(f) ret_list.append(f)
return ret_list return ret_list
trainers = int(os.getenv("TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
data_file = fluid.layers.io.open_files( data_file = fluid.layers.io.open_files(
filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0), filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
thread_num=1, thread_num=1,
......
...@@ -194,16 +194,16 @@ def train(word_dict, ...@@ -194,16 +194,16 @@ def train(word_dict,
if is_local: if is_local:
train_loop(fluid.default_main_program()) train_loop(fluid.default_main_program())
else: else:
port = os.getenv("PADDLE_INIT_PORT", "6174") port = os.getenv("PADDLE_PSERVER_PORT", "6174")
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
eplist = [] eplist = []
for ip in pserver_ips.split(","): for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port])) eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port... pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
trainers = int(os.getenv("TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
current_endpoint = os.getenv("POD_IP") + ":" + port current_endpoint = os.getenv("POD_IP") + ":" + port
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
......
...@@ -69,16 +69,16 @@ def train(use_cuda, save_dirname, is_local): ...@@ -69,16 +69,16 @@ def train(use_cuda, save_dirname, is_local):
if is_local: if is_local:
train_loop(fluid.default_main_program()) train_loop(fluid.default_main_program())
else: else:
port = os.getenv("PADDLE_INIT_PORT", "6174") port = os.getenv("PADDLE_PSERVER_PORT", "6174")
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
eplist = [] eplist = []
for ip in pserver_ips.split(","): for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port])) eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port... pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
trainers = int(os.getenv("TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
current_endpoint = os.getenv("POD_IP") + ":" + port current_endpoint = os.getenv("POD_IP") + ":" + port
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
......
...@@ -178,16 +178,16 @@ def train(net_type, use_cuda, save_dirname, is_local): ...@@ -178,16 +178,16 @@ def train(net_type, use_cuda, save_dirname, is_local):
if is_local: if is_local:
train_loop(fluid.default_main_program()) train_loop(fluid.default_main_program())
else: else:
port = os.getenv("PADDLE_INIT_PORT", "6174") port = os.getenv("PADDLE_PSERVER_PORT", "6174")
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
eplist = [] eplist = []
for ip in pserver_ips.split(","): for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port])) eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port... pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
trainers = int(os.getenv("TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
current_endpoint = os.getenv("POD_IP") + ":" + port current_endpoint = os.getenv("POD_IP") + ":" + port
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
......
...@@ -209,16 +209,16 @@ def train(use_cuda, save_dirname=None, is_local=True): ...@@ -209,16 +209,16 @@ def train(use_cuda, save_dirname=None, is_local=True):
if is_local: if is_local:
train_loop(fluid.default_main_program()) train_loop(fluid.default_main_program())
else: else:
port = os.getenv("PADDLE_INIT_PORT", "6174") port = os.getenv("PADDLE_PSERVER_PORT", "6174")
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
eplist = [] eplist = []
for ip in pserver_ips.split(","): for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port])) eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port... pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
trainers = int(os.getenv("TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
current_endpoint = os.getenv("POD_IP") + ":" + port current_endpoint = os.getenv("POD_IP") + ":" + port
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
......
...@@ -200,16 +200,16 @@ def train_main(use_cuda, is_sparse, is_local=True): ...@@ -200,16 +200,16 @@ def train_main(use_cuda, is_sparse, is_local=True):
if is_local: if is_local:
train_loop(framework.default_main_program()) train_loop(framework.default_main_program())
else: else:
port = os.getenv("PADDLE_INIT_PORT", "6174") port = os.getenv("PADDLE_PSERVER_PORT", "6174")
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
eplist = [] eplist = []
for ip in pserver_ips.split(","): for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port])) eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port... pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
trainers = int(os.getenv("TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
current_endpoint = os.getenv("POD_IP") + ":" + port current_endpoint = os.getenv("POD_IP") + ":" + port
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
......
...@@ -151,16 +151,16 @@ def train(nn_type, ...@@ -151,16 +151,16 @@ def train(nn_type,
if is_local: if is_local:
train_loop(fluid.default_main_program()) train_loop(fluid.default_main_program())
else: else:
port = os.getenv("PADDLE_INIT_PORT", "6174") port = os.getenv("PADDLE_PSERVER_PORT", "6174")
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
eplist = [] eplist = []
for ip in pserver_ips.split(","): for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port])) eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port... pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
trainers = int(os.getenv("TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
current_endpoint = os.getenv("POD_IP") + ":" + port current_endpoint = os.getenv("POD_IP") + ":" + port
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
......
...@@ -220,16 +220,16 @@ def train(use_cuda, save_dirname, is_local=True): ...@@ -220,16 +220,16 @@ def train(use_cuda, save_dirname, is_local=True):
if is_local: if is_local:
train_loop(fluid.default_main_program()) train_loop(fluid.default_main_program())
else: else:
port = os.getenv("PADDLE_INIT_PORT", "6174") port = os.getenv("PADDLE_PSERVER_PORT", "6174")
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
eplist = [] eplist = []
for ip in pserver_ips.split(","): for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port])) eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port... pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
trainers = int(os.getenv("TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
current_endpoint = os.getenv("POD_IP") + ":" + port current_endpoint = os.getenv("POD_IP") + ":" + port
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
......
...@@ -125,16 +125,16 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): ...@@ -125,16 +125,16 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
if is_local: if is_local:
train_loop(fluid.default_main_program()) train_loop(fluid.default_main_program())
else: else:
port = os.getenv("PADDLE_INIT_PORT", "6174") port = os.getenv("PADDLE_PSERVER_PORT", "6174")
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
eplist = [] eplist = []
for ip in pserver_ips.split(","): for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port])) eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist) # ip:port,ip:port... pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
trainers = int(os.getenv("TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
current_endpoint = os.getenv("POD_IP") + ":" + port current_endpoint = os.getenv("POD_IP") + ":" + port
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册