Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleFL
提交
c28289ce
P
PaddleFL
项目概览
PaddlePaddle
/
PaddleFL
通知
35
Star
5
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
6
列表
看板
标记
里程碑
合并请求
4
Wiki
3
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleFL
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
6
Issue
6
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
3
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
c28289ce
编写于
2月 03, 2020
作者:
Q
Qinghe JING
提交者:
GitHub
2月 03, 2020
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'master' into secagg
上级
dff47636
48fac0af
变更
25
显示空白变更内容
内联
并排
Showing
25 changed file
with
57 addition
and
41 deletion
+57
-41
contrib/data_safety_training/image_classification/server/receiver.py
...a_safety_training/image_classification/server/receiver.py
+0
-0
docs/source/md/quick_start.md
docs/source/md/quick_start.md
+5
-1
paddle_fl/core/scheduler/agent_master.py
paddle_fl/core/scheduler/agent_master.py
+0
-1
paddle_fl/core/trainer/diffiehellman/diffiehellman.py
paddle_fl/core/trainer/diffiehellman/diffiehellman.py
+0
-1
paddle_fl/examples/ctr_demo/fl_scheduler.py
paddle_fl/examples/ctr_demo/fl_scheduler.py
+2
-1
paddle_fl/examples/ctr_demo/fl_server.py
paddle_fl/examples/ctr_demo/fl_server.py
+2
-2
paddle_fl/examples/ctr_demo/fl_trainer.py
paddle_fl/examples/ctr_demo/fl_trainer.py
+5
-5
paddle_fl/examples/dpsgd_demo/fl_scheduler.py
paddle_fl/examples/dpsgd_demo/fl_scheduler.py
+2
-1
paddle_fl/examples/dpsgd_demo/fl_server.py
paddle_fl/examples/dpsgd_demo/fl_server.py
+2
-2
paddle_fl/examples/dpsgd_demo/fl_trainer.py
paddle_fl/examples/dpsgd_demo/fl_trainer.py
+1
-1
paddle_fl/examples/dpsgd_demo/run.sh
paddle_fl/examples/dpsgd_demo/run.sh
+2
-0
paddle_fl/examples/femnist_demo/fl_scheduler.py
paddle_fl/examples/femnist_demo/fl_scheduler.py
+2
-1
paddle_fl/examples/femnist_demo/fl_server.py
paddle_fl/examples/femnist_demo/fl_server.py
+2
-2
paddle_fl/examples/femnist_demo/fl_trainer.py
paddle_fl/examples/femnist_demo/fl_trainer.py
+2
-2
paddle_fl/examples/femnist_demo/run.sh
paddle_fl/examples/femnist_demo/run.sh
+2
-0
paddle_fl/examples/gru4rec_demo/fl_scheduler.py
paddle_fl/examples/gru4rec_demo/fl_scheduler.py
+2
-1
paddle_fl/examples/gru4rec_demo/fl_server.py
paddle_fl/examples/gru4rec_demo/fl_server.py
+2
-2
paddle_fl/examples/gru4rec_demo/fl_trainer.py
paddle_fl/examples/gru4rec_demo/fl_trainer.py
+1
-1
paddle_fl/examples/secagg_demo/fl_scheduler.py
paddle_fl/examples/secagg_demo/fl_scheduler.py
+2
-1
paddle_fl/examples/secagg_demo/fl_server.py
paddle_fl/examples/secagg_demo/fl_server.py
+2
-2
paddle_fl/examples/secagg_demo/fl_trainer.py
paddle_fl/examples/secagg_demo/fl_trainer.py
+3
-2
paddle_fl/examples/submitter_demo/conf.txt
paddle_fl/examples/submitter_demo/conf.txt
+2
-1
paddle_fl/examples/submitter_demo/kill.sh
paddle_fl/examples/submitter_demo/kill.sh
+3
-1
paddle_fl/examples/submitter_demo/scheduler_client.py
paddle_fl/examples/submitter_demo/scheduler_client.py
+6
-5
paddle_fl/examples/submitter_demo/train_program.py
paddle_fl/examples/submitter_demo/train_program.py
+5
-5
未找到文件。
contrib/data_safety_training/image_classification/server/receive.py
→
contrib/data_safety_training/image_classification/server/receive
r
.py
浏览文件 @
c28289ce
文件已移动
docs/source/md/quick_start.md
浏览文件 @
c28289ce
...
@@ -67,7 +67,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
...
@@ -67,7 +67,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
worker_num
=
2
worker_num
=
2
server_num
=
1
server_num
=
1
scheduler
=
FLScheduler
(
worker_num
,
server_num
)
# Define the number of worker/server and the port for scheduler
scheduler
=
FLScheduler
(
worker_num
,
server_num
,
port
=
9091
)
scheduler
.
set_sample_worker_num
(
worker_num
)
scheduler
.
set_sample_worker_num
(
worker_num
)
scheduler
.
init_env
()
scheduler
.
init_env
()
print
(
"init env done."
)
print
(
"init env done."
)
...
@@ -94,6 +95,7 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest
...
@@ -94,6 +95,7 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest
job_path
=
"fl_job_config"
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
=
FLRunTimeJob
()
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# Inform the scheduler IP to trainer
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
.
start
()
trainer
.
start
()
...
@@ -122,6 +124,8 @@ server_id = 0
...
@@ -122,6 +124,8 @@ server_id = 0
job_path
=
"fl_job_config"
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
=
FLRunTimeJob
()
job
.
load_server_job
(
job_path
,
server_id
)
job
.
load_server_job
(
job_path
,
server_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# IP address for scheduler
server
.
set_server_job
(
job
)
server
.
set_server_job
(
job
)
server
.
_current_ep
=
"127.0.0.1:8181"
# IP address for server
server
.
start
()
server
.
start
()
```
```
paddle_fl/core/scheduler/agent_master.py
浏览文件 @
c28289ce
...
@@ -140,4 +140,3 @@ class FLScheduler(object):
...
@@ -140,4 +140,3 @@ class FLScheduler(object):
if
len
(
finish_training_dict
)
==
len
(
worker_dict
):
if
len
(
finish_training_dict
)
==
len
(
worker_dict
):
all_finish_training
=
True
all_finish_training
=
True
time
.
sleep
(
5
)
time
.
sleep
(
5
)
loop
+=
1
paddle_fl/core/trainer/diffiehellman/diffiehellman.py
浏览文件 @
c28289ce
...
@@ -44,7 +44,6 @@ try:
...
@@ -44,7 +44,6 @@ try:
except
(
AttributeError
,
ImportError
):
except
(
AttributeError
,
ImportError
):
rng
=
os
.
urandom
rng
=
os
.
urandom
class
DiffieHellman
:
class
DiffieHellman
:
"""
"""
Implements the Diffie-Hellman key exchange protocol.
Implements the Diffie-Hellman key exchange protocol.
...
...
paddle_fl/examples/ctr_demo/fl_scheduler.py
浏览文件 @
c28289ce
...
@@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
...
@@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
worker_num
=
2
worker_num
=
2
server_num
=
1
server_num
=
1
scheduler
=
FLScheduler
(
worker_num
,
server_num
)
# Define the number of worker/server and the port for scheduler
scheduler
=
FLScheduler
(
worker_num
,
server_num
,
port
=
9091
)
scheduler
.
set_sample_worker_num
(
worker_num
)
scheduler
.
set_sample_worker_num
(
worker_num
)
scheduler
.
init_env
()
scheduler
.
init_env
()
print
(
"init env done."
)
print
(
"init env done."
)
...
...
paddle_fl/examples/ctr_demo/fl_server.py
浏览文件 @
c28289ce
...
@@ -21,8 +21,8 @@ server_id = 0
...
@@ -21,8 +21,8 @@ server_id = 0
job_path
=
"fl_job_config"
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
=
FLRunTimeJob
()
job
.
load_server_job
(
job_path
,
server_id
)
job
.
load_server_job
(
job_path
,
server_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# IP address for scheduler
server
.
set_server_job
(
job
)
server
.
set_server_job
(
job
)
server
.
_current_ep
=
"127.0.0.1:8181"
server
.
_current_ep
=
"127.0.0.1:8181"
# IP address for server
server
.
start
()
server
.
start
()
print
(
"connect"
)
print
(
"connect"
)
paddle_fl/examples/ctr_demo/fl_trainer.py
浏览文件 @
c28289ce
...
@@ -19,22 +19,22 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest
...
@@ -19,22 +19,22 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest
job_path
=
"fl_job_config"
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
=
FLRunTimeJob
()
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# Inform the scheduler IP to trainer
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
.
_current_ep
=
"127.0.0.1:{}"
.
format
(
9000
+
trainer_id
)
trainer
.
_current_ep
=
"127.0.0.1:{}"
.
format
(
9000
+
trainer_id
)
trainer
.
start
()
trainer
.
start
()
print
(
trainer
.
_scheduler_ep
,
trainer
.
_current_ep
)
print
(
trainer
.
_scheduler_ep
,
trainer
.
_current_ep
)
output_folder
=
"fl_model"
output_folder
=
"fl_model"
step_i
=
0
epoch_id
=
0
while
not
trainer
.
stop
():
while
not
trainer
.
stop
():
print
(
"batch %d start train"
%
(
step_i
))
print
(
"batch %d start train"
%
(
epoch_id
))
train_step
=
0
train_step
=
0
for
data
in
reader
():
for
data
in
reader
():
trainer
.
run
(
feed
=
data
,
fetch
=
[])
trainer
.
run
(
feed
=
data
,
fetch
=
[])
train_step
+=
1
train_step
+=
1
if
train_step
==
trainer
.
_step
:
if
train_step
==
trainer
.
_step
:
break
break
step_i
+=
1
epoch_id
+=
1
if
step_i
%
100
==
0
:
if
epoch_id
%
5
==
0
:
trainer
.
save_inference_program
(
output_folder
)
trainer
.
save_inference_program
(
output_folder
)
paddle_fl/examples/dpsgd_demo/fl_scheduler.py
浏览文件 @
c28289ce
...
@@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
...
@@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
worker_num
=
4
worker_num
=
4
server_num
=
1
server_num
=
1
scheduler
=
FLScheduler
(
worker_num
,
server_num
)
#Define number of worker/server and the port for scheduler
scheduler
=
FLScheduler
(
worker_num
,
server_num
,
port
=
9091
)
scheduler
.
set_sample_worker_num
(
4
)
scheduler
.
set_sample_worker_num
(
4
)
scheduler
.
init_env
()
scheduler
.
init_env
()
print
(
"init env done."
)
print
(
"init env done."
)
...
...
paddle_fl/examples/dpsgd_demo/fl_server.py
浏览文件 @
c28289ce
...
@@ -21,7 +21,7 @@ server_id = 0
...
@@ -21,7 +21,7 @@ server_id = 0
job_path
=
"fl_job_config"
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
=
FLRunTimeJob
()
job
.
load_server_job
(
job_path
,
server_id
)
job
.
load_server_job
(
job_path
,
server_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# IP address for scheduler
server
.
set_server_job
(
job
)
server
.
set_server_job
(
job
)
server
.
_current_ep
=
"127.0.0.1:8181"
server
.
_current_ep
=
"127.0.0.1:8181"
# IP address for server
server
.
start
()
server
.
start
()
paddle_fl/examples/dpsgd_demo/fl_trainer.py
浏览文件 @
c28289ce
...
@@ -13,7 +13,7 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest
...
@@ -13,7 +13,7 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest
job_path
=
"fl_job_config"
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
=
FLRunTimeJob
()
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# Inform scheduler IP address to trainer
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
.
_current_ep
=
"127.0.0.1:{}"
.
format
(
9000
+
trainer_id
)
trainer
.
_current_ep
=
"127.0.0.1:{}"
.
format
(
9000
+
trainer_id
)
trainer
.
start
()
trainer
.
start
()
...
...
paddle_fl/examples/dpsgd_demo/run.sh
浏览文件 @
c28289ce
unset
http_proxy
unset
https_proxy
python fl_master.py
python fl_master.py
sleep
2
sleep
2
python
-u
fl_scheduler.py
>
scheduler.log &
python
-u
fl_scheduler.py
>
scheduler.log &
...
...
paddle_fl/examples/femnist_demo/fl_scheduler.py
浏览文件 @
c28289ce
...
@@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
...
@@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
worker_num
=
4
worker_num
=
4
server_num
=
1
server_num
=
1
scheduler
=
FLScheduler
(
worker_num
,
server_num
)
# Define the number of worker/server and the port for scheduler
scheduler
=
FLScheduler
(
worker_num
,
server_num
,
port
=
9091
)
scheduler
.
set_sample_worker_num
(
4
)
scheduler
.
set_sample_worker_num
(
4
)
scheduler
.
init_env
()
scheduler
.
init_env
()
print
(
"init env done."
)
print
(
"init env done."
)
...
...
paddle_fl/examples/femnist_demo/fl_server.py
浏览文件 @
c28289ce
...
@@ -7,7 +7,7 @@ server_id = 0
...
@@ -7,7 +7,7 @@ server_id = 0
job_path
=
"fl_job_config"
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
=
FLRunTimeJob
()
job
.
load_server_job
(
job_path
,
server_id
)
job
.
load_server_job
(
job_path
,
server_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# IP address for scheduler
server
.
set_server_job
(
job
)
server
.
set_server_job
(
job
)
server
.
_current_ep
=
"127.0.0.1:8181"
server
.
_current_ep
=
"127.0.0.1:8181"
# IP address for server
server
.
start
()
server
.
start
()
paddle_fl/examples/femnist_demo/fl_trainer.py
浏览文件 @
c28289ce
...
@@ -14,7 +14,7 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest
...
@@ -14,7 +14,7 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest
job_path
=
"fl_job_config"
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
=
FLRunTimeJob
()
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# Inform the scheduler IP to trainer
print
(
job
.
_target_names
)
print
(
job
.
_target_names
)
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
.
_current_ep
=
"127.0.0.1:{}"
.
format
(
9000
+
trainer_id
)
trainer
.
_current_ep
=
"127.0.0.1:{}"
.
format
(
9000
+
trainer_id
)
...
@@ -40,7 +40,7 @@ def train_test(train_test_program, train_test_feed, train_test_reader):
...
@@ -40,7 +40,7 @@ def train_test(train_test_program, train_test_feed, train_test_reader):
epoch_id
=
0
epoch_id
=
0
step
=
0
step
=
0
epoch
=
3000
epoch
=
3000
count_by_step
=
Tru
e
count_by_step
=
Fals
e
if
count_by_step
:
if
count_by_step
:
output_folder
=
"model_node%d"
%
trainer_id
output_folder
=
"model_node%d"
%
trainer_id
else
:
else
:
...
...
paddle_fl/examples/femnist_demo/run.sh
浏览文件 @
c28289ce
unset
http_proxy
unset
https_proxy
#killall python
#killall python
python fl_master.py
python fl_master.py
sleep
2
sleep
2
...
...
paddle_fl/examples/gru4rec_demo/fl_scheduler.py
浏览文件 @
c28289ce
...
@@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
...
@@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
worker_num
=
4
worker_num
=
4
server_num
=
1
server_num
=
1
scheduler
=
FLScheduler
(
worker_num
,
server_num
)
# Define the number of worker/server and the port for scheduler
scheduler
=
FLScheduler
(
worker_num
,
server_num
,
port
=
9091
)
scheduler
.
set_sample_worker_num
(
4
)
scheduler
.
set_sample_worker_num
(
4
)
scheduler
.
init_env
()
scheduler
.
init_env
()
print
(
"init env done."
)
print
(
"init env done."
)
...
...
paddle_fl/examples/gru4rec_demo/fl_server.py
浏览文件 @
c28289ce
...
@@ -21,7 +21,7 @@ server_id = 0
...
@@ -21,7 +21,7 @@ server_id = 0
job_path
=
"fl_job_config"
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
=
FLRunTimeJob
()
job
.
load_server_job
(
job_path
,
server_id
)
job
.
load_server_job
(
job_path
,
server_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# IP address for scheduler
server
.
set_server_job
(
job
)
server
.
set_server_job
(
job
)
server
.
_current_ep
=
"127.0.0.1:8181"
server
.
_current_ep
=
"127.0.0.1:8181"
# IP address for server
server
.
start
()
server
.
start
()
paddle_fl/examples/gru4rec_demo/fl_trainer.py
浏览文件 @
c28289ce
...
@@ -14,7 +14,7 @@ train_file_dir = "mid_data/node4/%d/" % trainer_id
...
@@ -14,7 +14,7 @@ train_file_dir = "mid_data/node4/%d/" % trainer_id
job_path
=
"fl_job_config"
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
=
FLRunTimeJob
()
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# Inform the scheduler IP to trainer
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
.
_current_ep
=
"127.0.0.1:{}"
.
format
(
9000
+
trainer_id
)
trainer
.
_current_ep
=
"127.0.0.1:{}"
.
format
(
9000
+
trainer_id
)
trainer
.
start
()
trainer
.
start
()
...
...
paddle_fl/examples/secagg_demo/fl_scheduler.py
浏览文件 @
c28289ce
...
@@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
...
@@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
worker_num
=
2
worker_num
=
2
server_num
=
1
server_num
=
1
scheduler
=
FLScheduler
(
worker_num
,
server_num
)
scheduler
=
FLScheduler
(
worker_num
,
server_num
,
port
=
9091
)
scheduler
.
set_sample_worker_num
(
worker_num
)
scheduler
.
set_sample_worker_num
(
worker_num
)
scheduler
.
init_env
()
scheduler
.
init_env
()
print
(
"init env done."
)
print
(
"init env done."
)
...
...
paddle_fl/examples/secagg_demo/fl_server.py
浏览文件 @
c28289ce
...
@@ -21,8 +21,8 @@ server_id = 0
...
@@ -21,8 +21,8 @@ server_id = 0
job_path
=
"fl_job_config"
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
=
FLRunTimeJob
()
job
.
load_server_job
(
job_path
,
server_id
)
job
.
load_server_job
(
job_path
,
server_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# IP address for scheduler
server
.
set_server_job
(
job
)
server
.
set_server_job
(
job
)
server
.
_current_ep
=
"127.0.0.1:8181"
server
.
_current_ep
=
"127.0.0.1:8181"
# IP address for server
server
.
start
()
server
.
start
()
print
(
"connect"
)
print
(
"connect"
)
paddle_fl/examples/secagg_demo/fl_trainer.py
浏览文件 @
c28289ce
...
@@ -28,7 +28,7 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest
...
@@ -28,7 +28,7 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest
job_path
=
"fl_job_config"
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
=
FLRunTimeJob
()
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# Inform the scheduler IP to trainer
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
.
trainer_id
=
trainer_id
trainer
.
trainer_id
=
trainer_id
trainer
.
_current_ep
=
"127.0.0.1:{}"
.
format
(
9000
+
trainer_id
)
trainer
.
_current_ep
=
"127.0.0.1:{}"
.
format
(
9000
+
trainer_id
)
...
@@ -75,6 +75,7 @@ while not trainer.stop():
...
@@ -75,6 +75,7 @@ while not trainer.stop():
if
step_i
%
100
==
0
:
if
step_i
%
100
==
0
:
print
(
"Epoch: {0}, step: {1}, accuracy: {2}"
.
format
(
epoch_id
,
step_i
,
accuracy
[
0
]))
print
(
"Epoch: {0}, step: {1}, accuracy: {2}"
.
format
(
epoch_id
,
step_i
,
accuracy
[
0
]))
print
(
step_i
)
avg_loss_val
,
acc_val
=
train_test
(
train_test_program
=
test_program
,
avg_loss_val
,
acc_val
=
train_test
(
train_test_program
=
test_program
,
train_test_reader
=
test_reader
,
train_test_reader
=
test_reader
,
train_test_feed
=
feeder
)
train_test_feed
=
feeder
)
...
@@ -82,5 +83,5 @@ while not trainer.stop():
...
@@ -82,5 +83,5 @@ while not trainer.stop():
if
epoch_id
>
40
:
if
epoch_id
>
40
:
break
break
if
step_i
%
100
==
0
:
if
epoch_id
%
5
==
0
:
trainer
.
save_inference_program
(
output_folder
)
trainer
.
save_inference_program
(
output_folder
)
paddle_fl/examples/submitter_demo/conf.txt
浏览文件 @
c28289ce
...
@@ -2,7 +2,8 @@
...
@@ -2,7 +2,8 @@
task_name=test_fl_job_submit_jingqinghe
task_name=test_fl_job_submit_jingqinghe
hdfs_output=/user/feed/mlarch/sequence_generator/dongdaxiang/job_44
hdfs_output=/user/feed/mlarch/sequence_generator/dongdaxiang/job_44
train_cmd=python dist_trainer.py
train_cmd=python dist_trainer.py
monitor_cmd=python system_monitor_app.py 10 100
#monitor_cmd=python system_monitor_app.py 10 100
monitor_cmd=
#train_cmd=python test_hadoop.py
#train_cmd=python test_hadoop.py
hdfs_path=afs://xingtian.afs.baidu.com:9902
hdfs_path=afs://xingtian.afs.baidu.com:9902
...
...
paddle_fl/examples/submitter_demo/kill.sh
浏览文件 @
c28289ce
/home/jingqinghe/mpi_feed4/smart_client/bin/qdel
$1
".yq01-hpc-lvliang01-smart-master.dmop.baidu.com"
unset
http_proxy
unset
https_proxy
/home/jingqinghe/tools/mpi_feed4/smart_client/bin/qdel
$1
".yq01-hpc-lvliang01-smart-master.dmop.baidu.com"
paddle_fl/examples/submitter_demo/scheduler_client.py
浏览文件 @
c28289ce
...
@@ -18,7 +18,8 @@ print(random_port)
...
@@ -18,7 +18,8 @@ print(random_port)
current_ip
=
socket
.
gethostbyname
(
socket
.
gethostname
())
current_ip
=
socket
.
gethostbyname
(
socket
.
gethostname
())
endpoints
=
"{}:{}"
.
format
(
current_ip
,
random_port
)
endpoints
=
"{}:{}"
.
format
(
current_ip
,
random_port
)
#start a web server for remote endpoints to download their config
#start a web server for remote endpoints to download their config
os
.
system
(
"python -m SimpleHTTPServer 8080 &"
)
#os.system("python -m SimpleHTTPServer 8080 &")
os
.
system
(
"python -m http.server 8080 &"
)
if
os
.
path
.
exists
(
"job_config"
):
if
os
.
path
.
exists
(
"job_config"
):
os
.
system
(
"rm -rf job_config"
)
os
.
system
(
"rm -rf job_config"
)
if
os
.
path
.
exists
(
"package"
):
if
os
.
path
.
exists
(
"package"
):
...
...
paddle_fl/examples/submitter_demo/train_program.py
浏览文件 @
c28289ce
...
@@ -89,15 +89,15 @@ else:
...
@@ -89,15 +89,15 @@ else:
trainer
.
start
()
trainer
.
start
()
print
(
trainer
.
_scheduler_ep
,
trainer
.
_current_ep
)
print
(
trainer
.
_scheduler_ep
,
trainer
.
_current_ep
)
output_folder
=
"fl_model"
output_folder
=
"fl_model"
step_i
=
0
epoch_id
=
0
while
not
trainer
.
stop
():
while
not
trainer
.
stop
():
print
(
"batch %d start train"
%
(
step_i
))
print
(
"batch %d start train"
%
(
step_i
))
train_step
=
0
step_i
=
0
for
data
in
reader
():
for
data
in
reader
():
trainer
.
run
(
feed
=
data
,
fetch
=
[])
trainer
.
run
(
feed
=
data
,
fetch
=
[])
train_step
+=
1
step_i
+=
1
if
train_step
==
trainer
.
_step
:
if
train_step
==
trainer
.
_step
:
break
break
step_i
+=
1
epoch_id
+=
1
if
step_i
%
100
==
0
:
if
epoch_id
%
5
==
0
:
trainer
.
save_inference_program
(
output_folder
)
trainer
.
save_inference_program
(
output_folder
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录