Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleFL
提交
c28289ce
P
PaddleFL
项目概览
PaddlePaddle
/
PaddleFL
通知
35
Star
5
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
6
列表
看板
标记
里程碑
合并请求
4
Wiki
3
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleFL
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
6
Issue
6
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
3
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
c28289ce
编写于
2月 03, 2020
作者:
Q
Qinghe JING
提交者:
GitHub
2月 03, 2020
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'master' into secagg
上级
dff47636
48fac0af
变更
25
隐藏空白更改
内联
并排
Showing
25 changed file
with
57 addition
and
41 deletion
+57
-41
contrib/data_safety_training/image_classification/server/receiver.py
...a_safety_training/image_classification/server/receiver.py
+0
-0
docs/source/md/quick_start.md
docs/source/md/quick_start.md
+5
-1
paddle_fl/core/scheduler/agent_master.py
paddle_fl/core/scheduler/agent_master.py
+0
-1
paddle_fl/core/trainer/diffiehellman/diffiehellman.py
paddle_fl/core/trainer/diffiehellman/diffiehellman.py
+0
-1
paddle_fl/examples/ctr_demo/fl_scheduler.py
paddle_fl/examples/ctr_demo/fl_scheduler.py
+2
-1
paddle_fl/examples/ctr_demo/fl_server.py
paddle_fl/examples/ctr_demo/fl_server.py
+2
-2
paddle_fl/examples/ctr_demo/fl_trainer.py
paddle_fl/examples/ctr_demo/fl_trainer.py
+5
-5
paddle_fl/examples/dpsgd_demo/fl_scheduler.py
paddle_fl/examples/dpsgd_demo/fl_scheduler.py
+2
-1
paddle_fl/examples/dpsgd_demo/fl_server.py
paddle_fl/examples/dpsgd_demo/fl_server.py
+2
-2
paddle_fl/examples/dpsgd_demo/fl_trainer.py
paddle_fl/examples/dpsgd_demo/fl_trainer.py
+1
-1
paddle_fl/examples/dpsgd_demo/run.sh
paddle_fl/examples/dpsgd_demo/run.sh
+2
-0
paddle_fl/examples/femnist_demo/fl_scheduler.py
paddle_fl/examples/femnist_demo/fl_scheduler.py
+2
-1
paddle_fl/examples/femnist_demo/fl_server.py
paddle_fl/examples/femnist_demo/fl_server.py
+2
-2
paddle_fl/examples/femnist_demo/fl_trainer.py
paddle_fl/examples/femnist_demo/fl_trainer.py
+2
-2
paddle_fl/examples/femnist_demo/run.sh
paddle_fl/examples/femnist_demo/run.sh
+2
-0
paddle_fl/examples/gru4rec_demo/fl_scheduler.py
paddle_fl/examples/gru4rec_demo/fl_scheduler.py
+2
-1
paddle_fl/examples/gru4rec_demo/fl_server.py
paddle_fl/examples/gru4rec_demo/fl_server.py
+2
-2
paddle_fl/examples/gru4rec_demo/fl_trainer.py
paddle_fl/examples/gru4rec_demo/fl_trainer.py
+1
-1
paddle_fl/examples/secagg_demo/fl_scheduler.py
paddle_fl/examples/secagg_demo/fl_scheduler.py
+2
-1
paddle_fl/examples/secagg_demo/fl_server.py
paddle_fl/examples/secagg_demo/fl_server.py
+2
-2
paddle_fl/examples/secagg_demo/fl_trainer.py
paddle_fl/examples/secagg_demo/fl_trainer.py
+3
-2
paddle_fl/examples/submitter_demo/conf.txt
paddle_fl/examples/submitter_demo/conf.txt
+2
-1
paddle_fl/examples/submitter_demo/kill.sh
paddle_fl/examples/submitter_demo/kill.sh
+3
-1
paddle_fl/examples/submitter_demo/scheduler_client.py
paddle_fl/examples/submitter_demo/scheduler_client.py
+6
-5
paddle_fl/examples/submitter_demo/train_program.py
paddle_fl/examples/submitter_demo/train_program.py
+5
-5
未找到文件。
contrib/data_safety_training/image_classification/server/receive.py
→
contrib/data_safety_training/image_classification/server/receive
r
.py
浏览文件 @
c28289ce
文件已移动
docs/source/md/quick_start.md
浏览文件 @
c28289ce
...
...
@@ -67,7 +67,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
worker_num
=
2
server_num
=
1
scheduler
=
FLScheduler
(
worker_num
,
server_num
)
# Define the number of worker/server and the port for scheduler
scheduler
=
FLScheduler
(
worker_num
,
server_num
,
port
=
9091
)
scheduler
.
set_sample_worker_num
(
worker_num
)
scheduler
.
init_env
()
print
(
"init env done."
)
...
...
@@ -94,6 +95,7 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# Inform the scheduler IP to trainer
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
.
start
()
...
...
@@ -122,6 +124,8 @@ server_id = 0
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_server_job
(
job_path
,
server_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# IP address for scheduler
server
.
set_server_job
(
job
)
server
.
_current_ep
=
"127.0.0.1:8181"
# IP address for server
server
.
start
()
```
paddle_fl/core/scheduler/agent_master.py
浏览文件 @
c28289ce
...
...
@@ -140,4 +140,3 @@ class FLScheduler(object):
if
len
(
finish_training_dict
)
==
len
(
worker_dict
):
all_finish_training
=
True
time
.
sleep
(
5
)
loop
+=
1
paddle_fl/core/trainer/diffiehellman/diffiehellman.py
浏览文件 @
c28289ce
...
...
@@ -44,7 +44,6 @@ try:
except
(
AttributeError
,
ImportError
):
rng
=
os
.
urandom
class
DiffieHellman
:
"""
Implements the Diffie-Hellman key exchange protocol.
...
...
paddle_fl/examples/ctr_demo/fl_scheduler.py
浏览文件 @
c28289ce
...
...
@@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
worker_num
=
2
server_num
=
1
scheduler
=
FLScheduler
(
worker_num
,
server_num
)
# Define the number of worker/server and the port for scheduler
scheduler
=
FLScheduler
(
worker_num
,
server_num
,
port
=
9091
)
scheduler
.
set_sample_worker_num
(
worker_num
)
scheduler
.
init_env
()
print
(
"init env done."
)
...
...
paddle_fl/examples/ctr_demo/fl_server.py
浏览文件 @
c28289ce
...
...
@@ -21,8 +21,8 @@ server_id = 0
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_server_job
(
job_path
,
server_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# IP address for scheduler
server
.
set_server_job
(
job
)
server
.
_current_ep
=
"127.0.0.1:8181"
server
.
_current_ep
=
"127.0.0.1:8181"
# IP address for server
server
.
start
()
print
(
"connect"
)
paddle_fl/examples/ctr_demo/fl_trainer.py
浏览文件 @
c28289ce
...
...
@@ -19,22 +19,22 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# Inform the scheduler IP to trainer
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
.
_current_ep
=
"127.0.0.1:{}"
.
format
(
9000
+
trainer_id
)
trainer
.
start
()
print
(
trainer
.
_scheduler_ep
,
trainer
.
_current_ep
)
output_folder
=
"fl_model"
step_i
=
0
epoch_id
=
0
while
not
trainer
.
stop
():
print
(
"batch %d start train"
%
(
step_i
))
print
(
"batch %d start train"
%
(
epoch_id
))
train_step
=
0
for
data
in
reader
():
trainer
.
run
(
feed
=
data
,
fetch
=
[])
train_step
+=
1
if
train_step
==
trainer
.
_step
:
break
step_i
+=
1
if
step_i
%
100
==
0
:
epoch_id
+=
1
if
epoch_id
%
5
==
0
:
trainer
.
save_inference_program
(
output_folder
)
paddle_fl/examples/dpsgd_demo/fl_scheduler.py
浏览文件 @
c28289ce
...
...
@@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
worker_num
=
4
server_num
=
1
scheduler
=
FLScheduler
(
worker_num
,
server_num
)
#Define number of worker/server and the port for scheduler
scheduler
=
FLScheduler
(
worker_num
,
server_num
,
port
=
9091
)
scheduler
.
set_sample_worker_num
(
4
)
scheduler
.
init_env
()
print
(
"init env done."
)
...
...
paddle_fl/examples/dpsgd_demo/fl_server.py
浏览文件 @
c28289ce
...
...
@@ -21,7 +21,7 @@ server_id = 0
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_server_job
(
job_path
,
server_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# IP address for scheduler
server
.
set_server_job
(
job
)
server
.
_current_ep
=
"127.0.0.1:8181"
server
.
_current_ep
=
"127.0.0.1:8181"
# IP address for server
server
.
start
()
paddle_fl/examples/dpsgd_demo/fl_trainer.py
浏览文件 @
c28289ce
...
...
@@ -13,7 +13,7 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# Inform scheduler IP address to trainer
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
.
_current_ep
=
"127.0.0.1:{}"
.
format
(
9000
+
trainer_id
)
trainer
.
start
()
...
...
paddle_fl/examples/dpsgd_demo/run.sh
浏览文件 @
c28289ce
unset
http_proxy
unset
https_proxy
python fl_master.py
sleep
2
python
-u
fl_scheduler.py
>
scheduler.log &
...
...
paddle_fl/examples/femnist_demo/fl_scheduler.py
浏览文件 @
c28289ce
...
...
@@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
worker_num
=
4
server_num
=
1
scheduler
=
FLScheduler
(
worker_num
,
server_num
)
# Define the number of worker/server and the port for scheduler
scheduler
=
FLScheduler
(
worker_num
,
server_num
,
port
=
9091
)
scheduler
.
set_sample_worker_num
(
4
)
scheduler
.
init_env
()
print
(
"init env done."
)
...
...
paddle_fl/examples/femnist_demo/fl_server.py
浏览文件 @
c28289ce
...
...
@@ -7,7 +7,7 @@ server_id = 0
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_server_job
(
job_path
,
server_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# IP address for scheduler
server
.
set_server_job
(
job
)
server
.
_current_ep
=
"127.0.0.1:8181"
server
.
_current_ep
=
"127.0.0.1:8181"
# IP address for server
server
.
start
()
paddle_fl/examples/femnist_demo/fl_trainer.py
浏览文件 @
c28289ce
...
...
@@ -14,7 +14,7 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# Inform the scheduler IP to trainer
print
(
job
.
_target_names
)
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
.
_current_ep
=
"127.0.0.1:{}"
.
format
(
9000
+
trainer_id
)
...
...
@@ -40,7 +40,7 @@ def train_test(train_test_program, train_test_feed, train_test_reader):
epoch_id
=
0
step
=
0
epoch
=
3000
count_by_step
=
Tru
e
count_by_step
=
Fals
e
if
count_by_step
:
output_folder
=
"model_node%d"
%
trainer_id
else
:
...
...
paddle_fl/examples/femnist_demo/run.sh
浏览文件 @
c28289ce
unset
http_proxy
unset
https_proxy
#killall python
python fl_master.py
sleep
2
...
...
paddle_fl/examples/gru4rec_demo/fl_scheduler.py
浏览文件 @
c28289ce
...
...
@@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
worker_num
=
4
server_num
=
1
scheduler
=
FLScheduler
(
worker_num
,
server_num
)
# Define the number of worker/server and the port for scheduler
scheduler
=
FLScheduler
(
worker_num
,
server_num
,
port
=
9091
)
scheduler
.
set_sample_worker_num
(
4
)
scheduler
.
init_env
()
print
(
"init env done."
)
...
...
paddle_fl/examples/gru4rec_demo/fl_server.py
浏览文件 @
c28289ce
...
...
@@ -21,7 +21,7 @@ server_id = 0
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_server_job
(
job_path
,
server_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# IP address for scheduler
server
.
set_server_job
(
job
)
server
.
_current_ep
=
"127.0.0.1:8181"
server
.
_current_ep
=
"127.0.0.1:8181"
# IP address for server
server
.
start
()
paddle_fl/examples/gru4rec_demo/fl_trainer.py
浏览文件 @
c28289ce
...
...
@@ -14,7 +14,7 @@ train_file_dir = "mid_data/node4/%d/" % trainer_id
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# Inform the scheduler IP to trainer
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
.
_current_ep
=
"127.0.0.1:{}"
.
format
(
9000
+
trainer_id
)
trainer
.
start
()
...
...
paddle_fl/examples/secagg_demo/fl_scheduler.py
浏览文件 @
c28289ce
...
...
@@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
worker_num
=
2
server_num
=
1
scheduler
=
FLScheduler
(
worker_num
,
server_num
)
scheduler
=
FLScheduler
(
worker_num
,
server_num
,
port
=
9091
)
scheduler
.
set_sample_worker_num
(
worker_num
)
scheduler
.
init_env
()
print
(
"init env done."
)
...
...
paddle_fl/examples/secagg_demo/fl_server.py
浏览文件 @
c28289ce
...
...
@@ -21,8 +21,8 @@ server_id = 0
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_server_job
(
job_path
,
server_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# IP address for scheduler
server
.
set_server_job
(
job
)
server
.
_current_ep
=
"127.0.0.1:8181"
server
.
_current_ep
=
"127.0.0.1:8181"
# IP address for server
server
.
start
()
print
(
"connect"
)
paddle_fl/examples/secagg_demo/fl_trainer.py
浏览文件 @
c28289ce
...
...
@@ -28,7 +28,7 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# Inform the scheduler IP to trainer
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
.
trainer_id
=
trainer_id
trainer
.
_current_ep
=
"127.0.0.1:{}"
.
format
(
9000
+
trainer_id
)
...
...
@@ -75,6 +75,7 @@ while not trainer.stop():
if
step_i
%
100
==
0
:
print
(
"Epoch: {0}, step: {1}, accuracy: {2}"
.
format
(
epoch_id
,
step_i
,
accuracy
[
0
]))
print
(
step_i
)
avg_loss_val
,
acc_val
=
train_test
(
train_test_program
=
test_program
,
train_test_reader
=
test_reader
,
train_test_feed
=
feeder
)
...
...
@@ -82,5 +83,5 @@ while not trainer.stop():
if
epoch_id
>
40
:
break
if
step_i
%
100
==
0
:
if
epoch_id
%
5
==
0
:
trainer
.
save_inference_program
(
output_folder
)
paddle_fl/examples/submitter_demo/conf.txt
浏览文件 @
c28289ce
...
...
@@ -2,7 +2,8 @@
task_name=test_fl_job_submit_jingqinghe
hdfs_output=/user/feed/mlarch/sequence_generator/dongdaxiang/job_44
train_cmd=python dist_trainer.py
monitor_cmd=python system_monitor_app.py 10 100
#monitor_cmd=python system_monitor_app.py 10 100
monitor_cmd=
#train_cmd=python test_hadoop.py
hdfs_path=afs://xingtian.afs.baidu.com:9902
...
...
paddle_fl/examples/submitter_demo/kill.sh
浏览文件 @
c28289ce
/home/jingqinghe/mpi_feed4/smart_client/bin/qdel
$1
".yq01-hpc-lvliang01-smart-master.dmop.baidu.com"
unset
http_proxy
unset
https_proxy
/home/jingqinghe/tools/mpi_feed4/smart_client/bin/qdel
$1
".yq01-hpc-lvliang01-smart-master.dmop.baidu.com"
paddle_fl/examples/submitter_demo/scheduler_client.py
浏览文件 @
c28289ce
...
...
@@ -18,7 +18,8 @@ print(random_port)
current_ip
=
socket
.
gethostbyname
(
socket
.
gethostname
())
endpoints
=
"{}:{}"
.
format
(
current_ip
,
random_port
)
#start a web server for remote endpoints to download their config
os
.
system
(
"python -m SimpleHTTPServer 8080 &"
)
#os.system("python -m SimpleHTTPServer 8080 &")
os
.
system
(
"python -m http.server 8080 &"
)
if
os
.
path
.
exists
(
"job_config"
):
os
.
system
(
"rm -rf job_config"
)
if
os
.
path
.
exists
(
"package"
):
...
...
@@ -120,10 +121,10 @@ print(ip_list)
#allocate the role of each endpoint and their ids
ip_role
=
{}
for
i
in
range
(
len
(
ip_list
)):
if
i
<
int
(
default_dict
[
"server_nodes"
]):
ip_role
[
ip_list
[
i
]]
=
'server%d'
%
i
if
i
<
int
(
default_dict
[
"server_nodes"
]):
ip_role
[
ip_list
[
i
]]
=
'server%d'
%
i
else
:
ip_role
[
ip_list
[
i
]]
=
'trainer%d'
%
(
i
-
int
(
default_dict
[
"server_nodes"
]))
ip_role
[
ip_list
[
i
]]
=
'trainer%d'
%
(
i
-
int
(
default_dict
[
"server_nodes"
]))
print
(
ip_role
)
def
job_generate
():
...
...
@@ -179,7 +180,7 @@ while not all_job_sent:
message
=
zmq_socket
.
recv
()
group
=
message
.
split
(
"
\t
"
)
if
group
[
0
]
==
"GET_FL_JOB"
:
download_job
.
append
(
group
[
1
])
download_job
.
append
(
group
[
1
])
zmq_socket
.
send
(
ip_role
[
group
[
1
]])
else
:
zmq_socket
.
send
(
"WAIT
\t
0"
)
...
...
paddle_fl/examples/submitter_demo/train_program.py
浏览文件 @
c28289ce
...
...
@@ -89,15 +89,15 @@ else:
trainer
.
start
()
print
(
trainer
.
_scheduler_ep
,
trainer
.
_current_ep
)
output_folder
=
"fl_model"
step_i
=
0
epoch_id
=
0
while
not
trainer
.
stop
():
print
(
"batch %d start train"
%
(
step_i
))
train_step
=
0
step_i
=
0
for
data
in
reader
():
trainer
.
run
(
feed
=
data
,
fetch
=
[])
train_step
+=
1
step_i
+=
1
if
train_step
==
trainer
.
_step
:
break
step_i
+=
1
if
step_i
%
100
==
0
:
epoch_id
+=
1
if
epoch_id
%
5
==
0
:
trainer
.
save_inference_program
(
output_folder
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录