Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleFL
提交
fd36a922
P
PaddleFL
项目概览
PaddlePaddle
/
PaddleFL
通知
35
Star
5
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
6
列表
看板
标记
里程碑
合并请求
4
Wiki
3
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleFL
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
6
Issue
6
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
3
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
fd36a922
编写于
1月 30, 2020
作者:
D
Dong Daxiang
提交者:
GitHub
1月 30, 2020
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #37 from qjing666/easy_use
Make examples more easy to follow
上级
67fa4a19
72383c36
变更
28
隐藏空白更改
内联
并排
Showing
28 changed file
with
91 addition
and
62 deletion
+91
-62
docs/requirements.txt
docs/requirements.txt
+1
-0
docs/source/md/quick_start.md
docs/source/md/quick_start.md
+5
-1
paddle_fl/core/scheduler/agent_master.py
paddle_fl/core/scheduler/agent_master.py
+0
-4
paddle_fl/core/trainer/diffiehellman/diffiehellman.py
paddle_fl/core/trainer/diffiehellman/diffiehellman.py
+8
-10
paddle_fl/core/trainer/fl_trainer.py
paddle_fl/core/trainer/fl_trainer.py
+7
-7
paddle_fl/examples/ctr_demo/fl_scheduler.py
paddle_fl/examples/ctr_demo/fl_scheduler.py
+2
-1
paddle_fl/examples/ctr_demo/fl_server.py
paddle_fl/examples/ctr_demo/fl_server.py
+2
-2
paddle_fl/examples/ctr_demo/fl_trainer.py
paddle_fl/examples/ctr_demo/fl_trainer.py
+5
-5
paddle_fl/examples/dpsgd_demo/fl_scheduler.py
paddle_fl/examples/dpsgd_demo/fl_scheduler.py
+2
-1
paddle_fl/examples/dpsgd_demo/fl_server.py
paddle_fl/examples/dpsgd_demo/fl_server.py
+2
-2
paddle_fl/examples/dpsgd_demo/fl_trainer.py
paddle_fl/examples/dpsgd_demo/fl_trainer.py
+1
-1
paddle_fl/examples/dpsgd_demo/run.sh
paddle_fl/examples/dpsgd_demo/run.sh
+2
-0
paddle_fl/examples/femnist_demo/fl_scheduler.py
paddle_fl/examples/femnist_demo/fl_scheduler.py
+2
-1
paddle_fl/examples/femnist_demo/fl_server.py
paddle_fl/examples/femnist_demo/fl_server.py
+2
-2
paddle_fl/examples/femnist_demo/fl_trainer.py
paddle_fl/examples/femnist_demo/fl_trainer.py
+2
-2
paddle_fl/examples/femnist_demo/run.sh
paddle_fl/examples/femnist_demo/run.sh
+2
-0
paddle_fl/examples/gru4rec_demo/fl_scheduler.py
paddle_fl/examples/gru4rec_demo/fl_scheduler.py
+2
-1
paddle_fl/examples/gru4rec_demo/fl_server.py
paddle_fl/examples/gru4rec_demo/fl_server.py
+2
-2
paddle_fl/examples/gru4rec_demo/fl_trainer.py
paddle_fl/examples/gru4rec_demo/fl_trainer.py
+1
-1
paddle_fl/examples/secagg_demo/fl_scheduler.py
paddle_fl/examples/secagg_demo/fl_scheduler.py
+10
-0
paddle_fl/examples/secagg_demo/fl_server.py
paddle_fl/examples/secagg_demo/fl_server.py
+3
-0
paddle_fl/examples/secagg_demo/fl_trainer.py
paddle_fl/examples/secagg_demo/fl_trainer.py
+4
-1
paddle_fl/examples/secagg_demo/run.sh
paddle_fl/examples/secagg_demo/run.sh
+6
-4
paddle_fl/examples/submitter_demo/conf.txt
paddle_fl/examples/submitter_demo/conf.txt
+2
-1
paddle_fl/examples/submitter_demo/kill.sh
paddle_fl/examples/submitter_demo/kill.sh
+3
-1
paddle_fl/examples/submitter_demo/scheduler_client.py
paddle_fl/examples/submitter_demo/scheduler_client.py
+6
-5
paddle_fl/examples/submitter_demo/train_program.py
paddle_fl/examples/submitter_demo/train_program.py
+5
-5
paddle_fl/version.py
paddle_fl/version.py
+2
-2
未找到文件。
docs/requirements.txt
浏览文件 @
fd36a922
...
...
@@ -3,3 +3,4 @@ mistune
sphinx_rtd_theme
paddlepaddle>=1.6
zmq
docs/source/md/quick_start.md
浏览文件 @
fd36a922
...
...
@@ -67,7 +67,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
worker_num
=
2
server_num
=
1
scheduler
=
FLScheduler
(
worker_num
,
server_num
)
# Define the number of worker/server and the port for scheduler
scheduler
=
FLScheduler
(
worker_num
,
server_num
,
port
=
9091
)
scheduler
.
set_sample_worker_num
(
worker_num
)
scheduler
.
init_env
()
print
(
"init env done."
)
...
...
@@ -94,6 +95,7 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# Inform the scheduler IP to trainer
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
.
start
()
...
...
@@ -122,6 +124,8 @@ server_id = 0
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_server_job
(
job_path
,
server_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# IP address for scheduler
server
.
set_server_job
(
job
)
server
.
_current_ep
=
"127.0.0.1:8181"
# IP address for server
server
.
start
()
```
paddle_fl/core/scheduler/agent_master.py
浏览文件 @
fd36a922
...
...
@@ -104,10 +104,7 @@ class FLScheduler(object):
def
start_fl_training
(
self
):
# loop until training is done
loop
=
0
while
True
:
if
loop
<=
1
:
print
(
loop
)
random
.
shuffle
(
self
.
fl_workers
)
worker_dict
=
{}
for
worker
in
self
.
fl_workers
[:
self
.
sample_worker_num
]:
...
...
@@ -143,4 +140,3 @@ class FLScheduler(object):
if
len
(
finish_training_dict
)
==
len
(
worker_dict
):
all_finish_training
=
True
time
.
sleep
(
5
)
loop
+=
1
paddle_fl/core/trainer/diffiehellman/diffiehellman.py
浏览文件 @
fd36a922
...
...
@@ -42,10 +42,8 @@ try:
from
ssl
import
RAND_bytes
rng
=
RAND_bytes
except
(
AttributeError
,
ImportError
):
#python2
rng
=
os
.
urandom
#raise RNGError
class
DiffieHellman
:
"""
Implements the Diffie-Hellman key exchange protocol.
...
...
@@ -115,13 +113,13 @@ class DiffieHellman:
self
.
shared_secret
=
pow
(
other_public_key
,
self
.
private_key
,
self
.
prime
)
#python2
#length = self.shared_secret.bit_length() // 8 + 1
#shared_secret_as_bytes = ('%%0%dx' % (length << 1) % self.shared_secret).decode('hex')[-length:]
#python3
shared_secret_as_bytes
=
self
.
shared_secret
.
to_bytes
(
self
.
shared_secret
.
bit_length
()
//
8
+
1
,
byteorder
=
'big'
)
try
:
#python3
shared_secret_as_bytes
=
self
.
shared_secret
.
to_bytes
(
self
.
shared_secret
.
bit_length
()
//
8
+
1
,
byteorder
=
'big'
)
except
:
#python2
length
=
self
.
shared_secret
.
bit_length
()
//
8
+
1
shared_secret_as_bytes
=
(
'%%0%dx'
%
(
length
<<
1
)
%
self
.
shared_secret
).
decode
(
'hex'
)[
-
length
:]
_h
=
sha256
()
_h
.
update
(
bytes
(
shared_secret_as_bytes
))
...
...
paddle_fl/core/trainer/fl_trainer.py
浏览文件 @
fd36a922
...
...
@@ -16,6 +16,7 @@ import logging
from
paddle_fl.core.scheduler.agent_master
import
FLWorkerAgent
import
numpy
import
hmac
import
hashlib
from
.diffiehellman.diffiehellman
import
DiffieHellman
class
FLTrainerFactory
(
object
):
...
...
@@ -89,12 +90,12 @@ class FLTrainer(object):
# TODO(guru4elephant): add connection with master
if
self
.
cur_step
!=
0
:
while
not
self
.
agent
.
finish_training
():
print
(
'wait others finish'
)
self
.
_logger
.
debug
(
"Wait others finish"
)
continue
while
not
self
.
agent
.
can_join_training
():
print
(
"w
ait permit"
)
self
.
_logger
.
debug
(
"W
ait permit"
)
continue
print
(
"r
eady to train"
)
self
.
_logger
.
debug
(
"R
eady to train"
)
return
False
...
...
@@ -123,7 +124,6 @@ class FedAvgTrainer(FLTrainer):
self
.
exe
.
run
(
self
.
_recv_program
)
epoch
=
0
for
i
in
range
(
num_epoch
):
print
(
epoch
)
for
data
in
reader
():
self
.
exe
.
run
(
self
.
_main_program
,
feed
=
feeder
.
feed
(
data
),
...
...
@@ -190,6 +190,8 @@ class SecAggTrainer(FLTrainer):
self
.
_step_id
=
s
def
start
(
self
):
self
.
agent
=
FLWorkerAgent
(
self
.
_scheduler_ep
,
self
.
_current_ep
)
self
.
agent
.
connect_scheduler
()
self
.
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
self
.
exe
.
run
(
self
.
_startup_program
)
self
.
cur_step
=
0
...
...
@@ -219,7 +221,7 @@ class SecAggTrainer(FLTrainer):
self
.
_logger
.
debug
(
"begin to run send program"
)
noise
=
0.0
scale
=
pow
(
10.0
,
5
)
digestmod
=
"SHA256"
digestmod
=
hashlib
.
sha256
# 1. load priv key and other's pub key
dh
=
DiffieHellman
(
group
=
15
,
key_length
=
256
)
dh
.
load_private_key
(
self
.
_key_dir
+
str
(
self
.
_trainer_id
)
+
"_priv_key.txt"
)
...
...
@@ -245,5 +247,3 @@ class SecAggTrainer(FLTrainer):
self
.
cur_step
+=
1
return
loss
def
stop
(
self
):
return
False
paddle_fl/examples/ctr_demo/fl_scheduler.py
浏览文件 @
fd36a922
...
...
@@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
worker_num
=
2
server_num
=
1
scheduler
=
FLScheduler
(
worker_num
,
server_num
)
# Define the number of worker/server and the port for scheduler
scheduler
=
FLScheduler
(
worker_num
,
server_num
,
port
=
9091
)
scheduler
.
set_sample_worker_num
(
worker_num
)
scheduler
.
init_env
()
print
(
"init env done."
)
...
...
paddle_fl/examples/ctr_demo/fl_server.py
浏览文件 @
fd36a922
...
...
@@ -21,8 +21,8 @@ server_id = 0
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_server_job
(
job_path
,
server_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# IP address for scheduler
server
.
set_server_job
(
job
)
server
.
_current_ep
=
"127.0.0.1:8181"
server
.
_current_ep
=
"127.0.0.1:8181"
# IP address for server
server
.
start
()
print
(
"connect"
)
paddle_fl/examples/ctr_demo/fl_trainer.py
浏览文件 @
fd36a922
...
...
@@ -19,22 +19,22 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# Inform the scheduler IP to trainer
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
.
_current_ep
=
"127.0.0.1:{}"
.
format
(
9000
+
trainer_id
)
trainer
.
start
()
print
(
trainer
.
_scheduler_ep
,
trainer
.
_current_ep
)
output_folder
=
"fl_model"
step_i
=
0
epoch_id
=
0
while
not
trainer
.
stop
():
print
(
"batch %d start train"
%
(
step_i
))
print
(
"batch %d start train"
%
(
epoch_id
))
train_step
=
0
for
data
in
reader
():
trainer
.
run
(
feed
=
data
,
fetch
=
[])
train_step
+=
1
if
train_step
==
trainer
.
_step
:
break
step_i
+=
1
if
step_i
%
100
==
0
:
epoch_id
+=
1
if
epoch_id
%
5
==
0
:
trainer
.
save_inference_program
(
output_folder
)
paddle_fl/examples/dpsgd_demo/fl_scheduler.py
浏览文件 @
fd36a922
...
...
@@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
worker_num
=
4
server_num
=
1
scheduler
=
FLScheduler
(
worker_num
,
server_num
)
#Define number of worker/server and the port for scheduler
scheduler
=
FLScheduler
(
worker_num
,
server_num
,
port
=
9091
)
scheduler
.
set_sample_worker_num
(
4
)
scheduler
.
init_env
()
print
(
"init env done."
)
...
...
paddle_fl/examples/dpsgd_demo/fl_server.py
浏览文件 @
fd36a922
...
...
@@ -21,7 +21,7 @@ server_id = 0
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_server_job
(
job_path
,
server_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# IP address for scheduler
server
.
set_server_job
(
job
)
server
.
_current_ep
=
"127.0.0.1:8181"
server
.
_current_ep
=
"127.0.0.1:8181"
# IP address for server
server
.
start
()
paddle_fl/examples/dpsgd_demo/fl_trainer.py
浏览文件 @
fd36a922
...
...
@@ -13,7 +13,7 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# Inform scheduler IP address to trainer
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
.
_current_ep
=
"127.0.0.1:{}"
.
format
(
9000
+
trainer_id
)
trainer
.
start
()
...
...
paddle_fl/examples/dpsgd_demo/run.sh
浏览文件 @
fd36a922
unset
http_proxy
unset
https_proxy
python fl_master.py
sleep
2
python
-u
fl_scheduler.py
>
scheduler.log &
...
...
paddle_fl/examples/femnist_demo/fl_scheduler.py
浏览文件 @
fd36a922
...
...
@@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
worker_num
=
4
server_num
=
1
scheduler
=
FLScheduler
(
worker_num
,
server_num
)
# Define the number of worker/server and the port for scheduler
scheduler
=
FLScheduler
(
worker_num
,
server_num
,
port
=
9091
)
scheduler
.
set_sample_worker_num
(
4
)
scheduler
.
init_env
()
print
(
"init env done."
)
...
...
paddle_fl/examples/femnist_demo/fl_server.py
浏览文件 @
fd36a922
...
...
@@ -7,7 +7,7 @@ server_id = 0
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_server_job
(
job_path
,
server_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# IP address for scheduler
server
.
set_server_job
(
job
)
server
.
_current_ep
=
"127.0.0.1:8181"
server
.
_current_ep
=
"127.0.0.1:8181"
# IP address for server
server
.
start
()
paddle_fl/examples/femnist_demo/fl_trainer.py
浏览文件 @
fd36a922
...
...
@@ -14,7 +14,7 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# Inform the scheduler IP to trainer
print
(
job
.
_target_names
)
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
.
_current_ep
=
"127.0.0.1:{}"
.
format
(
9000
+
trainer_id
)
...
...
@@ -40,7 +40,7 @@ def train_test(train_test_program, train_test_feed, train_test_reader):
epoch_id
=
0
step
=
0
epoch
=
3000
count_by_step
=
Tru
e
count_by_step
=
Fals
e
if
count_by_step
:
output_folder
=
"model_node%d"
%
trainer_id
else
:
...
...
paddle_fl/examples/femnist_demo/run.sh
浏览文件 @
fd36a922
unset
http_proxy
unset
https_proxy
#killall python
python fl_master.py
sleep
2
...
...
paddle_fl/examples/gru4rec_demo/fl_scheduler.py
浏览文件 @
fd36a922
...
...
@@ -2,7 +2,8 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
worker_num
=
4
server_num
=
1
scheduler
=
FLScheduler
(
worker_num
,
server_num
)
# Define the number of worker/server and the port for scheduler
scheduler
=
FLScheduler
(
worker_num
,
server_num
,
port
=
9091
)
scheduler
.
set_sample_worker_num
(
4
)
scheduler
.
init_env
()
print
(
"init env done."
)
...
...
paddle_fl/examples/gru4rec_demo/fl_server.py
浏览文件 @
fd36a922
...
...
@@ -21,7 +21,7 @@ server_id = 0
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_server_job
(
job_path
,
server_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# IP address for scheduler
server
.
set_server_job
(
job
)
server
.
_current_ep
=
"127.0.0.1:8181"
server
.
_current_ep
=
"127.0.0.1:8181"
# IP address for server
server
.
start
()
paddle_fl/examples/gru4rec_demo/fl_trainer.py
浏览文件 @
fd36a922
...
...
@@ -14,7 +14,7 @@ train_file_dir = "mid_data/node4/%d/" % trainer_id
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# Inform the scheduler IP to trainer
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
.
_current_ep
=
"127.0.0.1:{}"
.
format
(
9000
+
trainer_id
)
trainer
.
start
()
...
...
paddle_fl/examples/secagg_demo/fl_scheduler.py
0 → 100644
浏览文件 @
fd36a922
from
paddle_fl.core.scheduler.agent_master
import
FLScheduler
worker_num
=
2
server_num
=
1
# Define the number of worker/server and the port for scheduler
scheduler
=
FLScheduler
(
worker_num
,
server_num
,
port
=
9091
)
scheduler
.
set_sample_worker_num
(
worker_num
)
scheduler
.
init_env
()
print
(
"init env done."
)
scheduler
.
start_fl_training
()
paddle_fl/examples/secagg_demo/fl_server.py
浏览文件 @
fd36a922
...
...
@@ -21,5 +21,8 @@ server_id = 0
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_server_job
(
job_path
,
server_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# IP address for scheduler
server
.
set_server_job
(
job
)
server
.
_current_ep
=
"127.0.0.1:8181"
# IP address for server
server
.
start
()
print
(
"connect"
)
paddle_fl/examples/secagg_demo/fl_trainer.py
浏览文件 @
fd36a922
...
...
@@ -28,8 +28,10 @@ trainer_id = int(sys.argv[1]) # trainer id for each guest
job_path
=
"fl_job_config"
job
=
FLRunTimeJob
()
job
.
load_trainer_job
(
job_path
,
trainer_id
)
job
.
_scheduler_ep
=
"127.0.0.1:9091"
# Inform the scheduler IP to trainer
trainer
=
FLTrainerFactory
().
create_fl_trainer
(
job
)
trainer
.
trainer_id
=
trainer_id
trainer
.
_current_ep
=
"127.0.0.1:{}"
.
format
(
9000
+
trainer_id
)
trainer
.
trainer_num
=
trainer_num
trainer
.
key_dir
=
"./keys/"
trainer
.
start
()
...
...
@@ -73,6 +75,7 @@ while not trainer.stop():
if
step_i
%
100
==
0
:
print
(
"Epoch: {0}, step: {1}, accuracy: {2}"
.
format
(
epoch_id
,
step_i
,
accuracy
[
0
]))
print
(
step_i
)
avg_loss_val
,
acc_val
=
train_test
(
train_test_program
=
test_program
,
train_test_reader
=
test_reader
,
train_test_feed
=
feeder
)
...
...
@@ -80,5 +83,5 @@ while not trainer.stop():
if
epoch_id
>
40
:
break
if
step_i
%
100
==
0
:
if
epoch_id
%
5
==
0
:
trainer
.
save_inference_program
(
output_folder
)
paddle_fl/examples/secagg_demo/run.sh
浏览文件 @
fd36a922
...
...
@@ -5,10 +5,12 @@ if [ ! -d log ];then
mkdir
log
fi
python
3
fl_master.py
python fl_master.py
sleep
2
python
3
-u
fl_server.py
>
log/server0.log &
python
-u
fl_server.py
>
log/server0.log &
sleep
2
python
3
-u
fl_trainer.py 0
>
log/trainer0
.log &
python
-u
fl_scheduler.py
>
log/scheduler
.log &
sleep
2
python3
-u
fl_trainer.py 1
>
log/trainer1.log &
python
-u
fl_trainer.py 0
>
log/trainer0.log &
sleep
2
python
-u
fl_trainer.py 1
>
log/trainer1.log &
paddle_fl/examples/submitter_demo/conf.txt
浏览文件 @
fd36a922
...
...
@@ -2,7 +2,8 @@
task_name=test_fl_job_submit_jingqinghe
hdfs_output=/user/feed/mlarch/sequence_generator/dongdaxiang/job_44
train_cmd=python dist_trainer.py
monitor_cmd=python system_monitor_app.py 10 100
#monitor_cmd=python system_monitor_app.py 10 100
monitor_cmd=
#train_cmd=python test_hadoop.py
hdfs_path=afs://xingtian.afs.baidu.com:9902
...
...
paddle_fl/examples/submitter_demo/kill.sh
浏览文件 @
fd36a922
/home/jingqinghe/mpi_feed4/smart_client/bin/qdel
$1
".yq01-hpc-lvliang01-smart-master.dmop.baidu.com"
unset
http_proxy
unset
https_proxy
/home/jingqinghe/tools/mpi_feed4/smart_client/bin/qdel
$1
".yq01-hpc-lvliang01-smart-master.dmop.baidu.com"
paddle_fl/examples/submitter_demo/scheduler_client.py
浏览文件 @
fd36a922
...
...
@@ -18,7 +18,8 @@ print(random_port)
current_ip
=
socket
.
gethostbyname
(
socket
.
gethostname
())
endpoints
=
"{}:{}"
.
format
(
current_ip
,
random_port
)
#start a web server for remote endpoints to download their config
os
.
system
(
"python -m SimpleHTTPServer 8080 &"
)
#os.system("python -m SimpleHTTPServer 8080 &")
os
.
system
(
"python -m http.server 8080 &"
)
if
os
.
path
.
exists
(
"job_config"
):
os
.
system
(
"rm -rf job_config"
)
if
os
.
path
.
exists
(
"package"
):
...
...
@@ -120,10 +121,10 @@ print(ip_list)
#allocate the role of each endpoint and their ids
ip_role
=
{}
for
i
in
range
(
len
(
ip_list
)):
if
i
<
int
(
default_dict
[
"server_nodes"
]):
ip_role
[
ip_list
[
i
]]
=
'server%d'
%
i
if
i
<
int
(
default_dict
[
"server_nodes"
]):
ip_role
[
ip_list
[
i
]]
=
'server%d'
%
i
else
:
ip_role
[
ip_list
[
i
]]
=
'trainer%d'
%
(
i
-
int
(
default_dict
[
"server_nodes"
]))
ip_role
[
ip_list
[
i
]]
=
'trainer%d'
%
(
i
-
int
(
default_dict
[
"server_nodes"
]))
print
(
ip_role
)
def
job_generate
():
...
...
@@ -179,7 +180,7 @@ while not all_job_sent:
message
=
zmq_socket
.
recv
()
group
=
message
.
split
(
"
\t
"
)
if
group
[
0
]
==
"GET_FL_JOB"
:
download_job
.
append
(
group
[
1
])
download_job
.
append
(
group
[
1
])
zmq_socket
.
send
(
ip_role
[
group
[
1
]])
else
:
zmq_socket
.
send
(
"WAIT
\t
0"
)
...
...
paddle_fl/examples/submitter_demo/train_program.py
浏览文件 @
fd36a922
...
...
@@ -89,15 +89,15 @@ else:
trainer
.
start
()
print
(
trainer
.
_scheduler_ep
,
trainer
.
_current_ep
)
output_folder
=
"fl_model"
step_i
=
0
epoch_id
=
0
while
not
trainer
.
stop
():
print
(
"batch %d start train"
%
(
step_i
))
train_step
=
0
step_i
=
0
for
data
in
reader
():
trainer
.
run
(
feed
=
data
,
fetch
=
[])
train_step
+=
1
step_i
+=
1
if
train_step
==
trainer
.
_step
:
break
step_i
+=
1
if
step_i
%
100
==
0
:
epoch_id
+=
1
if
epoch_id
%
5
==
0
:
trainer
.
save_inference_program
(
output_folder
)
paddle_fl/version.py
浏览文件 @
fd36a922
...
...
@@ -12,5 +12,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.
""" PaddleFL version string """
fl_version
=
"0.1.
6
"
module_proto_version
=
"0.1.
6
"
fl_version
=
"0.1.
7
"
module_proto_version
=
"0.1.
7
"
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录