提交 a6caa651 编写于 作者: Q qjing666

fix code style

上级 d4e75537
......@@ -12,14 +12,14 @@ import math
import msgpack
def data_generater(samples,r):
# data generater
def data_generater(samples, r):
# data generater
def train_data():
for item in samples:
sample = msgpack.loads(r.get(str(item)))
conv = sample[0]
label = sample[1]
yield conv,label
yield conv, label
return train_data
......@@ -67,7 +67,7 @@ class ResNet():
size=class_dim,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)),
act = "softmax")
act="softmax")
else:
for block in range(len(depth)):
for i in range(depth[block]):
......@@ -87,7 +87,7 @@ class ResNet():
size=class_dim,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)),
act = "softmax")
act="softmax")
return out
def conv_bn_layer(self,
......@@ -123,8 +123,6 @@ class ResNet():
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance', )
def shortcut(self, input, ch_out, stride, is_first, name):
ch_in = input.shape[1]
if ch_in != ch_out or stride != 1 or is_first == True:
......@@ -181,31 +179,33 @@ class ResNet():
input, num_filters, stride, is_first, name=name + "_branch1")
return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
# local redis config
redis_host = "127.0.0.1"
redis_port = 6379
redis_password = ""
r = redis.StrictRedis(host=redis_host, port=redis_port, password=redis_password)
r = redis.StrictRedis(
host=redis_host, port=redis_port, password=redis_password)
# reader generation
reader = fluid.layers.py_reader(capacity=64,
shapes=[(-1,64, 8, 8), (-1,1)],
dtypes=['float32', 'int64'])
reader = fluid.layers.py_reader(
capacity=64, shapes=[(-1, 64, 8, 8), (-1, 1)],
dtypes=['float32', 'int64'])
samples = r.keys()
train_data = data_generater(samples,r)
train_data = data_generater(samples, r)
reader.decorate_paddle_reader(paddle.batch(
paddle.reader.shuffle(
train_data, buf_size=5000),
batch_size=64))
reader.decorate_paddle_reader(
paddle.batch(
paddle.reader.shuffle(
train_data, buf_size=5000), batch_size=64))
conv1,label = fluid.layers.read_file(reader)
conv1, label = fluid.layers.read_file(reader)
# train program
place = fluid.CUDAPlace(0)
model = ResNet(layers=50)
predicts = model.net(conv1,10)
predicts = model.net(conv1, 10)
cost = fluid.layers.cross_entropy(input=predicts, label=label)
accuracy = fluid.layers.accuracy(input=predicts, label=label)
loss = fluid.layers.mean(cost)
......@@ -222,18 +222,20 @@ step = 0
train_start = time.time()
# start training
for pass_id in range(EPOCH_NUM):
reader.start()
try:
while True:
start_time = time.time()
loss_value,acc_value = exe.run(fetch_list=[loss.name,accuracy.name])
step += 1
if step % 10 == 0:
print("epoch: "+ str(pass_id)+"step: "+str(step)+"loss: "+ str(loss_value)+"acc: "+str(acc_value))
end_time = time.time()
total_time += (end_time - start_time)
except fluid.core.EOFException:
reader.reset()
reader.start()
try:
while True:
start_time = time.time()
loss_value, acc_value = exe.run(
fetch_list=[loss.name, accuracy.name])
step += 1
if step % 10 == 0:
print("epoch: " + str(pass_id) + "step: " + str(step) +
"loss: " + str(loss_value) + "acc: " + str(acc_value))
end_time = time.time()
total_time += (end_time - start_time)
except fluid.core.EOFException:
reader.reset()
train_end = time.time()
print("total time: %d" % (train_end - train_start))
print("computation time: %d" % total_time)
......@@ -5,10 +5,12 @@ import paddle.fluid as fluid
import numpy
import sys
import redis
import time
import time
from paddle.fluid import layers
from paddle.fluid.param_attr import ParamAttr
import msgpack
def conv_bn_layer(input,
num_filters,
filter_size,
......@@ -16,30 +18,30 @@ def conv_bn_layer(input,
groups=1,
act=None,
name=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False,
name=name + '.conv2d.output.1')
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
return fluid.layers.batch_norm(
input=conv,
act=act,
name=bn_name + '.output.1',
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance', )
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False,
name=name + '.conv2d.output.1')
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
return fluid.layers.batch_norm(
input=conv,
act=act,
name=bn_name + '.output.1',
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance', )
def load_conf(conf_file, local_dict):
......@@ -51,6 +53,7 @@ def load_conf(conf_file, local_dict):
local_dict[group[0]] = group[1]
return local_dict
# redis DB configuration
redis_host = "127.0.0.1"
redis_port = 6379
......@@ -58,26 +61,39 @@ redis_password = ""
start_time = time.time()
# start a redis client and empty the DB
r = redis.StrictRedis(host=redis_host, port=redis_port, password=redis_password)
r = redis.StrictRedis(
host=redis_host, port=redis_port, password=redis_password)
r.flushall()
# encoding program
images = fluid.layers.data(name='images', shape=[3,32,32], dtype='float32')
images = fluid.layers.data(name='images', shape=[3, 32, 32], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
place = fluid.CPUPlace()
conv1 = conv_bn_layer(input=images,num_filters=64,filter_size=7,stride=2,act='relu',name="conv1")
pool = fluid.layers.pool2d(input=conv1,pool_size=3,pool_stride=2,pool_padding=1,pool_type='max')
feeder = fluid.DataFeeder(place=place, feed_list=[images,label])
conv1 = conv_bn_layer(
input=images,
num_filters=64,
filter_size=7,
stride=2,
act='relu',
name="conv1")
pool = fluid.layers.pool2d(
input=conv1, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
pretrained_model = 'ResNet50_pretrained'
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
# load pretrained mode and prepare datal
def if_exist(var):
return os.path.exists(os.path.join(pretrained_model, var.name))
fluid.io.load_vars(exe, pretrained_model, main_program=fluid.default_main_program(),
predicate=if_exist)
return os.path.exists(os.path.join(pretrained_model, var.name))
fluid.io.load_vars(
exe,
pretrained_model,
main_program=fluid.default_main_program(),
predicate=if_exist)
train_data = paddle.dataset.cifar.train10()
step = 0
......@@ -86,11 +102,13 @@ step = 0
for data in train_data():
pre_data = []
pre_data.append(data)
res = exe.run(program=fluid.default_main_program(),feed=feeder.feed(pre_data), fetch_list=[pool.name])
sample = [res[0][0].tolist(),data[1]]
res = exe.run(program=fluid.default_main_program(),
feed=feeder.feed(pre_data),
fetch_list=[pool.name])
sample = [res[0][0].tolist(), data[1]]
step += 1
file = msgpack.dumps(sample)
r.set(step,file)
r.set(step, file)
if step % 100 == 0:
print(numpy.array(sample[0]).shape)
print("%dstart" % step)
......@@ -99,6 +117,4 @@ files = r.keys()
print("upload file numbers: %d" % len(files))
end_time = time.time()
total_time = end_time - start_time
print("total time: %d"% total_time)
print("total time: %d" % total_time)
......@@ -2,7 +2,7 @@ import zmq
import socket
import msgpack
import os
mission_dict = {"mission": "image classification", "image_size": [3,32,32]}
mission_dict = {"mission": "image classification", "image_size": [3, 32, 32]}
#send request
context = zmq.Context()
zmq_socket = context.socket(zmq.REQ)
......
......@@ -4,16 +4,22 @@ from paddle_fl.core.master.job_generator import JobGenerator
from paddle_fl.core.strategy.fl_strategy_base import FLStrategyFactory
import math
class Model(object):
def __init__(self):
pass
def lr_network(self):
self.inputs = fluid.layers.data(name='img', shape=[1, 28, 28], dtype="float32")
self.label = fluid.layers.data(name='label', shape=[1],dtype='int64')
self.predict = fluid.layers.fc(input=self.inputs, size=10, act='softmax')
self.sum_cost = fluid.layers.cross_entropy(input=self.predict, label=self.label)
self.accuracy = fluid.layers.accuracy(input=self.predict, label=self.label)
self.inputs = fluid.layers.data(
name='img', shape=[1, 28, 28], dtype="float32")
self.label = fluid.layers.data(name='label', shape=[1], dtype='int64')
self.predict = fluid.layers.fc(input=self.inputs,
size=10,
act='softmax')
self.sum_cost = fluid.layers.cross_entropy(
input=self.predict, label=self.label)
self.accuracy = fluid.layers.accuracy(
input=self.predict, label=self.label)
self.loss = fluid.layers.mean(self.sum_cost)
self.startup_program = fluid.default_startup_program()
......@@ -23,7 +29,7 @@ model.lr_network()
STEP_EPSILON = 0.1
DELTA = 0.00001
SIGMA = math.sqrt(2.0 * math.log(1.25/DELTA)) / STEP_EPSILON
SIGMA = math.sqrt(2.0 * math.log(1.25 / DELTA)) / STEP_EPSILON
CLIP = 4.0
batch_size = 64
......@@ -33,7 +39,8 @@ job_generator.set_optimizer(optimizer)
job_generator.set_losses([model.loss])
job_generator.set_startup_program(model.startup_program)
job_generator.set_infer_feed_and_target_names(
[model.inputs.name, model.label.name], [model.loss.name, model.accuracy.name])
[model.inputs.name, model.label.name],
[model.loss.name, model.accuracy.name])
build_strategy = FLStrategyFactory()
build_strategy.dpsgd = True
......
......@@ -3,7 +3,7 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
worker_num = 4
server_num = 1
#Define number of worker/server and the port for scheduler
scheduler = FLScheduler(worker_num,server_num,port=9091)
scheduler = FLScheduler(worker_num, server_num, port=9091)
scheduler.set_sample_worker_num(4)
scheduler.init_env()
print("init env done.")
......
......@@ -21,7 +21,7 @@ server_id = 0
job_path = "fl_job_config"
job = FLRunTimeJob()
job.load_server_job(job_path, server_id)
job._scheduler_ep = "127.0.0.1:9091" # IP address for scheduler
job._scheduler_ep = "127.0.0.1:9091" # IP address for scheduler
server.set_server_job(job)
server._current_ep = "127.0.0.1:8181" # IP address for server
server._current_ep = "127.0.0.1:8181" # IP address for server
server.start()
......@@ -7,44 +7,51 @@ import paddle.fluid as fluid
import logging
import math
logging.basicConfig(filename="test.log", filemode="w", format="%(asctime)s %(name)s:%(levelname)s:%(message)s", datefmt="%d-%M-%Y %H:%M:%S", level=logging.DEBUG)
logging.basicConfig(
filename="test.log",
filemode="w",
format="%(asctime)s %(name)s:%(levelname)s:%(message)s",
datefmt="%d-%M-%Y %H:%M:%S",
level=logging.DEBUG)
trainer_id = int(sys.argv[1]) # trainer id for each guest
trainer_id = int(sys.argv[1]) # trainer id for each guest
job_path = "fl_job_config"
job = FLRunTimeJob()
job.load_trainer_job(job_path, trainer_id)
job._scheduler_ep = "127.0.0.1:9091" # Inform scheduler IP address to trainer
job._scheduler_ep = "127.0.0.1:9091" # Inform scheduler IP address to trainer
trainer = FLTrainerFactory().create_fl_trainer(job)
trainer._current_ep = "127.0.0.1:{}".format(9000+trainer_id)
trainer._current_ep = "127.0.0.1:{}".format(9000 + trainer_id)
trainer.start()
test_program = trainer._main_program.clone(for_test=True)
train_reader = paddle.batch(
paddle.reader.shuffle(paddle.dataset.mnist.train(), buf_size=500),
batch_size=64)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=64)
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=500),
batch_size=64)
test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=64)
img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace())
def train_test(train_test_program, train_test_feed, train_test_reader):
acc_set = []
for test_data in train_test_reader():
acc_np = trainer.exe.run(
program=train_test_program,
feed=train_test_feed.feed(test_data),
fetch_list=["accuracy_0.tmp_0"])
acc_np = trainer.exe.run(program=train_test_program,
feed=train_test_feed.feed(test_data),
fetch_list=["accuracy_0.tmp_0"])
acc_set.append(float(acc_np[0]))
acc_val_mean = numpy.array(acc_set).mean()
return acc_val_mean
def compute_privacy_budget(sample_ratio, epsilon, step, delta):
E = 2 * epsilon * math.sqrt(step * sample_ratio)
print("({0}, {1})-DP".format(E, delta))
output_folder = "model_node%d" % trainer_id
epoch_id = 0
step = 0
......@@ -64,7 +71,8 @@ while not trainer.stop():
train_test_feed=feeder)
print("Test with epoch %d, accuracy: %s" % (epoch_id, acc_val))
compute_privacy_budget(sample_ratio=0.001, epsilon=0.1, step=step, delta=0.00001)
compute_privacy_budget(
sample_ratio=0.001, epsilon=0.1, step=step, delta=0.00001)
save_dir = (output_folder + "/epoch_%d") % epoch_id
trainer.save_inference_program(output_folder)
......@@ -9,14 +9,31 @@ class Model(object):
pass
def cnn(self):
self.inputs = fluid.layers.data(name='img', shape=[1, 28, 28], dtype="float32")
self.label = fluid.layers.data(name='label', shape=[1],dtype='int64')
self.conv_pool_1 = fluid.nets.simple_img_conv_pool(input=self.inputs,num_filters=20,filter_size=5,pool_size=2,pool_stride=2,act='relu')
self.conv_pool_2 = fluid.nets.simple_img_conv_pool(input=self.conv_pool_1,num_filters=50,filter_size=5,pool_size=2,pool_stride=2,act='relu')
self.predict = self.predict = fluid.layers.fc(input=self.conv_pool_2, size=62, act='softmax')
self.cost = fluid.layers.cross_entropy(input=self.predict, label=self.label)
self.accuracy = fluid.layers.accuracy(input=self.predict, label=self.label)
self.inputs = fluid.layers.data(
name='img', shape=[1, 28, 28], dtype="float32")
self.label = fluid.layers.data(name='label', shape=[1], dtype='int64')
self.conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=self.inputs,
num_filters=20,
filter_size=5,
pool_size=2,
pool_stride=2,
act='relu')
self.conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=self.conv_pool_1,
num_filters=50,
filter_size=5,
pool_size=2,
pool_stride=2,
act='relu')
self.predict = self.predict = fluid.layers.fc(input=self.conv_pool_2,
size=62,
act='softmax')
self.cost = fluid.layers.cross_entropy(
input=self.predict, label=self.label)
self.accuracy = fluid.layers.accuracy(
input=self.predict, label=self.label)
self.loss = fluid.layers.mean(self.cost)
self.startup_program = fluid.default_startup_program()
......@@ -30,8 +47,8 @@ job_generator.set_optimizer(optimizer)
job_generator.set_losses([model.loss])
job_generator.set_startup_program(model.startup_program)
job_generator.set_infer_feed_and_target_names(
[model.inputs.name, model.label.name], [model.loss.name, model.accuracy.name])
[model.inputs.name, model.label.name],
[model.loss.name, model.accuracy.name])
build_strategy = FLStrategyFactory()
build_strategy.fed_avg = True
......
......@@ -3,7 +3,7 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
worker_num = 4
server_num = 1
# Define the number of worker/server and the port for scheduler
scheduler = FLScheduler(worker_num,server_num,port=9091)
scheduler = FLScheduler(worker_num, server_num, port=9091)
scheduler.set_sample_worker_num(4)
scheduler.init_env()
print("init env done.")
......
......@@ -7,7 +7,7 @@ server_id = 0
job_path = "fl_job_config"
job = FLRunTimeJob()
job.load_server_job(job_path, server_id)
job._scheduler_ep = "127.0.0.1:9091" # IP address for scheduler
job._scheduler_ep = "127.0.0.1:9091" # IP address for scheduler
server.set_server_job(job)
server._current_ep = "127.0.0.1:8181" # IP address for server
server._current_ep = "127.0.0.1:8181" # IP address for server
server.start()
......@@ -8,16 +8,21 @@ import paddle.fluid as fluid
import logging
import math
logging.basicConfig(filename="test.log", filemode="w", format="%(asctime)s %(name)s:%(levelname)s:%(message)s", datefmt="%d-%M-%Y %H:%M:%S", level=logging.DEBUG)
logging.basicConfig(
filename="test.log",
filemode="w",
format="%(asctime)s %(name)s:%(levelname)s:%(message)s",
datefmt="%d-%M-%Y %H:%M:%S",
level=logging.DEBUG)
trainer_id = int(sys.argv[1]) # trainer id for each guest
trainer_id = int(sys.argv[1]) # trainer id for each guest
job_path = "fl_job_config"
job = FLRunTimeJob()
job.load_trainer_job(job_path, trainer_id)
job._scheduler_ep = "127.0.0.1:9091" # Inform the scheduler IP to trainer
job._scheduler_ep = "127.0.0.1:9091" # Inform the scheduler IP to trainer
print(job._target_names)
trainer = FLTrainerFactory().create_fl_trainer(job)
trainer._current_ep = "127.0.0.1:{}".format(9000+trainer_id)
trainer._current_ep = "127.0.0.1:{}".format(9000 + trainer_id)
trainer.start()
print(trainer._step)
test_program = trainer._main_program.clone(for_test=True)
......@@ -26,26 +31,26 @@ img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace())
def train_test(train_test_program, train_test_feed, train_test_reader):
acc_set = []
for test_data in train_test_reader():
acc_np = trainer.exe.run(
program=train_test_program,
feed=train_test_feed.feed(test_data),
fetch_list=["accuracy_0.tmp_0"])
acc_set.append(float(acc_np[0]))
acc_val_mean = numpy.array(acc_set).mean()
return acc_val_mean
acc_set = []
for test_data in train_test_reader():
acc_np = trainer.exe.run(program=train_test_program,
feed=train_test_feed.feed(test_data),
fetch_list=["accuracy_0.tmp_0"])
acc_set.append(float(acc_np[0]))
acc_val_mean = numpy.array(acc_set).mean()
return acc_val_mean
epoch_id = 0
step = 0
epoch = 3000
count_by_step = False
if count_by_step:
output_folder = "model_node%d" % trainer_id
else:
output_folder = "model_node%d_epoch" % trainer_id
output_folder = "model_node%d" % trainer_id
else:
output_folder = "model_node%d_epoch" % trainer_id
while not trainer.stop():
count = 0
......@@ -55,24 +60,35 @@ while not trainer.stop():
print("epoch %d start train" % (epoch_id))
#train_data,test_data= data_generater(trainer_id,inner_step=trainer._step,batch_size=64,count_by_step=count_by_step)
train_reader = paddle.batch(
paddle.reader.shuffle(paddle_fl.dataset.femnist.train(trainer_id,inner_step=trainer._step,batch_size=64,count_by_step=count_by_step), buf_size=500),
paddle.reader.shuffle(
paddle_fl.dataset.femnist.train(
trainer_id,
inner_step=trainer._step,
batch_size=64,
count_by_step=count_by_step),
buf_size=500),
batch_size=64)
test_reader = paddle.batch(
paddle_fl.dataset.femnist.test(trainer_id,inner_step=trainer._step,batch_size=64,count_by_step=count_by_step), batch_size=64)
paddle_fl.dataset.femnist.test(
trainer_id,
inner_step=trainer._step,
batch_size=64,
count_by_step=count_by_step),
batch_size=64)
if count_by_step:
for step_id, data in enumerate(train_reader()):
for step_id, data in enumerate(train_reader()):
acc = trainer.run(feeder.feed(data), fetch=["accuracy_0.tmp_0"])
step += 1
count += 1
print(count)
if count % trainer._step == 0:
if count % trainer._step == 0:
break
# print("acc:%.3f" % (acc[0]))
else:
trainer.run_with_epoch(train_reader,feeder,fetch=["accuracy_0.tmp_0"],num_epoch=1)
trainer.run_with_epoch(
train_reader, feeder, fetch=["accuracy_0.tmp_0"], num_epoch=1)
acc_val = train_test(
train_test_program=test_program,
......@@ -80,6 +96,6 @@ while not trainer.stop():
train_test_feed=feeder)
print("Test with epoch %d, accuracy: %s" % (epoch_id, acc_val))
if trainer_id == 0:
if trainer_id == 0:
save_dir = (output_folder + "/epoch_%d") % epoch_id
trainer.save_inference_program(output_folder)
......@@ -3,6 +3,7 @@ import paddle_fl as fl
from paddle_fl.core.master.job_generator import JobGenerator
from paddle_fl.core.strategy.fl_strategy_base import FLStrategyFactory
class Model(object):
def __init__(self):
pass
......@@ -34,7 +35,8 @@ class Model(object):
size=hid_size * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
low=init_low_bound,
high=init_high_bound),
learning_rate=gru_lr_x))
gru_h0 = fluid.layers.dynamic_gru(
input=fc0,
......@@ -45,12 +47,13 @@ class Model(object):
learning_rate=gru_lr_x))
self.fc = fluid.layers.fc(input=gru_h0,
size=vocab_size,
act='softmax',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
learning_rate=fc_lr_x))
size=vocab_size,
act='softmax',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound,
high=init_high_bound),
learning_rate=fc_lr_x))
cost = fluid.layers.cross_entropy(
input=self.fc, label=self.dst_wordseq)
self.acc = fluid.layers.accuracy(
......@@ -59,7 +62,6 @@ class Model(object):
self.startup_program = fluid.default_startup_program()
model = Model()
model.gru4rec_network()
......@@ -69,7 +71,8 @@ job_generator.set_optimizer(optimizer)
job_generator.set_losses([model.loss])
job_generator.set_startup_program(model.startup_program)
job_generator.set_infer_feed_and_target_names(
[model.src_wordseq.name, model.dst_wordseq.name], [model.loss.name, model.acc.name])
[model.src_wordseq.name, model.dst_wordseq.name],
[model.loss.name, model.acc.name])
build_strategy = FLStrategyFactory()
build_strategy.fed_avg = True
......
......@@ -3,7 +3,7 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
worker_num = 4
server_num = 1
# Define the number of worker/server and the port for scheduler
scheduler = FLScheduler(worker_num,server_num,port=9091)
scheduler = FLScheduler(worker_num, server_num, port=9091)
scheduler.set_sample_worker_num(4)
scheduler.init_env()
print("init env done.")
......
......@@ -21,7 +21,7 @@ server_id = 0
job_path = "fl_job_config"
job = FLRunTimeJob()
job.load_server_job(job_path, server_id)
job._scheduler_ep = "127.0.0.1:9091" # IP address for scheduler
job._scheduler_ep = "127.0.0.1:9091" # IP address for scheduler
server.set_server_job(job)
server._current_ep = "127.0.0.1:8181" # IP address for server
server._current_ep = "127.0.0.1:8181" # IP address for server
server.start()
......@@ -6,21 +6,26 @@ import numpy as np
import sys
import os
import logging
logging.basicConfig(filename="test.log", filemode="w", format="%(asctime)s %(name)s:%(levelname)s:%(message)s", datefmt="%d-%M-%Y %H:%M:%S", level=logging.DEBUG)
logging.basicConfig(
filename="test.log",
filemode="w",
format="%(asctime)s %(name)s:%(levelname)s:%(message)s",
datefmt="%d-%M-%Y %H:%M:%S",
level=logging.DEBUG)
trainer_id = int(sys.argv[1]) # trainer id for each guest
trainer_id = int(sys.argv[1]) # trainer id for each guest
place = fluid.CPUPlace()
train_file_dir = "mid_data/node4/%d/" % trainer_id
job_path = "fl_job_config"
job = FLRunTimeJob()
job.load_trainer_job(job_path, trainer_id)
job._scheduler_ep = "127.0.0.1:9091" # Inform the scheduler IP to trainer
job._scheduler_ep = "127.0.0.1:9091" # Inform the scheduler IP to trainer
trainer = FLTrainerFactory().create_fl_trainer(job)
trainer._current_ep = "127.0.0.1:{}".format(9000+trainer_id)
trainer._current_ep = "127.0.0.1:{}".format(9000 + trainer_id)
trainer.start()
r = Gru4rec_Reader()
train_reader = r.reader(train_file_dir, place, batch_size = 125)
train_reader = r.reader(train_file_dir, place, batch_size=125)
output_folder = "model_node4"
step_i = 0
......@@ -30,8 +35,7 @@ while not trainer.stop():
train_step = 0
for data in train_reader():
#print(np.array(data['src_wordseq']))
ret_avg_cost = trainer.run(feed=data,
fetch=["mean_0.tmp_0"])
ret_avg_cost = trainer.run(feed=data, fetch=["mean_0.tmp_0"])
train_step += 1
if train_step == trainer._step:
break
......
......@@ -5,6 +5,7 @@ import paddle_fl as fl
from paddle_fl.core.master.job_generator import JobGenerator
from paddle_fl.core.strategy.fl_strategy_base import FLStrategyFactory
def parse_args():
parser = argparse.ArgumentParser(description="master")
parser.add_argument(
......@@ -12,7 +13,7 @@ def parse_args():
type=int,
default=2,
help='number of trainer(default: 2)')
return parser.parse_args()
......@@ -25,7 +26,8 @@ class Model(object):
self.fc1 = fluid.layers.fc(input=self.concat, size=256, act='relu')
self.fc2 = fluid.layers.fc(input=self.fc1, size=128, act='relu')
self.predict = fluid.layers.fc(input=self.fc2, size=2, act='softmax')
self.sum_cost = fluid.layers.cross_entropy(input=self.predict, label=label)
self.sum_cost = fluid.layers.cross_entropy(
input=self.predict, label=label)
self.accuracy = fluid.layers.accuracy(input=self.predict, label=label)
self.loss = fluid.layers.reduce_mean(self.sum_cost)
self.startup_program = fluid.default_startup_program()
......@@ -47,8 +49,8 @@ optimizer = fluid.optimizer.SGD(learning_rate=0.1)
job_generator.set_optimizer(optimizer)
job_generator.set_losses([model.loss])
job_generator.set_startup_program(model.startup_program)
job_generator.set_infer_feed_and_target_names(
[x.name for x in inputs], [model.predict.name])
job_generator.set_infer_feed_and_target_names([x.name for x in inputs],
[model.predict.name])
build_strategy = FLStrategyFactory()
build_strategy.fed_avg = True
......@@ -57,7 +59,8 @@ strategy = build_strategy.create_fl_strategy()
# endpoints will be collected through the cluster
# in this example, we suppose endpoints have been collected
server_service_ip = os.environ['FL_SERVER_SERVICE_HOST'] + ":" + os.environ['FL_SERVER_SERVICE_PORT_FL_SERVER']
server_service_ip = os.environ['FL_SERVER_SERVICE_HOST'] + ":" + os.environ[
'FL_SERVER_SERVICE_PORT_FL_SERVER']
service_endpoints = [server_service_ip]
pod_endpoints = ["0.0.0.0:8181"]
output = "fl_job_config"
......@@ -68,4 +71,8 @@ num_trainer = args.trainer_num
# fl_job_config will be dispatched to workers
job_generator.generate_fl_job_for_k8s(
strategy, server_pod_endpoints=pod_endpoints,server_service_endpoints=service_endpoints, worker_num=2, output=output)
strategy,
server_pod_endpoints=pod_endpoints,
server_service_endpoints=service_endpoints,
worker_num=2,
output=output)
import argparse
from paddle_fl.core.scheduler.agent_master import FLScheduler
def parse_args():
parser = argparse.ArgumentParser(description="scheduler")
parser.add_argument(
......@@ -11,12 +12,13 @@ def parse_args():
return parser.parse_args()
args = parse_args()
num_trainer = args.trainer_num
worker_num = num_trainer
server_num = 1
# Define the number of worker/server and the port for scheduler
scheduler = FLScheduler(worker_num,server_num,port=9091)
scheduler = FLScheduler(worker_num, server_num, port=9091)
scheduler.set_sample_worker_num(worker_num)
scheduler.init_env()
print("init env done.")
......
......@@ -23,10 +23,12 @@ server_id = 0
job_path = "fl_job_config"
job = FLRunTimeJob()
job.load_server_job(job_path, server_id)
job._scheduler_ep = os.environ['FL_SCHEDULER_SERVICE_HOST'] + ":" + os.environ['FL_SCHEDULER_SERVICE_PORT_FL_SCHEDULER']# IP address for scheduler
job._scheduler_ep = os.environ['FL_SCHEDULER_SERVICE_HOST'] + ":" + os.environ[
'FL_SCHEDULER_SERVICE_PORT_FL_SCHEDULER'] # IP address for scheduler
#job._endpoints = os.environ['POD_IP'] + ":" + os.environ['FL_SERVER_SERVICE_PORT_FL_SERVER'] # IP address for server
server.set_server_job(job)
server._current_ep = os.environ['FL_SERVER_SERVICE_HOST'] + ":" + os.environ['FL_SERVER_SERVICE_PORT_FL_SERVER'] # IP address for server
print(job._scheduler_ep,server._current_ep)
server._current_ep = os.environ['FL_SERVER_SERVICE_HOST'] + ":" + os.environ[
'FL_SERVER_SERVICE_PORT_FL_SERVER'] # IP address for server
print(job._scheduler_ep, server._current_ep)
server.start()
print("connect")
......@@ -5,7 +5,12 @@ import sys
import os
import logging
import time
logging.basicConfig(filename="test.log", filemode="w", format="%(asctime)s %(name)s:%(levelname)s:%(message)s", datefmt="%d-%M-%Y %H:%M:%S", level=logging.DEBUG)
logging.basicConfig(
filename="test.log",
filemode="w",
format="%(asctime)s %(name)s:%(levelname)s:%(message)s",
datefmt="%d-%M-%Y %H:%M:%S",
level=logging.DEBUG)
def reader():
......@@ -16,15 +21,18 @@ def reader():
data_dict["label"] = np.random.randint(2, size=(1, 1)).astype('int64')
yield data_dict
trainer_id = int(sys.argv[1]) # trainer id for each guest
trainer_id = int(sys.argv[1]) # trainer id for each guest
job_path = "fl_job_config"
job = FLRunTimeJob()
job.load_trainer_job(job_path, trainer_id)
#job._scheduler_ep = "127.0.0.1:9091" # Inform the scheduler IP to trainer
job._scheduler_ep = os.environ['FL_SCHEDULER_SERVICE_HOST'] + ":" + os.environ['FL_SCHEDULER_SERVICE_PORT_FL_SCHEDULER']
job._scheduler_ep = os.environ['FL_SCHEDULER_SERVICE_HOST'] + ":" + os.environ[
'FL_SCHEDULER_SERVICE_PORT_FL_SCHEDULER']
trainer = FLTrainerFactory().create_fl_trainer(job)
#trainer._current_ep = "127.0.0.1:{}".format(9000+trainer_id)
trainer._current_ep = os.environ['TRAINER0_SERVICE_HOST'] + ":" + os.environ['TRAINER0_SERVICE_PORT_TRAINER0']
trainer._current_ep = os.environ['TRAINER0_SERVICE_HOST'] + ":" + os.environ[
'TRAINER0_SERVICE_PORT_TRAINER0']
trainer.start()
print(trainer._scheduler_ep, trainer._current_ep)
output_folder = "fl_model"
......@@ -40,4 +48,3 @@ while not trainer.stop():
epoch_id += 1
if epoch_id % 5 == 0:
trainer.save_inference_program(output_folder)
......@@ -5,7 +5,12 @@ import sys
import os
import logging
import time
logging.basicConfig(filename="test.log", filemode="w", format="%(asctime)s %(name)s:%(levelname)s:%(message)s", datefmt="%d-%M-%Y %H:%M:%S", level=logging.DEBUG)
logging.basicConfig(
filename="test.log",
filemode="w",
format="%(asctime)s %(name)s:%(levelname)s:%(message)s",
datefmt="%d-%M-%Y %H:%M:%S",
level=logging.DEBUG)
def reader():
......@@ -16,15 +21,18 @@ def reader():
data_dict["label"] = np.random.randint(2, size=(1, 1)).astype('int64')
yield data_dict
trainer_id = int(sys.argv[1]) # trainer id for each guest
trainer_id = int(sys.argv[1]) # trainer id for each guest
job_path = "fl_job_config"
job = FLRunTimeJob()
job.load_trainer_job(job_path, trainer_id)
#job._scheduler_ep = "127.0.0.1:9091" # Inform the scheduler IP to trainer
job._scheduler_ep = os.environ['FL_SCHEDULER_SERVICE_HOST'] + ":" + os.environ['FL_SCHEDULER_SERVICE_PORT_FL_SCHEDULER']
job._scheduler_ep = os.environ['FL_SCHEDULER_SERVICE_HOST'] + ":" + os.environ[
'FL_SCHEDULER_SERVICE_PORT_FL_SCHEDULER']
trainer = FLTrainerFactory().create_fl_trainer(job)
#trainer._current_ep = "127.0.0.1:{}".format(9000+trainer_id)
trainer._current_ep = os.environ['TRAINER1_SERVICE_HOST'] + ":" + os.environ['TRAINER1_SERVICE_PORT_TRAINER1']
trainer._current_ep = os.environ['TRAINER1_SERVICE_HOST'] + ":" + os.environ[
'TRAINER1_SERVICE_PORT_TRAINER1']
trainer.start()
print(trainer._scheduler_ep, trainer._current_ep)
output_folder = "fl_model"
......@@ -40,4 +48,3 @@ while not trainer.stop():
epoch_id += 1
if epoch_id % 5 == 0:
trainer.save_inference_program(output_folder)
......@@ -3,6 +3,7 @@ import paddle_fl as fl
from paddle_fl.core.master.job_generator import JobGenerator
from paddle_fl.core.strategy.fl_strategy_base import FLStrategyFactory
class Model(object):
def __init__(self):
pass
......@@ -14,12 +15,17 @@ class Model(object):
param_attrs = fluid.ParamAttr(
name="fc_0.w_0",
initializer=fluid.initializer.ConstantInitializer(0.0))
self.predict = fluid.layers.fc(input=inputs, size=10, act='softmax', param_attr=param_attrs)
self.sum_cost = fluid.layers.cross_entropy(input=self.predict, label=label)
self.predict = fluid.layers.fc(input=inputs,
size=10,
act='softmax',
param_attr=param_attrs)
self.sum_cost = fluid.layers.cross_entropy(
input=self.predict, label=label)
self.loss = fluid.layers.mean(self.sum_cost)
self.accuracy = fluid.layers.accuracy(input=self.predict, label=label)
self.startup_program = fluid.default_startup_program()
inputs = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(name='y', shape=[1], dtype='int64')
......@@ -31,15 +37,16 @@ optimizer = fluid.optimizer.SGD(learning_rate=0.01)
job_generator.set_optimizer(optimizer)
job_generator.set_losses([model.loss])
job_generator.set_startup_program(model.startup_program)
job_generator.set_infer_feed_and_target_names(
[inputs.name, label.name], [model.loss.name])
job_generator.set_infer_feed_and_target_names([inputs.name, label.name],
[model.loss.name])
build_strategy = FLStrategyFactory()
#build_strategy.fed_avg = True
build_strategy.sec_agg = True
param_name_list = []
param_name_list.append("fc_0.w_0.opti.trainer_") # need trainer_id when running
param_name_list.append(
"fc_0.w_0.opti.trainer_") # need trainer_id when running
param_name_list.append("fc_0.b_0.opti.trainer_")
build_strategy.param_name_list = param_name_list
......
......@@ -3,7 +3,7 @@ from paddle_fl.core.scheduler.agent_master import FLScheduler
worker_num = 2
server_num = 1
scheduler = FLScheduler(worker_num,server_num,port=9091)
scheduler = FLScheduler(worker_num, server_num, port=9091)
scheduler.set_sample_worker_num(worker_num)
scheduler.init_env()
print("init env done.")
......
......@@ -21,8 +21,8 @@ server_id = 0
job_path = "fl_job_config"
job = FLRunTimeJob()
job.load_server_job(job_path, server_id)
job._scheduler_ep = "127.0.0.1:9091" # IP address for scheduler
job._scheduler_ep = "127.0.0.1:9091" # IP address for scheduler
server.set_server_job(job)
server._current_ep = "127.0.0.1:8181" # IP address for server
server._current_ep = "127.0.0.1:8181" # IP address for server
server.start()
print("connect")
2438748580808349511143047663636683775879288034436941526695550498623461587527621346172907651006831789701999970929529915459467532662545948308044143788306668377086821294492459623439935894424167712515718436351900091777957477710004777078638317806960364609629258387413979203403741893205419691425902518810085451041187685334971769087054033027561974230347468587825700834108657561999305482311897914109364221430821533207693682979777541616125499682380618775029176238891407643926372043660610226672413497764635239787000143341827693941253721638947580506197728500367325524850325027980531066702962726949006217630290236644410746181942256812170056772600756232506116738493114591218127323741133163913140583529684827066023347088796194846253682954154504336640429027403657831470993825621749318372332546269811820953216261135662418531598954663771775691648448615131802158937156803324423733802071166119966224716088242291968098450309032800335049617861465
\ No newline at end of file
2438748580808349511143047663636683775879288034436941526695550498623461587527621346172907651006831789701999970929529915459467532662545948308044143788306668377086821294492459623439935894424167712515718436351900091777957477710004777078638317806960364609629258387413979203403741893205419691425902518810085451041187685334971769087054033027561974230347468587825700834108657561999305482311897914109364221430821533207693682979777541616125499682380618775029176238891407643926372043660610226672413497764635239787000143341827693941253721638947580506197728500367325524850325027980531066702962726949006217630290236644410746181942256812170056772600756232506116738493114591218127323741133163913140583529684827066023347088796194846253682954154504336640429027403657831470993825621749318372332546269811820953216261135662418531598954663771775691648448615131802158937156803324423733802071166119966224716088242291968098450309032800335049617861465
2514645349791449916465355128335954929464444612258498884322250411584328344530925790221013632799576102047787468232441470392901627580493383471719532612816534848391408601920539266665550346246343040368608757429591392784807798812848893304745441721044204602414415725300562075953290154457726382683020925406187524758708694161689285261293920782115270550960717687322240202298426363733065475031448888618026711435993053991653780694485897784243346859087028197560255857091562150381619708471192080403868115055265681423866891707972356449212236920210992128063075734725292359699578940877165584175851667803049667020005978253573912096358469050409541237248593428640028573437783747958382446666712845099931003578559688134007238753677011257181086592677636834099341020870502521085827878680362572013623469761170961943916356175726242515624843837354222489536469472365937707615959657846428001149463801728084949088483783942894784080451399167982957006474558
\ No newline at end of file
2514645349791449916465355128335954929464444612258498884322250411584328344530925790221013632799576102047787468232441470392901627580493383471719532612816534848391408601920539266665550346246343040368608757429591392784807798812848893304745441721044204602414415725300562075953290154457726382683020925406187524758708694161689285261293920782115270550960717687322240202298426363733065475031448888618026711435993053991653780694485897784243346859087028197560255857091562150381619708471192080403868115055265681423866891707972356449212236920210992128063075734725292359699578940877165584175851667803049667020005978253573912096358469050409541237248593428640028573437783747958382446666712845099931003578559688134007238753677011257181086592677636834099341020870502521085827878680362572013623469761170961943916356175726242515624843837354222489536469472365937707615959657846428001149463801728084949088483783942894784080451399167982957006474558
......@@ -21,4 +21,3 @@ server=yq01-hpc-lvliang01-smart-master.dmop.baidu.com
python_tar=./python.tar.gz
wheel=./paddlepaddle-0.0.0-cp27-cp27mu-linux_x86_64.whl
import paddle.fluid as fluid
class Model(object):
def __init__(self):
pass
......@@ -9,8 +10,8 @@ class Model(object):
self.fc1 = fluid.layers.fc(input=self.concat, size=256, act='relu')
self.fc2 = fluid.layers.fc(input=self.fc1, size=128, act='relu')
self.predict = fluid.layers.fc(input=self.fc2, size=2, act='softmax')
self.sum_cost = fluid.layers.cross_entropy(input=self.predict, label=label)
self.sum_cost = fluid.layers.cross_entropy(
input=self.predict, label=label)
self.accuracy = fluid.layers.accuracy(input=self.predict, label=label)
self.loss = fluid.layers.reduce_mean(self.sum_cost)
self.startup_program = fluid.default_startup_program()
......@@ -49,6 +49,7 @@ default_dict = {
"wheel": "./paddlepaddle-0.0.0-cp27-cp27mu-linux_x86_64-0.whl"
}
def load_conf(conf_file, local_dict):
with open(conf_file) as fin:
for line in fin:
......@@ -58,6 +59,7 @@ def load_conf(conf_file, local_dict):
local_dict[group[0]] = group[1]
return local_dict
client = HPCClient()
default_dict = load_conf(sys.argv[1], default_dict)
......@@ -94,9 +96,11 @@ all_ips_ready = False
ip_list = []
scheduler = FLScheduler(int(default_dict["worker_nodes"]),
int(default_dict["server_nodes"]),
port=random_port, socket=zmq_socket)
scheduler = FLScheduler(
int(default_dict["worker_nodes"]),
int(default_dict["server_nodes"]),
port=random_port,
socket=zmq_socket)
scheduler.set_sample_worker_num(int(default_dict["worker_nodes"]))
......@@ -121,12 +125,14 @@ print(ip_list)
#allocate the role of each endpoint and their ids
ip_role = {}
for i in range(len(ip_list)):
if i < int(default_dict["server_nodes"]):
ip_role[ip_list[i]] = 'server%d' % i
else:
ip_role[ip_list[i]] = 'trainer%d' % (i-int(default_dict["server_nodes"]))
if i < int(default_dict["server_nodes"]):
ip_role[ip_list[i]] = 'server%d' % i
else:
ip_role[ip_list[i]] = 'trainer%d' % (
i - int(default_dict["server_nodes"]))
print(ip_role)
def job_generate():
#generate a fl job which is the same as fl_master
inputs = [fluid.layers.data( \
......@@ -146,8 +152,8 @@ def job_generate():
job_generator.set_optimizer(optimizer)
job_generator.set_losses([model.loss])
job_generator.set_startup_program(model.startup_program)
job_generator.set_infer_feed_and_target_names(
[x.name for x in inputs], [model.predict.name])
job_generator.set_infer_feed_and_target_names([x.name for x in inputs],
[model.predict.name])
build_strategy = FLStrategyFactory()
build_strategy.fed_avg = True
......@@ -157,20 +163,24 @@ def job_generate():
# endpoints will be collected through the cluster
# in this example, we suppose endpoints have been collected
server_ip = ["{}".format(ip_list[0])]
output = "job_config"
job_generator.generate_fl_job(
strategy, server_endpoints=server_ip, worker_num=int(default_dict["worker_nodes"]), output=output)
strategy,
server_endpoints=server_ip,
worker_num=int(default_dict["worker_nodes"]),
output=output)
file_list = os.listdir(output)
for file in file_list:
tar = tarfile.open('{}/{}.tar.gz'.format(output,file),'w:gz')
for root,dir,files in os.walk("{}/{}".format(output,file)):
for f in files:
fullpath = os.path.join(root,f)
tar.add(fullpath)
tar = tarfile.open('{}/{}.tar.gz'.format(output, file), 'w:gz')
for root, dir, files in os.walk("{}/{}".format(output, file)):
for f in files:
fullpath = os.path.join(root, f)
tar.add(fullpath)
tar.close()
job_generate()
#send the allocated rolls to the remote endpoints
......
......@@ -13,7 +13,6 @@ import sys
import logging
import time
random_port = 60001
scheduler_conf = {}
......@@ -31,8 +30,7 @@ download_url = "{}:8080".format(scheduler_ip[0])
print(download_url)
context = zmq.Context()
zmq_socket = context.socket(zmq.REQ)
zmq_socket.connect(
"tcp://{}".format(scheduler_conf["ENDPOINT"]))
zmq_socket.connect("tcp://{}".format(scheduler_conf["ENDPOINT"]))
zmq_socket.send("ENDPOINT\t{}".format(endpoint))
message = zmq_socket.recv()
print(message)
......@@ -47,7 +45,7 @@ while True:
if group[0] == "WAIT":
continue
else:
os.system("wget {}/job_config/{}.tar.gz".format(download_url,message))
os.system("wget {}/job_config/{}.tar.gz".format(download_url, message))
print(message)
break
......@@ -71,6 +69,7 @@ if 'server' in message:
server._current_ep = endpoint
server.start()
else:
def reader():
for i in range(1000):
data_dict = {}
......@@ -96,7 +95,7 @@ else:
for data in reader():
trainer.run(feed=data, fetch=[])
step_i += 1
if step_i == trainer._step:
if step_i == trainer._step:
break
epoch_id += 1
if epoch_id % 5 == 0:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册