diff --git a/paddle_fl/dataset/femnist.py b/paddle_fl/dataset/femnist.py new file mode 100644 index 0000000000000000000000000000000000000000..bac01effee1fc09a20c584ea7d436846f885e246 --- /dev/null +++ b/paddle_fl/dataset/femnist.py @@ -0,0 +1,70 @@ +import requests +import os +import json +import tarfile +import random +url = "https://paddlefl.bj.bcebos.com/leaf/" +target_path = "femnist_data" +tar_path = target_path+".tar.gz" +print(tar_path) + +def download(url): + r = requests.get(url) + with open(tar_path,'wb') as f: + f.write(r.content) + +def extract(tar_path): + tar = tarfile.open(tar_path, "r:gz") + file_names = tar.getnames() + for file_name in file_names: + tar.extract(file_name) + + tar.close() + +def train(trainer_id,inner_step,batch_size,count_by_step): + if not os.path.exists(target_path): + print("Preparing data...") + if not os.path.exists(tar_path): + download(url+tar_path) + extract(tar_path) + def train_data(): + train_file = open("./femnist_data/train/all_data_%d_niid_0_keep_0_train_9.json" % trainer_id,'r') + json_train = json.load(train_file) + users = json_train["users"] + rand = random.randrange(0,len(users)) # random choose a user from each trainer + cur_user = users[rand] + print('training using '+cur_user) + train_images = json_train["user_data"][cur_user]['x'] + train_labels = json_train["user_data"][cur_user]['y'] + if count_by_step: + for i in xrange(inner_step*batch_size): + yield train_images[i%(len(train_images))], train_labels[i%(len(train_images))] + else: + for i in xrange(len(train_images)): + yield train_images[i], train_labels[i] + + train_file.close() + + return train_data + +def test(trainer_id,inner_step,batch_size,count_by_step): + if not os.path.exists(target_path): + print("Preparing data...") + if not os.path.exists(tar_path): + download(url+tar_path) + extract(tar_path) + def test_data(): + test_file = open("./femnist_data/test/all_data_%d_niid_0_keep_0_test_9.json" % trainer_id, 'r') + json_test = json.load(test_file) + users = json_test["users"] + for user in users: + test_images = json_test['user_data'][user]['x'] + test_labels = json_test['user_data'][user]['y'] + for i in xrange(len(test_images)): + yield test_images[i], test_labels[i] + + test_file.close() + + return test_data + + diff --git a/paddle_fl/examples/femnist_demo/fl_trainer.py b/paddle_fl/examples/femnist_demo/fl_trainer.py index e4944ae692107714c3a8fe0d8817aa916ef766d5..d70ae267e7f2a6df4f5f5dcaa4f601543f6d13c3 100644 --- a/paddle_fl/examples/femnist_demo/fl_trainer.py +++ b/paddle_fl/examples/femnist_demo/fl_trainer.py @@ -1,13 +1,12 @@ from paddle_fl.core.trainer.fl_trainer import FLTrainerFactory from paddle_fl.core.master.fl_job import FLRunTimeJob +import paddle_fl.dataset.femnist import numpy import sys import paddle import paddle.fluid as fluid import logging import math -import random -import json logging.basicConfig(filename="test.log", filemode="w", format="%(asctime)s %(name)s:%(levelname)s:%(message)s", datefmt="%d-%M-%Y %H:%M:%S", level=logging.DEBUG) @@ -22,36 +21,6 @@ trainer.start() print(trainer._step) test_program = trainer._main_program.clone(for_test=True) -def data_generater(trainer_id,inner_step,batch_size,count_by_step): - train_file = open("./femnist_data/train/all_data_%d_niid_0_keep_0_train_9.json" % trainer_id,'r') - test_file = open("./femnist_data/test/all_data_%d_niid_0_keep_0_test_9.json" % trainer_id, 'r') - json_train = json.load(train_file) - json_test = json.load(test_file) - users = json_train["users"] - rand = random.randrange(0,len(users)) # random choose a user from each trainer - cur_user = users[rand] - print('training using '+cur_user) - def train_data(): - train_images = json_train["user_data"][cur_user]['x'] - train_labels = json_train["user_data"][cur_user]['y'] - if count_by_step: - for i in xrange(inner_step*batch_size): - yield train_images[i%(len(train_images))], train_labels[i%(len(train_images))] - else: - for i in xrange(len(train_images)): - yield train_images[i], train_labels[i] - def test_data(): - for user in users: - test_images = json_test['user_data'][user]['x'] - test_labels = json_test['user_data'][user]['y'] - for i in xrange(len(test_images)): - yield test_images[i], test_labels[i] - - train_file.close() - test_file.close() - return train_data, test_data - - img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace()) @@ -67,13 +36,6 @@ def train_test(train_test_program, train_test_feed, train_test_reader): acc_val_mean = numpy.array(acc_set).mean() return acc_val_mean - - -def compute_privacy_budget(sample_ratio, epsilon, step, delta): - E = 2 * epsilon * math.sqrt(step * sample_ratio) - print("({0}, {1})-DP".format(E, delta)) - - epoch_id = 0 step = 0 epoch = 3000 @@ -90,13 +52,15 @@ while not trainer.stop(): if epoch_id > epoch: break print("epoch %d start train" % (epoch_id)) - train_data,test_data= data_generater(trainer_id,inner_step=trainer._step,batch_size=64,count_by_step=count_by_step) + #train_data,test_data= data_generater(trainer_id,inner_step=trainer._step,batch_size=64,count_by_step=count_by_step) train_reader = paddle.batch( - paddle.reader.shuffle(train_data, buf_size=500), + paddle.reader.shuffle(paddle_fl.dataset.femnist.train(trainer_id,inner_step=trainer._step,batch_size=64,count_by_step=count_by_step), buf_size=500), batch_size=64) test_reader = paddle.batch( - test_data, batch_size=64) + paddle_fl.dataset.femnist.test(trainer_id,inner_step=trainer._step,batch_size=64,count_by_step=count_by_step), batch_size=64) + + if count_by_step: for step_id, data in enumerate(train_reader()): acc = trainer.run(feeder.feed(data), fetch=["accuracy_0.tmp_0"]) @@ -116,7 +80,6 @@ while not trainer.stop(): train_test_feed=feeder) print("Test with epoch %d, accuracy: %s" % (epoch_id, acc_val)) - compute_privacy_budget(sample_ratio=0.001, epsilon=0.1, step=step, delta=0.00001) if trainer_id == 0: save_dir = (output_folder + "/epoch_%d") % epoch_id trainer.save_inference_program(output_folder) diff --git a/paddle_fl/examples/femnist_demo/run.sh b/paddle_fl/examples/femnist_demo/run.sh index 7f416b4e8b851c2dcd81ab620ca46baa8ea21e3c..4a32be225fe2145630b3ca581525da00faa58930 100644 --- a/paddle_fl/examples/femnist_demo/run.sh +++ b/paddle_fl/examples/femnist_demo/run.sh @@ -1,7 +1,5 @@ #killall python -#python fl_master.py -#sleep 2 -python -u fl_server.py >log/server0.log & +python fl_master.py sleep 2 python -u fl_scheduler.py >scheduler.log & sleep 2 diff --git a/setup.py b/setup.py index 050d63ff684146c0c390bc3f81af6a6791688197..61a97da5605a1117efd0b84fef6d9003291545ce 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ def python_version(): max_version, mid_version, min_version = python_version() REQUIRED_PACKAGES = [ - 'six >= 1.10.0', 'protobuf >= 3.1.0','paddlepaddle >= 1.6' + 'six >= 1.10.0', 'protobuf >= 3.1.0','paddlepaddle >= 1.6', ] if max_version < 3: