diff --git a/examples/dgi/README.md b/examples/dgi/README.md index 34481e06d4576c7969fe68c7a7b6a5f5e319d0c9..85c9bb08b986c508feb2c7620ceb2a2f78dfcc3e 100644 --- a/examples/dgi/README.md +++ b/examples/dgi/README.md @@ -1,4 +1,4 @@ -# PGL Examples for DGI +# PGL Examples for Deep Graph Infomax [Deep Graph Infomax \(DGI\)](https://arxiv.org/abs/1809.10341) is a general approach for learning node representations within graph-structured data in an unsupervised manner. DGI relies on maximizing mutual information between patch representations and corresponding high-level summaries of graphs---both derived using established graph convolutional network architectures. diff --git a/examples/distribute_deepwalk/README.md b/examples/distribute_deepwalk/README.md index 24775aaae8bfbae11c69a68fa6eb49c9863a0ae3..08e58922142d636103e8c62bfafb0bde9ad8f0bd 100644 --- a/examples/distribute_deepwalk/README.md +++ b/examples/distribute_deepwalk/README.md @@ -1,5 +1,6 @@ # PGL Examples for distributed deepwalk [Deepwalk](https://arxiv.org/pdf/1403.6652.pdf) is an algorithmic framework for representational learning on graphs. Given any graph, it can learn continuous feature representations for the nodes, which can then be used for various downstream machine learning tasks. Based on PGL, we reproduce distributed deepwalk algorithms and reach the same level of indicators as the paper. + ## Datasets The datasets contain two networks: [BlogCatalog](http://socialcomputing.asu.edu/datasets/BlogCatalog3). ## Dependencies @@ -8,7 +9,9 @@ The datasets contain two networks: [BlogCatalog](http://socialcomputing.asu.edu/ ## How to run -For examples, train deepwalk in distributed mode on cora dataset. +We adopt [PaddlePaddle Fleet](https://github.com/PaddlePaddle/Fleet) as our distributed training frameworks ```pgl_deepwalk.cfg``` is config file for deepwalk hyperparameter and ```local_config``` is a config file for parameter servers. By default, we have 2 pservers and 2 trainers. We can use ```cloud_run.sh``` to help you startup the parameter servers and model trainers. + +For examples, train deepwalk in distributed mode on BlogCataLog dataset. ```sh # train deepwalk in distributed mode. sh cloud_run.sh diff --git a/examples/distribute_graphsage/README.md b/examples/distribute_graphsage/README.md index 7e424d4668955476eecf459ade84603b71e26ebe..0ce196f6417b676f8d1853f14c012bd86d5972ef 100644 --- a/examples/distribute_graphsage/README.md +++ b/examples/distribute_graphsage/README.md @@ -55,3 +55,5 @@ python train.py --use_cuda --epoch 10 --graphsage_type graphsage_mean --sample_w - samples_1: The max neighbors for the first hop neighbor sampling. (default: 25) - samples_2: The max neighbors for the second hop neighbor sampling. (default: 10) - hidden_size: The hidden size of the GraphSAGE models. + + diff --git a/examples/distribute_graphsage/reader.py b/examples/distribute_graphsage/reader.py index 9c230cce3c40161881163a0446ab51977ecc9700..6617b6b86fe08facee1915edcd459a8c706c4191 100644 --- a/examples/distribute_graphsage/reader.py +++ b/examples/distribute_graphsage/reader.py @@ -89,8 +89,7 @@ def worker(batch_info, graph_wrapper, samples): if len(start_nodes) == 0: break - subgraph = graph.subgraph( - nodes=nodes, eid=eids, edges=[eid2edges[e] for e in eids]) + subgraph = graph.subgraph(nodes=nodes, eid=eids, edges=[ eid2edges[e] for e in eids]) sub_node_index = subgraph.reindex_from_parrent_nodes( batch_train_samples) feed_dict = graph_wrapper.to_feed(subgraph) @@ -103,7 +102,8 @@ def worker(batch_info, graph_wrapper, samples): return work -def multiprocess_graph_reader(graph_wrapper, +def multiprocess_graph_reader( + graph_wrapper, samples, node_index, batch_size, @@ -138,7 +138,7 @@ def multiprocess_graph_reader(graph_wrapper, reader_pool = [] for i in range(num_workers): reader_pool.append( - worker(batch_info[block_size * i:block_size * (i + 1)], + worker(batch_info[block_size * i:block_size * (i + 1)], graph_wrapper, samples)) multi_process_sample = mp_reader.multiprocess_reader( reader_pool, use_pipe=True, queue_size=1000) @@ -146,3 +146,4 @@ def multiprocess_graph_reader(graph_wrapper, return paddle.reader.buffered(r, 1000) return reader() + diff --git a/examples/distribute_graphsage/requirements.txt b/examples/distribute_graphsage/requirements.txt index bfc094c11eb1cebe90f8acbc4c399eb688d9c7cb..7bda67a20635218a8786cfb872cfd2da5b2ddbe1 100644 --- a/examples/distribute_graphsage/requirements.txt +++ b/examples/distribute_graphsage/requirements.txt @@ -1,3 +1,4 @@ scipy redis==2.10.6 redis-py-cluster==1.3.6 + diff --git a/examples/distribute_graphsage/train.py b/examples/distribute_graphsage/train.py index cb62acf8c20b1e63126c6ff20687d6e59f597d7f..fa52e3e002b52e14db5ea4e893377476eada41ef 100644 --- a/examples/distribute_graphsage/train.py +++ b/examples/distribute_graphsage/train.py @@ -170,9 +170,7 @@ def main(args): with fluid.program_guard(train_program, startup_program): graph_wrapper = pgl.graph_wrapper.GraphWrapper( - "sub_graph", - fluid.CPUPlace(), - node_feat=[('feats', [None, 602], np.dtype('float32'))]) + "sub_graph", fluid.CPUPlace(), node_feat=[('feats', [None, 602], np.dtype('float32'))]) model_loss, model_acc = build_graph_model( graph_wrapper, num_class=data["num_class"], diff --git a/examples/line/README.md b/examples/line/README.md index abbe0d4e28e6ae47e72e6aeec6a7b70f04ae988d..dfda23d599da9e8b4a58e4b637962e4fe9860e36 100644 --- a/examples/line/README.md +++ b/examples/line/README.md @@ -36,7 +36,7 @@ For examples, use gpu to train LINE on Flickr dataset. # multiclass task example python line.py --use_cuda --order first_order --data_path ./data/flickr/ --save_dir ./checkpoints/model/ -python multi_class.py --ckpt_path ./checkpoints/model/model_eopch_20 --percent 0.5 +python multi_class.py --ckpt_path ./checkpoints/model/model_epoch_20 --percent 0.5 ``` diff --git a/examples/line/line.py b/examples/line/line.py index a6e17f6d20122f806cdc27e88abdfb1585fc99a2..4c7cc9b5540c2399b376a57fe810d9af3ba0119c 100644 --- a/examples/line/line.py +++ b/examples/line/line.py @@ -42,6 +42,16 @@ def make_dir(path): raise +def save_param(dirname, var_name_list): + """save_param""" + if not os.path.exists(dirname): + os.makedirs(dirname) + for var_name in var_name_list: + var = fluid.global_scope().find_var(var_name) + var_tensor = var.get_tensor() + np.save(os.path.join(dirname, var_name + '.npy'), np.array(var_tensor)) + + def set_seed(seed): """Set global random seed. """ @@ -153,9 +163,9 @@ def main(args): # save parameters in every epoch log.info("saving persistables parameters...") - fluid.io.save_persistables(exe, - os.path.join(args.save_dir, "model_epoch_%d" - % (epoch + 1)), main_program) + cur_save_path = os.path.join(args.save_dir, + "model_epoch_%d" % (epoch + 1)) + save_param(cur_save_path, ['shared_w']) if __name__ == '__main__': diff --git a/examples/line/multi_class.py b/examples/line/multi_class.py index 3f191f23504359995c3a9f279c9948ff5402b15c..00e616594c58fc05e81010a6df0fe4cd9017caa7 100644 --- a/examples/line/multi_class.py +++ b/examples/line/multi_class.py @@ -33,6 +33,15 @@ from pgl.utils.logger import log from data_loader import FlickrDataset +def load_param(dirname, var_name_list): + """load_param""" + for var_name in var_name_list: + var = fluid.global_scope().find_var(var_name) + var_tensor = var.get_tensor() + var_tmp = np.load(os.path.join(dirname, var_name + '.npy')) + var_tensor.set(var_tmp, fluid.CPUPlace()) + + def set_seed(seed): """Set global random seed. """ @@ -200,12 +209,15 @@ def main(args): return False return os.path.exists(os.path.join(args.ckpt_path, var.name)) - fluid.io.load_vars( - exe, args.ckpt_path, main_program=train_prog, predicate=existed_params) + log.info('loading pretrained parameters from npy') + load_param(args.ckpt_path, ['shared_w']) + step = 0 prev_time = time.time() train_model['pyreader'].start() + final_macro_f1 = 0.0 + final_micro_f1 = 0.0 while 1: try: train_loss_val, train_probs_val, train_labels_val, train_topk_val = exe.run( @@ -257,8 +269,13 @@ def main(args): log.info("\t\tStep %d " % step + "Test Loss: %f " % test_loss_val + "Test Macro F1: %f " % test_macro_f1 + "Test Micro F1: %f " % test_micro_f1) + final_macro_f1 = max(test_macro_f1, final_macro_f1) + final_micro_f1 = max(test_micro_f1, final_micro_f1) break + log.info("\nFinal test Macro F1: %f " % final_macro_f1 + + "Final test Micro F1: %f " % final_micro_f1) + if __name__ == '__main__': parser = argparse.ArgumentParser(description='LINE') @@ -268,7 +285,7 @@ if __name__ == '__main__': default='./data/flickr/', help='dataset for training') parser.add_argument("--use_cuda", action='store_true', help="use_cuda") - parser.add_argument("--epochs", type=int, default=10) + parser.add_argument("--epochs", type=int, default=5) parser.add_argument("--seed", type=int, default=1667) parser.add_argument( "--lr", type=float, default=0.025, help='learning rate') diff --git a/examples/node2vec/README.md b/examples/node2vec/README.md index 4339c40b70d822bbd1d1b06aae6431e07d888377..f2305f01b4db4963ae9f76d9f185172934e601e9 100644 --- a/examples/node2vec/README.md +++ b/examples/node2vec/README.md @@ -16,7 +16,8 @@ python node2vec.py --use_cuda --dataset BlogCatalog --save_path ./tmp/node2vec_B python multi_class.py --use_cuda --ckpt_path ./tmp/node2vec_BlogCatalog/paddle_model --epoch 1000 # link prediction task example -python node2vec.py --use_cuda --dataset ArXiv --save_path ./tmp/node2vec_ArXiv --offline_learning --epoch 400 +python node2vec.py --use_cuda --dataset ArXiv --save_path +./tmp/node2vec_ArXiv --offline_learning --epoch 10 python link_predict.py --use_cuda --ckpt_path ./tmp/node2vec_ArXiv/paddle_model --epoch 400 ``` diff --git a/examples/strucvec/classify.py b/examples/strucvec/classify.py index daaa87eb8dee6e117ae1c523c089ddcc51af206f..d36bf6a59415c134ca0d9aa5e3e3c64aa502aedc 100644 --- a/examples/strucvec/classify.py +++ b/examples/strucvec/classify.py @@ -1,4 +1,7 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +""" +classify.py +""" +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,7 +14,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import numpy as np import paddle import paddle.fluid as fluid diff --git a/examples/strucvec/sklearn_classify.py b/examples/strucvec/sklearn_classify.py index b2dde24b9af2f5d9e8455a4efb114ea7e51b3d14..07e5c875a225fca6d409611c8d0941a0129c210d 100644 --- a/examples/strucvec/sklearn_classify.py +++ b/examples/strucvec/sklearn_classify.py @@ -1,4 +1,7 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +""" +sklearn_classify.py +""" +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -36,7 +39,7 @@ def train_lr_l2_model(args, data): test_size=0.2, random_state=random_num + random_seed) - # use the one vs rest to train the lr model with l2 + # use the one vs rest to train the lr model with l2 pred_test = [] for i in range(0, args.num_class): y_train_relabel = np.where(y_train == i, 1, 0)