From 0bdc0da99baa6328266c3d43d91ae4bf09a05d05 Mon Sep 17 00:00:00 2001 From: liweibin Date: Fri, 25 Oct 2019 15:24:02 +0800 Subject: [PATCH] update examples --- examples/dgi/README.md | 2 +- examples/distribute_deepwalk/README.md | 5 +++- examples/distribute_graphsage/README.md | 2 ++ examples/distribute_graphsage/reader.py | 9 ++++---- .../distribute_graphsage/requirements.txt | 1 + examples/distribute_graphsage/train.py | 4 +--- examples/line/README.md | 2 +- examples/line/line.py | 16 ++++++++++--- examples/line/multi_class.py | 23 ++++++++++++++++--- examples/node2vec/README.md | 3 ++- examples/strucvec/classify.py | 6 +++-- examples/strucvec/sklearn_classify.py | 7 ++++-- 12 files changed, 59 insertions(+), 21 deletions(-) diff --git a/examples/dgi/README.md b/examples/dgi/README.md index 34481e0..85c9bb0 100644 --- a/examples/dgi/README.md +++ b/examples/dgi/README.md @@ -1,4 +1,4 @@ -# PGL Examples for DGI +# PGL Examples for Deep Graph Infomax [Deep Graph Infomax \(DGI\)](https://arxiv.org/abs/1809.10341) is a general approach for learning node representations within graph-structured data in an unsupervised manner. DGI relies on maximizing mutual information between patch representations and corresponding high-level summaries of graphs---both derived using established graph convolutional network architectures. diff --git a/examples/distribute_deepwalk/README.md b/examples/distribute_deepwalk/README.md index 24775aa..08e5892 100644 --- a/examples/distribute_deepwalk/README.md +++ b/examples/distribute_deepwalk/README.md @@ -1,5 +1,6 @@ # PGL Examples for distributed deepwalk [Deepwalk](https://arxiv.org/pdf/1403.6652.pdf) is an algorithmic framework for representational learning on graphs. Given any graph, it can learn continuous feature representations for the nodes, which can then be used for various downstream machine learning tasks. Based on PGL, we reproduce distributed deepwalk algorithms and reach the same level of indicators as the paper. + ## Datasets The datasets contain two networks: [BlogCatalog](http://socialcomputing.asu.edu/datasets/BlogCatalog3). ## Dependencies @@ -8,7 +9,9 @@ The datasets contain two networks: [BlogCatalog](http://socialcomputing.asu.edu/ ## How to run -For examples, train deepwalk in distributed mode on cora dataset. +We adopt [PaddlePaddle Fleet](https://github.com/PaddlePaddle/Fleet) as our distributed training frameworks ```pgl_deepwalk.cfg``` is config file for deepwalk hyperparameter and ```local_config``` is a config file for parameter servers. By default, we have 2 pservers and 2 trainers. We can use ```cloud_run.sh``` to help you startup the parameter servers and model trainers. + +For examples, train deepwalk in distributed mode on BlogCataLog dataset. ```sh # train deepwalk in distributed mode. sh cloud_run.sh diff --git a/examples/distribute_graphsage/README.md b/examples/distribute_graphsage/README.md index 7e424d4..0ce196f 100644 --- a/examples/distribute_graphsage/README.md +++ b/examples/distribute_graphsage/README.md @@ -55,3 +55,5 @@ python train.py --use_cuda --epoch 10 --graphsage_type graphsage_mean --sample_w - samples_1: The max neighbors for the first hop neighbor sampling. (default: 25) - samples_2: The max neighbors for the second hop neighbor sampling. (default: 10) - hidden_size: The hidden size of the GraphSAGE models. + + diff --git a/examples/distribute_graphsage/reader.py b/examples/distribute_graphsage/reader.py index 9c230cc..6617b6b 100644 --- a/examples/distribute_graphsage/reader.py +++ b/examples/distribute_graphsage/reader.py @@ -89,8 +89,7 @@ def worker(batch_info, graph_wrapper, samples): if len(start_nodes) == 0: break - subgraph = graph.subgraph( - nodes=nodes, eid=eids, edges=[eid2edges[e] for e in eids]) + subgraph = graph.subgraph(nodes=nodes, eid=eids, edges=[ eid2edges[e] for e in eids]) sub_node_index = subgraph.reindex_from_parrent_nodes( batch_train_samples) feed_dict = graph_wrapper.to_feed(subgraph) @@ -103,7 +102,8 @@ def worker(batch_info, graph_wrapper, samples): return work -def multiprocess_graph_reader(graph_wrapper, +def multiprocess_graph_reader( + graph_wrapper, samples, node_index, batch_size, @@ -138,7 +138,7 @@ def multiprocess_graph_reader(graph_wrapper, reader_pool = [] for i in range(num_workers): reader_pool.append( - worker(batch_info[block_size * i:block_size * (i + 1)], + worker(batch_info[block_size * i:block_size * (i + 1)], graph_wrapper, samples)) multi_process_sample = mp_reader.multiprocess_reader( reader_pool, use_pipe=True, queue_size=1000) @@ -146,3 +146,4 @@ def multiprocess_graph_reader(graph_wrapper, return paddle.reader.buffered(r, 1000) return reader() + diff --git a/examples/distribute_graphsage/requirements.txt b/examples/distribute_graphsage/requirements.txt index bfc094c..7bda67a 100644 --- a/examples/distribute_graphsage/requirements.txt +++ b/examples/distribute_graphsage/requirements.txt @@ -1,3 +1,4 @@ scipy redis==2.10.6 redis-py-cluster==1.3.6 + diff --git a/examples/distribute_graphsage/train.py b/examples/distribute_graphsage/train.py index cb62acf..fa52e3e 100644 --- a/examples/distribute_graphsage/train.py +++ b/examples/distribute_graphsage/train.py @@ -170,9 +170,7 @@ def main(args): with fluid.program_guard(train_program, startup_program): graph_wrapper = pgl.graph_wrapper.GraphWrapper( - "sub_graph", - fluid.CPUPlace(), - node_feat=[('feats', [None, 602], np.dtype('float32'))]) + "sub_graph", fluid.CPUPlace(), node_feat=[('feats', [None, 602], np.dtype('float32'))]) model_loss, model_acc = build_graph_model( graph_wrapper, num_class=data["num_class"], diff --git a/examples/line/README.md b/examples/line/README.md index abbe0d4..dfda23d 100644 --- a/examples/line/README.md +++ b/examples/line/README.md @@ -36,7 +36,7 @@ For examples, use gpu to train LINE on Flickr dataset. # multiclass task example python line.py --use_cuda --order first_order --data_path ./data/flickr/ --save_dir ./checkpoints/model/ -python multi_class.py --ckpt_path ./checkpoints/model/model_eopch_20 --percent 0.5 +python multi_class.py --ckpt_path ./checkpoints/model/model_epoch_20 --percent 0.5 ``` diff --git a/examples/line/line.py b/examples/line/line.py index a6e17f6..4c7cc9b 100644 --- a/examples/line/line.py +++ b/examples/line/line.py @@ -42,6 +42,16 @@ def make_dir(path): raise +def save_param(dirname, var_name_list): + """save_param""" + if not os.path.exists(dirname): + os.makedirs(dirname) + for var_name in var_name_list: + var = fluid.global_scope().find_var(var_name) + var_tensor = var.get_tensor() + np.save(os.path.join(dirname, var_name + '.npy'), np.array(var_tensor)) + + def set_seed(seed): """Set global random seed. """ @@ -153,9 +163,9 @@ def main(args): # save parameters in every epoch log.info("saving persistables parameters...") - fluid.io.save_persistables(exe, - os.path.join(args.save_dir, "model_epoch_%d" - % (epoch + 1)), main_program) + cur_save_path = os.path.join(args.save_dir, + "model_epoch_%d" % (epoch + 1)) + save_param(cur_save_path, ['shared_w']) if __name__ == '__main__': diff --git a/examples/line/multi_class.py b/examples/line/multi_class.py index 3f191f2..00e6165 100644 --- a/examples/line/multi_class.py +++ b/examples/line/multi_class.py @@ -33,6 +33,15 @@ from pgl.utils.logger import log from data_loader import FlickrDataset +def load_param(dirname, var_name_list): + """load_param""" + for var_name in var_name_list: + var = fluid.global_scope().find_var(var_name) + var_tensor = var.get_tensor() + var_tmp = np.load(os.path.join(dirname, var_name + '.npy')) + var_tensor.set(var_tmp, fluid.CPUPlace()) + + def set_seed(seed): """Set global random seed. """ @@ -200,12 +209,15 @@ def main(args): return False return os.path.exists(os.path.join(args.ckpt_path, var.name)) - fluid.io.load_vars( - exe, args.ckpt_path, main_program=train_prog, predicate=existed_params) + log.info('loading pretrained parameters from npy') + load_param(args.ckpt_path, ['shared_w']) + step = 0 prev_time = time.time() train_model['pyreader'].start() + final_macro_f1 = 0.0 + final_micro_f1 = 0.0 while 1: try: train_loss_val, train_probs_val, train_labels_val, train_topk_val = exe.run( @@ -257,8 +269,13 @@ def main(args): log.info("\t\tStep %d " % step + "Test Loss: %f " % test_loss_val + "Test Macro F1: %f " % test_macro_f1 + "Test Micro F1: %f " % test_micro_f1) + final_macro_f1 = max(test_macro_f1, final_macro_f1) + final_micro_f1 = max(test_micro_f1, final_micro_f1) break + log.info("\nFinal test Macro F1: %f " % final_macro_f1 + + "Final test Micro F1: %f " % final_micro_f1) + if __name__ == '__main__': parser = argparse.ArgumentParser(description='LINE') @@ -268,7 +285,7 @@ if __name__ == '__main__': default='./data/flickr/', help='dataset for training') parser.add_argument("--use_cuda", action='store_true', help="use_cuda") - parser.add_argument("--epochs", type=int, default=10) + parser.add_argument("--epochs", type=int, default=5) parser.add_argument("--seed", type=int, default=1667) parser.add_argument( "--lr", type=float, default=0.025, help='learning rate') diff --git a/examples/node2vec/README.md b/examples/node2vec/README.md index 4339c40..f2305f0 100644 --- a/examples/node2vec/README.md +++ b/examples/node2vec/README.md @@ -16,7 +16,8 @@ python node2vec.py --use_cuda --dataset BlogCatalog --save_path ./tmp/node2vec_B python multi_class.py --use_cuda --ckpt_path ./tmp/node2vec_BlogCatalog/paddle_model --epoch 1000 # link prediction task example -python node2vec.py --use_cuda --dataset ArXiv --save_path ./tmp/node2vec_ArXiv --offline_learning --epoch 400 +python node2vec.py --use_cuda --dataset ArXiv --save_path +./tmp/node2vec_ArXiv --offline_learning --epoch 10 python link_predict.py --use_cuda --ckpt_path ./tmp/node2vec_ArXiv/paddle_model --epoch 400 ``` diff --git a/examples/strucvec/classify.py b/examples/strucvec/classify.py index daaa87e..d36bf6a 100644 --- a/examples/strucvec/classify.py +++ b/examples/strucvec/classify.py @@ -1,4 +1,7 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +""" +classify.py +""" +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,7 +14,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import numpy as np import paddle import paddle.fluid as fluid diff --git a/examples/strucvec/sklearn_classify.py b/examples/strucvec/sklearn_classify.py index b2dde24..07e5c87 100644 --- a/examples/strucvec/sklearn_classify.py +++ b/examples/strucvec/sklearn_classify.py @@ -1,4 +1,7 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +""" +sklearn_classify.py +""" +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -36,7 +39,7 @@ def train_lr_l2_model(args, data): test_size=0.2, random_state=random_num + random_seed) - # use the one vs rest to train the lr model with l2 + # use the one vs rest to train the lr model with l2 pred_test = [] for i in range(0, args.num_class): y_train_relabel = np.where(y_train == i, 1, 0) -- GitLab