From 98b071f8fe41860468185255ae29321852b6e17e Mon Sep 17 00:00:00 2001 From: Webbley Date: Tue, 21 Apr 2020 17:11:50 +0800 Subject: [PATCH] update distribute_metapath2vec --- examples/distribute_metapath2vec/README.md | 20 +++++++++++++++----- examples/distribute_metapath2vec/config.yaml | 2 +- examples/distribute_metapath2vec/walker.py | 7 +++++-- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/examples/distribute_metapath2vec/README.md b/examples/distribute_metapath2vec/README.md index de31aa3..715850e 100644 --- a/examples/distribute_metapath2vec/README.md +++ b/examples/distribute_metapath2vec/README.md @@ -2,17 +2,17 @@ [metapath2vec](https://ericdongyx.github.io/papers/KDD17-dong-chawla-swami-metapath2vec.pdf) is a algorithm framework for representation learning in heterogeneous networks which contains multiple types of nodes and links. Given a heterogeneous graph, metapath2vec algorithm first generates meta-path-based random walks and then use skipgram model to train a language model. Based on PGL, we reproduce metapath2vec algorithm in distributed mode. -## Datasets +### Datasets DBLP: The dataset contains 14376 papers (P), 20 conferences (C), 14475 authors (A), and 8920 terms (T). There are 33791 nodes in this dataset. You can dowload datasets from [here](https://github.com/librahu/HIN-Datasets-for-Recommendation-and-Network-Embedding) We use the ```DBLP``` dataset for example. After downloading the dataset, put them, let's say, in ```./data/DBLP/``` . -## Dependencies +### Dependencies - paddlepaddle>=1.6 - pgl>=1.0.0 -## How to run +### How to run Before training, run the below command to do data preprocessing. ```sh python data_process.py --data_path ./data/DBLP --output_path ./data/data_processed @@ -30,11 +30,21 @@ python multi_class.py --dataset ./data/data_processed/author_label.txt --ckpt_pa ``` +### Model Selection +Actually, There are 3 models in this example, they are ```metapath2vec```, ```metapath2vec++``` and ```multi_metapath2vec++```. You can select different models by modifying ```config.yaml```. -## Hyperparameters +In order to run ```metapath2vec++``` model, you can easily rewrite the hyper parameter of **neg_sample_type** to **m2v_plus**, then ```metapath2vec++``` model will be selected. + +```multi-metapath2vec++``` means that you are not only use a single metapath, instead, you can use several metapaths at the same time to train the model. For example, you might want to use ```c2p-p2a-a2p-p2c``` and ```p2a-a2p``` simultaneously. Then you can rewrite the below hyper parameters in ```config.yaml```. +- **neg_sample_type**: "m2v_plus" +- **walk_mode**: "multi_m2v" +- **meta_path**: "c2p-p2a-a2p-p2c;p2a-a2p" +- **first_node_type**: "c;p" + +### Hyperparameters All the hyper parameters are saved in ```config.yaml``` file. So before training, you can open the config.yaml to modify the hyper parameters as you like. -Some important hyper parameters in config.yaml: +Some important hyper parameters in ```config.yaml```: - **edge_path**: the directory of graph data that you want to load - **lr**: learning rate - **neg_num**: number of negative samples. diff --git a/examples/distribute_metapath2vec/config.yaml b/examples/distribute_metapath2vec/config.yaml index 30506df..db3f38c 100644 --- a/examples/distribute_metapath2vec/config.yaml +++ b/examples/distribute_metapath2vec/config.yaml @@ -31,7 +31,7 @@ is_distributed: False # trainging config epochs: 10 optimizer: "sgd" -lr: 1.0 +lr: 0.1 warm_start_from_dir: null walkpath_files: "None" train_files: "None" diff --git a/examples/distribute_metapath2vec/walker.py b/examples/distribute_metapath2vec/walker.py index db340f9..6503766 100644 --- a/examples/distribute_metapath2vec/walker.py +++ b/examples/distribute_metapath2vec/walker.py @@ -87,9 +87,12 @@ class NodeGenerator(object): idx = cc % num_n_type n_type = n_type_list[idx] try: - nodes = node_generators[n_type].next() + nodes = next(node_generators[n_type]) except StopIteration as e: - log.info("exception when iteration") + log.info("node type of %s iteration finished in one epoch" % + (n_type)) + node_generators[n_type] = \ + self.graph.node_batch_iter(self.batch_size, n_type=n_type) break yield (nodes, idx) cc += 1 -- GitLab