diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 041ee06ca3b78d358d73bebabb5eaa05205161e1..625f91ebd5ea5276a4d1740a3f4c6ea1e3df1b63 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -23,7 +23,7 @@ repos:
sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0
hooks:
- id: check-added-large-files
- args: [--maxkb=1024]
+ args: [--maxkb=4096]
- id: check-merge-conflict
- id: check-symlinks
- id: detect-private-key
diff --git a/docs/requirements.txt b/docs/requirements.txt
index fa68258b1d84063d6ca0fb46f55e80e414eb9bdd..4e7960b2517dee1e49324ec1245b1a49595126bc 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,3 +1,7 @@
sphinx==2.1.0
mistune
sphinx_rtd_theme
+numpy >= 1.16.4
+cython >= 0.25.2
+paddlepaddle
+pgl
diff --git a/docs/source/api/pgl.rst b/docs/source/api/pgl.rst
index bf5fc0e29ff53f10f7b1fc1570d178b469e4d3e5..36da0df57a7e8174f813dd4a4c869fff962d8462 100644
--- a/docs/source/api/pgl.rst
+++ b/docs/source/api/pgl.rst
@@ -8,3 +8,4 @@ API Reference
pgl.layers
pgl.data_loader
pgl.utils.paddle_helper
+ pgl.utils.mp_reader
diff --git a/docs/source/api/pgl.utils.mp_reader.rst b/docs/source/api/pgl.utils.mp_reader.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1952a6f3afd0d3309b3de60194b24dae0e33c317
--- /dev/null
+++ b/docs/source/api/pgl.utils.mp_reader.rst
@@ -0,0 +1,7 @@
+pgl.utils.mp\_reader module: MultiProcessing reader helper function for Paddle.
+===============================
+
+.. automodule:: pgl.utils.mp_reader
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/source/examples/md/gat_examples.md b/docs/source/examples/md/gat_examples.md
index 51534442d9f8e6c631eb2dc4f16da14adb2ecc8e..fd55c0469afc8e68f515377b9265b7569b35acf3 100644
--- a/docs/source/examples/md/gat_examples.md
+++ b/docs/source/examples/md/gat_examples.md
@@ -32,18 +32,18 @@ The datasets contain three citation networks: CORA, PUBMED, CITESEER. The detail
### Dependencies
-- paddlepaddle>=1.4 (The speed can be faster in 1.5.)
+- paddlepaddle>=1.6
- pgl
### Performance
We train our models for 200 epochs and report the accuracy on the test dataset.
-| Dataset | Accuracy | Speed with paddle 1.4
(epoch time) | Speed with paddle 1.5
(epoch time)|
-| --- | --- | --- |---|
-| Cora | ~83% | 0.0188s | 0.0175s |
-| Pubmed | ~78% | 0.0449s | 0.0295s |
-| Citeseer | ~70% | 0.0275 | 0.0253s |
+| Dataset | Accuracy |
+| --- | --- |
+| Cora | ~83% |
+| Pubmed | ~78% |
+| Citeseer | ~70% |
### How to run
diff --git a/docs/source/examples/md/gcn_examples.md b/docs/source/examples/md/gcn_examples.md
index 37af6683695944ead0ad21b6068537fb995273f1..11d5f4ee88aeb46575da8ee7b26b24bfb837b85a 100644
--- a/docs/source/examples/md/gcn_examples.md
+++ b/docs/source/examples/md/gcn_examples.md
@@ -27,18 +27,18 @@ The datasets contain three citation networks: CORA, PUBMED, CITESEER. The detail
### Dependencies
-- paddlepaddle>=1.4 (The speed can be faster in 1.5.)
+- paddlepaddle>=1.6
- pgl
### Performance
We train our models for 200 epochs and report the accuracy on the test dataset.
-| Dataset | Accuracy | Speed with paddle 1.4
(epoch time) | Speed with paddle 1.5
(epoch time)|
-| --- | --- | --- |---|
-| Cora | ~81% | 0.0106s | 0.0104s |
-| Pubmed | ~79% | 0.0210s | 0.0154s |
-| Citeseer | ~71% | 0.0175s | 0.0177s |
+| Dataset | Accuracy |
+| --- | --- |
+| Cora | ~81% |
+| Pubmed | ~79% |
+| Citeseer | ~71% |
### How to run
diff --git a/docs/source/examples/md/graphsage_examples.md b/docs/source/examples/md/graphsage_examples.md
index c52c2e04740517e69705f540c7bd23e5789ad536..6ad6901fd464752fb47a81a07604e706d59ee56b 100644
--- a/docs/source/examples/md/graphsage_examples.md
+++ b/docs/source/examples/md/graphsage_examples.md
@@ -12,7 +12,7 @@ The reddit dataset should be downloaded from the following links and placed in d
### Dependencies
-- paddlepaddle>=1.4 (The speed can be faster in 1.5.)
+- paddlepaddle>=1.6
- pgl
### How to run
@@ -22,6 +22,14 @@ To train a GraphSAGE model on Reddit Dataset, you can just run
python train.py --use_cuda --epoch 10 --graphsage_type graphsage_mean --normalize --symmetry
```
+If you want to train a GraphSAGE model with multiple GPUs, you can just run
+
+```
+CUDA_VISIBLE_DEVICES=0,1 python train_multi.py --use_cuda --epoch 10 --graphsage_type graphsage_mean --normalize --symmetry --num_trainer 2
+```
+
+
+
#### Hyperparameters
- epoch: Number of epochs default (10)
diff --git a/docs/source/examples/md/node2vec_examples.md b/docs/source/examples/md/node2vec_examples.md
index 7324b9ddaf7d4f4b0fc6e2ca30164bc0462cfe3e..981c39f3f0c3e1cdd33137deeb1e40b935773b2d 100644
--- a/docs/source/examples/md/node2vec_examples.md
+++ b/docs/source/examples/md/node2vec_examples.md
@@ -5,7 +5,7 @@
## Datasets
The datasets contain two networks: [BlogCatalog](http://socialcomputing.asu.edu/datasets/BlogCatalog3) and [Arxiv](http://snap.stanford.edu/data/ca-AstroPh.html).
## Dependencies
-- paddlepaddle>=1.4
+- paddlepaddle>=1.6
- pgl
## How to run
diff --git a/docs/source/examples/md/static_gat_examples.md b/docs/source/examples/md/static_gat_examples.md
index 6be646d3235d050d0383e8d8e74a85ee5b36ffa4..7659be190179e88c4739181f7e6b0dbaa9899900 100644
--- a/docs/source/examples/md/static_gat_examples.md
+++ b/docs/source/examples/md/static_gat_examples.md
@@ -19,11 +19,11 @@ The datasets contain three citation networks: CORA, PUBMED, CITESEER. The detail
We train our models for 200 epochs and report the accuracy on the test dataset.
-| Dataset | Accuracy | Speed with paddle 1.4
(epoch time) | Speed with paddle 1.5
(epoch time)| examples/gat | Improvement |
-| --- | --- | --- |---| --- | --- |
-| Cora | ~83% | 0.0145s | 0.0119s | 0.0175s | 1.47x |
-| Pubmed | ~78% | 0.0352s | 0.0193s |0.0295s | 1.53x |
-| Citeseer | ~70% | 0.0148s | 0.0124s |0.0253s | 2.04x |
+| Dataset | Accuracy | epoch time | examples/gat | Improvement |
+| --- | --- | --- | --- | --- |
+| Cora | ~83% | 0.0119s | 0.0175s | 1.47x |
+| Pubmed | ~78% | 0.0193s |0.0295s | 1.53x |
+| Citeseer | ~70% | 0.0124s |0.0253s | 2.04x |
### How to run
diff --git a/docs/source/examples/md/static_gcn_examples.md b/docs/source/examples/md/static_gcn_examples.md
index d9ad8b3cc31a725eb2c8b48d85794446d12f794a..6b0997dcf587ad398e41c37b720275d1edbeb236 100644
--- a/docs/source/examples/md/static_gcn_examples.md
+++ b/docs/source/examples/md/static_gcn_examples.md
@@ -10,7 +10,7 @@ The datasets contain three citation networks: CORA, PUBMED, CITESEER. The detail
### Dependencies
-- paddlepaddle>=1.4 (The speed can be faster in 1.5.)
+- paddlepaddle>=1.6
- pgl
### Performance
@@ -18,11 +18,11 @@ The datasets contain three citation networks: CORA, PUBMED, CITESEER. The detail
We train our models for 200 epochs and report the accuracy on the test dataset.
-| Dataset | Accuracy | Speed with paddle 1.4
(epoch time) | Speed with paddle 1.5
(epoch time)| examples/gcn | Improvement |
+| Dataset | Accuracy | epoch time | examples/gcn | Improvement |
| --- | --- | --- |---| --- | --- |
-| Cora | ~81% | 0.0053s | 0.0047s | 0.0104s | 2.21x |
-| Pubmed | ~79% | 0.0105s | 0.0049s |0.0154s | 3.14x |
-| Citeseer | ~71% | 0.0051s | 0.0045s |0.0177s | 3.93x |
+| Cora | ~81% | 0.0047s | 0.0104s | 2.21x |
+| Pubmed | ~79% | 0.0049s |0.0154s | 3.14x |
+| Citeseer | ~71% | 0.0045s |0.0177s | 3.93x |
diff --git a/docs/source/instruction.rst b/docs/source/instruction.rst
index 6a338516e55c97567e801f4cc8c15252f7fb2685..9ece98772ca31b4fe428e916d733da441f82459b 100644
--- a/docs/source/instruction.rst
+++ b/docs/source/instruction.rst
@@ -8,8 +8,7 @@ To install Paddle Graph Learning, we need the following packages.
.. code-block:: sh
- paddlepaddle >= 1.4 (Faster performance on 1.5)
- networkx
+ paddlepaddle >= 1.6
cython
We can simply install pgl by pip.
diff --git a/docs/source/md/introduction.md b/docs/source/md/introduction.md
index 6dd06fa1f461d4075b250d7c79b5006f9fe1cd9f..bd42565b410e84661cf18731645dbc131102085f 100644
--- a/docs/source/md/introduction.md
+++ b/docs/source/md/introduction.md
@@ -35,8 +35,8 @@ Users only need to call the ```sequence_ops``` functions provided by Paddle to e
return fluid.layers.sequence_pool(msg, "sum")
```
+Although DGL does some kernel fusion optimization for general sum, max and other aggregate functions with scatter-gather. For **complex user-defined functions** with degree bucketing algorithm, the serial execution for each degree bucket cannot take full advantage of the performance improvement provided by GPU. However, operations on the PGL LodTensor-based message is performed in parallel, which can fully utilize GPU parallel optimization. Even without scatter-gather optimization, PGL still has excellent performance. Of course, we still provide build-in scatter-optimized message aggregation functions.
-Although DGL does some kernel fusion optimization for general sum, max and other aggregate functions with scatter-gather. For **complex user-defined functions** with degree bucketing algorithm, the serial execution for each degree bucket cannot take full advantage of the performance improvement provided by GPU. However, operations on the PGL LodTensor-based message is performed in parallel, which can fully utilize GPU parallel optimization. In our experiments, PGL can reach up to 13 times the speed of DGL with complex user-defined functions. Even without scatter-gather optimization, PGL still has excellent performance. Of course, we still provide build-in scatter-optimized message aggregation functions.
## Performance
@@ -50,11 +50,3 @@ We test all the GNN algorithms with Tesla V100-SXM2-16G running for 200 epochs t
| Pubmed | GAT | 77% |0.0193s|**0.0144s**|
| Citeseer | GCN |70.2%| **0.0045** |0.0046s|
| Citeseer | GAT |68.8%| **0.0124s** |0.0139s|
-
-If we use complex user-defined aggregation like [GraphSAGE-LSTM](https://cs.stanford.edu/people/jure/pubs/graphsage-nips17.pdf) that aggregates neighbor features with LSTM ignoring the order of recieved messages, the optimized message-passing in DGL will be forced to degenerate into degree bucketing scheme. The speed performance will be much slower than the one implemented in PGL. Performances may be various with different scale of the graph, in our experiments, PGL can reach up to 13 times the speed of DGL.
-
-| Dataset | PGL speed (epoch time) | DGL 0.3.0 speed (epoch time) | Speed up|
-| -------- | ------------ | ------------------------------------ |----|
-| Cora | **0.0186s** | 0.1638s | 8.80x|
-| Pubmed | **0.0388s** |0.5275s | 13.59x|
-| Citeseer | **0.0150s** | 0.1278s | 8.52x |
diff --git a/docs/source/md/quick_start.md b/docs/source/md/quick_start.md
index 9356362127e800b53e5dca9258e251eab3868ea6..0a30c39e822cb700b84f4df37df54d7de2e80b1a 100644
--- a/docs/source/md/quick_start.md
+++ b/docs/source/md/quick_start.md
@@ -95,7 +95,7 @@ After defining the GCN layer, we can construct a deeper GCN model with two GCN l
```python
output = gcn_layer(gw, gw.node_feat['feature'],
hidden_size=8, name='gcn_layer_1', activation='relu')
-output = gcn_layer(gw, output, hidden_size=1,
+output = gcn_layer(gw, output, hidden_size=2,
name='gcn_layer_2', activation=None)
```
diff --git a/examples/dgi/README.md b/examples/dgi/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3de5252e1ceb5482013cc8de224cc4080a5fab58
--- /dev/null
+++ b/examples/dgi/README.md
@@ -0,0 +1,36 @@
+# PGL Examples for GCN
+
+[Deep Graph Infomax \(DGI\)](https://arxiv.org/abs/1809.10341) is a general approach for learning node representations within graph-structured data in an unsupervised manner. DGI relies on maximizing mutual information between patch representations and corresponding high-level summaries of graphs---both derived using established graph convolutional network architectures.
+
+### Datasets
+
+The datasets contain three citation networks: CORA, PUBMED, CITESEER. The details for these three datasets can be found in the [paper](https://arxiv.org/abs/1609.02907).
+
+### Dependencies
+
+- paddlepaddle>=1.6
+- pgl
+
+### Performance
+
+We use DGI to pretrain embeddings for each nodes. Then we fix the embedding to train a node classifier.
+
+| Dataset | Accuracy |
+| --- | --- |
+| Cora | ~81% |
+| Pubmed | ~77.6% |
+| Citeseer | ~71.3% |
+
+
+### How to run
+
+For examples, use gpu to train gcn on cora dataset.
+```
+python dgi.py --dataset cora --use_cuda
+python train.py --dataset cora --use_cuda
+```
+
+#### Hyperparameters
+
+- dataset: The citation dataset "cora", "citeseer", "pubmed".
+- use_cuda: Use gpu if assign use_cuda.
diff --git a/examples/dgi/dgi.py b/examples/dgi/dgi.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f8706a940ea93367e025fa6beeea3f97271e47f
--- /dev/null
+++ b/examples/dgi/dgi.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ DGI Pretrain
+"""
+import os
+import pgl
+from pgl import data_loader
+from pgl.utils.logger import log
+import paddle.fluid as fluid
+import numpy as np
+import time
+import argparse
+
+
+def load(name):
+ """Load dataset"""
+ if name == 'cora':
+ dataset = data_loader.CoraDataset()
+ elif name == "pubmed":
+ dataset = data_loader.CitationDataset("pubmed", symmetry_edges=False)
+ elif name == "citeseer":
+ dataset = data_loader.CitationDataset("citeseer", symmetry_edges=False)
+ else:
+ raise ValueError(name + " dataset doesn't exists")
+ return dataset
+
+
+def save_param(dirname, var_name_list):
+ """save_param"""
+ for var_name in var_name_list:
+ var = fluid.global_scope().find_var(var_name)
+ var_tensor = var.get_tensor()
+ np.save(os.path.join(dirname, var_name + '.npy'), np.array(var_tensor))
+
+
+def main(args):
+ """main"""
+ dataset = load(args.dataset)
+
+ # normalize
+ indegree = dataset.graph.indegree()
+ norm = np.zeros_like(indegree, dtype="float32")
+ norm[indegree > 0] = np.power(indegree[indegree > 0], -0.5)
+ dataset.graph.node_feat["norm"] = np.expand_dims(norm, -1)
+
+ place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+ train_program = fluid.Program()
+ startup_program = fluid.Program()
+ hidden_size = 512
+
+ with fluid.program_guard(train_program, startup_program):
+ pos_gw = pgl.graph_wrapper.GraphWrapper(
+ name="pos_graph",
+ place=place,
+ node_feat=dataset.graph.node_feat_info())
+
+ neg_gw = pgl.graph_wrapper.GraphWrapper(
+ name="neg_graph",
+ place=place,
+ node_feat=dataset.graph.node_feat_info())
+
+ positive_feat = pgl.layers.gcn(pos_gw,
+ pos_gw.node_feat["words"],
+ hidden_size,
+ activation="relu",
+ norm=pos_gw.node_feat['norm'],
+ name="gcn_layer_1")
+
+ negative_feat = pgl.layers.gcn(neg_gw,
+ neg_gw.node_feat["words"],
+ hidden_size,
+ activation="relu",
+ norm=neg_gw.node_feat['norm'],
+ name="gcn_layer_1")
+
+ summary_feat = fluid.layers.sigmoid(
+ fluid.layers.reduce_mean(
+ positive_feat, [0], keep_dim=True))
+
+ summary_feat = fluid.layers.fc(summary_feat,
+ hidden_size,
+ bias_attr=False,
+ name="discriminator")
+ pos_logits = fluid.layers.matmul(
+ positive_feat, summary_feat, transpose_y=True)
+ neg_logits = fluid.layers.matmul(
+ negative_feat, summary_feat, transpose_y=True)
+ pos_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+ x=pos_logits,
+ label=fluid.layers.ones(
+ shape=[dataset.graph.num_nodes, 1], dtype="float32"))
+ neg_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+ x=neg_logits,
+ label=fluid.layers.zeros(
+ shape=[dataset.graph.num_nodes, 1], dtype="float32"))
+ loss = fluid.layers.reduce_mean(pos_loss) + fluid.layers.reduce_mean(
+ neg_loss)
+
+ adam = fluid.optimizer.Adam(learning_rate=1e-3)
+ adam.minimize(loss)
+
+ exe = fluid.Executor(place)
+ exe.run(startup_program)
+
+ best_loss = 1e9
+ dur = []
+
+ for epoch in range(args.epoch):
+ feed_dict = pos_gw.to_feed(dataset.graph)
+ node_feat = dataset.graph.node_feat["words"].copy()
+ perm = np.arange(0, dataset.graph.num_nodes)
+ np.random.shuffle(perm)
+
+ dataset.graph.node_feat["words"] = dataset.graph.node_feat["words"][
+ perm]
+
+ feed_dict.update(neg_gw.to_feed(dataset.graph))
+ dataset.graph.node_feat["words"] = node_feat
+ if epoch >= 3:
+ t0 = time.time()
+ train_loss = exe.run(train_program,
+ feed=feed_dict,
+ fetch_list=[loss],
+ return_numpy=True)
+ if train_loss[0] < best_loss:
+ best_loss = train_loss[0]
+ save_param(args.checkpoint, ["gcn_layer_1", "gcn_layer_1_bias"])
+
+ if epoch >= 3:
+ time_per_epoch = 1.0 * (time.time() - t0)
+ dur.append(time_per_epoch)
+
+ log.info("Epoch %d " % epoch + "(%.5lf sec) " % np.mean(dur) +
+ "Train Loss: %f " % train_loss[0])
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='DGI pretrain')
+ parser.add_argument(
+ "--dataset", type=str, default="cora", help="dataset (cora, pubmed)")
+ parser.add_argument(
+ "--checkpoint", type=str, default="best_model", help="checkpoint")
+ parser.add_argument(
+ "--epoch", type=int, default=200, help="pretrain epochs")
+ parser.add_argument("--use_cuda", action='store_true', help="use_cuda")
+ args = parser.parse_args()
+ log.info(args)
+ main(args)
diff --git a/examples/dgi/train.py b/examples/dgi/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..67742093486e9e391cea0d141d504580c2985df0
--- /dev/null
+++ b/examples/dgi/train.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ Train
+"""
+import os
+import pgl
+from pgl import data_loader
+from pgl.utils.logger import log
+import paddle.fluid as fluid
+import numpy as np
+import time
+import argparse
+
+
+def load(name):
+ """Load"""
+ if name == 'cora':
+ dataset = data_loader.CoraDataset()
+ elif name == "pubmed":
+ dataset = data_loader.CitationDataset("pubmed", symmetry_edges=False)
+ elif name == "citeseer":
+ dataset = data_loader.CitationDataset("citeseer", symmetry_edges=False)
+ else:
+ raise ValueError(name + " dataset doesn't exists")
+ return dataset
+
+
+def load_param(dirname, var_name_list):
+ """load_param"""
+ for var_name in var_name_list:
+ var = fluid.global_scope().find_var(var_name)
+ var_tensor = var.get_tensor()
+ var_tmp = np.load(os.path.join(dirname, var_name + '.npy'))
+ var_tensor.set(var_tmp, fluid.CPUPlace())
+
+
+def main(args):
+ """main"""
+ dataset = load(args.dataset)
+
+ # normalize
+ indegree = dataset.graph.indegree()
+ norm = np.zeros_like(indegree, dtype="float32")
+ norm[indegree > 0] = np.power(indegree[indegree > 0], -0.5)
+ dataset.graph.node_feat["norm"] = np.expand_dims(norm, -1)
+
+ place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+ train_program = fluid.Program()
+ startup_program = fluid.Program()
+ test_program = fluid.Program()
+ hidden_size = 512
+
+ with fluid.program_guard(train_program, startup_program):
+ gw = pgl.graph_wrapper.GraphWrapper(
+ name="graph",
+ place=place,
+ node_feat=dataset.graph.node_feat_info())
+
+ output = pgl.layers.gcn(gw,
+ gw.node_feat["words"],
+ hidden_size,
+ activation="relu",
+ norm=gw.node_feat['norm'],
+ name="gcn_layer_1")
+ output.stop_gradient = True
+ output = fluid.layers.fc(output,
+ dataset.num_classes,
+ act=None,
+ name="classifier")
+ node_index = fluid.layers.data(
+ "node_index",
+ shape=[None, 1],
+ dtype="int64",
+ append_batch_size=False)
+ node_label = fluid.layers.data(
+ "node_label",
+ shape=[None, 1],
+ dtype="int64",
+ append_batch_size=False)
+
+ pred = fluid.layers.gather(output, node_index)
+ loss, pred = fluid.layers.softmax_with_cross_entropy(
+ logits=pred, label=node_label, return_softmax=True)
+ acc = fluid.layers.accuracy(input=pred, label=node_label, k=1)
+ loss = fluid.layers.mean(loss)
+
+ test_program = train_program.clone(for_test=True)
+ with fluid.program_guard(train_program, startup_program):
+ adam = fluid.optimizer.Adam(learning_rate=1e-2)
+ adam.minimize(loss)
+
+ exe = fluid.Executor(place)
+ exe.run(startup_program)
+
+ load_param(args.checkpoint, ["gcn_layer_1", "gcn_layer_1_bias"])
+ feed_dict = gw.to_feed(dataset.graph)
+
+ train_index = dataset.train_index
+ train_label = np.expand_dims(dataset.y[train_index], -1)
+ train_index = np.expand_dims(train_index, -1)
+
+ val_index = dataset.val_index
+ val_label = np.expand_dims(dataset.y[val_index], -1)
+ val_index = np.expand_dims(val_index, -1)
+
+ test_index = dataset.test_index
+ test_label = np.expand_dims(dataset.y[test_index], -1)
+ test_index = np.expand_dims(test_index, -1)
+
+ dur = []
+ for epoch in range(200):
+ if epoch >= 3:
+ t0 = time.time()
+ feed_dict["node_index"] = np.array(train_index, dtype="int64")
+ feed_dict["node_label"] = np.array(train_label, dtype="int64")
+ train_loss, train_acc = exe.run(train_program,
+ feed=feed_dict,
+ fetch_list=[loss, acc],
+ return_numpy=True)
+
+ if epoch >= 3:
+ time_per_epoch = 1.0 * (time.time() - t0)
+ dur.append(time_per_epoch)
+ feed_dict["node_index"] = np.array(val_index, dtype="int64")
+ feed_dict["node_label"] = np.array(val_label, dtype="int64")
+ val_loss, val_acc = exe.run(test_program,
+ feed=feed_dict,
+ fetch_list=[loss, acc],
+ return_numpy=True)
+
+ log.info("Epoch %d " % epoch + "(%.5lf sec) " % np.mean(dur) +
+ "Train Loss: %f " % train_loss + "Train Acc: %f " % train_acc
+ + "Val Loss: %f " % val_loss + "Val Acc: %f " % val_acc)
+
+ feed_dict["node_index"] = np.array(test_index, dtype="int64")
+ feed_dict["node_label"] = np.array(test_label, dtype="int64")
+ test_loss, test_acc = exe.run(test_program,
+ feed=feed_dict,
+ fetch_list=[loss, acc],
+ return_numpy=True)
+ log.info("Accuracy: %f" % test_acc)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='GCN')
+ parser.add_argument(
+ "--dataset", type=str, default="cora", help="dataset (cora, pubmed)")
+ parser.add_argument(
+ "--checkpoint", type=str, default="best_model", help="checkpoint")
+ parser.add_argument("--use_cuda", action='store_true', help="use_cuda")
+ args = parser.parse_args()
+ log.info(args)
+ main(args)
diff --git a/examples/distribute_deepwalk/README.md b/examples/distribute_deepwalk/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..24775aaae8bfbae11c69a68fa6eb49c9863a0ae3
--- /dev/null
+++ b/examples/distribute_deepwalk/README.md
@@ -0,0 +1,31 @@
+# PGL Examples for distributed deepwalk
+[Deepwalk](https://arxiv.org/pdf/1403.6652.pdf) is an algorithmic framework for representational learning on graphs. Given any graph, it can learn continuous feature representations for the nodes, which can then be used for various downstream machine learning tasks. Based on PGL, we reproduce distributed deepwalk algorithms and reach the same level of indicators as the paper.
+## Datasets
+The datasets contain two networks: [BlogCatalog](http://socialcomputing.asu.edu/datasets/BlogCatalog3).
+## Dependencies
+- paddlepaddle>=1.6
+- pgl>=1.0
+
+## How to run
+
+For examples, train deepwalk in distributed mode on cora dataset.
+```sh
+# train deepwalk in distributed mode.
+sh cloud_run.sh
+
+# multiclass task example
+python3 multi_class.py --use_cuda --ckpt_path ./model_path/4029 --epoch 1000
+
+```
+
+## Hyperparameters
+- dataset: The citation dataset "BlogCatalog".
+- hidden_size: Hidden size of the embedding.
+- lr: Learning rate.
+- neg_num: Number of negative samples.
+- epoch: Number of training epoch.
+
+### Experiment results
+Dataset|model|Task|Metric|PGL Result|Reported Result
+--|--|--|--|--|--
+BlogCatalog|distributed deepwalk|multi-label classification|MacroF1|0.233|0.211
diff --git a/examples/distribute_deepwalk/cloud_run.sh b/examples/distribute_deepwalk/cloud_run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..4d400849ec10cb51aa0a813f4756b02905246800
--- /dev/null
+++ b/examples/distribute_deepwalk/cloud_run.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -x
+source ./pgl_deepwalk.cfg
+source ./local_config
+
+unset http_proxy https_proxy
+
+# build train_data
+trainer_num=`echo $PADDLE_PORT | awk -F',' '{print NF}'`
+rm -rf train_data && mkdir -p train_data
+cd train_data
+if [[ $build_train_data == True ]];then
+ seq 0 $((num_nodes-1)) | shuf | split -l $((num_nodes/trainer_num/CPU_NUM+1))
+else
+ for i in `seq 1 $trainer_num`;do
+ touch $i
+ done
+fi
+cd -
+
+# mkdir workspace
+if [ -d ${BASE} ]; then
+ rm -rf ${BASE}
+fi
+mkdir ${BASE}
+
+# start ps
+for((i=0;i<${PADDLE_PSERVERS_NUM};i++))
+do
+ echo "start ps server: ${i}"
+ echo $BASE
+ TRAINING_ROLE="PSERVER" PADDLE_TRAINER_ID=${i} sh job.sh &> $BASE/pserver.$i.log &
+done
+sleep 5s
+
+# start trainers
+for((j=0;j<${PADDLE_TRAINERS_NUM};j++))
+do
+ echo "start ps work: ${j}"
+ TRAINING_ROLE="TRAINER" PADDLE_TRAINER_ID=${j} sh job.sh &> $BASE/worker.$j.log &
+done
diff --git a/examples/distribute_deepwalk/cluster_train.py b/examples/distribute_deepwalk/cluster_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa4c9ce122dccff538b5ab5dee3a45b4ae405df2
--- /dev/null
+++ b/examples/distribute_deepwalk/cluster_train.py
@@ -0,0 +1,348 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import time
+import os
+import math
+from multiprocessing import Process
+
+import numpy as np
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from pgl.utils.logger import log
+from pgl import data_loader
+
+from reader import DeepwalkReader
+from model import DeepwalkModel
+from utils import get_file_list
+from utils import build_graph
+from utils import build_fake_graph
+from utils import build_gen_func
+
+
+def init_role():
+ # reset the place according to role of parameter server
+ training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+ paddle_role = role_maker.Role.WORKER
+ place = F.CPUPlace()
+ if training_role == "PSERVER":
+ paddle_role = role_maker.Role.SERVER
+
+ # set the fleet runtime environment according to configure
+ ports = os.getenv("PADDLE_PORT", "6174").split(",")
+ pserver_ips = os.getenv("PADDLE_PSERVERS").split(",") # ip,ip...
+ eplist = []
+ if len(ports) > 1:
+ # local debug mode, multi port
+ for port in ports:
+ eplist.append(':'.join([pserver_ips[0], port]))
+ else:
+ # distributed mode, multi ip
+ for ip in pserver_ips:
+ eplist.append(':'.join([ip, ports[0]]))
+
+ pserver_endpoints = eplist # ip:port,ip:port...
+ worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
+ trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+ role = role_maker.UserDefinedRoleMaker(
+ current_id=trainer_id,
+ role=paddle_role,
+ worker_num=worker_num,
+ server_endpoints=pserver_endpoints)
+ fleet.init(role)
+
+
+def optimization(base_lr, loss, train_steps, optimizer='sgd'):
+ decayed_lr = L.learning_rate_scheduler.polynomial_decay(
+ learning_rate=base_lr,
+ decay_steps=train_steps,
+ end_learning_rate=0.0001 * base_lr,
+ power=1.0,
+ cycle=False)
+ if optimizer == 'sgd':
+ optimizer = F.optimizer.SGD(decayed_lr)
+ elif optimizer == 'adam':
+ optimizer = F.optimizer.Adam(decayed_lr, lazy_mode=True)
+ else:
+ raise ValueError
+
+ log.info('learning rate:%f' % (base_lr))
+ #create the DistributeTranspiler configure
+ config = DistributeTranspilerConfig()
+ config.sync_mode = False
+ #config.runtime_split_send_recv = False
+
+ config.slice_var_up = False
+ #create the distributed optimizer
+ optimizer = fleet.distributed_optimizer(optimizer, config)
+ optimizer.minimize(loss)
+
+
+def build_complied_prog(train_program, model_loss):
+ num_threads = int(os.getenv("CPU_NUM", 10))
+ trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
+ exec_strategy = F.ExecutionStrategy()
+ exec_strategy.num_threads = num_threads
+ #exec_strategy.use_experimental_executor = True
+ build_strategy = F.BuildStrategy()
+ build_strategy.enable_inplace = True
+ #build_strategy.memory_optimize = True
+ build_strategy.memory_optimize = False
+ build_strategy.remove_unnecessary_lock = False
+ if num_threads > 1:
+ build_strategy.reduce_strategy = F.BuildStrategy.ReduceStrategy.Reduce
+
+ compiled_prog = F.compiler.CompiledProgram(
+ train_program).with_data_parallel(
+ loss_name=model_loss.name,
+ build_strategy=build_strategy,
+ exec_strategy=exec_strategy)
+ return compiled_prog
+
+
+def train_prog(exe, program, loss, node2vec_pyreader, args, train_steps):
+ trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+ step = 0
+ while True:
+ try:
+ begin_time = time.time()
+ loss_val, = exe.run(program, fetch_list=[loss])
+ log.info("step %s: loss %.5f speed: %.5f s/step" %
+ (step, np.mean(loss_val), time.time() - begin_time))
+ step += 1
+ except F.core.EOFException:
+ node2vec_pyreader.reset()
+
+ if step % args.steps_per_save == 0 or step == train_steps:
+ if trainer_id == 0 or args.is_distributed:
+ model_save_dir = args.save_path
+ model_path = os.path.join(model_save_dir, str(step))
+ if not os.path.exists(model_save_dir):
+ os.makedirs(model_save_dir)
+ fleet.save_persistables(exe, model_path)
+
+ if step == train_steps:
+ break
+
+
+def test(args):
+ graph = build_graph(args.num_nodes, args.edge_path)
+ gen_func = build_gen_func(args, graph)
+
+ start = time.time()
+ num = 10
+ for idx, _ in enumerate(gen_func()):
+ if idx % num == num - 1:
+ log.info("%s" % (1.0 * (time.time() - start) / num))
+ start = time.time()
+
+
+def walk(args):
+ graph = build_graph(args.num_nodes, args.edge_path)
+ num_sample_workers = args.num_sample_workers
+
+ if args.train_files is None or args.train_files == "None":
+ log.info("Walking from graph...")
+ train_files = [None for _ in range(num_sample_workers)]
+ else:
+ log.info("Walking from train_data...")
+ files = get_file_list(args.train_files)
+ train_files = [[] for i in range(num_sample_workers)]
+ for idx, f in enumerate(files):
+ train_files[idx % num_sample_workers].append(f)
+
+ def walk_to_file(walk_gen, filename, max_num):
+ with open(filename, "w") as outf:
+ num = 0
+ for walks in walk_gen:
+ for walk in walks:
+ outf.write("%s\n" % "\t".join([str(i) for i in walk]))
+ num += 1
+ if num % 1000 == 0:
+ log.info("Total: %s, %s walkpath is saved. " %
+ (max_num, num))
+ if num == max_num:
+ return
+
+ m_args = [(DeepwalkReader(
+ graph,
+ batch_size=args.batch_size,
+ walk_len=args.walk_len,
+ win_size=args.win_size,
+ neg_num=args.neg_num,
+ neg_sample_type=args.neg_sample_type,
+ walkpath_files=None,
+ train_files=train_files[i]).walk_generator(),
+ "%s/%s" % (args.walkpath_files, i),
+ args.epoch * args.num_nodes // args.num_sample_workers)
+ for i in range(num_sample_workers)]
+ ps = []
+ for i in range(num_sample_workers):
+ p = Process(target=walk_to_file, args=m_args[i])
+ p.start()
+ ps.append(p)
+ for i in range(num_sample_workers):
+ ps[i].join()
+
+
+def train(args):
+ import logging
+ log.setLevel(logging.DEBUG)
+ log.info("start")
+
+ worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
+ num_devices = int(os.getenv("CPU_NUM", 10))
+
+ model = DeepwalkModel(args.num_nodes, args.hidden_size, args.neg_num,
+ args.is_sparse, args.is_distributed, 1.)
+ pyreader = model.pyreader
+ loss = model.forward()
+
+ # init fleet
+ init_role()
+
+ train_steps = math.ceil(1. * args.num_nodes * args.epoch /
+ args.batch_size / num_devices / worker_num)
+ log.info("Train step: %s" % train_steps)
+
+ if args.optimizer == "sgd":
+ args.lr *= args.batch_size * args.walk_len * args.win_size
+ optimization(args.lr, loss, train_steps, args.optimizer)
+
+ # init and run server or worker
+ if fleet.is_server():
+ fleet.init_server(args.warm_start_from_dir)
+ fleet.run_server()
+
+ if fleet.is_worker():
+ log.info("start init worker done")
+ fleet.init_worker()
+ #just the worker, load the sample
+ log.info("init worker done")
+
+ exe = F.Executor(F.CPUPlace())
+ exe.run(fleet.startup_program)
+ log.info("Startup done")
+
+ if args.dataset is not None:
+ if args.dataset == "BlogCatalog":
+ graph = data_loader.BlogCatalogDataset().graph
+ elif args.dataset == "ArXiv":
+ graph = data_loader.ArXivDataset().graph
+ else:
+ raise ValueError(args.dataset + " dataset doesn't exists")
+ log.info("Load buildin BlogCatalog dataset done.")
+ elif args.walkpath_files is None or args.walkpath_files == "None":
+ graph = build_graph(args.num_nodes, args.edge_path)
+ log.info("Load graph from '%s' done." % args.edge_path)
+ else:
+ graph = build_fake_graph(args.num_nodes)
+ log.info("Load fake graph done.")
+
+ # bind gen
+ gen_func = build_gen_func(args, graph)
+
+ pyreader.decorate_tensor_provider(gen_func)
+ pyreader.start()
+
+ compiled_prog = build_complied_prog(fleet.main_program, loss)
+ train_prog(exe, compiled_prog, loss, pyreader, args, train_steps)
+
+
+if __name__ == '__main__':
+
+ def str2bool(v):
+ if isinstance(v, bool):
+ return v
+ if v.lower() in ('yes', 'true', 't', 'y', '1'):
+ return True
+ elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+ return False
+ else:
+ raise argparse.ArgumentTypeError('Boolean value expected.')
+
+ parser = argparse.ArgumentParser(description='Deepwalk')
+ parser.add_argument(
+ "--hidden_size",
+ type=int,
+ default=64,
+ help="Hidden size of the embedding.")
+ parser.add_argument(
+ "--lr", type=float, default=0.025, help="Learning rate.")
+ parser.add_argument(
+ "--neg_num", type=int, default=5, help="Number of negative samples.")
+ parser.add_argument(
+ "--epoch", type=int, default=1, help="Number of training epoch.")
+ parser.add_argument(
+ "--batch_size",
+ type=int,
+ default=128,
+ help="Numbert of walk paths in a batch.")
+ parser.add_argument(
+ "--walk_len", type=int, default=40, help="Length of a walk path.")
+ parser.add_argument(
+ "--win_size", type=int, default=5, help="Window size in skip-gram.")
+ parser.add_argument(
+ "--save_path",
+ type=str,
+ default="model_path",
+ help="Output path for saving model.")
+ parser.add_argument(
+ "--num_sample_workers",
+ type=int,
+ default=1,
+ help="Number of sampling workers.")
+ parser.add_argument(
+ "--steps_per_save",
+ type=int,
+ default=3000,
+ help="Steps for model saveing.")
+ parser.add_argument(
+ "--num_nodes",
+ type=int,
+ default=10000,
+ help="Number of nodes in graph.")
+ parser.add_argument("--edge_path", type=str, default="./graph_data")
+ parser.add_argument("--train_files", type=str, default=None)
+ parser.add_argument("--walkpath_files", type=str, default=None)
+ parser.add_argument("--is_distributed", type=str2bool, default=False)
+ parser.add_argument("--is_sparse", type=str2bool, default=True)
+ parser.add_argument("--warm_start_from_dir", type=str, default=None)
+ parser.add_argument("--dataset", type=str, default=None)
+ parser.add_argument(
+ "--neg_sample_type",
+ type=str,
+ default="average",
+ choices=["average", "outdegree"])
+ parser.add_argument(
+ "--mode",
+ type=str,
+ required=False,
+ choices=['train', 'walk'],
+ default="train")
+ parser.add_argument(
+ "--optimizer",
+ type=str,
+ required=False,
+ choices=['adam', 'sgd'],
+ default="sgd")
+ args = parser.parse_args()
+ log.info(args)
+ if args.mode == "train":
+ train(args)
+ elif args.mode == "walk":
+ walk(args)
diff --git a/examples/distribute_deepwalk/gpu_train.py b/examples/distribute_deepwalk/gpu_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6c4d9853c48e0a8f3083e4b813dc7057317e7cc
--- /dev/null
+++ b/examples/distribute_deepwalk/gpu_train.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import time
+import os
+
+import numpy as np
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from pgl.utils.logger import log
+
+from model import DeepwalkModel
+from utils import build_graph
+from utils import build_gen_func
+
+
+def optimization(base_lr, loss, train_steps, optimizer='adam'):
+ decayed_lr = L.polynomial_decay(base_lr, train_steps, 0.0001)
+
+ if optimizer == 'sgd':
+ optimizer = F.optimizer.SGD(
+ decayed_lr,
+ regularization=F.regularizer.L2DecayRegularizer(
+ regularization_coeff=0.0025))
+ elif optimizer == 'adam':
+ # dont use gpu's lazy mode
+ optimizer = F.optimizer.Adam(decayed_lr)
+ else:
+ raise ValueError
+
+ log.info('learning rate:%f' % (base_lr))
+ optimizer.minimize(loss)
+
+
+def get_parallel_exe(program, loss):
+ exec_strategy = F.ExecutionStrategy()
+ exec_strategy.num_threads = 1 #2 for fp32 4 for fp16
+ exec_strategy.use_experimental_executor = True
+ exec_strategy.num_iteration_per_drop_scope = 1 #important shit
+
+ build_strategy = F.BuildStrategy()
+ build_strategy.enable_inplace = True
+ build_strategy.memory_optimize = True
+ build_strategy.remove_unnecessary_lock = True
+
+ #return compiled_prog
+ train_exe = F.ParallelExecutor(
+ use_cuda=True,
+ loss_name=loss.name,
+ build_strategy=build_strategy,
+ exec_strategy=exec_strategy,
+ main_program=program)
+ return train_exe
+
+
+def train(train_exe, exe, program, loss, node2vec_pyreader, args, train_steps):
+ trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+ step = 0
+ while True:
+ try:
+ begin_time = time.time()
+ loss_val, = train_exe.run(fetch_list=[loss])
+ log.info("step %s: loss %.5f speed: %.5f s/step" %
+ (step, np.mean(loss_val), time.time() - begin_time))
+ step += 1
+ except F.core.EOFException:
+ node2vec_pyreader.reset()
+
+ if (step == train_steps or
+ step % args.steps_per_save == 0) and trainer_id == 0:
+
+ model_save_dir = args.output_path
+ model_path = os.path.join(model_save_dir, str(step))
+ if not os.path.exists(model_save_dir):
+ os.makedirs(model_save_dir)
+ F.io.save_params(exe, model_path, program)
+ if step == train_steps:
+ break
+
+
+def main(args):
+ import logging
+ log.setLevel(logging.DEBUG)
+ log.info("start")
+
+ num_devices = len(F.cuda_places())
+ model = DeepwalkModel(args.num_nodes, args.hidden_size, args.neg_num,
+ False, False, 1.)
+ pyreader = model.pyreader
+ loss = model.forward()
+
+ train_steps = int(args.num_nodes * args.epoch / args.batch_size /
+ num_devices)
+ optimization(args.lr * num_devices, loss, train_steps, args.optimizer)
+
+ place = F.CUDAPlace(0)
+ exe = F.Executor(place)
+ exe.run(F.default_startup_program())
+
+ graph = build_graph(args.num_nodes, args.edge_path)
+ gen_func = build_gen_func(args, graph)
+
+ pyreader.decorate_tensor_provider(gen_func)
+ pyreader.start()
+
+ train_prog = F.default_main_program()
+
+ if args.warm_start_from_dir is not None:
+ F.io.load_params(exe, args.warm_start_from_dir, train_prog)
+
+ train_exe = get_parallel_exe(train_prog, loss)
+ train(train_exe, exe, train_prog, loss, pyreader, args, train_steps)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Deepwalk')
+ parser.add_argument("--hidden_size", type=int, default=64)
+ parser.add_argument("--lr", type=float, default=0.025)
+ parser.add_argument("--neg_num", type=int, default=5)
+ parser.add_argument("--epoch", type=int, default=100)
+ parser.add_argument("--batch_size", type=int, default=128)
+ parser.add_argument("--walk_len", type=int, default=40)
+ parser.add_argument("--win_size", type=int, default=5)
+ parser.add_argument("--output_path", type=str, default="output")
+ parser.add_argument("--num_sample_workers", type=int, default=1)
+ parser.add_argument("--steps_per_save", type=int, default=3000)
+ parser.add_argument("--num_nodes", type=int, default=10000)
+ parser.add_argument("--edge_path", type=str, default="./graph_data")
+ parser.add_argument("--walkpath_files", type=str, default=None)
+ parser.add_argument("--train_files", type=str, default="./train_data")
+ parser.add_argument("--warm_start_from_dir", type=str, default=None)
+ parser.add_argument(
+ "--neg_sample_type",
+ type=str,
+ default="average",
+ choices=["average", "outdegree"])
+ parser.add_argument(
+ "--optimizer",
+ type=str,
+ required=False,
+ choices=['adam', 'sgd'],
+ default="adam")
+ args = parser.parse_args()
+ log.info(args)
+ main(args)
diff --git a/examples/distribute_deepwalk/job.sh b/examples/distribute_deepwalk/job.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e0a0ba5d914e778cfd88d4947d3628f181518fc3
--- /dev/null
+++ b/examples/distribute_deepwalk/job.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -x
+source ./pgl_deepwalk.cfg
+
+export CPU_NUM=$CPU_NUM
+export FLAGS_rpc_deadline=3000000
+export FLAGS_communicator_send_queue_size=1
+export FLAGS_communicator_min_send_grad_num_before_recv=0
+export FLAGS_communicator_max_merge_var_num=1
+export FLAGS_communicator_merge_sparse_grad=1
+
+if [[ $build_train_data == True ]];then
+ train_files="./train_data"
+else
+ train_files="None"
+fi
+
+if [[ $pre_walk == True ]]; then
+ walkpath_files="./walk_path"
+ if [[ $TRAINING_ROLE == "PSERVER" ]];then
+ while [[ ! -d train_data ]];do
+ sleep 60
+ echo "Waiting for train_data ..."
+ done
+ rm -rf $walkpath_files && mkdir -p $walkpath_files
+ python -u cluster_train.py --num_sample_workers $num_sample_workers --num_nodes $num_nodes --mode walk --walkpath_files $walkpath_files --epoch $epoch \
+ --walk_len $walk_len --batch_size $batch_size --train_files $train_files --dataset "BlogCatalog"
+ touch build_graph_done
+ fi
+
+ while [[ ! -f build_graph_done ]];do
+ sleep 60
+ echo "Waiting for walk_path ..."
+ done
+else
+ walkpath_files="None"
+fi
+
+python -u cluster_train.py --num_sample_workers $num_sample_workers --num_nodes $num_nodes --optimizer $optimizer --walkpath_files $walkpath_files --epoch $epoch \
+ --is_distributed $distributed_embedding --lr $learning_rate --neg_num $neg_num --walk_len $walk_len --win_size $win_size --is_sparse $is_sparse --hidden_size $dim \
+ --batch_size $batch_size --steps_per_save $steps_per_save --train_files $train_files --dataset "BlogCatalog"
diff --git a/examples/distribute_deepwalk/local_config b/examples/distribute_deepwalk/local_config
new file mode 100644
index 0000000000000000000000000000000000000000..455a34228c6f1af38e01e149d7963c19a4dde3c7
--- /dev/null
+++ b/examples/distribute_deepwalk/local_config
@@ -0,0 +1,7 @@
+#!/bin/bash
+export PADDLE_TRAINERS_NUM=2
+export PADDLE_PSERVERS_NUM=2
+export PADDLE_PORT=6184,6185
+export PADDLE_PSERVERS="127.0.0.1"
+export BASE="./local_dir"
+
diff --git a/examples/distribute_deepwalk/model.py b/examples/distribute_deepwalk/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3740ff2848b73a2def81f6ec7a8b2f96d029e520
--- /dev/null
+++ b/examples/distribute_deepwalk/model.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ Deepwalk model file.
+"""
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+import math
+
+import paddle.fluid.layers as L
+import paddle.fluid as F
+
+
+def split_embedding(input,
+ dict_size,
+ hidden_size,
+ initializer,
+ name,
+ num_part=16,
+ is_sparse=False,
+ learning_rate=1.0):
+ """ split_embedding
+ """
+ _part_size = hidden_size // num_part
+ if hidden_size % num_part != 0:
+ _part_size += 1
+ output_embedding = []
+ p_num = 0
+ while hidden_size > 0:
+ _part_size = min(_part_size, hidden_size)
+ hidden_size -= _part_size
+ print("part", p_num, "size=", (dict_size, _part_size))
+ part_embedding = L.embedding(
+ input=input,
+ size=(dict_size, _part_size),
+ is_sparse=is_sparse,
+ is_distributed=False,
+ param_attr=F.ParamAttr(
+ name=name + '_part%s' % p_num,
+ initializer=initializer,
+ learning_rate=learning_rate))
+ p_num += 1
+ output_embedding.append(part_embedding)
+ return L.concat(output_embedding, -1)
+
+
+class DeepwalkModel(object):
+ def __init__(self,
+ num_nodes,
+ hidden_size=16,
+ neg_num=5,
+ is_sparse=False,
+ is_distributed=False,
+ embedding_lr=1.0):
+ self.pyreader = L.py_reader(
+ capacity=70,
+ shapes=[[-1, 1, 1], [-1, neg_num + 1, 1]],
+ dtypes=['int64', 'int64'],
+ lod_levels=[0, 0],
+ name='train',
+ use_double_buffer=True)
+
+ self.num_nodes = num_nodes
+ self.neg_num = neg_num
+
+ self.embed_init = F.initializer.Uniform(
+ low=-1. / math.sqrt(hidden_size), high=1. / math.sqrt(hidden_size))
+ self.is_sparse = is_sparse
+ self.is_distributed = is_distributed
+ self.hidden_size = hidden_size
+ self.loss = None
+ self.embedding_lr = embedding_lr
+ max_hidden_size = int(math.pow(2, 31) / 4 / num_nodes)
+ self.num_part = int(math.ceil(1. * hidden_size / max_hidden_size))
+
+ def forward(self):
+ src, dsts = L.read_file(self.pyreader)
+
+ if self.is_sparse:
+ # sparse mode use 2 dims input.
+ src = L.reshape(src, [-1, 1])
+ dsts = L.reshape(dsts, [-1, 1])
+
+ if self.num_part is not None and self.num_part != 1 and not self.is_distributed:
+ src_embed = split_embedding(
+ src,
+ self.num_nodes,
+ self.hidden_size,
+ self.embed_init,
+ "weight",
+ self.num_part,
+ self.is_sparse,
+ learning_rate=self.embedding_lr)
+
+ dsts_embed = split_embedding(
+ dsts,
+ self.num_nodes,
+ self.hidden_size,
+ self.embed_init,
+ "weight",
+ self.num_part,
+ self.is_sparse,
+ learning_rate=self.embedding_lr)
+ else:
+ src_embed = L.embedding(
+ src, (self.num_nodes, self.hidden_size),
+ self.is_sparse,
+ self.is_distributed,
+ param_attr=F.ParamAttr(
+ name="weight",
+ learning_rate=self.embedding_lr,
+ initializer=self.embed_init))
+
+ dsts_embed = L.embedding(
+ dsts, (self.num_nodes, self.hidden_size),
+ self.is_sparse,
+ self.is_distributed,
+ param_attr=F.ParamAttr(
+ name="weight",
+ learning_rate=self.embedding_lr,
+ initializer=self.embed_init))
+
+ if self.is_sparse:
+ # reshape back
+ src_embed = L.reshape(src_embed, [-1, 1, self.hidden_size])
+ dsts_embed = L.reshape(dsts_embed,
+ [-1, self.neg_num + 1, self.hidden_size])
+
+ logits = L.matmul(
+ src_embed, dsts_embed,
+ transpose_y=True) # [batch_size, 1, neg_num+1]
+
+ pos_label = L.fill_constant_batch_size_like(logits, [-1, 1, 1],
+ "float32", 1)
+ neg_label = L.fill_constant_batch_size_like(
+ logits, [-1, 1, self.neg_num], "float32", 0)
+ label = L.concat([pos_label, neg_label], -1)
+
+ pos_weight = L.fill_constant_batch_size_like(logits, [-1, 1, 1],
+ "float32", self.neg_num)
+ neg_weight = L.fill_constant_batch_size_like(
+ logits, [-1, 1, self.neg_num], "float32", 1)
+ weight = L.concat([pos_weight, neg_weight], -1)
+
+ weight.stop_gradient = True
+ label.stop_gradient = True
+
+ loss = L.sigmoid_cross_entropy_with_logits(logits, label)
+ loss = loss * weight
+ loss = L.reduce_mean(loss)
+ loss = loss * ((self.neg_num + 1) / 2 / self.neg_num)
+ loss.persistable = True
+ self.loss = loss
+ return loss
diff --git a/examples/distribute_deepwalk/mp_reader.py b/examples/distribute_deepwalk/mp_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4df8998cc4ad672627dc7ba9f846c91ccca0bba
--- /dev/null
+++ b/examples/distribute_deepwalk/mp_reader.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimized Multiprocessing Reader for PaddlePaddle
+"""
+import multiprocessing
+import numpy as np
+import time
+
+import paddle.fluid as fluid
+import pyarrow
+
+
+def _serialize_serializable(obj):
+ """Serialize Feed Dict
+ """
+ return {"type": type(obj), "data": obj.__dict__}
+
+
+def _deserialize_serializable(obj):
+ """Deserialize Feed Dict
+ """
+
+ val = obj["type"].__new__(obj["type"])
+ val.__dict__.update(obj["data"])
+ return val
+
+
+context = pyarrow.default_serialization_context()
+
+context.register_type(
+ object,
+ "object",
+ custom_serializer=_serialize_serializable,
+ custom_deserializer=_deserialize_serializable)
+
+
+def serialize_data(data):
+ """serialize_data"""
+ return pyarrow.serialize(data, context=context).to_buffer().to_pybytes()
+
+
+def deserialize_data(data):
+ """deserialize_data"""
+ return pyarrow.deserialize(data, context=context)
+
+
+def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
+ """
+ multiprocess_reader use python multi process to read data from readers
+ and then use multiprocess.Queue or multiprocess.Pipe to merge all
+ data. The process number is equal to the number of input readers, each
+ process call one reader.
+ Multiprocess.Queue require the rw access right to /dev/shm, some
+ platform does not support.
+ you need to create multiple readers first, these readers should be independent
+ to each other so that each process can work independently.
+ An example:
+ .. code-block:: python
+ reader0 = reader(["file01", "file02"])
+ reader1 = reader(["file11", "file12"])
+ reader1 = reader(["file21", "file22"])
+ reader = multiprocess_reader([reader0, reader1, reader2],
+ queue_size=100, use_pipe=False)
+ """
+
+ assert type(readers) is list and len(readers) > 0
+
+ def _read_into_queue(reader, queue):
+ """read_into_queue"""
+ for sample in reader():
+ if sample is None:
+ raise ValueError("sample has None")
+ queue.put(serialize_data(sample))
+ queue.put(serialize_data(None))
+
+ def queue_reader():
+ """queue_reader"""
+ queue = multiprocessing.Queue(queue_size)
+ for reader in readers:
+ p = multiprocessing.Process(
+ target=_read_into_queue, args=(reader, queue))
+ p.start()
+
+ reader_num = len(readers)
+ finish_num = 0
+ while finish_num < reader_num:
+ sample = deserialize_data(queue.get())
+ if sample is None:
+ finish_num += 1
+ else:
+ yield sample
+
+ def _read_into_pipe(reader, conn):
+ """read_into_pipe"""
+ for sample in reader():
+ if sample is None:
+ raise ValueError("sample has None!")
+ conn.send(serialize_data(sample))
+ conn.send(serialize_data(None))
+ conn.close()
+
+ def pipe_reader():
+ """pipe_reader"""
+ conns = []
+ for reader in readers:
+ parent_conn, child_conn = multiprocessing.Pipe()
+ conns.append(parent_conn)
+ p = multiprocessing.Process(
+ target=_read_into_pipe, args=(reader, child_conn))
+ p.start()
+
+ reader_num = len(readers)
+ finish_num = 0
+ conn_to_remove = []
+ finish_flag = np.zeros(len(conns), dtype="int32")
+ while finish_num < reader_num:
+ for conn_id, conn in enumerate(conns):
+ if finish_flag[conn_id] > 0:
+ continue
+ buff = conn.recv()
+ now = time.time()
+ sample = deserialize_data(buff)
+ out = time.time() - now
+ if sample is None:
+ finish_num += 1
+ conn.close()
+ finish_flag[conn_id] = 1
+ else:
+ yield sample
+
+ if use_pipe:
+ return pipe_reader
+ else:
+ return queue_reader
diff --git a/examples/distribute_deepwalk/multi_class.py b/examples/distribute_deepwalk/multi_class.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ffe69060b7af620ceb5aecdd2f06a33768b8d3d
--- /dev/null
+++ b/examples/distribute_deepwalk/multi_class.py
@@ -0,0 +1,244 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import time
+import math
+import os
+
+import numpy as np
+import sklearn.metrics
+from sklearn.metrics import f1_score
+
+import pgl
+from pgl import data_loader
+from pgl.utils import op
+from pgl.utils.logger import log
+import paddle.fluid as fluid
+import paddle.fluid.layers as l
+
+np.random.seed(123)
+
+
+def load(name):
+ if name == 'BlogCatalog':
+ dataset = data_loader.BlogCatalogDataset()
+ else:
+ raise ValueError(name + " dataset doesn't exists")
+ return dataset
+
+
+def node_classify_model(graph,
+ num_labels,
+ hidden_size=16,
+ name='node_classify_task'):
+ pyreader = l.py_reader(
+ capacity=70,
+ shapes=[[-1, 1], [-1, num_labels]],
+ dtypes=['int64', 'float32'],
+ lod_levels=[0, 0],
+ name=name + '_pyreader',
+ use_double_buffer=True)
+ nodes, labels = l.read_file(pyreader)
+ embed_nodes = l.embedding(
+ input=nodes,
+ size=[graph.num_nodes, hidden_size],
+ param_attr=fluid.ParamAttr(name='weight'))
+ embed_nodes.stop_gradient = True
+ logits = l.fc(input=embed_nodes, size=num_labels)
+ loss = l.sigmoid_cross_entropy_with_logits(logits, labels)
+ loss = l.reduce_mean(loss)
+ prob = l.sigmoid(logits)
+ topk = l.reduce_sum(labels, -1)
+ return pyreader, loss, prob, labels, topk
+
+
+def node_classify_generator(graph,
+ all_nodes=None,
+ batch_size=512,
+ epoch=1,
+ shuffle=True):
+
+ if all_nodes is None:
+ all_nodes = np.arange(graph.num_nodes)
+ #labels = (np.random.rand(512, 39) > 0.95).astype(np.float32)
+
+ def batch_nodes_generator(shuffle=shuffle):
+ perm = np.arange(len(all_nodes), dtype=np.int64)
+ if shuffle:
+ np.random.shuffle(perm)
+ start = 0
+ while start < len(all_nodes):
+ yield all_nodes[perm[start:start + batch_size]]
+ start += batch_size
+
+ def wrapper():
+ for _ in range(epoch):
+ for batch_nodes in batch_nodes_generator():
+ batch_nodes_expanded = np.expand_dims(batch_nodes,
+ -1).astype(np.int64)
+ batch_labels = graph.node_feat['group_id'][batch_nodes].astype(
+ np.float32)
+ yield [batch_nodes_expanded, batch_labels]
+
+ return wrapper
+
+
+def topk_f1_score(labels,
+ probs,
+ topk_list=None,
+ average="macro",
+ threshold=None):
+ assert topk_list is not None or threshold is not None, "one of topklist and threshold should not be None"
+ if threshold is not None:
+ preds = probs > threshold
+ else:
+ preds = np.zeros_like(labels, dtype=np.int64)
+ for idx, (prob, topk) in enumerate(zip(np.argsort(probs), topk_list)):
+ preds[idx][prob[-int(topk):]] = 1
+ return f1_score(labels, preds, average=average)
+
+
+def main(args):
+ hidden_size = args.hidden_size
+ epoch = args.epoch
+ ckpt_path = args.ckpt_path
+ threshold = args.threshold
+
+ dataset = load(args.dataset)
+
+ if args.batch_size is None:
+ batch_size = len(dataset.train_index)
+ else:
+ batch_size = args.batch_size
+
+ train_steps = (len(dataset.train_index) // batch_size) * epoch
+
+ place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+ train_prog = fluid.Program()
+ test_prog = fluid.Program()
+ startup_prog = fluid.Program()
+
+ with fluid.program_guard(train_prog, startup_prog):
+ with fluid.unique_name.guard():
+ train_pyreader, train_loss, train_probs, train_labels, train_topk = node_classify_model(
+ dataset.graph,
+ dataset.num_groups,
+ hidden_size=hidden_size,
+ name='train')
+ lr = l.polynomial_decay(0.025, train_steps, 0.0001)
+ adam = fluid.optimizer.Adam(lr)
+ adam.minimize(train_loss)
+
+ with fluid.program_guard(test_prog, startup_prog):
+ with fluid.unique_name.guard():
+ test_pyreader, test_loss, test_probs, test_labels, test_topk = node_classify_model(
+ dataset.graph,
+ dataset.num_groups,
+ hidden_size=hidden_size,
+ name='test')
+ test_prog = test_prog.clone(for_test=True)
+
+ exe = fluid.Executor(place)
+ exe.run(startup_prog)
+
+ train_pyreader.decorate_tensor_provider(
+ node_classify_generator(
+ dataset.graph,
+ dataset.train_index,
+ batch_size=batch_size,
+ epoch=epoch))
+ test_pyreader.decorate_tensor_provider(
+ node_classify_generator(
+ dataset.graph, dataset.test_index, batch_size=batch_size, epoch=1))
+
+ def existed_params(var):
+ if not isinstance(var, fluid.framework.Parameter):
+ return False
+ return os.path.exists(os.path.join(ckpt_path, var.name))
+
+ fluid.io.load_vars(
+ exe, ckpt_path, main_program=train_prog, predicate=existed_params)
+ step = 0
+ prev_time = time.time()
+ train_pyreader.start()
+
+ while 1:
+ try:
+ train_loss_val, train_probs_val, train_labels_val, train_topk_val = exe.run(
+ train_prog,
+ fetch_list=[
+ train_loss, train_probs, train_labels, train_topk
+ ],
+ return_numpy=True)
+ train_macro_f1 = topk_f1_score(train_labels_val, train_probs_val,
+ train_topk_val, "macro", threshold)
+ train_micro_f1 = topk_f1_score(train_labels_val, train_probs_val,
+ train_topk_val, "micro", threshold)
+ step += 1
+ log.info("Step %d " % step + "Train Loss: %f " % train_loss_val +
+ "Train Macro F1: %f " % train_macro_f1 +
+ "Train Micro F1: %f " % train_micro_f1)
+ except fluid.core.EOFException:
+ train_pyreader.reset()
+ break
+
+ test_pyreader.start()
+ test_probs_vals, test_labels_vals, test_topk_vals = [], [], []
+ while 1:
+ try:
+ test_loss_val, test_probs_val, test_labels_val, test_topk_val = exe.run(
+ test_prog,
+ fetch_list=[
+ test_loss, test_probs, test_labels, test_topk
+ ],
+ return_numpy=True)
+ test_probs_vals.append(
+ test_probs_val), test_labels_vals.append(test_labels_val)
+ test_topk_vals.append(test_topk_val)
+ except fluid.core.EOFException:
+ test_pyreader.reset()
+ test_probs_array = np.concatenate(test_probs_vals)
+ test_labels_array = np.concatenate(test_labels_vals)
+ test_topk_array = np.concatenate(test_topk_vals)
+ test_macro_f1 = topk_f1_score(
+ test_labels_array, test_probs_array, test_topk_array,
+ "macro", threshold)
+ test_micro_f1 = topk_f1_score(
+ test_labels_array, test_probs_array, test_topk_array,
+ "micro", threshold)
+ log.info("\t\tStep %d " % step + "Test Loss: %f " %
+ test_loss_val + "Test Macro F1: %f " % test_macro_f1 +
+ "Test Micro F1: %f " % test_micro_f1)
+ break
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='node2vec')
+ parser.add_argument(
+ "--dataset",
+ type=str,
+ default="BlogCatalog",
+ help="dataset (BlogCatalog)")
+ parser.add_argument("--use_cuda", action='store_true', help="use_cuda")
+ parser.add_argument("--hidden_size", type=int, default=128)
+ parser.add_argument("--epoch", type=int, default=400)
+ parser.add_argument("--batch_size", type=int, default=None)
+ parser.add_argument("--threshold", type=float, default=0.3)
+ parser.add_argument(
+ "--ckpt_path",
+ type=str,
+ default="./tmp/baseline_node2vec/paddle_model")
+ args = parser.parse_args()
+ log.info(args)
+ main(args)
diff --git a/examples/distribute_deepwalk/pgl_deepwalk.cfg b/examples/distribute_deepwalk/pgl_deepwalk.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..8c06b3174378829d5c9b7334c92e60041857bda4
--- /dev/null
+++ b/examples/distribute_deepwalk/pgl_deepwalk.cfg
@@ -0,0 +1,22 @@
+
+# deepwalk config
+num_nodes=10312 # max node_id + 1
+num_sample_workers=2
+epoch=100
+
+optimizer=sgd # sgd or adam
+learning_rate=0.5
+
+neg_num=5
+walk_len=40
+win_size=5
+dim=128
+batch_size=8
+steps_per_save=5000
+
+is_sparse=False
+distributed_embedding=False # only use when num_nodes > 100,000,000, slower than noraml embedding
+build_train_data=True
+pre_walk=False
+
+CPU_NUM=16
diff --git a/examples/distribute_deepwalk/reader.py b/examples/distribute_deepwalk/reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..f797c18ed4ac91c63d9a57b8a8741782c6151b23
--- /dev/null
+++ b/examples/distribute_deepwalk/reader.py
@@ -0,0 +1,146 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ Reader file.
+"""
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+import time
+import io
+import os
+
+import numpy as np
+import paddle
+from pgl.utils.logger import log
+from pgl.sample import node2vec_sample
+from pgl.sample import deepwalk_sample
+from pgl.sample import alias_sample
+from pgl.graph_kernel import skip_gram_gen_pair
+from pgl.graph_kernel import alias_sample_build_table
+from pgl.utils import mp_reader
+
+
+class DeepwalkReader(object):
+ def __init__(self,
+ graph,
+ batch_size=512,
+ walk_len=40,
+ win_size=5,
+ neg_num=5,
+ train_files=None,
+ walkpath_files=None,
+ neg_sample_type="average"):
+ """
+ Args:
+ walkpath_files: if is not None, read walk path from walkpath_files
+ """
+ self.graph = graph
+ self.batch_size = batch_size
+ self.walk_len = walk_len
+ self.win_size = win_size
+ self.neg_num = neg_num
+ self.train_files = train_files
+ self.walkpath_files = walkpath_files
+ self.neg_sample_type = neg_sample_type
+
+ def walk_from_files(self):
+ bucket = []
+ while True:
+ for filename in self.walkpath_files:
+ with io.open(filename) as inf:
+ for line in inf:
+ #walk = [hash_map[x] for x in line.strip('\n\t').split('\t')]
+ walk = [int(x) for x in line.strip('\n\t').split('\t')]
+ bucket.append(walk)
+ if len(bucket) == self.batch_size:
+ yield bucket
+ bucket = []
+ if len(bucket):
+ yield bucket
+
+ def walk_from_graph(self):
+ def node_generator():
+ if self.train_files is None:
+ while True:
+ for nodes in self.graph.node_batch_iter(self.batch_size):
+ yield nodes
+ else:
+ nodes = []
+ while True:
+ for filename in self.train_files:
+ with io.open(filename) as inf:
+ for line in inf:
+ node = int(line.strip('\n\t'))
+ nodes.append(node)
+ if len(nodes) == self.batch_size:
+ yield nodes
+ nodes = []
+ if len(nodes):
+ yield nodes
+
+ if "alias" in self.graph.node_feat and "events" in self.graph.node_feat:
+ log.info("Deepwalk using alias sample")
+ for nodes in node_generator():
+ if "alias" in self.graph.node_feat and "events" in self.graph.node_feat:
+ walks = deepwalk_sample(self.graph, nodes, self.walk_len,
+ "alias", "events")
+ else:
+ walks = deepwalk_sample(self.graph, nodes, self.walk_len)
+ yield walks
+
+ def walk_generator(self):
+ if self.walkpath_files is not None:
+ for i in self.walk_from_files():
+ yield i
+ else:
+ for i in self.walk_from_graph():
+ yield i
+
+ def __call__(self):
+ np.random.seed(os.getpid())
+ if self.neg_sample_type == "outdegree":
+ outdegree = self.graph.outdegree()
+ distribution = 1. * outdegree / outdegree.sum()
+ alias, events = alias_sample_build_table(distribution)
+ max_len = int(self.batch_size * self.walk_len * (
+ (1 + self.win_size) - 0.3))
+ for walks in self.walk_generator():
+ try:
+ src_list, pos_list = [], []
+ for walk in walks:
+ s, p = skip_gram_gen_pair(walk, self.win_size)
+ src_list.append(s[:max_len]), pos_list.append(p[:max_len])
+ src = [s for x in src_list for s in x]
+ pos = [s for x in pos_list for s in x]
+ src = np.array(src, dtype=np.int64),
+ pos = np.array(pos, dtype=np.int64)
+ src, pos = np.reshape(src, [-1, 1, 1]), np.reshape(pos,
+ [-1, 1, 1])
+
+ neg_sample_size = [len(pos), self.neg_num, 1]
+ if src.shape[0] == 0:
+ continue
+ if self.neg_sample_type == "average":
+ negs = np.random.randint(
+ low=0, high=self.graph.num_nodes, size=neg_sample_size)
+ elif self.neg_sample_type == "outdegree":
+ negs = alias_sample(neg_sample_size, alias, events)
+ elif self.neg_sample_type == "inbatch":
+ pass
+ dst = np.concatenate([pos, negs], 1)
+ # [batch_size, 1, 1] [batch_size, neg_num+1, 1]
+ yield src[:max_len], dst[:max_len]
+ except Exception as e:
+ log.exception(e)
diff --git a/examples/distribute_deepwalk/utils.py b/examples/distribute_deepwalk/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0958d63d6d4cdc9a661192914bfd29a8a09280cd
--- /dev/null
+++ b/examples/distribute_deepwalk/utils.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ Utils file.
+"""
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import os
+import time
+
+import numpy as np
+from pgl.utils.logger import log
+from pgl.graph import Graph
+from pgl.sample import graph_alias_sample_table
+
+from reader import DeepwalkReader
+import mp_reader
+
+
+def get_file_list(path):
+ filelist = []
+ if os.path.isfile(path):
+ filelist = [path]
+ elif os.path.isdir(path):
+ filelist = [
+ os.path.join(dp, f)
+ for dp, dn, filenames in os.walk(path) for f in filenames
+ ]
+ else:
+ raise ValueError(path + " not supported")
+ return filelist
+
+
+def build_graph(num_nodes, edge_path):
+ filelist = []
+ if os.path.isfile(edge_path):
+ filelist = [edge_path]
+ elif os.path.isdir(edge_path):
+ filelist = [
+ os.path.join(dp, f)
+ for dp, dn, filenames in os.walk(edge_path) for f in filenames
+ ]
+ else:
+ raise ValueError(edge_path + " not supported")
+ edges, edge_weight = [], []
+ for name in filelist:
+ with open(name) as inf:
+ for line in inf:
+ slots = line.strip("\n").split()
+ edges.append([slots[0], slots[1]])
+ edges.append([slots[1], slots[0]])
+ if len(slots) > 2:
+ edge_weight.extend([float(slots[2]), float(slots[2])])
+ edges = np.array(edges, dtype="int64")
+ assert num_nodes > edges.max(
+ ), "Node id in any edges should be smaller then num_nodes!"
+
+ edge_feat = dict()
+ if len(edge_weight) == len(edges):
+ edge_feat["weight"] = np.array(edge_weight)
+
+ graph = Graph(num_nodes, edges, edge_feat=edge_feat)
+ log.info("Build graph done")
+
+ graph.outdegree()
+
+ del edges, edge_feat
+
+ log.info("Build graph index done")
+ if "weight" in graph.edge_feat:
+ graph.node_feat["alias"], graph.node_feat[
+ "events"] = graph_alias_sample_table(graph, "weight")
+ log.info("Build graph alias sample table done")
+ return graph
+
+
+def build_fake_graph(num_nodes):
+ class FakeGraph():
+ pass
+
+ graph = FakeGraph()
+ graph.num_nodes = num_nodes
+ return graph
+
+
+def build_gen_func(args, graph):
+ num_sample_workers = args.num_sample_workers
+
+ if args.walkpath_files is None or args.walkpath_files == "None":
+ walkpath_files = [None for _ in range(num_sample_workers)]
+ else:
+ files = get_file_list(args.walkpath_files)
+ walkpath_files = [[] for i in range(num_sample_workers)]
+ for idx, f in enumerate(files):
+ walkpath_files[idx % num_sample_workers].append(f)
+
+ if args.train_files is None or args.train_files == "None":
+ train_files = [None for _ in range(num_sample_workers)]
+ else:
+ files = get_file_list(args.train_files)
+ train_files = [[] for i in range(num_sample_workers)]
+ for idx, f in enumerate(files):
+ train_files[idx % num_sample_workers].append(f)
+
+ gen_func_pool = [
+ DeepwalkReader(
+ graph,
+ batch_size=args.batch_size,
+ walk_len=args.walk_len,
+ win_size=args.win_size,
+ neg_num=args.neg_num,
+ neg_sample_type=args.neg_sample_type,
+ walkpath_files=walkpath_files[i],
+ train_files=train_files[i]) for i in range(num_sample_workers)
+ ]
+ if num_sample_workers == 1:
+ gen_func = gen_func_pool[0]
+ else:
+ gen_func = mp_reader.multiprocess_reader(
+ gen_func_pool, use_pipe=True, queue_size=100)
+ return gen_func
+
+
+def test_gen_speed(gen_func):
+ cur_time = time.time()
+ for idx, _ in enumerate(gen_func()):
+ log.info("iter %s: %s s" % (idx, time.time() - cur_time))
+ cur_time = time.time()
+ if idx == 100:
+ break
diff --git a/examples/distribute_graphsage/README.md b/examples/distribute_graphsage/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7e424d4668955476eecf459ade84603b71e26ebe
--- /dev/null
+++ b/examples/distribute_graphsage/README.md
@@ -0,0 +1,57 @@
+# Distribute GraphSAGE in PGL
+
+[GraphSAGE](https://cs.stanford.edu/people/jure/pubs/graphsage-nips17.pdf) is a general inductive framework that leverages node feature
+information (e.g., text attributes) to efficiently generate node embeddings for previously unseen data. Instead of training individual embeddings for each node, GraphSAGE learns a function that generates embeddings by sampling and aggregating features from a node’s local neighborhood. Based on PGL, we reproduce GraphSAGE algorithm and reach the same level of indicators as the paper in Reddit Dataset. Besides, this is an example of subgraph sampling and training in PGL.
+
+For purpose of high scalability, we use redis as distribute graph storage solution and training graphsage against redis server.
+
+### Datasets(Quickstart)
+The reddit dataset should be downloaded from [reddit_adj.npz](https://drive.google.com/open?id=174vb0Ws7Vxk_QTUtxqTgDHSQ4El4qDHt) and [reddit.npz](https://drive.google.com/open?id=19SphVl_Oe8SJ1r87Hr5a6znx3nJu1F2Jthe). The details for Reddit Dataset can be found [here](https://cs.stanford.edu/people/jure/pubs/graphsage-nips17.pdf).
+
+Alternatively, reddit dataset has been preprocessed and packed into docker image, which can be instantly pulled using following commands.
+
+```sh
+docker pull githubutilities/reddit_redis_demo:v0.1
+```
+
+### Dependencies
+
+```txt
+- paddlepaddle>=1.6
+- pgl
+- scipy
+- redis==2.10.6
+- redis-py-cluster==1.3.6
+```
+
+### How to run
+
+#### 1. Start reddit data service
+
+```sh
+docker run \
+ --net=host \
+ -d --rm \
+ --name reddit_demo \
+ -it githubutilities/reddit_redis_demo:v0.1 \
+ /bin/bash -c "/bin/bash ./before_hook.sh && /bin/bash"
+docker logs -f `docker ps -aqf "name=reddit_demo"`
+```
+
+#### 2. training GraphSAGE model
+
+```sh
+python train.py --use_cuda --epoch 10 --graphsage_type graphsage_mean --sample_workers 10
+```
+
+#### Hyperparameters
+
+- epoch: Number of epochs default (10)
+- use_cuda: Use gpu if assign use_cuda.
+- graphsage_type: We support 4 aggregator types including "graphsage_mean", "graphsage_maxpool", "graphsage_meanpool" and "graphsage_lstm".
+- sample_workers: The number of workers for multiprocessing subgraph sample.
+- lr: Learning rate.
+- batch_size: Batch size.
+- samples_1: The max neighbors for the first hop neighbor sampling. (default: 25)
+- samples_2: The max neighbors for the second hop neighbor sampling. (default: 10)
+- hidden_size: The hidden size of the GraphSAGE models.
diff --git a/examples/distribute_graphsage/data/reddit_index_label.npz b/examples/distribute_graphsage/data/reddit_index_label.npz
new file mode 100644
index 0000000000000000000000000000000000000000..69c2ae1ced956dd5b40221ea0e8e11d2b470230a
Binary files /dev/null and b/examples/distribute_graphsage/data/reddit_index_label.npz differ
diff --git a/examples/distribute_graphsage/model.py b/examples/distribute_graphsage/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..145a979e86951dc4b5a2522154a0dc0373eea065
--- /dev/null
+++ b/examples/distribute_graphsage/model.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.fluid as fluid
+
+
+def copy_send(src_feat, dst_feat, edge_feat):
+ return src_feat["h"]
+
+
+def mean_recv(feat):
+ return fluid.layers.sequence_pool(feat, pool_type="average")
+
+
+def sum_recv(feat):
+ return fluid.layers.sequence_pool(feat, pool_type="sum")
+
+
+def max_recv(feat):
+ return fluid.layers.sequence_pool(feat, pool_type="max")
+
+
+def lstm_recv(feat):
+ hidden_dim = 128
+ forward, _ = fluid.layers.dynamic_lstm(
+ input=feat, size=hidden_dim * 4, use_peepholes=False)
+ output = fluid.layers.sequence_last_step(forward)
+ return output
+
+
+def graphsage_mean(gw, feature, hidden_size, act, name):
+ msg = gw.send(copy_send, nfeat_list=[("h", feature)])
+ neigh_feature = gw.recv(msg, mean_recv)
+ self_feature = feature
+ self_feature = fluid.layers.fc(self_feature,
+ hidden_size,
+ act=act,
+ name=name + '_l')
+ neigh_feature = fluid.layers.fc(neigh_feature,
+ hidden_size,
+ act=act,
+ name=name + '_r')
+ output = fluid.layers.concat([self_feature, neigh_feature], axis=1)
+ output = fluid.layers.l2_normalize(output, axis=1)
+ return output
+
+
+def graphsage_meanpool(gw,
+ feature,
+ hidden_size,
+ act,
+ name,
+ inner_hidden_size=512):
+ neigh_feature = fluid.layers.fc(feature, inner_hidden_size, act="relu")
+ msg = gw.send(copy_send, nfeat_list=[("h", neigh_feature)])
+ neigh_feature = gw.recv(msg, mean_recv)
+ neigh_feature = fluid.layers.fc(neigh_feature,
+ hidden_size,
+ act=act,
+ name=name + '_r')
+
+ self_feature = feature
+ self_feature = fluid.layers.fc(self_feature,
+ hidden_size,
+ act=act,
+ name=name + '_l')
+ output = fluid.layers.concat([self_feature, neigh_feature], axis=1)
+ output = fluid.layers.l2_normalize(output, axis=1)
+ return output
+
+
+def graphsage_maxpool(gw,
+ feature,
+ hidden_size,
+ act,
+ name,
+ inner_hidden_size=512):
+ neigh_feature = fluid.layers.fc(feature, inner_hidden_size, act="relu")
+ msg = gw.send(copy_send, nfeat_list=[("h", neigh_feature)])
+ neigh_feature = gw.recv(msg, max_recv)
+ neigh_feature = fluid.layers.fc(neigh_feature,
+ hidden_size,
+ act=act,
+ name=name + '_r')
+
+ self_feature = feature
+ self_feature = fluid.layers.fc(self_feature,
+ hidden_size,
+ act=act,
+ name=name + '_l')
+ output = fluid.layers.concat([self_feature, neigh_feature], axis=1)
+ output = fluid.layers.l2_normalize(output, axis=1)
+ return output
+
+
+def graphsage_lstm(gw, feature, hidden_size, act, name):
+ inner_hidden_size = 128
+ neigh_feature = fluid.layers.fc(feature, inner_hidden_size, act="relu")
+
+ hidden_dim = 128
+ forward_proj = fluid.layers.fc(input=neigh_feature,
+ size=hidden_dim * 4,
+ bias_attr=False,
+ name="lstm_proj")
+ msg = gw.send(copy_send, nfeat_list=[("h", forward_proj)])
+ neigh_feature = gw.recv(msg, lstm_recv)
+ neigh_feature = fluid.layers.fc(neigh_feature,
+ hidden_size,
+ act=act,
+ name=name + '_r')
+
+ self_feature = feature
+ self_feature = fluid.layers.fc(self_feature,
+ hidden_size,
+ act=act,
+ name=name + '_l')
+ output = fluid.layers.concat([self_feature, neigh_feature], axis=1)
+ output = fluid.layers.l2_normalize(output, axis=1)
+ return output
diff --git a/examples/distribute_graphsage/reader.py b/examples/distribute_graphsage/reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c230cce3c40161881163a0446ab51977ecc9700
--- /dev/null
+++ b/examples/distribute_graphsage/reader.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import pickle as pkl
+import paddle
+import paddle.fluid as fluid
+import socket
+import pgl
+import time
+
+from pgl.utils import mp_reader
+from pgl.utils.logger import log
+from pgl import redis_graph
+
+
+def node_batch_iter(nodes, node_label, batch_size):
+ """node_batch_iter
+ """
+ perm = np.arange(len(nodes))
+ np.random.shuffle(perm)
+ start = 0
+ while start < len(nodes):
+ index = perm[start:start + batch_size]
+ start += batch_size
+ yield nodes[index], node_label[index]
+
+
+def traverse(item):
+ """traverse
+ """
+ if isinstance(item, list) or isinstance(item, np.ndarray):
+ for i in iter(item):
+ for j in traverse(i):
+ yield j
+ else:
+ yield item
+
+
+def flat_node_and_edge(nodes, eids):
+ """flat_node_and_edge
+ """
+ nodes = list(set(traverse(nodes)))
+ eids = list(set(traverse(eids)))
+ return nodes, eids
+
+
+def worker(batch_info, graph_wrapper, samples):
+ """Worker
+ """
+
+ def work():
+ """work
+ """
+ redis_configs = [{
+ "host": socket.gethostbyname(socket.gethostname()),
+ "port": 7430
+ }, ]
+ graph = redis_graph.RedisGraph("sub_graph", redis_configs, 64)
+ first = True
+ for batch_train_samples, batch_train_labels in batch_info:
+ start_nodes = batch_train_samples
+ nodes = start_nodes
+ eids = []
+ eid2edges = {}
+ for max_deg in samples:
+ pred, pred_eid = graph.sample_predecessor(
+ start_nodes, max_degree=max_deg, return_eids=True)
+ for _dst, _srcs, _eids in zip(start_nodes, pred, pred_eid):
+ for _src, _eid in zip(_srcs, _eids):
+ eid2edges[_eid] = (_src, _dst)
+
+ last_nodes = nodes
+ nodes = [nodes, pred]
+ eids = [eids, pred_eid]
+ nodes, eids = flat_node_and_edge(nodes, eids)
+ # Find new nodes
+ start_nodes = list(set(nodes) - set(last_nodes))
+ if len(start_nodes) == 0:
+ break
+
+ subgraph = graph.subgraph(
+ nodes=nodes, eid=eids, edges=[eid2edges[e] for e in eids])
+ sub_node_index = subgraph.reindex_from_parrent_nodes(
+ batch_train_samples)
+ feed_dict = graph_wrapper.to_feed(subgraph)
+ feed_dict["node_label"] = np.expand_dims(
+ np.array(
+ batch_train_labels, dtype="int64"), -1)
+ feed_dict["node_index"] = sub_node_index
+ yield feed_dict
+
+ return work
+
+
+def multiprocess_graph_reader(graph_wrapper,
+ samples,
+ node_index,
+ batch_size,
+ node_label,
+ num_workers=4):
+ """multiprocess_graph_reader
+ """
+
+ def parse_to_subgraph(rd):
+ """parse_to_subgraph
+ """
+
+ def work():
+ """work
+ """
+ last = time.time()
+ for data in rd():
+ this = time.time()
+ feed_dict = data
+ now = time.time()
+ last = now
+ yield feed_dict
+
+ return work
+
+ def reader():
+ """reader"""
+ batch_info = list(
+ node_batch_iter(
+ node_index, node_label, batch_size=batch_size))
+ block_size = int(len(batch_info) / num_workers + 1)
+ reader_pool = []
+ for i in range(num_workers):
+ reader_pool.append(
+ worker(batch_info[block_size * i:block_size * (i + 1)],
+ graph_wrapper, samples))
+ multi_process_sample = mp_reader.multiprocess_reader(
+ reader_pool, use_pipe=True, queue_size=1000)
+ r = parse_to_subgraph(multi_process_sample)
+ return paddle.reader.buffered(r, 1000)
+
+ return reader()
diff --git a/examples/distribute_graphsage/requirements.txt b/examples/distribute_graphsage/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bfc094c11eb1cebe90f8acbc4c399eb688d9c7cb
--- /dev/null
+++ b/examples/distribute_graphsage/requirements.txt
@@ -0,0 +1,3 @@
+scipy
+redis==2.10.6
+redis-py-cluster==1.3.6
diff --git a/examples/distribute_graphsage/train.py b/examples/distribute_graphsage/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb62acf8c20b1e63126c6ff20687d6e59f597d7f
--- /dev/null
+++ b/examples/distribute_graphsage/train.py
@@ -0,0 +1,265 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import argparse
+import time
+
+import numpy as np
+import scipy.sparse as sp
+from sklearn.preprocessing import StandardScaler
+
+import pgl
+from pgl.utils.logger import log
+from pgl.utils import paddle_helper
+import paddle
+import paddle.fluid as fluid
+import reader
+from model import graphsage_mean, graphsage_meanpool,\
+ graphsage_maxpool, graphsage_lstm
+
+
+def load_data():
+ """
+ data from https://github.com/matenure/FastGCN/issues/8
+ reddit.npz: https://drive.google.com/open?id=19SphVl_Oe8SJ1r87Hr5a6znx3nJu1F2J
+ reddit_index_label is preprocess from reddit.npz without feats key.
+ """
+ data_dir = os.path.dirname(os.path.abspath(__file__))
+ data = np.load(os.path.join(data_dir, "data/reddit_index_label.npz"))
+
+ num_class = 41
+
+ train_label = data['y_train']
+ val_label = data['y_val']
+ test_label = data['y_test']
+
+ train_index = data['train_index']
+ val_index = data['val_index']
+ test_index = data['test_index']
+
+ return {
+ "train_index": train_index,
+ "train_label": train_label,
+ "val_label": val_label,
+ "val_index": val_index,
+ "test_index": test_index,
+ "test_label": test_label,
+ "num_class": 41
+ }
+
+
+def build_graph_model(graph_wrapper, num_class, k_hop, graphsage_type,
+ hidden_size):
+ node_index = fluid.layers.data(
+ "node_index", shape=[None], dtype="int64", append_batch_size=False)
+
+ node_label = fluid.layers.data(
+ "node_label", shape=[None, 1], dtype="int64", append_batch_size=False)
+
+ #feature = fluid.layers.gather(feature, graph_wrapper.node_feat['feats'])
+ feature = graph_wrapper.node_feat['feats']
+ feature.stop_gradient = True
+
+ for i in range(k_hop):
+ if graphsage_type == 'graphsage_mean':
+ feature = graphsage_mean(
+ graph_wrapper,
+ feature,
+ hidden_size,
+ act="relu",
+ name="graphsage_mean_%s" % i)
+ elif graphsage_type == 'graphsage_meanpool':
+ feature = graphsage_meanpool(
+ graph_wrapper,
+ feature,
+ hidden_size,
+ act="relu",
+ name="graphsage_meanpool_%s" % i)
+ elif graphsage_type == 'graphsage_maxpool':
+ feature = graphsage_maxpool(
+ graph_wrapper,
+ feature,
+ hidden_size,
+ act="relu",
+ name="graphsage_maxpool_%s" % i)
+ elif graphsage_type == 'graphsage_lstm':
+ feature = graphsage_lstm(
+ graph_wrapper,
+ feature,
+ hidden_size,
+ act="relu",
+ name="graphsage_maxpool_%s" % i)
+ else:
+ raise ValueError("graphsage type %s is not"
+ " implemented" % graphsage_type)
+
+ feature = fluid.layers.gather(feature, node_index)
+ logits = fluid.layers.fc(feature,
+ num_class,
+ act=None,
+ name='classification_layer')
+ proba = fluid.layers.softmax(logits)
+
+ loss = fluid.layers.softmax_with_cross_entropy(
+ logits=logits, label=node_label)
+ loss = fluid.layers.mean(loss)
+ acc = fluid.layers.accuracy(input=proba, label=node_label, k=1)
+ return loss, acc
+
+
+def run_epoch(batch_iter,
+ exe,
+ program,
+ prefix,
+ model_loss,
+ model_acc,
+ epoch,
+ log_per_step=100):
+ batch = 0
+ total_loss = 0.
+ total_acc = 0.
+ total_sample = 0
+ start = time.time()
+ for batch_feed_dict in batch_iter():
+ batch += 1
+ batch_loss, batch_acc = exe.run(program,
+ fetch_list=[model_loss, model_acc],
+ feed=batch_feed_dict)
+
+ if batch % log_per_step == 0:
+ log.info("Batch %s %s-Loss %s %s-Acc %s" %
+ (batch, prefix, batch_loss, prefix, batch_acc))
+
+ num_samples = len(batch_feed_dict["node_index"])
+ total_loss += batch_loss * num_samples
+ total_acc += batch_acc * num_samples
+ total_sample += num_samples
+ end = time.time()
+
+ log.info("%s Epoch %s Loss %.5lf Acc %.5lf Speed(per batch) %.5lf sec" %
+ (prefix, epoch, total_loss / total_sample,
+ total_acc / total_sample, (end - start) / batch))
+
+
+def main(args):
+ data = load_data()
+ log.info("preprocess finish")
+ log.info("Train Examples: %s" % len(data["train_index"]))
+ log.info("Val Examples: %s" % len(data["val_index"]))
+ log.info("Test Examples: %s" % len(data["test_index"]))
+
+ place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+ train_program = fluid.Program()
+ startup_program = fluid.Program()
+ samples = []
+ if args.samples_1 > 0:
+ samples.append(args.samples_1)
+ if args.samples_2 > 0:
+ samples.append(args.samples_2)
+
+ with fluid.program_guard(train_program, startup_program):
+ graph_wrapper = pgl.graph_wrapper.GraphWrapper(
+ "sub_graph",
+ fluid.CPUPlace(),
+ node_feat=[('feats', [None, 602], np.dtype('float32'))])
+ model_loss, model_acc = build_graph_model(
+ graph_wrapper,
+ num_class=data["num_class"],
+ hidden_size=args.hidden_size,
+ graphsage_type=args.graphsage_type,
+ k_hop=len(samples))
+
+ test_program = train_program.clone(for_test=True)
+
+ with fluid.program_guard(train_program, startup_program):
+ adam = fluid.optimizer.Adam(learning_rate=args.lr)
+ adam.minimize(model_loss)
+
+ exe = fluid.Executor(place)
+ exe.run(startup_program)
+
+ train_iter = reader.multiprocess_graph_reader(
+ graph_wrapper,
+ samples=samples,
+ num_workers=args.sample_workers,
+ batch_size=args.batch_size,
+ node_index=data['train_index'],
+ node_label=data["train_label"])
+
+ val_iter = reader.multiprocess_graph_reader(
+ graph_wrapper,
+ samples=samples,
+ num_workers=args.sample_workers,
+ batch_size=args.batch_size,
+ node_index=data['val_index'],
+ node_label=data["val_label"])
+
+ test_iter = reader.multiprocess_graph_reader(
+ graph_wrapper,
+ samples=samples,
+ num_workers=args.sample_workers,
+ batch_size=args.batch_size,
+ node_index=data['test_index'],
+ node_label=data["test_label"])
+
+ for epoch in range(args.epoch):
+ run_epoch(
+ train_iter,
+ program=train_program,
+ exe=exe,
+ prefix="train",
+ model_loss=model_loss,
+ model_acc=model_acc,
+ log_per_step=1,
+ epoch=epoch)
+
+ run_epoch(
+ val_iter,
+ program=test_program,
+ exe=exe,
+ prefix="val",
+ model_loss=model_loss,
+ model_acc=model_acc,
+ log_per_step=10000,
+ epoch=epoch)
+
+ run_epoch(
+ test_iter,
+ program=test_program,
+ prefix="test",
+ exe=exe,
+ model_loss=model_loss,
+ model_acc=model_acc,
+ log_per_step=10000,
+ epoch=epoch)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description='graphsage')
+ parser.add_argument("--use_cuda", action='store_true', help="use_cuda")
+ parser.add_argument(
+ "--normalize", action='store_true', help="normalize features")
+ parser.add_argument(
+ "--symmetry", action='store_true', help="undirect graph")
+ parser.add_argument("--graphsage_type", type=str, default="graphsage_mean")
+ parser.add_argument("--sample_workers", type=int, default=10)
+ parser.add_argument("--epoch", type=int, default=10)
+ parser.add_argument("--hidden_size", type=int, default=128)
+ parser.add_argument("--batch_size", type=int, default=128)
+ parser.add_argument("--lr", type=float, default=0.01)
+ parser.add_argument("--samples_1", type=int, default=25)
+ parser.add_argument("--samples_2", type=int, default=10)
+ args = parser.parse_args()
+ log.info(args)
+ main(args)
diff --git a/examples/gat/README.md b/examples/gat/README.md
index 6b18dcc8d33d14c9b7818a5ab216ad83e48e3ee9..573316ea729a03441d47f7a54fe83163142cf96c 100644
--- a/examples/gat/README.md
+++ b/examples/gat/README.md
@@ -26,24 +26,25 @@ def gat_layer(graph_wrapper, node_feature, hidden_size):
return output
```
+
### Datasets
The datasets contain three citation networks: CORA, PUBMED, CITESEER. The details for these three datasets can be found in the [paper](https://arxiv.org/abs/1609.02907).
### Dependencies
-- paddlepaddle>=1.4 (The speed can be faster in 1.5.)
+- paddlepaddle>=1.6
- pgl
### Performance
We train our models for 200 epochs and report the accuracy on the test dataset.
-| Dataset | Accuracy | Speed with paddle 1.4
(epoch time) | Speed with paddle 1.5
(epoch time)|
-| --- | --- | --- |---|
-| Cora | ~83% | 0.0188s | 0.0175s |
-| Pubmed | ~78% | 0.0449s | 0.0295s |
-| Citeseer | ~70% | 0.0275 | 0.0253s |
+| Dataset | Accuracy |
+| --- | --- |
+| Cora | ~83% |
+| Pubmed | ~78% |
+| Citeseer | ~70% |
### How to run
diff --git a/examples/gat/train.py b/examples/gat/train.py
index 0446f0262defbb85a39e235223183eb85706778f..344948803539f260bbe7288a4ee16a423c5c5f8a 100644
--- a/examples/gat/train.py
+++ b/examples/gat/train.py
@@ -68,7 +68,7 @@ def main(args):
node_index = fluid.layers.data(
"node_index",
shape=[None, 1],
- dtype="int32",
+ dtype="int64",
append_batch_size=False)
node_label = fluid.layers.data(
"node_label",
@@ -111,7 +111,7 @@ def main(args):
for epoch in range(200):
if epoch >= 3:
t0 = time.time()
- feed_dict["node_index"] = np.array(train_index, dtype="int32")
+ feed_dict["node_index"] = np.array(train_index, dtype="int64")
feed_dict["node_label"] = np.array(train_label, dtype="int64")
train_loss, train_acc = exe.run(train_program,
feed=feed_dict,
@@ -121,7 +121,7 @@ def main(args):
time_per_epoch = 1.0 * (time.time() - t0)
dur.append(time_per_epoch)
- feed_dict["node_index"] = np.array(val_index, dtype="int32")
+ feed_dict["node_index"] = np.array(val_index, dtype="int64")
feed_dict["node_label"] = np.array(val_label, dtype="int64")
val_loss, val_acc = exe.run(test_program,
feed=feed_dict,
@@ -132,7 +132,7 @@ def main(args):
"Train Loss: %f " % train_loss + "Train Acc: %f " % train_acc
+ "Val Loss: %f " % val_loss + "Val Acc: %f " % val_acc)
- feed_dict["node_index"] = np.array(test_index, dtype="int32")
+ feed_dict["node_index"] = np.array(test_index, dtype="int64")
feed_dict["node_label"] = np.array(test_label, dtype="int64")
test_loss, test_acc = exe.run(test_program,
feed=feed_dict,
diff --git a/examples/gcn/README.md b/examples/gcn/README.md
index c56c604e0f89eef2c714dac49a1f10c3fbbe48d3..1a1cf4fc54b3c7232b8c3b1cb0516b34de05240d 100644
--- a/examples/gcn/README.md
+++ b/examples/gcn/README.md
@@ -26,18 +26,18 @@ The datasets contain three citation networks: CORA, PUBMED, CITESEER. The detail
### Dependencies
-- paddlepaddle>=1.4 (The speed can be faster in 1.5.)
+- paddlepaddle>=1.6
- pgl
### Performance
We train our models for 200 epochs and report the accuracy on the test dataset.
-| Dataset | Accuracy | Speed with paddle 1.4
(epoch time) | Speed with paddle 1.5
(epoch time)|
-| --- | --- | --- |---|
-| Cora | ~81% | 0.0106s | 0.0104s |
-| Pubmed | ~79% | 0.0210s | 0.0154s |
-| Citeseer | ~71% | 0.0175s | 0.0177s |
+| Dataset | Accuracy |
+| --- | --- |
+| Cora | ~81% |
+| Pubmed | ~79% |
+| Citeseer | ~71% |
### How to run
diff --git a/examples/gcn/train.py b/examples/gcn/train.py
index 6dfef67965a9179710a25ff9c2529da6108b40ce..7a1aaf3ab36d3abdfdafe4451572f4104284b9a1 100644
--- a/examples/gcn/train.py
+++ b/examples/gcn/train.py
@@ -70,7 +70,7 @@ def main(args):
node_index = fluid.layers.data(
"node_index",
shape=[None, 1],
- dtype="int32",
+ dtype="int64",
append_batch_size=False)
node_label = fluid.layers.data(
"node_label",
@@ -113,7 +113,7 @@ def main(args):
for epoch in range(200):
if epoch >= 3:
t0 = time.time()
- feed_dict["node_index"] = np.array(train_index, dtype="int32")
+ feed_dict["node_index"] = np.array(train_index, dtype="int64")
feed_dict["node_label"] = np.array(train_label, dtype="int64")
train_loss, train_acc = exe.run(train_program,
feed=feed_dict,
@@ -123,7 +123,7 @@ def main(args):
if epoch >= 3:
time_per_epoch = 1.0 * (time.time() - t0)
dur.append(time_per_epoch)
- feed_dict["node_index"] = np.array(val_index, dtype="int32")
+ feed_dict["node_index"] = np.array(val_index, dtype="int64")
feed_dict["node_label"] = np.array(val_label, dtype="int64")
val_loss, val_acc = exe.run(test_program,
feed=feed_dict,
@@ -134,7 +134,7 @@ def main(args):
"Train Loss: %f " % train_loss + "Train Acc: %f " % train_acc
+ "Val Loss: %f " % val_loss + "Val Acc: %f " % val_acc)
- feed_dict["node_index"] = np.array(test_index, dtype="int32")
+ feed_dict["node_index"] = np.array(test_index, dtype="int64")
feed_dict["node_label"] = np.array(test_label, dtype="int64")
test_loss, test_acc = exe.run(test_program,
feed=feed_dict,
diff --git a/examples/ges/README.md b/examples/ges/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5636cb13c2eec606141b469f3b1a13e347ae8bda
--- /dev/null
+++ b/examples/ges/README.md
@@ -0,0 +1,22 @@
+# PGL Examples for GES
+[Graph Embedding with Side Information](https://arxiv.org/pdf/1803.02349.pdf) is an algorithmic framework for representational learning on graphs. Given any graph, it can learn continuous feature representations for the nodes, which can then be used for various downstream machine learning tasks. Based on PGL, we reproduce ges algorithms.
+## Datasets
+The datasets contain two networks: [BlogCatalog](http://socialcomputing.asu.edu/datasets/BlogCatalog3).
+## Dependencies
+- paddlepaddle>=1.6
+- pgl>=1.0.0
+
+## How to run
+
+For examples, train ges on cora dataset.
+```sh
+# train deepwalk in distributed mode.
+sh gpu_run.sh
+```
+
+## Hyperparameters
+- dataset: The citation dataset "BlogCatalog".
+- hidden_size: Hidden size of the embedding.
+- lr: Learning rate.
+- neg_num: Number of negative samples.
+- epoch: Number of training epoch.
diff --git a/examples/ges/gpu_run.sh b/examples/ges/gpu_run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..2eca3a6a140b84bc6be9ea49c5bff1203e920a55
--- /dev/null
+++ b/examples/ges/gpu_run.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+export FLAGS_sync_nccl_allreduce=1
+export FLAGS_eager_delete_tensor_gb=0
+export FLAGS_fraction_of_gpu_memory_to_use=1
+export NCCL_DEBUG=INFO
+export NCCL_IB_GID_INDEX=3
+export GLOG_v=1
+export GLOG_logtostderr=1
+
+num_nodes=10312
+num_embedding=10351
+num_sample_workers=20
+
+# build train_data
+rm -rf train_data && mkdir -p train_data
+cd train_data
+seq 0 $((num_nodes-1)) | shuf | split -l $((num_nodes/num_sample_workers+1))
+cd -
+
+python3 gpu_train.py --output_path ./output --epoch 100 --walk_len 40 --win_size 5 --neg_num 5 --batch_size 128 --hidden_size 128 \
+ --num_nodes $num_nodes --num_embedding $num_embedding --num_sample_workers $num_sample_workers --steps_per_save 2000 --dataset "BlogCatalog"
diff --git a/examples/ges/gpu_train.py b/examples/ges/gpu_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..62fcf3588e3be8a332258c1c6e44d59d6259fcba
--- /dev/null
+++ b/examples/ges/gpu_train.py
@@ -0,0 +1,314 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" gpu_train
+"""
+import argparse
+import time
+import os
+import glob
+
+import numpy as np
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from pgl.utils.logger import log
+from pgl.graph import Graph
+from pgl.sample import graph_alias_sample_table
+from pgl import data_loader
+
+import mp_reader
+from reader import GESReader
+from model import GESModel
+
+
+def get_file_list(path):
+ """get_file_list
+ """
+ filelist = []
+ if os.path.isfile(path):
+ filelist = [path]
+ elif os.path.isdir(path):
+ filelist = [
+ os.path.join(dp, f)
+ for dp, dn, filenames in os.walk(path) for f in filenames
+ ]
+ else:
+ raise ValueError(path + " not supported")
+ return filelist
+
+
+def build_graph(num_nodes, edge_path, output_path, undigraph=True):
+ """ build_graph
+ """
+ edge_file = os.path.join(output_path, "edge.npy")
+ edge_weight_file = os.path.join(output_path, "edge_weight.npy")
+ alias_file = os.path.join(output_path, "alias.npy")
+ events_file = os.path.join(output_path, "events.npy")
+ if os.path.isfile(edge_file):
+ edges = np.load(edge_file)
+ edge_feat = dict()
+ if os.path.isfile(edge_weight_file):
+ log.info("Loading weight from cache")
+ edge_feat["weight"] = np.load(edge_weight_file, allow_pickle=True)
+ node_feat = dict()
+ if os.path.isfile(alias_file):
+ log.info("Loading alias from cache")
+ node_feat["alias"] = np.load(alias_file, allow_pickle=True)
+ if os.path.isfile(events_file):
+ log.info("Loading events from cache")
+ node_feat["events"] = np.load(events_file, allow_pickle=True)
+ else:
+ filelist = get_file_list(edge_path)
+ edges, edge_weight = [], []
+ log.info("Reading edge files")
+ for name in filelist:
+ with open(name) as inf:
+ for line in inf:
+ slots = line.strip("\n").split()
+ edges.append([slots[0], slots[1]])
+ if len(slots) > 2:
+ edge_weight.append(slots[2])
+ edges = np.array(edges, dtype="int64")
+ assert num_nodes > edges.max(
+ ), "Node id in any edges should be smaller then num_nodes!"
+
+ log.info("Read edge files done.")
+ edge_feat = dict()
+ node_feat = dict()
+ if len(edge_weight) == len(edges):
+ edge_feat["weight"] = np.array(edge_weight, dtype="float32")
+
+ if undigraph is True:
+ edges = np.concatenate([edges, edges[:, [1, 0]]], 0)
+ if "weight" in edge_feat:
+ edge_feat["weight"] = np.concatenate(
+ [edge_feat["weight"], edge_feat["weight"]],
+ 0).astype("float64")
+
+ graph = Graph(num_nodes, edges, node_feat, edge_feat=edge_feat)
+ log.info("Build graph done")
+ graph.outdegree()
+ log.info("Build graph index done")
+ if "weight" in graph.edge_feat and "alias" not in graph.node_feat and "events" not in graph.node_feat:
+ graph.node_feat["alias"], graph.node_feat[
+ "events"] = graph_alias_sample_table(graph, "weight")
+ log.info(
+ "Build graph alias sample table done, and saving alias & evnets cache"
+ )
+ np.save(alias_file, graph.node_feat["alias"])
+ np.save(events_file, graph.node_feat["events"])
+ return graph
+
+
+def optimization(base_lr, loss, train_steps, optimizer='adam'):
+ """ optimization
+ """
+ decayed_lr = L.polynomial_decay(base_lr, train_steps, 0.0001)
+
+ if optimizer == 'sgd':
+ optimizer = F.optimizer.SGD(
+ decayed_lr,
+ regularization=F.regularizer.L2DecayRegularizer(
+ regularization_coeff=0.0025))
+ elif optimizer == 'adam':
+ # dont use gpu's lazy mode
+ optimizer = F.optimizer.Adam(decayed_lr)
+ else:
+ raise ValueError
+
+ log.info('learning rate:%f' % (base_lr))
+ optimizer.minimize(loss)
+
+
+def build_gen_func(args, graph, node_feat):
+ """ build_gen_func
+ """
+ num_sample_workers = args.num_sample_workers
+
+ if args.walkpath_files is None:
+ walkpath_files = [None for _ in range(num_sample_workers)]
+ else:
+ files = get_file_list(args.walkpath_files)
+ walkpath_files = [[] for i in range(num_sample_workers)]
+ for idx, f in enumerate(files):
+ walkpath_files[idx % num_sample_workers].append(f)
+
+ if args.train_files is None:
+ train_files = [None for _ in range(num_sample_workers)]
+ else:
+ files = get_file_list(args.train_files)
+ train_files = [[] for i in range(num_sample_workers)]
+ for idx, f in enumerate(files):
+ train_files[idx % num_sample_workers].append(f)
+
+ gen_func_pool = [
+ GESReader(
+ graph,
+ node_feat,
+ batch_size=args.batch_size,
+ walk_len=args.walk_len,
+ win_size=args.win_size,
+ neg_num=args.neg_num,
+ neg_sample_type=args.neg_sample_type,
+ walkpath_files=walkpath_files[i],
+ train_files=train_files[i]) for i in range(num_sample_workers)
+ ]
+ if num_sample_workers == 1:
+ gen_func = gen_func_pool[0]
+ else:
+ gen_func = mp_reader.multiprocess_reader(
+ gen_func_pool, use_pipe=True, queue_size=100)
+ return gen_func
+
+
+def get_parallel_exe(program, loss):
+ """ get_parallel_exe
+ """
+ exec_strategy = F.ExecutionStrategy()
+ exec_strategy.num_threads = 1 #2 for fp32 4 for fp16
+ exec_strategy.use_experimental_executor = True
+ exec_strategy.num_iteration_per_drop_scope = 10 #important shit
+
+ build_strategy = F.BuildStrategy()
+ build_strategy.enable_inplace = True
+ build_strategy.memory_optimize = True
+ build_strategy.remove_unnecessary_lock = True
+
+ #return compiled_prog
+ train_exe = F.ParallelExecutor(
+ use_cuda=True,
+ loss_name=loss.name,
+ build_strategy=build_strategy,
+ exec_strategy=exec_strategy,
+ main_program=program)
+ return train_exe
+
+
+def train(train_exe, exe, program, loss, node2vec_pyreader, args, train_steps):
+ """ train
+ """
+ trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+ step = 0
+ while True:
+ try:
+ begin_time = time.time()
+ loss_val, = train_exe.run(fetch_list=[loss])
+ log.info("step %s: loss %.5f speed: %.5f s/step" %
+ (step, np.mean(loss_val), time.time() - begin_time))
+ step += 1
+ except F.core.EOFException:
+ node2vec_pyreader.reset()
+
+ if (step % args.steps_per_save == 0 or
+ step == train_steps) and trainer_id == 0:
+
+ model_save_dir = args.output_path
+ model_path = os.path.join(model_save_dir, str(step))
+ if not os.path.exists(model_save_dir):
+ os.makedirs(model_save_dir)
+ F.io.save_params(exe, model_path, program)
+
+ if step == train_steps:
+ break
+
+
+def test_gen_speed(gen_func):
+ """ test_gen_speed
+ """
+ cur_time = time.time()
+ for idx, _ in enumerate(gen_func()):
+ log.info("iter %s: %s s" % (idx, time.time() - cur_time))
+ cur_time = time.time()
+ if idx == 100:
+ break
+
+
+def main(args):
+ """ main
+ """
+ import logging
+ log.setLevel(logging.DEBUG)
+ log.info("start")
+
+ if args.dataset is not None:
+ if args.dataset == "BlogCatalog":
+ graph = data_loader.BlogCatalogDataset().graph
+ else:
+ raise ValueError(args.dataset + " dataset doesn't exists")
+ log.info("Load buildin BlogCatalog dataset done.")
+ node_feat = np.expand_dims(graph.node_feat["group_id"].argmax(-1),
+ -1) + graph.num_nodes
+ args.num_nodes = graph.num_nodes
+ args.num_embedding = graph.num_nodes + graph.node_feat[
+ "group_id"].shape[-1]
+ else:
+ graph = build_graph(args.num_nodes, args.edge_path, args.output_path)
+ node_feat = np.load(args.node_feat_npy)
+
+ model = GESModel(args.num_embedding, node_feat.shape[1] + 1,
+ args.hidden_size, args.neg_num, False, 2)
+ pyreader = model.pyreader
+ loss = model.forward()
+ num_devices = len(F.cuda_places())
+
+ train_steps = int(args.num_nodes * args.epoch / args.batch_size /
+ num_devices)
+ log.info("Train steps: %s" % train_steps)
+ optimization(args.lr * num_devices, loss, train_steps, args.optimizer)
+
+ place = F.CUDAPlace(0)
+ exe = F.Executor(place)
+ exe.run(F.default_startup_program())
+
+ gen_func = build_gen_func(args, graph, node_feat)
+
+ pyreader.decorate_tensor_provider(gen_func)
+ pyreader.start()
+ train_prog = F.default_main_program()
+ train_exe = get_parallel_exe(train_prog, loss)
+ train(train_exe, exe, train_prog, loss, pyreader, args, train_steps)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Deepwalk')
+ parser.add_argument("--hidden_size", type=int, default=64)
+ parser.add_argument("--lr", type=float, default=0.025)
+ parser.add_argument("--neg_num", type=int, default=5)
+ parser.add_argument("--epoch", type=int, default=100)
+ parser.add_argument("--batch_size", type=int, default=128)
+ parser.add_argument("--walk_len", type=int, default=40)
+ parser.add_argument("--win_size", type=int, default=5)
+ parser.add_argument("--output_path", type=str, default="output")
+ parser.add_argument("--num_sample_workers", type=int, default=1)
+ parser.add_argument("--steps_per_save", type=int, default=3000)
+ parser.add_argument("--num_nodes", type=int, default=10000)
+ parser.add_argument("--num_embedding", type=int, default=10000)
+ parser.add_argument("--edge_path", type=str, default="./graph_data")
+ parser.add_argument("--walkpath_files", type=str, default=None)
+ parser.add_argument("--train_files", type=str, default="./train_data")
+ parser.add_argument("--node_feat_npy", type=str, default="./feat.npy")
+ parser.add_argument("--dataset", type=str, default=None)
+ parser.add_argument(
+ "--neg_sample_type",
+ type=str,
+ default="average",
+ choices=["average", "outdegree"])
+ parser.add_argument(
+ "--optimizer",
+ type=str,
+ required=False,
+ choices=['adam', 'sgd'],
+ default="adam")
+ args = parser.parse_args()
+ log.info(args)
+ main(args)
diff --git a/examples/ges/model.py b/examples/ges/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fce05e79843495f1e4f2abc3984d6311761d3f0
--- /dev/null
+++ b/examples/ges/model.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ GES model file.
+"""
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+import math
+
+import paddle.fluid.layers as L
+import paddle.fluid as F
+
+
+def split_embedding(input,
+ dict_size,
+ hidden_size,
+ initializer,
+ name,
+ num_part=16,
+ is_sparse=False,
+ learning_rate=1.0):
+ """ split_embedding
+ """
+ _part_size = hidden_size // num_part
+ if hidden_size % num_part != 0:
+ _part_size += 1
+ output_embedding = []
+ p_num = 0
+ while hidden_size > 0:
+ _part_size = min(_part_size, hidden_size)
+ hidden_size -= _part_size
+ print("part", p_num, "size=", (dict_size, _part_size))
+ part_embedding = L.embedding(
+ input=input,
+ size=(dict_size, _part_size),
+ is_sparse=is_sparse,
+ is_distributed=False,
+ param_attr=F.ParamAttr(
+ name=name + '_part%s' % p_num,
+ initializer=initializer,
+ learning_rate=learning_rate))
+ p_num += 1
+ output_embedding.append(part_embedding)
+ return L.concat(output_embedding, -1)
+
+
+class GESModel(object):
+ """ GESModel
+ """
+
+ def __init__(self,
+ num_nodes,
+ num_featuers,
+ hidden_size=16,
+ neg_num=5,
+ is_sparse=False,
+ num_part=1):
+ self.pyreader = L.py_reader(
+ capacity=70,
+ shapes=[[-1, 1, num_featuers, 1],
+ [-1, neg_num + 1, num_featuers, 1]],
+ dtypes=['int64', 'int64'],
+ lod_levels=[0, 0],
+ name='train',
+ use_double_buffer=True)
+
+ self.num_nodes = num_nodes
+ self.num_featuers = num_featuers
+ self.neg_num = neg_num
+ self.embed_init = F.initializer.TruncatedNormal(scale=1.0 /
+ math.sqrt(hidden_size))
+ self.is_sparse = is_sparse
+ self.num_part = num_part
+ self.hidden_size = hidden_size
+ self.loss = None
+
+ def forward(self):
+ """ forward
+ """
+ src, dst = L.read_file(self.pyreader)
+
+ if self.is_sparse:
+ # sparse mode use 2 dims input.
+ src = L.reshape(src, [-1, 1])
+ dst = L.reshape(dst, [-1, 1])
+
+ src_embed = split_embedding(src, self.num_nodes, self.hidden_size,
+ self.embed_init, "weight", self.num_part,
+ self.is_sparse)
+
+ dst_embed = split_embedding(dst, self.num_nodes, self.hidden_size,
+ self.embed_init, "weight", self.num_part,
+ self.is_sparse)
+
+ if self.is_sparse:
+ src_embed = L.reshape(
+ src_embed, [-1, 1, self.num_featuers, self.hidden_size])
+ dst_embed = L.reshape(
+ dst_embed,
+ [-1, self.neg_num + 1, self.num_featuers, self.hidden_size])
+
+ src_embed = L.reduce_mean(src_embed, 2)
+ dst_embed = L.reduce_mean(dst_embed, 2)
+
+ logits = L.matmul(
+ src_embed, dst_embed,
+ transpose_y=True) # [batch_size, 1, neg_num+1]
+
+ pos_label = L.fill_constant_batch_size_like(logits, [-1, 1, 1],
+ "float32", 1)
+ neg_label = L.fill_constant_batch_size_like(
+ logits, [-1, 1, self.neg_num], "float32", 0)
+ label = L.concat([pos_label, neg_label], -1)
+
+ pos_weight = L.fill_constant_batch_size_like(logits, [-1, 1, 1],
+ "float32", self.neg_num)
+ neg_weight = L.fill_constant_batch_size_like(
+ logits, [-1, 1, self.neg_num], "float32", 1)
+ weight = L.concat([pos_weight, neg_weight], -1)
+
+ weight.stop_gradient = True
+ label.stop_gradient = True
+
+ loss = L.sigmoid_cross_entropy_with_logits(logits, label)
+ loss = loss * weight
+ loss = L.reduce_mean(loss)
+ loss = loss * ((self.neg_num + 1) / 2 / self.neg_num)
+ loss.persistable = True
+ self.loss = loss
+ return loss
+
+
+class EGESModel(GESModel):
+ """ EGESModel
+ """
+
+ def forward(self):
+ """ forward
+ """
+ src, dst = L.read_file(self.pyreader)
+
+ src_id = L.slice(src, [0, 1, 2, 3], [0, 0, 0, 0],
+ [int(math.pow(2, 30)) - 1, 1, 1, 1])
+ dst_id = L.slice(dst, [0, 1, 2, 3], [0, 0, 0, 0],
+ [int(math.pow(2, 30)) - 1, self.neg_num + 1, 1, 1])
+
+ if self.is_sparse:
+ # sparse mode use 2 dims input.
+ src = L.reshape(src, [-1, 1])
+ dst = L.reshape(dst, [-1, 1])
+
+ # [b, 1, f, h]
+ src_embed = split_embedding(src, self.num_nodes, self.hidden_size,
+ self.embed_init, "weight", self.num_part,
+ self.is_sparse)
+
+ # [b, n+1, f, h]
+ dst_embed = split_embedding(dst, self.num_nodes, self.hidden_size,
+ self.embed_init, "weight", self.num_part,
+ self.is_sparse)
+
+ if self.is_sparse:
+ src_embed = L.reshape(
+ src_embed, [-1, 1, self.num_featuers, self.hidden_size])
+ dst_embed = L.reshape(
+ dst_embed,
+ [-1, self.neg_num + 1, self.num_featuers, self.hidden_size])
+
+ # [b, 1, 1, f]
+ src_weight = L.softmax(
+ L.embedding(
+ src_id, [self.num_nodes, self.num_featuers],
+ param_attr=F.ParamAttr(name="alpha")))
+ # [b, n+1, 1, f]
+ dst_weight = L.softmax(
+ L.embedding(
+ dst_id, [self.num_nodes, self.num_featuers],
+ param_attr=F.ParamAttr(name="alpha")))
+
+ # [b, 1, h]
+ src_sum = L.squeeze(L.matmul(src_weight, src_embed), axes=[2])
+ # [b, n+1, h]
+ dst_sum = L.squeeze(L.matmul(dst_weight, dst_embed), axes=[2])
+
+ logits = L.matmul(
+ src_sum, dst_sum, transpose_y=True) # [batch_size, 1, neg_num+1]
+
+ pos_label = L.fill_constant_batch_size_like(logits, [-1, 1, 1],
+ "float32", 1)
+ neg_label = L.fill_constant_batch_size_like(
+ logits, [-1, 1, self.neg_num], "float32", 0)
+ label = L.concat([pos_label, neg_label], -1)
+
+ pos_weight = L.fill_constant_batch_size_like(logits, [-1, 1, 1],
+ "float32", self.neg_num)
+ neg_weight = L.fill_constant_batch_size_like(
+ logits, [-1, 1, self.neg_num], "float32", 1)
+ weight = L.concat([pos_weight, neg_weight], -1)
+
+ weight.stop_gradient = True
+ label.stop_gradient = True
+
+ loss = L.sigmoid_cross_entropy_with_logits(logits, label)
+ loss = loss * weight
+ loss = L.reduce_mean(loss)
+ loss = loss * ((self.neg_num + 1) / 2 / self.neg_num)
+ loss.persistable = True
+ self.loss = loss
+ return loss
diff --git a/examples/ges/mp_reader.py b/examples/ges/mp_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4df8998cc4ad672627dc7ba9f846c91ccca0bba
--- /dev/null
+++ b/examples/ges/mp_reader.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimized Multiprocessing Reader for PaddlePaddle
+"""
+import multiprocessing
+import numpy as np
+import time
+
+import paddle.fluid as fluid
+import pyarrow
+
+
+def _serialize_serializable(obj):
+ """Serialize Feed Dict
+ """
+ return {"type": type(obj), "data": obj.__dict__}
+
+
+def _deserialize_serializable(obj):
+ """Deserialize Feed Dict
+ """
+
+ val = obj["type"].__new__(obj["type"])
+ val.__dict__.update(obj["data"])
+ return val
+
+
+context = pyarrow.default_serialization_context()
+
+context.register_type(
+ object,
+ "object",
+ custom_serializer=_serialize_serializable,
+ custom_deserializer=_deserialize_serializable)
+
+
+def serialize_data(data):
+ """serialize_data"""
+ return pyarrow.serialize(data, context=context).to_buffer().to_pybytes()
+
+
+def deserialize_data(data):
+ """deserialize_data"""
+ return pyarrow.deserialize(data, context=context)
+
+
+def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
+ """
+ multiprocess_reader use python multi process to read data from readers
+ and then use multiprocess.Queue or multiprocess.Pipe to merge all
+ data. The process number is equal to the number of input readers, each
+ process call one reader.
+ Multiprocess.Queue require the rw access right to /dev/shm, some
+ platform does not support.
+ you need to create multiple readers first, these readers should be independent
+ to each other so that each process can work independently.
+ An example:
+ .. code-block:: python
+ reader0 = reader(["file01", "file02"])
+ reader1 = reader(["file11", "file12"])
+ reader1 = reader(["file21", "file22"])
+ reader = multiprocess_reader([reader0, reader1, reader2],
+ queue_size=100, use_pipe=False)
+ """
+
+ assert type(readers) is list and len(readers) > 0
+
+ def _read_into_queue(reader, queue):
+ """read_into_queue"""
+ for sample in reader():
+ if sample is None:
+ raise ValueError("sample has None")
+ queue.put(serialize_data(sample))
+ queue.put(serialize_data(None))
+
+ def queue_reader():
+ """queue_reader"""
+ queue = multiprocessing.Queue(queue_size)
+ for reader in readers:
+ p = multiprocessing.Process(
+ target=_read_into_queue, args=(reader, queue))
+ p.start()
+
+ reader_num = len(readers)
+ finish_num = 0
+ while finish_num < reader_num:
+ sample = deserialize_data(queue.get())
+ if sample is None:
+ finish_num += 1
+ else:
+ yield sample
+
+ def _read_into_pipe(reader, conn):
+ """read_into_pipe"""
+ for sample in reader():
+ if sample is None:
+ raise ValueError("sample has None!")
+ conn.send(serialize_data(sample))
+ conn.send(serialize_data(None))
+ conn.close()
+
+ def pipe_reader():
+ """pipe_reader"""
+ conns = []
+ for reader in readers:
+ parent_conn, child_conn = multiprocessing.Pipe()
+ conns.append(parent_conn)
+ p = multiprocessing.Process(
+ target=_read_into_pipe, args=(reader, child_conn))
+ p.start()
+
+ reader_num = len(readers)
+ finish_num = 0
+ conn_to_remove = []
+ finish_flag = np.zeros(len(conns), dtype="int32")
+ while finish_num < reader_num:
+ for conn_id, conn in enumerate(conns):
+ if finish_flag[conn_id] > 0:
+ continue
+ buff = conn.recv()
+ now = time.time()
+ sample = deserialize_data(buff)
+ out = time.time() - now
+ if sample is None:
+ finish_num += 1
+ conn.close()
+ finish_flag[conn_id] = 1
+ else:
+ yield sample
+
+ if use_pipe:
+ return pipe_reader
+ else:
+ return queue_reader
diff --git a/examples/ges/reader.py b/examples/ges/reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd2d9d21cc47cf64e83859094379d29696b2598f
--- /dev/null
+++ b/examples/ges/reader.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ Reader file.
+"""
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+import time
+import io
+import os
+
+import numpy as np
+import paddle
+from pgl.utils.logger import log
+from pgl.sample import node2vec_sample
+from pgl.sample import deepwalk_sample
+from pgl.sample import alias_sample
+from pgl.graph_kernel import skip_gram_gen_pair
+from pgl.graph_kernel import alias_sample_build_table
+
+
+class GESReader(object):
+ """ GESReader
+ """
+
+ def __init__(self,
+ graph,
+ node_feat,
+ batch_size=512,
+ walk_len=40,
+ win_size=5,
+ neg_num=5,
+ train_files=None,
+ walkpath_files=None,
+ neg_sample_type="average"):
+ """
+ Args:
+ walkpath_files: if is not None, read walk path from walkpath_files
+ """
+ self.graph = graph
+ self.node_feat = node_feat
+ self.batch_size = batch_size
+ self.walk_len = walk_len
+ self.win_size = win_size
+ self.neg_num = neg_num
+ self.train_files = train_files
+ self.walkpath_files = walkpath_files
+ self.neg_sample_type = neg_sample_type
+
+ def walk_from_files(self):
+ """ walk_from_files
+ """
+ bucket = []
+ while True:
+ for filename in self.walkpath_files:
+ with io.open(filename) as inf:
+ for line in inf:
+ walk = [int(x) for x in line.strip('\n\t').split('\t')]
+ bucket.append(walk)
+ if len(bucket) == self.batch_size:
+ yield bucket
+ bucket = []
+ if len(bucket):
+ yield bucket
+
+ def walk_from_graph(self):
+ """ walk_from_graph
+ """
+
+ def node_generator():
+ """ node_generator
+ """
+ if self.train_files is None:
+ while True:
+ for nodes in self.graph.node_batch_iter(self.batch_size):
+ yield nodes
+ else:
+ nodes = []
+ while True:
+ for filename in self.train_files:
+ with io.open(filename) as inf:
+ for line in inf:
+ node = int(line.strip('\n\t'))
+ nodes.append(node)
+ if len(nodes) == self.batch_size:
+ yield nodes
+ nodes = []
+ if len(nodes):
+ yield nodes
+
+ if "alias" in self.graph.node_feat and "events" in self.graph.node_feat:
+ log.info("Deepwalk using alias sample")
+ for nodes in node_generator():
+ if "alias" in self.graph.node_feat and "events" in self.graph.node_feat:
+ walks = deepwalk_sample(self.graph, nodes, self.walk_len,
+ "alias", "events")
+ else:
+ walks = deepwalk_sample(self.graph, nodes, self.walk_len)
+ yield walks
+
+ def walk_generator(self):
+ """ walk_generator
+ """
+ if self.walkpath_files is not None:
+ for i in self.walk_from_files():
+ yield i
+ else:
+ for i in self.walk_from_graph():
+ yield i
+
+ def __call__(self):
+ np.random.seed(os.getpid())
+ if self.neg_sample_type == "outdegree":
+ outdegree = self.graph.outdegree()
+ distribution = 1. * outdegree / outdegree.sum()
+ alias, events = alias_sample_build_table(distribution)
+ max_len = int(self.batch_size * self.walk_len * (
+ (1 + self.win_size) - 0.3))
+ for walks in self.walk_generator():
+ src, pos = [], []
+ for walk in walks:
+ s, p = skip_gram_gen_pair(walk, self.win_size)
+ src.extend(s), pos.extend(p)
+ src = np.array(src, dtype=np.int64),
+ pos = np.array(pos, dtype=np.int64)
+ src, pos = np.reshape(src, [-1, 1, 1]), np.reshape(pos, [-1, 1, 1])
+
+ if src.shape[0] == 0:
+ continue
+ neg_sample_size = [len(pos), self.neg_num, 1]
+ if self.neg_sample_type == "average":
+ negs = self.graph.sample_nodes(neg_sample_size)
+ elif self.neg_sample_type == "outdegree":
+ negs = alias_sample(neg_sample_size, alias, events)
+ # [batch_size, 1, 1] [batch_size, neg_num+1, 1]
+ dst = np.concatenate([pos, negs], 1)
+ src_feat = np.concatenate([src, self.node_feat[src[:, :, 0]]], -1)
+ dst_feat = np.concatenate([dst, self.node_feat[dst[:, :, 0]]], -1)
+ src_feat, dst_feat = np.expand_dims(src_feat, -1), np.expand_dims(
+ dst_feat, -1)
+ yield src_feat[:max_len], dst_feat[:max_len]
diff --git a/examples/graphsage/README.md b/examples/graphsage/README.md
index 6eb17f6cfec52194cf499d6a1273dc0e9b89d555..449aaf8c72112eb979ec916f104a0c29b06f1064 100644
--- a/examples/graphsage/README.md
+++ b/examples/graphsage/README.md
@@ -12,17 +12,23 @@ The reddit dataset should be downloaded from the following links and placed in d
### Dependencies
-- sklearn
-- paddlepaddle>=1.4 (The speed can be faster in 1.5.)
+- paddlepaddle>=1.6
- pgl
### How to run
To train a GraphSAGE model on Reddit Dataset, you can just run
+
```
python train.py --use_cuda --epoch 10 --graphsage_type graphsage_mean --normalize --symmetry
```
+If you want to train a GraphSAGE model with multiple GPUs, you can just run
+
+```
+CUDA_VISIBLE_DEVICES=0,1 python train_multi.py --use_cuda --epoch 10 --graphsage_type graphsage_mean --normalize --symmetry --num_trainer 2
+```
+
#### Hyperparameters
- epoch: Number of epochs default (10)
diff --git a/examples/graphsage/reader.py b/examples/graphsage/reader.py
index f05bfaba2837840e991000d14f18671d3fff6bcb..abc3fe8cbe8bdd380261ffde8933871b1a6094c9 100644
--- a/examples/graphsage/reader.py
+++ b/examples/graphsage/reader.py
@@ -17,12 +17,15 @@ import paddle
import paddle.fluid as fluid
import pgl
import time
+from pgl.utils import mp_reader
from pgl.utils.logger import log
import train
import time
def node_batch_iter(nodes, node_label, batch_size):
+ """node_batch_iter
+ """
perm = np.arange(len(nodes))
np.random.shuffle(perm)
start = 0
@@ -33,6 +36,8 @@ def node_batch_iter(nodes, node_label, batch_size):
def traverse(item):
+ """traverse
+ """
if isinstance(item, list) or isinstance(item, np.ndarray):
for i in iter(item):
for j in traverse(i):
@@ -42,13 +47,21 @@ def traverse(item):
def flat_node_and_edge(nodes, eids):
+ """flat_node_and_edge
+ """
nodes = list(set(traverse(nodes)))
eids = list(set(traverse(eids)))
return nodes, eids
-def worker(batch_info, graph, samples):
+def worker(batch_info, graph, graph_wrapper, samples):
+ """Worker
+ """
+
def work():
+ """work
+ """
+ first = True
for batch_train_samples, batch_train_labels in batch_info:
start_nodes = batch_train_samples
nodes = start_nodes
@@ -65,11 +78,14 @@ def worker(batch_info, graph, samples):
if len(start_nodes) == 0:
break
- feed_dict = {}
- feed_dict["nodes"] = [int(n) for n in nodes]
- feed_dict["eids"] = [int(e) for e in eids]
- feed_dict["node_label"] = [int(n) for n in batch_train_labels]
- feed_dict["node_index"] = [int(n) for n in batch_train_samples]
+ subgraph = graph.subgraph(nodes=nodes, eid=eids)
+ sub_node_index = subgraph.reindex_from_parrent_nodes(
+ batch_train_samples)
+ feed_dict = graph_wrapper.to_feed(subgraph)
+ feed_dict["node_label"] = np.expand_dims(
+ np.array(
+ batch_train_labels, dtype="int64"), -1)
+ feed_dict["node_index"] = sub_node_index
yield feed_dict
return work
@@ -82,26 +98,28 @@ def multiprocess_graph_reader(graph,
batch_size,
node_label,
num_workers=4):
+ """multiprocess_graph_reader
+ """
+
def parse_to_subgraph(rd):
+ """parse_to_subgraph
+ """
+
def work():
+ """work
+ """
+ last = time.time()
for data in rd():
- nodes = data["nodes"]
- eids = data["eids"]
- batch_train_labels = data["node_label"]
- batch_train_samples = data["node_index"]
- subgraph = graph.subgraph(nodes=nodes, eid=eids)
- sub_node_index = subgraph.reindex_from_parrent_nodes(
- batch_train_samples)
- feed_dict = graph_wrapper.to_feed(subgraph)
- feed_dict["node_label"] = np.expand_dims(
- np.array(
- batch_train_labels, dtype="int64"), -1)
- feed_dict["node_index"] = sub_node_index
+ this = time.time()
+ feed_dict = data
+ now = time.time()
+ last = now
yield feed_dict
return work
def reader():
+ """reader"""
batch_info = list(
node_batch_iter(
node_index, node_label, batch_size=batch_size))
@@ -110,9 +128,9 @@ def multiprocess_graph_reader(graph,
for i in range(num_workers):
reader_pool.append(
worker(batch_info[block_size * i:block_size * (i + 1)], graph,
- samples))
- multi_process_sample = paddle.reader.multiprocess_reader(
- reader_pool, use_pipe=False)
+ graph_wrapper, samples))
+ multi_process_sample = mp_reader.multiprocess_reader(
+ reader_pool, use_pipe=True, queue_size=1000)
r = parse_to_subgraph(multi_process_sample)
return paddle.reader.buffered(r, 1000)
@@ -121,7 +139,10 @@ def multiprocess_graph_reader(graph,
def graph_reader(graph, graph_wrapper, samples, node_index, batch_size,
node_label):
+ """graph_reader"""
+
def reader():
+ """reader"""
for batch_train_samples, batch_train_labels in node_batch_iter(
node_index, node_label, batch_size=batch_size):
start_nodes = batch_train_samples
diff --git a/examples/graphsage/train.py b/examples/graphsage/train.py
index 5d5bca2728aa23fa8589c53ba3704a4f386a1019..fa19463a5222a306b72686ee47d3d18772808a93 100644
--- a/examples/graphsage/train.py
+++ b/examples/graphsage/train.py
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+import os
import argparse
import time
@@ -34,8 +35,9 @@ def load_data(normalize=True, symmetry=True):
reddit_adj.npz: https://drive.google.com/open?id=174vb0Ws7Vxk_QTUtxqTgDHSQ4El4qDHt
reddit.npz: https://drive.google.com/open?id=19SphVl_Oe8SJ1r87Hr5a6znx3nJu1F2J
"""
- data = np.load("data/reddit.npz")
- adj = sp.load_npz("data/reddit_adj.npz")
+ data_dir = os.path.dirname(os.path.abspath(__file__))
+ data = np.load(os.path.join(data_dir, "data/reddit.npz"))
+ adj = sp.load_npz(os.path.join(data_dir, "data/reddit_adj.npz"))
if symmetry:
adj = adj + adj.T
adj = adj.tocoo()
@@ -64,7 +66,7 @@ def load_data(normalize=True, symmetry=True):
num_nodes=feature.shape[0],
edges=list(zip(src, dst)),
node_feat={"index": np.arange(
- 0, len(feature), dtype="int32")})
+ 0, len(feature), dtype="int64")})
return {
"graph": graph,
@@ -82,7 +84,7 @@ def load_data(normalize=True, symmetry=True):
def build_graph_model(graph_wrapper, num_class, k_hop, graphsage_type,
hidden_size, feature):
node_index = fluid.layers.data(
- "node_index", shape=[None], dtype="int32", append_batch_size=False)
+ "node_index", shape=[None], dtype="int64", append_batch_size=False)
node_label = fluid.layers.data(
"node_label", shape=[None, 1], dtype="int64", append_batch_size=False)
@@ -198,7 +200,9 @@ def main(args):
hide_batch_size=False)
graph_wrapper = pgl.graph_wrapper.GraphWrapper(
- "sub_graph", place, node_feat=data['graph'].node_feat_info())
+ "sub_graph",
+ fluid.CPUPlace(),
+ node_feat=data['graph'].node_feat_info())
model_loss, model_acc = build_graph_model(
graph_wrapper,
num_class=data["num_class"],
diff --git a/examples/graphsage/train_multi.py b/examples/graphsage/train_multi.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a1e799312e25ff2314de326ea3880ca0970b708
--- /dev/null
+++ b/examples/graphsage/train_multi.py
@@ -0,0 +1,369 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import argparse
+import time
+
+import sys
+import traceback
+import numpy as np
+import scipy.sparse as sp
+from sklearn.preprocessing import StandardScaler
+
+import pgl
+from pgl.utils.logger import log
+from pgl.utils import paddle_helper
+import paddle
+import paddle.fluid as fluid
+import reader
+from model import graphsage_mean, graphsage_meanpool,\
+ graphsage_maxpool, graphsage_lstm
+
+
+def load_data(normalize=True, symmetry=True):
+ """
+ data from https://github.com/matenure/FastGCN/issues/8
+ reddit_adj.npz: https://drive.google.com/open?id=174vb0Ws7Vxk_QTUtxqTgDHSQ4El4qDHt
+ reddit.npz: https://drive.google.com/open?id=19SphVl_Oe8SJ1r87Hr5a6znx3nJu1F2J
+ """
+ data_dir = os.path.dirname(os.path.abspath(__file__))
+ data = np.load(os.path.join(data_dir, "data/reddit.npz"))
+ adj = sp.load_npz(os.path.join(data_dir, "data/reddit_adj.npz"))
+ if symmetry:
+ adj = adj + adj.T
+ adj = adj.tocoo()
+ src = adj.row
+ dst = adj.col
+
+ num_class = 41
+
+ train_label = data['y_train']
+ val_label = data['y_val']
+ test_label = data['y_test']
+
+ train_index = data['train_index']
+ val_index = data['val_index']
+ test_index = data['test_index']
+
+ feature = data["feats"].astype("float32")
+
+ if normalize:
+ scaler = StandardScaler()
+ scaler.fit(feature[train_index])
+ feature = scaler.transform(feature)
+
+ log.info("Feature shape %s" % (repr(feature.shape)))
+ graph = pgl.graph.Graph(
+ num_nodes=feature.shape[0],
+ edges=list(zip(src, dst)),
+ node_feat={"feat": feature.astype("float32")})
+
+ return {
+ "graph": graph,
+ "train_index": train_index,
+ "train_label": train_label,
+ "val_label": val_label,
+ "val_index": val_index,
+ "test_index": test_index,
+ "test_label": test_label,
+ "num_class": 41
+ }
+
+
+def build_graph_model(graph_wrapper, num_class, k_hop, graphsage_type,
+ hidden_size):
+ """build_graph_model"""
+ node_index = fluid.layers.data(
+ "node_index", shape=[None], dtype="int64", append_batch_size=False)
+
+ node_label = fluid.layers.data(
+ "node_label", shape=[None, 1], dtype="int64", append_batch_size=False)
+
+ feature = graph_wrapper.node_feat["feat"]
+
+ for i in range(k_hop):
+ if graphsage_type == 'graphsage_mean':
+ feature = graphsage_mean(
+ graph_wrapper,
+ feature,
+ hidden_size,
+ act="relu",
+ name="graphsage_mean_%s" % i)
+ elif graphsage_type == 'graphsage_meanpool':
+ feature = graphsage_meanpool(
+ graph_wrapper,
+ feature,
+ hidden_size,
+ act="relu",
+ name="graphsage_meanpool_%s" % i)
+ elif graphsage_type == 'graphsage_maxpool':
+ feature = graphsage_maxpool(
+ graph_wrapper,
+ feature,
+ hidden_size,
+ act="relu",
+ name="graphsage_maxpool_%s" % i)
+ elif graphsage_type == 'graphsage_lstm':
+ feature = graphsage_lstm(
+ graph_wrapper,
+ feature,
+ hidden_size,
+ act="relu",
+ name="graphsage_maxpool_%s" % i)
+ else:
+ raise ValueError("graphsage type %s is not"
+ " implemented" % graphsage_type)
+
+ feature = fluid.layers.gather(feature, node_index)
+ logits = fluid.layers.fc(feature,
+ num_class,
+ act=None,
+ name='classification_layer')
+ proba = fluid.layers.softmax(logits)
+
+ loss = fluid.layers.softmax_with_cross_entropy(
+ logits=logits, label=node_label)
+ loss = fluid.layers.mean(loss)
+ acc = fluid.layers.accuracy(input=proba, label=node_label, k=1)
+ return loss, acc
+
+
+def to_multidevice(batch_iter, num_trainer):
+ """to_multidevice"""
+ batch_dict = []
+ for batch in batch_iter():
+ batch_dict.append(batch)
+ if len(batch_dict) == num_trainer:
+ yield batch_dict
+ batch_dict = []
+
+ if len(batch_dict) > 0:
+ log.warning("The batch (%s) can't fill all device (%s)"
+ "which will be discarded." %
+ (len(batch_dict), num_trainer))
+
+
+def run_epoch(batch_iter,
+ exe,
+ program,
+ prefix,
+ model_loss,
+ model_acc,
+ epoch,
+ log_per_step=100,
+ num_trainer=1):
+ """run_epoch"""
+ batch = 0
+ total_loss = 0.
+ total_acc = 0.
+ total_sample = 0
+ start = time.time()
+ if num_trainer > 1:
+ batch_iter = to_multidevice(batch_iter, num_trainer)
+ else:
+ batch_iter = batch_iter()
+
+ for batch_feed_dict in batch_iter:
+ batch += 1
+ if num_trainer > 1:
+ batch_loss, batch_acc = exe.run(
+ fetch_list=[model_loss.name, model_acc.name],
+ feed=batch_feed_dict)
+
+ batch_loss = np.mean(batch_loss)
+ batch_acc = np.mean(batch_acc)
+ else:
+ batch_loss, batch_acc = exe.run(
+ program,
+ fetch_list=[model_loss.name, model_acc.name],
+ feed=batch_feed_dict)
+
+ if batch % log_per_step == 0:
+ log.info("Batch %s %s-Loss %s %s-Acc %s" %
+ (batch, prefix, batch_loss, prefix, batch_acc))
+
+ if num_trainer > 1:
+ num_samples = sum(
+ [len(batch["node_index"]) for batch in batch_feed_dict])
+ else:
+ num_samples = len(batch_feed_dict["node_index"])
+ total_loss += batch_loss * num_samples
+ total_acc += batch_acc * num_samples
+ total_sample += num_samples
+ end = time.time()
+
+ log.info("%s Epoch %s Loss %.5lf Acc %.5lf Speed(per batch) %.5lf sec" %
+ (prefix, epoch, total_loss / total_sample,
+ total_acc / total_sample, (end - start) / batch))
+
+
+def main(args):
+ """main"""
+ data = load_data(args.normalize, args.symmetry)
+ log.info("preprocess finish")
+ log.info("Train Examples: %s" % len(data["train_index"]))
+ log.info("Val Examples: %s" % len(data["val_index"]))
+ log.info("Test Examples: %s" % len(data["test_index"]))
+ log.info("Num nodes %s" % data["graph"].num_nodes)
+ log.info("Num edges %s" % data["graph"].num_edges)
+ log.info("Average Degree %s" % np.mean(data["graph"].indegree()))
+
+ place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+ train_program = fluid.Program()
+ startup_program = fluid.Program()
+ samples = []
+ if args.samples_1 > 0:
+ samples.append(args.samples_1)
+ if args.samples_2 > 0:
+ samples.append(args.samples_2)
+
+ with fluid.program_guard(train_program, startup_program):
+ graph_wrapper = pgl.graph_wrapper.GraphWrapper(
+ "sub_graph",
+ fluid.CPUPlace(),
+ node_feat=data['graph'].node_feat_info())
+
+ model_loss, model_acc = build_graph_model(
+ graph_wrapper,
+ num_class=data["num_class"],
+ hidden_size=args.hidden_size,
+ graphsage_type=args.graphsage_type,
+ k_hop=len(samples))
+
+ test_program = train_program.clone(for_test=True)
+
+ with fluid.program_guard(train_program, startup_program):
+ adam = fluid.optimizer.Adam(learning_rate=args.lr)
+ adam.minimize(model_loss)
+
+ exe = fluid.Executor(place)
+ exe.run(startup_program)
+ if args.num_trainer > 1:
+ build_strategy = fluid.BuildStrategy()
+ build_strategy.remove_unnecessary_lock = False
+ build_strategy.enable_sequential_execution = True
+
+ train_exe = fluid.ParallelExecutor(
+ use_cuda=args.use_cuda,
+ main_program=train_program,
+ build_strategy=build_strategy,
+ loss_name=model_loss.name)
+ else:
+ train_exe = exe
+
+ if args.sample_workers > 1:
+ train_iter = reader.multiprocess_graph_reader(
+ data['graph'],
+ graph_wrapper,
+ samples=samples,
+ num_workers=args.sample_workers,
+ batch_size=args.batch_size,
+ node_index=data['train_index'],
+ node_label=data["train_label"])
+ else:
+ train_iter = reader.graph_reader(
+ data['graph'],
+ graph_wrapper,
+ samples=samples,
+ batch_size=args.batch_size,
+ node_index=data['train_index'],
+ node_label=data["train_label"])
+
+ if args.sample_workers > 1:
+ val_iter = reader.multiprocess_graph_reader(
+ data['graph'],
+ graph_wrapper,
+ samples=samples,
+ num_workers=args.sample_workers,
+ batch_size=args.batch_size,
+ node_index=data['val_index'],
+ node_label=data["val_label"])
+ else:
+ val_iter = reader.graph_reader(
+ data['graph'],
+ graph_wrapper,
+ samples=samples,
+ batch_size=args.batch_size,
+ node_index=data['val_index'],
+ node_label=data["val_label"])
+
+ if args.sample_workers > 1:
+ test_iter = reader.multiprocess_graph_reader(
+ data['graph'],
+ graph_wrapper,
+ samples=samples,
+ num_workers=args.sample_workers,
+ batch_size=args.batch_size,
+ node_index=data['test_index'],
+ node_label=data["test_label"])
+ else:
+ test_iter = reader.graph_reader(
+ data['graph'],
+ graph_wrapper,
+ samples=samples,
+ batch_size=args.batch_size,
+ node_index=data['test_index'],
+ node_label=data["test_label"])
+
+ for epoch in range(args.epoch):
+ run_epoch(
+ train_iter,
+ program=train_program,
+ exe=train_exe,
+ prefix="train",
+ model_loss=model_loss,
+ model_acc=model_acc,
+ num_trainer=args.num_trainer,
+ epoch=epoch)
+
+ run_epoch(
+ val_iter,
+ program=test_program,
+ exe=exe,
+ prefix="val",
+ model_loss=model_loss,
+ model_acc=model_acc,
+ log_per_step=10000,
+ epoch=epoch)
+
+ run_epoch(
+ test_iter,
+ program=test_program,
+ prefix="test",
+ exe=exe,
+ model_loss=model_loss,
+ model_acc=model_acc,
+ log_per_step=10000,
+ epoch=epoch)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description='graphsage')
+ parser.add_argument("--use_cuda", action='store_true', help="use_cuda")
+ parser.add_argument(
+ "--normalize", action='store_true', help="normalize features")
+ parser.add_argument(
+ "--symmetry", action='store_true', help="undirect graph")
+ parser.add_argument("--graphsage_type", type=str, default="graphsage_mean")
+ parser.add_argument("--sample_workers", type=int, default=5)
+ parser.add_argument("--epoch", type=int, default=10)
+ parser.add_argument("--hidden_size", type=int, default=128)
+ parser.add_argument("--batch_size", type=int, default=128)
+ parser.add_argument("--num_trainer", type=int, default=1)
+ parser.add_argument("--lr", type=float, default=0.01)
+ parser.add_argument("--samples_1", type=int, default=25)
+ parser.add_argument("--samples_2", type=int, default=10)
+ args = parser.parse_args()
+ log.info(args)
+ main(args)
diff --git a/examples/graphsage/train_scale.py b/examples/graphsage/train_scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..31a615ff5515d54af7589389ebf7bf5a925fa6cb
--- /dev/null
+++ b/examples/graphsage/train_scale.py
@@ -0,0 +1,357 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Multi-GPU settings
+"""
+import argparse
+import time
+
+import numpy as np
+import scipy.sparse as sp
+from sklearn.preprocessing import StandardScaler
+
+import pgl
+from pgl.utils.logger import log
+from pgl.utils import paddle_helper
+import paddle
+import paddle.fluid as fluid
+import reader
+from model import graphsage_mean, graphsage_meanpool,\
+ graphsage_maxpool, graphsage_lstm
+
+
+def fixed_offset(data, num_nodes, scale):
+ """Test
+ """
+ len_data = len(data)
+ len_per_part = int(len_data / scale)
+ offset = np.arange(0, scale, dtype="int64")
+ offset = offset * num_nodes
+ offset = np.repeat(offset, len_per_part)
+ if len(data.shape) > 1:
+ data += offset.reshape([-1, 1])
+ else:
+ data += offset
+
+
+def load_data(normalize=True, symmetry=True, scale=1):
+ """
+ data from https://github.com/matenure/FastGCN/issues/8
+ reddit_adj.npz: https://drive.google.com/open?id=174vb0Ws7Vxk_QTUtxqTgDHSQ4El4qDHt
+ reddit.npz: https://drive.google.com/open?id=19SphVl_Oe8SJ1r87Hr5a6znx3nJu1F2J
+ """
+ data = np.load("data/reddit.npz")
+ adj = sp.load_npz("data/reddit_adj.npz")
+ if symmetry:
+ adj = adj + adj.T
+ adj = adj.tocoo()
+ src = adj.row.reshape([-1, 1])
+ dst = adj.col.reshape([-1, 1])
+ edges = np.hstack([src, dst])
+
+ num_class = 41
+
+ train_label = data['y_train']
+ val_label = data['y_val']
+ test_label = data['y_test']
+
+ train_index = data['train_index']
+ val_index = data['val_index']
+ test_index = data['test_index']
+
+ feature = data["feats"].astype("float32")
+
+ if normalize:
+ scaler = StandardScaler()
+ scaler.fit(feature[train_index])
+ feature = scaler.transform(feature)
+
+ if scale > 1:
+ num_nodes = feature.shape[0]
+ feature = np.tile(feature, [scale, 1])
+ train_label = np.tile(train_label, [scale])
+ val_label = np.tile(val_label, [scale])
+ test_label = np.tile(test_label, [scale])
+ edges = np.tile(edges, [scale, 1])
+ fixed_offset(edges, num_nodes, scale)
+ train_index = np.tile(train_index, [scale])
+ fixed_offset(train_index, num_nodes, scale)
+ val_index = np.tile(val_index, [scale])
+ fixed_offset(val_index, num_nodes, scale)
+ test_index = np.tile(test_index, [scale])
+ fixed_offset(test_index, num_nodes, scale)
+
+ log.info("Feature shape %s" % (repr(feature.shape)))
+
+ graph = pgl.graph.Graph(
+ num_nodes=feature.shape[0],
+ edges=edges,
+ node_feat={
+ "index": np.arange(
+ 0, len(feature), dtype="int64"),
+ "feature": feature
+ })
+
+ return {
+ "graph": graph,
+ "train_index": train_index,
+ "train_label": train_label,
+ "val_label": val_label,
+ "val_index": val_index,
+ "test_index": test_index,
+ "test_label": test_label,
+ "feature": feature,
+ "num_class": 41
+ }
+
+
+def build_graph_model(graph_wrapper, num_class, k_hop, graphsage_type,
+ hidden_size, feature):
+ """Test"""
+ node_index = fluid.layers.data(
+ "node_index", shape=[None], dtype="int64", append_batch_size=False)
+
+ node_label = fluid.layers.data(
+ "node_label", shape=[None, 1], dtype="int64", append_batch_size=False)
+
+ for i in range(k_hop):
+ if graphsage_type == 'graphsage_mean':
+ feature = graphsage_mean(
+ graph_wrapper,
+ feature,
+ hidden_size,
+ act="relu",
+ name="graphsage_mean_%s % i")
+ elif graphsage_type == 'graphsage_meanpool':
+ feature = graphsage_meanpool(
+ graph_wrapper,
+ feature,
+ hidden_size,
+ act="relu",
+ name="graphsage_meanpool_%s % i")
+ elif graphsage_type == 'graphsage_maxpool':
+ feature = graphsage_maxpool(
+ graph_wrapper,
+ feature,
+ hidden_size,
+ act="relu",
+ name="graphsage_maxpool_%s % i")
+ elif graphsage_type == 'graphsage_lstm':
+ feature = graphsage_lstm(
+ graph_wrapper,
+ feature,
+ hidden_size,
+ act="relu",
+ name="graphsage_maxpool_%s % i")
+ else:
+ raise ValueError("graphsage type %s is not"
+ " implemented" % graphsage_type)
+
+ feature = fluid.layers.gather(feature, node_index)
+ logits = fluid.layers.fc(feature,
+ num_class,
+ act=None,
+ name='classification_layer')
+ proba = fluid.layers.softmax(logits)
+
+ loss = fluid.layers.softmax_with_cross_entropy(
+ logits=logits, label=node_label)
+ loss = fluid.layers.mean(loss)
+ acc = fluid.layers.accuracy(input=proba, label=node_label, k=1)
+ return loss, acc
+
+
+def run_epoch(batch_iter,
+ exe,
+ program,
+ prefix,
+ model_loss,
+ model_acc,
+ epoch,
+ log_per_step=100):
+ """Test"""
+ batch = 0
+ total_loss = 0.
+ total_acc = 0.
+ total_sample = 0
+ start = time.time()
+ for batch_feed_dict in batch_iter():
+ batch += 1
+ batch_loss, batch_acc = exe.run(program,
+ fetch_list=[model_loss, model_acc],
+ feed=batch_feed_dict)
+
+ if batch % log_per_step == 0:
+ log.info("Batch %s %s-Loss %s %s-Acc %s" %
+ (batch, prefix, batch_loss, prefix, batch_acc))
+
+ num_samples = len(batch_feed_dict["node_index"])
+ total_loss += batch_loss * num_samples
+ total_acc += batch_acc * num_samples
+ total_sample += num_samples
+ end = time.time()
+
+ log.info("%s Epoch %s Loss %.5lf Acc %.5lf Speed(per batch) %.5lf sec" %
+ (prefix, epoch, total_loss / total_sample,
+ total_acc / total_sample, (end - start) / batch))
+
+
+def main(args):
+ """Test """
+ data = load_data(args.normalize, args.symmetry, args.scale)
+ log.info("preprocess finish")
+ log.info("Train Examples: %s" % len(data["train_index"]))
+ log.info("Val Examples: %s" % len(data["val_index"]))
+ log.info("Test Examples: %s" % len(data["test_index"]))
+ log.info("Num nodes %s" % data["graph"].num_nodes)
+ log.info("Num edges %s" % data["graph"].num_edges)
+ log.info("Average Degree %s" % np.mean(data["graph"].indegree()))
+
+ place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+ train_program = fluid.Program()
+ startup_program = fluid.Program()
+
+ samples = []
+ if args.samples_1 > 0:
+ samples.append(args.samples_1)
+ if args.samples_2 > 0:
+ samples.append(args.samples_2)
+
+ with fluid.program_guard(train_program, startup_program):
+ graph_wrapper = pgl.graph_wrapper.GraphWrapper(
+ "sub_graph",
+ fluid.CPUPlace(),
+ node_feat=data['graph'].node_feat_info())
+
+ model_loss, model_acc = build_graph_model(
+ graph_wrapper,
+ num_class=data["num_class"],
+ feature=graph_wrapper.node_feat["feature"],
+ hidden_size=args.hidden_size,
+ graphsage_type=args.graphsage_type,
+ k_hop=len(samples))
+
+ test_program = train_program.clone(for_test=True)
+
+ if args.sample_workers > 1:
+ train_iter = reader.multiprocess_graph_reader(
+ data['graph'],
+ graph_wrapper,
+ samples=samples,
+ num_workers=args.sample_workers,
+ batch_size=args.batch_size,
+ node_index=data['train_index'],
+ node_label=data["train_label"])
+ else:
+ train_iter = reader.graph_reader(
+ data['graph'],
+ graph_wrapper,
+ samples=samples,
+ batch_size=args.batch_size,
+ node_index=data['train_index'],
+ node_label=data["train_label"])
+
+ if args.sample_workers > 1:
+ val_iter = reader.multiprocess_graph_reader(
+ data['graph'],
+ graph_wrapper,
+ samples=samples,
+ num_workers=args.sample_workers,
+ batch_size=args.batch_size,
+ node_index=data['val_index'],
+ node_label=data["val_label"])
+ else:
+ val_iter = reader.graph_reader(
+ data['graph'],
+ graph_wrapper,
+ samples=samples,
+ batch_size=args.batch_size,
+ node_index=data['val_index'],
+ node_label=data["val_label"])
+
+ if args.sample_workers > 1:
+ test_iter = reader.multiprocess_graph_reader(
+ data['graph'],
+ graph_wrapper,
+ samples=samples,
+ num_workers=args.sample_workers,
+ batch_size=args.batch_size,
+ node_index=data['test_index'],
+ node_label=data["test_label"])
+ else:
+ test_iter = reader.graph_reader(
+ data['graph'],
+ graph_wrapper,
+ samples=samples,
+ batch_size=args.batch_size,
+ node_index=data['test_index'],
+ node_label=data["test_label"])
+
+ with fluid.program_guard(train_program, startup_program):
+ adam = fluid.optimizer.Adam(learning_rate=args.lr)
+ adam.minimize(model_loss)
+
+ exe = fluid.Executor(place)
+ exe.run(startup_program)
+
+ for epoch in range(args.epoch):
+ run_epoch(
+ train_iter,
+ program=train_program,
+ exe=exe,
+ prefix="train",
+ model_loss=model_loss,
+ model_acc=model_acc,
+ epoch=epoch)
+
+ run_epoch(
+ val_iter,
+ program=test_program,
+ exe=exe,
+ prefix="val",
+ model_loss=model_loss,
+ model_acc=model_acc,
+ log_per_step=10000,
+ epoch=epoch)
+
+ run_epoch(
+ test_iter,
+ program=test_program,
+ prefix="test",
+ exe=exe,
+ model_loss=model_loss,
+ model_acc=model_acc,
+ log_per_step=10000,
+ epoch=epoch)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description='graphsage')
+ parser.add_argument("--use_cuda", action='store_true', help="use_cuda")
+ parser.add_argument(
+ "--normalize", action='store_true', help="normalize features")
+ parser.add_argument(
+ "--symmetry", action='store_true', help="undirect graph")
+ parser.add_argument("--graphsage_type", type=str, default="graphsage_mean")
+ parser.add_argument("--sample_workers", type=int, default=5)
+ parser.add_argument("--epoch", type=int, default=10)
+ parser.add_argument("--hidden_size", type=int, default=128)
+ parser.add_argument("--batch_size", type=int, default=128)
+ parser.add_argument("--lr", type=float, default=0.01)
+ parser.add_argument("--samples_1", type=int, default=25)
+ parser.add_argument("--samples_2", type=int, default=10)
+ parser.add_argument("--scale", type=int, default=1)
+ args = parser.parse_args()
+ log.info(args)
+ main(args)
diff --git a/examples/line/README.md b/examples/line/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..abbe0d4e28e6ae47e72e6aeec6a7b70f04ae988d
--- /dev/null
+++ b/examples/line/README.md
@@ -0,0 +1,55 @@
+# PGL Examples for LINE
+[LINE](http://www.www2015.it/documents/proceedings/proceedings/p1067.pdf) is an algorithmic framework for embedding very large-scale information networks. It is suitable to a variety of networks including directed, undirected, binary or weighted edges. Based on PGL, we reproduce LINE algorithms and reach the same level of indicators as the paper.
+
+## Datasets
+[Flickr network](http://socialnetworks.mpi-sws.org/data-imc2007.html) is a social network, which contains 1715256 nodes and 22613981 edges.
+
+You can dowload data from [here](http://socialnetworks.mpi-sws.org/data-imc2007.html).
+
+Flickr network contains four files:
+* flickr-groupmemberships.txt.gz
+* flickr-groups.txt.gz
+* flickr-links.txt.gz
+* flickr-users.txt.gz
+
+After downloading the data,uncompress them, let's say, in **./data/flickr/** . Note that the current directory is the root directory of LINE model.
+
+Then you can run the below command to preprocess the data.
+```sh
+python data_process.py
+```
+
+Then it will produce three files in **./data/flickr/** directory:
+* nodes.txt
+* edges.txt
+* nodes_label.txt
+
+
+## Dependencies
+- paddlepaddle>=1.6
+- pgl
+
+## How to run
+
+For examples, use gpu to train LINE on Flickr dataset.
+```sh
+# multiclass task example
+python line.py --use_cuda --order first_order --data_path ./data/flickr/ --save_dir ./checkpoints/model/
+
+python multi_class.py --ckpt_path ./checkpoints/model/model_eopch_20 --percent 0.5
+
+```
+
+## Hyperparameters
+
+- -use_cuda: Use gpu if assign use_cuda.
+- -order: LINE with First_order Proximity or Second_order Proximity
+- -percent: The percentage of data as training data
+
+### Experiment results
+Dataset|model|Task|Metric|PGL Result|Reported Result
+--|--|--|--|--|--
+Flickr|LINE with first_order|multi-label classification|MacroF1|0.626|0.627
+Flickr|LINE with first_order|multi-label classification|MicroF1|0.637|0.639
+Flickr|LINE with second_order|multi-label classification|MacroF1|0.615|0.621
+Flickr|LINE with second_order|multi-label classification|MicroF1|0.630|0.635
diff --git a/examples/line/data_loader.py b/examples/line/data_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..58a118aa6a5d0839480fc84538b9b760a4fccc82
--- /dev/null
+++ b/examples/line/data_loader.py
@@ -0,0 +1,233 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file provides the Dataset for LINE model.
+"""
+import os
+import io
+import sys
+import numpy as np
+
+from pgl import graph
+from pgl.utils.logger import log
+
+
+class FlickrDataset(object):
+ """Flickr dataset implementation
+
+ Args:
+ name: The name of the dataset.
+
+ symmetry_edges: Whether to create symmetry edges.
+
+ self_loop: Whether to contain self loop edges.
+
+ train_percentage: The percentage of nodes to be trained in multi class task.
+
+ Attributes:
+ graph: The :code:`Graph` data object.
+
+ num_groups: Number of classes.
+
+ train_index: The index for nodes in training set.
+
+ test_index: The index for nodes in validation set.
+ """
+
+ def __init__(self,
+ data_path,
+ symmetry_edges=False,
+ self_loop=False,
+ train_percentage=0.5):
+ self.path = data_path
+ # self.name = name
+ self.num_groups = 5
+ self.symmetry_edges = symmetry_edges
+ self.self_loop = self_loop
+ self.train_percentage = train_percentage
+ self._load_data()
+
+ def _load_data(self):
+ edge_path = os.path.join(self.path, 'edges.txt')
+ node_path = os.path.join(self.path, 'nodes.txt')
+ nodes_label_path = os.path.join(self.path, 'nodes_label.txt')
+
+ all_edges = []
+ edges_weight = []
+
+ with io.open(node_path) as inf:
+ num_nodes = len(inf.readlines())
+
+ node_feature = np.zeros((num_nodes, self.num_groups))
+
+ with io.open(nodes_label_path) as inf:
+ for line in inf:
+ # group_id means the label of the node
+ node_id, group_id = line.strip('\n').split(',')
+ node_id = int(node_id) - 1
+ labels = group_id.split(' ')
+ for i in labels:
+ node_feature[node_id][int(i) - 1] = 1
+
+ node_degree_list = [1 for _ in range(num_nodes)]
+
+ with io.open(edge_path) as inf:
+ for line in inf:
+ items = line.strip().split('\t')
+ if len(items) == 2:
+ u, v = int(items[0]), int(items[1])
+ weight = 1 # binary weight, default set to 1
+ else:
+ u, v, weight = int(items[0]), int(items[1]), float(items[
+ 2]),
+ u, v = u - 1, v - 1
+ all_edges.append((u, v))
+ edges_weight.append(weight)
+
+ if self.symmetry_edges:
+ all_edges.append((v, u))
+ edges_weight.append(weight)
+
+ # sum the weights of the same node as the outdegree
+ node_degree_list[u] += weight
+
+ if self.self_loop:
+ for i in range(num_nodes):
+ all_edges.append((i, i))
+ edges_weight.append(1.)
+
+ all_edges = list(set(all_edges))
+ self.graph = graph.Graph(
+ num_nodes=num_nodes,
+ edges=all_edges,
+ node_feat={"group_id": node_feature})
+
+ perm = np.arange(0, num_nodes)
+ np.random.shuffle(perm)
+ train_num = int(num_nodes * self.train_percentage)
+ self.train_index = perm[:train_num]
+ self.test_index = perm[train_num:]
+
+ edge_distribution = np.array(edges_weight, dtype=np.float32)
+ self.edge_distribution = edge_distribution / np.sum(edge_distribution)
+ self.edge_sampling = AliasSampling(prob=edge_distribution)
+
+ node_dist = np.array(node_degree_list, dtype=np.float32)
+ node_negative_distribution = np.power(node_dist, 0.75)
+ self.node_negative_distribution = node_negative_distribution / np.sum(
+ node_negative_distribution)
+ self.node_sampling = AliasSampling(prob=node_negative_distribution)
+
+ self.node_index = {}
+ self.node_index_reversed = {}
+ for index, e in enumerate(self.graph.edges):
+ self.node_index[e[0]] = index
+ self.node_index_reversed[index] = e[0]
+
+ def fetch_batch(self,
+ batch_size=16,
+ K=10,
+ edge_sampling='alias',
+ node_sampling='alias'):
+ """Fetch batch data from dataset.
+ """
+ if edge_sampling == 'numpy':
+ edge_batch_index = np.random.choice(
+ self.graph.num_edges,
+ size=batch_size,
+ p=self.edge_distribution)
+ elif edge_sampling == 'alias':
+ edge_batch_index = self.edge_sampling.sampling(batch_size)
+ elif edge_sampling == 'uniform':
+ edge_batch_index = np.random.randint(
+ 0, self.graph.num_edges, size=batch_size)
+ u_i = []
+ u_j = []
+ label = []
+ for edge_index in edge_batch_index:
+ edge = self.graph.edges[edge_index]
+ u_i.append(edge[0])
+ u_j.append(edge[1])
+ label.append(1)
+ for i in range(K):
+ while True:
+ if node_sampling == 'numpy':
+ negative_node = np.random.choice(
+ self.graph.num_nodes,
+ p=self.node_negative_distribution)
+ elif node_sampling == 'alias':
+ negative_node = self.node_sampling.sampling()
+ elif node_sampling == 'uniform':
+ negative_node = np.random.randint(0,
+ self.graph.num_nodes)
+
+ # make sure the sampled node has no edge with the source node
+ if not self.graph.has_edges_between(
+ np.array(
+ [self.node_index_reversed[negative_node]]),
+ np.array([self.node_index_reversed[edge[0]]])):
+ break
+ u_i.append(edge[0])
+ u_j.append(negative_node)
+ label.append(-1)
+ u_i = np.array([u_i], dtype=np.int64).T
+ u_j = np.array([u_j], dtype=np.int64).T
+ label = np.array(label, dtype=np.float32)
+ return u_i, u_j, label
+
+
+class AliasSampling:
+ """Implemention of Alias-Method
+
+ This is an implementation of Alias-Method for sampling efficiently from
+ a discrete probability distribution.
+
+ Reference: https://en.wikipedia.org/wiki/Alias_method
+
+ Args:
+ prob: The discrete probability distribution.
+
+ """
+
+ def __init__(self, prob):
+ self.n = len(prob)
+ self.U = np.array(prob) * self.n
+ self.K = [i for i in range(len(prob))]
+ overfull, underfull = [], []
+ for i, U_i in enumerate(self.U):
+ if U_i > 1:
+ overfull.append(i)
+ elif U_i < 1:
+ underfull.append(i)
+ while len(overfull) and len(underfull):
+ i, j = overfull.pop(), underfull.pop()
+ self.K[j] = i
+ self.U[i] = self.U[i] - (1 - self.U[j])
+ if self.U[i] > 1:
+ overfull.append(i)
+ elif self.U[i] < 1:
+ underfull.append(i)
+
+ def sampling(self, n=1):
+ """Sampling.
+ """
+ x = np.random.rand(n)
+ i = np.floor(self.n * x)
+ y = self.n * x - i
+ i = i.astype(np.int64)
+ res = [i[k] if y[k] < self.U[i[k]] else self.K[i[k]] for k in range(n)]
+ if n == 1:
+ return res[0]
+ else:
+ return res
diff --git a/examples/line/data_process.py b/examples/line/data_process.py
new file mode 100644
index 0000000000000000000000000000000000000000..d81a5c7e2678e75c8562f840ac857321794fe003
--- /dev/null
+++ b/examples/line/data_process.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file preprocess the FlickrDataset for LINE model.
+"""
+import argparse
+import operator
+import os
+
+
+def process_data(groupsMemberships_file, flickr_links_file, users_label_file,
+ edges_file, users_file):
+ """Preprocess flickr network dataset.
+
+ Args:
+ groupsMemberships_file: flickr-groupmemberships.txt file,
+ each line is a pair (user, group), which indicates a user belongs to a group.
+
+ flickr_links_file: flickr-links.txt file,
+ each line is a pair (user, user), which indicates
+ the two users have a relationship.
+
+ users_label_file: each line is a pair (user, list of group),
+ each user may belong to multiple groups.
+
+ edges_file: each line is a pair (user, user), which indicates
+ the two users have a relationship. It filters some unused edges.
+
+ users_file: each line is a int number, which indicates the ID of a user.
+ """
+ group2users = {}
+ with open(groupsMemberships_file, 'r') as f:
+ for line in f:
+ user, group = line.strip().split()
+ try:
+ group2users[int(group)].append(user)
+ except:
+ group2users[int(group)] = [user]
+
+ # counting how many users belong to every group
+ group2usersNum = {}
+ for key, item in group2users.items():
+ group2usersNum[key] = len(item)
+
+ groups_sorted_by_usersNum = sorted(
+ group2usersNum.items(), key=operator.itemgetter(1), reverse=True)
+
+ # the paper only need the 5 groups with the largest number of users
+ label = 1 # remapping the 5 groups from 1 to 5
+ users_label = {}
+ for i in range(5):
+ users_list = group2users[groups_sorted_by_usersNum[i][0]]
+ for user in users_list:
+ # one user may have multi-labels
+ try:
+ users_label[user].append(label)
+ except:
+ users_label[user] = [label]
+ label += 1
+
+ # remapping the users IDs to make the IDs from 0 to N
+ userID2nodeID = {}
+ count = 1
+ for key in sorted(users_label.keys()):
+ userID2nodeID[key] = count
+ count += 1
+
+ with open(users_label_file, 'w') as writer:
+ for key in sorted(users_label.keys()):
+ line = ' '.join([str(i) for i in users_label[key]])
+ writer.write(str(userID2nodeID[key]) + ',' + line + '\n')
+
+ # produce edges file
+ with open(flickr_links_file, 'r') as reader, open(edges_file,
+ 'w') as writer:
+ for line in reader:
+ src, dst = line.strip().split('\t')
+ # filter unused user IDs
+ if src in users_label and dst in users_label:
+ # remapping the users IDs
+ src = userID2nodeID[src]
+ dst = userID2nodeID[dst]
+
+ writer.write(str(src) + '\t' + str(dst) + '\n')
+
+ # produce nodes file
+ with open(users_file, 'w') as writer:
+ for i in range(1, 1 + len(userID2nodeID)):
+ writer.write(str(i) + '\n')
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description='LINE')
+ parser.add_argument(
+ '--groupmemberships',
+ type=str,
+ default='./data/flickr/flickr-groupmemberships.txt',
+ help='groupmemberships of flickr dataset')
+
+ parser.add_argument(
+ '--flickr_links',
+ type=str,
+ default='./data/flickr/flickr-links.txt',
+ help='the flickr-links.txt file for training')
+
+ parser.add_argument(
+ '--nodes_label',
+ type=str,
+ default='./data/flickr/nodes_label.txt',
+ help='nodes (users) label file for training')
+
+ parser.add_argument(
+ '--edges',
+ type=str,
+ default='./data/flickr/edges.txt',
+ help='the result edges (links) file for training')
+
+ parser.add_argument(
+ '--nodes',
+ type=str,
+ default='./data/flickr/nodes.txt',
+ help='the nodes (users) file for training')
+
+ args = parser.parse_args()
+ process_data(args.groupmemberships, args.flickr_links, args.nodes_label,
+ args.edges, args.nodes)
diff --git a/examples/line/line.py b/examples/line/line.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6e17f6d20122f806cdc27e88abdfb1585fc99a2
--- /dev/null
+++ b/examples/line/line.py
@@ -0,0 +1,197 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file implement the training process of LINE model.
+"""
+
+import time
+import argparse
+import random
+import os
+import numpy as np
+
+import pgl
+import paddle.fluid as fluid
+import paddle.fluid.layers as fl
+from pgl.utils.logger import log
+
+from data_loader import FlickrDataset
+
+
+def make_dir(path):
+ """Create directory if path is not existed.
+
+ Args:
+ path: The directory that wants to create.
+ """
+ try:
+ os.makedirs(path)
+ except:
+ if not os.path.isdir(path):
+ raise
+
+
+def set_seed(seed):
+ """Set global random seed.
+ """
+ random.seed(seed)
+ np.random.seed(seed)
+
+
+def build_model(args, graph):
+ """Build LINE model.
+
+ Args:
+ args: The hyperparameters for configure.
+
+ graph: The :code:`Graph` data object.
+
+ """
+ u_i = fl.data(
+ name='u_i', shape=[None, 1], dtype='int64', append_batch_size=False)
+ u_j = fl.data(
+ name='u_j', shape=[None, 1], dtype='int64', append_batch_size=False)
+
+ label = fl.data(
+ name='label', shape=[None], dtype='float32', append_batch_size=False)
+
+ lr = fl.data(
+ name='learning_rate',
+ shape=[1],
+ dtype='float32',
+ append_batch_size=False)
+
+ u_i_embed = fl.embedding(
+ input=u_i,
+ size=[graph.num_nodes, args.embed_dim],
+ param_attr='shared_w')
+
+ if args.order == 'first_order':
+ u_j_embed = fl.embedding(
+ input=u_j,
+ size=[graph.num_nodes, args.embed_dim],
+ param_attr='shared_w')
+ elif args.order == 'second_order':
+ u_j_embed = fl.embedding(
+ input=u_j,
+ size=[graph.num_nodes, args.embed_dim],
+ param_attr='context_w')
+ else:
+ raise ValueError("order should be first_order or second_order, not %s"
+ % (args.order))
+
+ inner_product = fl.reduce_sum(u_i_embed * u_j_embed, dim=1)
+
+ loss = -1 * fl.reduce_mean(fl.logsigmoid(label * inner_product))
+ optimizer = fluid.optimizer.RMSPropOptimizer(learning_rate=lr)
+ train_op = optimizer.minimize(loss)
+
+ return loss, optimizer
+
+
+def main(args):
+ """The main funciton for training LINE model.
+ """
+ make_dir(args.save_dir)
+ set_seed(args.seed)
+
+ dataset = FlickrDataset(args.data_path)
+
+ log.info('num nodes in graph: %d' % dataset.graph.num_nodes)
+ log.info('num edges in graph: %d' % dataset.graph.num_edges)
+
+ place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+
+ main_program = fluid.default_main_program()
+ startup_program = fluid.default_startup_program()
+
+ # build model here
+ with fluid.program_guard(main_program, startup_program):
+ loss, opt = build_model(args, dataset.graph)
+
+ exe = fluid.Executor(place)
+ exe.run(startup_program) #initialize the parameters of the network
+
+ batchrange = int(dataset.graph.num_edges / args.batch_size)
+ T = batchrange * args.epochs
+ for epoch in range(args.epochs):
+ for b in range(batchrange):
+ lr = max(args.lr * (1 - (batchrange * epoch + b) / T), 0.0001)
+
+ u_i, u_j, label = dataset.fetch_batch(
+ batch_size=args.batch_size,
+ K=args.neg_sample_size,
+ edge_sampling=args.sample_method,
+ node_sampling=args.sample_method)
+
+ feed_dict = {
+ 'u_i': u_i,
+ 'u_j': u_j,
+ 'label': label,
+ 'learning_rate': lr
+ }
+
+ ret_loss = exe.run(main_program,
+ feed=feed_dict,
+ fetch_list=[loss],
+ return_numpy=True)
+
+ if b % 500 == 0:
+ log.info("Epoch %d | Step %d | Loss %f | lr: %f" %
+ (epoch, b, ret_loss[0], lr))
+
+ # save parameters in every epoch
+ log.info("saving persistables parameters...")
+ fluid.io.save_persistables(exe,
+ os.path.join(args.save_dir, "model_epoch_%d"
+ % (epoch + 1)), main_program)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='LINE')
+ parser.add_argument(
+ '--data_path',
+ type=str,
+ default='./data/flickr/',
+ help='dataset for training')
+ parser.add_argument("--use_cuda", action='store_true', help="use_cuda")
+ parser.add_argument("--epochs", type=int, default=20, help='total epochs')
+ parser.add_argument("--seed", type=int, default=1667, help='random seed')
+ parser.add_argument("--lr", type=float, default=0.01, help='learning rate')
+ parser.add_argument(
+ "--neg_sample_size",
+ type=int,
+ default=5,
+ help='negative samplle number')
+ parser.add_argument("--save_dir", type=str, default="./checkpoints/model")
+ parser.add_argument("--batch_size", type=int, default=32)
+ parser.add_argument(
+ "--embed_dim",
+ type=int,
+ default=128,
+ help='the dimension of node embedding')
+ parser.add_argument(
+ "--sample_method",
+ type=str,
+ default="alias",
+ help='negative sample method (uniform, numpy, alias)')
+ parser.add_argument(
+ "--order",
+ type=str,
+ default="first_order",
+ help='the order of neighbors (first_order, second_order)')
+
+ args = parser.parse_args()
+
+ main(args)
diff --git a/examples/line/multi_class.py b/examples/line/multi_class.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f191f23504359995c3a9f279c9948ff5402b15c
--- /dev/null
+++ b/examples/line/multi_class.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file provides the multi class task for testing the embedding
+learned by LINE model.
+"""
+import argparse
+import time
+import math
+import os
+import random
+
+import numpy as np
+import sklearn.metrics
+from sklearn.metrics import f1_score
+
+import pgl
+from pgl.utils import op
+import paddle.fluid as fluid
+import paddle.fluid.layers as l
+from pgl.utils.logger import log
+from data_loader import FlickrDataset
+
+
+def set_seed(seed):
+ """Set global random seed.
+ """
+ random.seed(seed)
+ np.random.seed(seed)
+
+
+def node_classify_model(graph,
+ num_labels,
+ embed_dim=16,
+ name='node_classify_task'):
+ """Build node classify model.
+
+ Args:
+ graph: The :code:`Graph` data object.
+
+ num_labels: The number of labels.
+
+ embed_dim: The dimension of embedding.
+
+ name: The name of the model.
+ """
+ pyreader = l.py_reader(
+ capacity=70,
+ shapes=[[-1, 1], [-1, num_labels]],
+ dtypes=['int64', 'float32'],
+ lod_levels=[0, 0],
+ name=name + '_pyreader',
+ use_double_buffer=True)
+ nodes, labels = l.read_file(pyreader)
+ embed_nodes = l.embedding(
+ input=nodes, size=[graph.num_nodes, embed_dim], param_attr='shared_w')
+ embed_nodes.stop_gradient = True
+ logits = l.fc(input=embed_nodes, size=num_labels)
+ loss = l.sigmoid_cross_entropy_with_logits(logits, labels)
+ loss = l.reduce_mean(loss)
+ prob = l.sigmoid(logits)
+ topk = l.reduce_sum(labels, -1)
+ return {
+ 'pyreader': pyreader,
+ 'loss': loss,
+ 'prob': prob,
+ 'labels': labels,
+ 'topk': topk
+ }
+ # return pyreader, loss, prob, labels, topk
+
+
+def node_classify_generator(graph,
+ all_nodes=None,
+ batch_size=512,
+ epoch=1,
+ shuffle=True):
+ """Data generator for node classify.
+
+ Args:
+ graph: The :code:`Graph` data object.
+
+ all_nodes: the total number of nodes.
+
+ batch_size: batch size for training.
+
+ epoch: The number of epochs.
+
+ shuffle: Random shuffle of data.
+ """
+
+ if all_nodes is None:
+ all_nodes = np.arange(graph.num_nodes)
+
+ def batch_nodes_generator(shuffle=shuffle):
+ """Batch nodes generator.
+ """
+ perm = np.arange(len(all_nodes), dtype=np.int64)
+ if shuffle:
+ np.random.shuffle(perm)
+ start = 0
+ while start < len(all_nodes):
+ yield all_nodes[perm[start:start + batch_size]]
+ start += batch_size
+
+ def wrapper():
+ """Wrapper function.
+ """
+ for _ in range(epoch):
+ for batch_nodes in batch_nodes_generator():
+ batch_nodes_expanded = np.expand_dims(batch_nodes,
+ -1).astype(np.int64)
+ batch_labels = graph.node_feat['group_id'][batch_nodes].astype(
+ np.float32)
+ yield [batch_nodes_expanded, batch_labels]
+
+ return wrapper
+
+
+def topk_f1_score(labels,
+ probs,
+ topk_list=None,
+ average="macro",
+ threshold=None):
+ """Calculate top K F1 score.
+ """
+ assert topk_list is not None or threshold is not None, "one of topklist and threshold should not be None"
+ if threshold is not None:
+ preds = probs > threshold
+ else:
+ preds = np.zeros_like(labels, dtype=np.int64)
+ for idx, (prob, topk) in enumerate(zip(np.argsort(probs), topk_list)):
+ preds[idx][prob[-int(topk):]] = 1
+ return f1_score(labels, preds, average=average)
+
+
+def main(args):
+ """The main funciton for nodes classify task.
+ """
+ set_seed(args.seed)
+ log.info(args)
+ dataset = FlickrDataset(args.data_path, train_percentage=args.percent)
+
+ train_steps = (len(dataset.train_index) // args.batch_size) * args.epochs
+ place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+ train_prog = fluid.Program()
+ test_prog = fluid.Program()
+ startup_prog = fluid.Program()
+
+ with fluid.program_guard(train_prog, startup_prog):
+ with fluid.unique_name.guard():
+ train_model = node_classify_model(
+ dataset.graph,
+ dataset.num_groups,
+ embed_dim=args.embed_dim,
+ name='train')
+
+ lr = l.polynomial_decay(args.lr, train_steps, 0.0001)
+ adam = fluid.optimizer.Adam(lr)
+ adam.minimize(train_model['loss'])
+ with fluid.program_guard(test_prog, startup_prog):
+ with fluid.unique_name.guard():
+ test_model = node_classify_model(
+ dataset.graph,
+ dataset.num_groups,
+ embed_dim=args.embed_dim,
+ name='test')
+ test_prog = test_prog.clone(for_test=True)
+ exe = fluid.Executor(place)
+ exe.run(startup_prog)
+
+ train_model['pyreader'].decorate_tensor_provider(
+ node_classify_generator(
+ dataset.graph,
+ dataset.train_index,
+ batch_size=args.batch_size,
+ epoch=args.epochs))
+ test_model['pyreader'].decorate_tensor_provider(
+ node_classify_generator(
+ dataset.graph,
+ dataset.test_index,
+ batch_size=args.batch_size,
+ epoch=1))
+
+ def existed_params(var):
+ """existed_params
+ """
+ if not isinstance(var, fluid.framework.Parameter):
+ return False
+ return os.path.exists(os.path.join(args.ckpt_path, var.name))
+
+ fluid.io.load_vars(
+ exe, args.ckpt_path, main_program=train_prog, predicate=existed_params)
+ step = 0
+ prev_time = time.time()
+ train_model['pyreader'].start()
+
+ while 1:
+ try:
+ train_loss_val, train_probs_val, train_labels_val, train_topk_val = exe.run(
+ train_prog,
+ fetch_list=[
+ train_model['loss'], train_model['prob'],
+ train_model['labels'], train_model['topk']
+ ],
+ return_numpy=True)
+ train_macro_f1 = topk_f1_score(train_labels_val, train_probs_val,
+ train_topk_val, "macro",
+ args.threshold)
+ train_micro_f1 = topk_f1_score(train_labels_val, train_probs_val,
+ train_topk_val, "micro",
+ args.threshold)
+ step += 1
+ log.info("Step %d " % step + "Train Loss: %f " % train_loss_val +
+ "Train Macro F1: %f " % train_macro_f1 +
+ "Train Micro F1: %f " % train_micro_f1)
+ except fluid.core.EOFException:
+ train_model['pyreader'].reset()
+ break
+
+ test_model['pyreader'].start()
+ test_probs_vals, test_labels_vals, test_topk_vals = [], [], []
+ while 1:
+ try:
+ test_loss_val, test_probs_val, test_labels_val, test_topk_val = exe.run(
+ test_prog,
+ fetch_list=[
+ test_model['loss'], test_model['prob'],
+ test_model['labels'], test_model['topk']
+ ],
+ return_numpy=True)
+ test_probs_vals.append(
+ test_probs_val), test_labels_vals.append(test_labels_val)
+ test_topk_vals.append(test_topk_val)
+ except fluid.core.EOFException:
+ test_model['pyreader'].reset()
+ test_probs_array = np.concatenate(test_probs_vals)
+ test_labels_array = np.concatenate(test_labels_vals)
+ test_topk_array = np.concatenate(test_topk_vals)
+ test_macro_f1 = topk_f1_score(
+ test_labels_array, test_probs_array, test_topk_array,
+ "macro", args.threshold)
+ test_micro_f1 = topk_f1_score(
+ test_labels_array, test_probs_array, test_topk_array,
+ "micro", args.threshold)
+ log.info("\t\tStep %d " % step + "Test Loss: %f " %
+ test_loss_val + "Test Macro F1: %f " % test_macro_f1 +
+ "Test Micro F1: %f " % test_micro_f1)
+ break
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='LINE')
+ parser.add_argument(
+ '--data_path',
+ type=str,
+ default='./data/flickr/',
+ help='dataset for training')
+ parser.add_argument("--use_cuda", action='store_true', help="use_cuda")
+ parser.add_argument("--epochs", type=int, default=10)
+ parser.add_argument("--seed", type=int, default=1667)
+ parser.add_argument(
+ "--lr", type=float, default=0.025, help='learning rate')
+ parser.add_argument("--embed_dim", type=int, default=128)
+ parser.add_argument("--batch_size", type=int, default=256)
+ parser.add_argument("--threshold", type=float, default=None)
+ parser.add_argument(
+ "--percent",
+ type=float,
+ default=0.5,
+ help="the percentage of data as training data")
+ parser.add_argument(
+ "--ckpt_path", type=str, default="./checkpoints/model/model_epoch_0/")
+ args = parser.parse_args()
+ main(args)
diff --git a/examples/sgc/README.md b/examples/sgc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..92fc546d51a9b07a45ff3d44fc000fa44a36b6b0
--- /dev/null
+++ b/examples/sgc/README.md
@@ -0,0 +1,35 @@
+# PGL Examples for SGC
+
+[Simplifying Graph Convolutional Networks \(SGC\)](https://arxiv.org/pdf/1902.07153.pdf) is a powerful neural network designed for machine learning on graphs. Based on PGL, we reproduce SGC algorithms and reach the same level of indicators as the paper in citation network benchmarks.
+
+### Datasets
+
+The datasets contain three citation networks: CORA, PUBMED, CITESEER. The details for these three datasets can be found in the [paper](https://arxiv.org/abs/1609.02907).
+
+### Dependencies
+
+- paddlepaddle 1.5
+- pgl
+
+### Performance
+
+We train our models for 200 epochs and report the accuracy on the test dataset.
+
+| Dataset | Accuracy | Speed with paddle 1.5
(epoch time)|
+| --- | --- | ---|
+| Cora | 0.818 (paper: 0.810) | 0.0015s |
+| Pubmed | 0.788 (paper: 0.789) | 0.0015s |
+| Citeseer | 0.719 (paper: 0.719) | 0.0015s |
+
+
+### How to run
+
+For examples, use gpu to train SGC on cora dataset.
+```
+python sgc.py --dataset cora --use_cuda
+```
+
+#### Hyperparameters
+
+- dataset: The citation dataset "cora", "citeseer", "pubmed".
+- use_cuda: Use gpu if assign use_cuda.
diff --git a/examples/sgc/sgc.py b/examples/sgc/sgc.py
new file mode 100644
index 0000000000000000000000000000000000000000..85ec6855f7696fb65898361a24bbd6c61871d0ba
--- /dev/null
+++ b/examples/sgc/sgc.py
@@ -0,0 +1,271 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file implement the training process of SGC model with StaticGraphWrapper.
+"""
+
+import os
+import argparse
+import numpy as np
+import random
+import time
+
+import pgl
+from pgl import data_loader
+from pgl.utils.logger import log
+from pgl.utils import paddle_helper
+import paddle.fluid as fluid
+
+
+def load(name):
+ """Load dataset."""
+ if name == 'cora':
+ dataset = data_loader.CoraDataset()
+ elif name == "pubmed":
+ dataset = data_loader.CitationDataset("pubmed", symmetry_edges=False)
+ elif name == "citeseer":
+ dataset = data_loader.CitationDataset("citeseer", symmetry_edges=False)
+ else:
+ raise ValueError(name + " dataset doesn't exists")
+ return dataset
+
+
+def expand_data_dim(dataset):
+ """Expand the dimension of data."""
+ train_index = dataset.train_index
+ train_label = np.expand_dims(dataset.y[train_index], -1)
+ train_index = np.expand_dims(train_index, -1)
+
+ val_index = dataset.val_index
+ val_label = np.expand_dims(dataset.y[val_index], -1)
+ val_index = np.expand_dims(val_index, -1)
+
+ test_index = dataset.test_index
+ test_label = np.expand_dims(dataset.y[test_index], -1)
+ test_index = np.expand_dims(test_index, -1)
+
+ return {
+ 'train_index': train_index,
+ 'train_label': train_label,
+ 'val_index': val_index,
+ 'val_label': val_label,
+ 'test_index': test_index,
+ 'test_label': test_label,
+ }
+
+
+def MessagePassing(gw, feature, num_layers, norm=None):
+ """Precomputing message passing.
+ """
+
+ def send_src_copy(src_feat, dst_feat, edge_feat):
+ """send_src_copy
+ """
+ return src_feat["h"]
+
+ for _ in range(num_layers):
+ if norm is not None:
+ feature = feature * norm
+
+ msg = gw.send(send_src_copy, nfeat_list=[("h", feature)])
+
+ feature = gw.recv(msg, "sum")
+
+ if norm is not None:
+ feature = feature * norm
+
+ return feature
+
+
+def pre_gather(features, name_prefix, node_index_val):
+ """Get features with respect to node index.
+ """
+ node_index, init = paddle_helper.constant(
+ "%s_node_index" % (name_prefix), dtype='int32', value=node_index_val)
+ logits = fluid.layers.gather(features, node_index)
+
+ return logits, init
+
+
+def calculate_loss(name, np_cached_h, node_label_val, num_classes, args):
+ """Calculate loss function.
+ """
+ initializer = []
+ const_cached_h, init = paddle_helper.constant(
+ "const_%s_cached_h" % name, dtype='float32', value=np_cached_h)
+ initializer.append(init)
+
+ node_label, init = paddle_helper.constant(
+ "%s_node_label" % (name), dtype='int64', value=node_label_val)
+ initializer.append(init)
+
+ output = fluid.layers.fc(const_cached_h,
+ size=num_classes,
+ bias_attr=args.bias,
+ name='fc')
+
+ loss, probs = fluid.layers.softmax_with_cross_entropy(
+ logits=output, label=node_label, return_softmax=True)
+ loss = fluid.layers.mean(loss)
+
+ acc = None
+ if name != 'train':
+ acc = fluid.layers.accuracy(input=probs, label=node_label, k=1)
+
+ return {
+ 'loss': loss,
+ 'acc': acc,
+ 'probs': probs,
+ 'initializer': initializer
+ }
+
+
+def main(args):
+ """"Main function."""
+ dataset = load(args.dataset)
+
+ # normalize
+ indegree = dataset.graph.indegree()
+ norm = np.zeros_like(indegree, dtype="float32")
+ norm[indegree > 0] = np.power(indegree[indegree > 0], -0.5)
+ dataset.graph.node_feat["norm"] = np.expand_dims(norm, -1)
+
+ data = expand_data_dim(dataset)
+
+ place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+ precompute_program = fluid.Program()
+ startup_program = fluid.Program()
+ train_program = fluid.Program()
+ val_program = train_program.clone(for_test=True)
+ test_program = train_program.clone(for_test=True)
+
+ # precompute message passing and gather
+ initializer = []
+ with fluid.program_guard(precompute_program, startup_program):
+ gw = pgl.graph_wrapper.StaticGraphWrapper(
+ name="graph", place=place, graph=dataset.graph)
+
+ cached_h = MessagePassing(
+ gw,
+ gw.node_feat["words"],
+ num_layers=args.num_layers,
+ norm=gw.node_feat['norm'])
+
+ train_cached_h, init = pre_gather(cached_h, 'train',
+ data['train_index'])
+ initializer.append(init)
+ val_cached_h, init = pre_gather(cached_h, 'val', data['val_index'])
+ initializer.append(init)
+ test_cached_h, init = pre_gather(cached_h, 'test', data['test_index'])
+ initializer.append(init)
+
+ exe = fluid.Executor(place)
+ gw.initialize(place)
+ for init in initializer:
+ init(place)
+
+ # get train features, val features and test features
+ np_train_cached_h, np_val_cached_h, np_test_cached_h = exe.run(
+ precompute_program,
+ feed={},
+ fetch_list=[train_cached_h, val_cached_h, test_cached_h],
+ return_numpy=True)
+
+ initializer = []
+ with fluid.program_guard(train_program, startup_program):
+ with fluid.unique_name.guard():
+ train_handle = calculate_loss('train', np_train_cached_h,
+ data['train_label'],
+ dataset.num_classes, args)
+ initializer += train_handle['initializer']
+ adam = fluid.optimizer.Adam(
+ learning_rate=args.lr,
+ regularization=fluid.regularizer.L2DecayRegularizer(
+ regularization_coeff=args.weight_decay))
+ adam.minimize(train_handle['loss'])
+
+ with fluid.program_guard(val_program, startup_program):
+ with fluid.unique_name.guard():
+ val_handle = calculate_loss('val', np_val_cached_h,
+ data['val_label'], dataset.num_classes,
+ args)
+ initializer += val_handle['initializer']
+
+ with fluid.program_guard(test_program, startup_program):
+ with fluid.unique_name.guard():
+ test_handle = calculate_loss('test', np_test_cached_h,
+ data['test_label'],
+ dataset.num_classes, args)
+ initializer += test_handle['initializer']
+
+ exe.run(startup_program)
+ for init in initializer:
+ init(place)
+
+ dur = []
+ for epoch in range(args.epochs):
+ if epoch >= 3:
+ t0 = time.time()
+ train_loss_t = exe.run(train_program,
+ feed={},
+ fetch_list=[train_handle['loss']],
+ return_numpy=True)[0]
+
+ if epoch >= 3:
+ time_per_epoch = 1.0 * (time.time() - t0)
+ dur.append(time_per_epoch)
+
+ val_loss_t, val_acc_t = exe.run(
+ val_program,
+ feed={},
+ fetch_list=[val_handle['loss'], val_handle['acc']],
+ return_numpy=True)
+
+ log.info("Epoch %d " % epoch + "(%.5lf sec) " % np.mean(
+ dur) + "Train Loss: %f " % train_loss_t + "Val Loss: %f " %
+ val_loss_t + "Val Acc: %f " % val_acc_t)
+
+ test_loss_t, test_acc_t = exe.run(
+ test_program,
+ feed={},
+ fetch_list=[test_handle['loss'], test_handle['acc']],
+ return_numpy=True)
+ log.info("Test Accuracy: %f" % test_acc_t)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='SGC')
+ parser.add_argument(
+ "--dataset",
+ type=str,
+ default="cora",
+ help="dataset (cora, pubmed, citeseer)")
+ parser.add_argument("--use_cuda", action='store_true', help="use_cuda")
+ parser.add_argument(
+ "--seed", type=int, default=1667, help="global random seed")
+ parser.add_argument("--lr", type=float, default=0.2, help="learning rate")
+ parser.add_argument(
+ "--weight_decay",
+ type=float,
+ default=0.000005,
+ help="Weight for L2 loss")
+ parser.add_argument(
+ "--bias", action='store_true', default=False, help="flag to use bias")
+ parser.add_argument(
+ "--epochs", type=int, default=200, help="number of training epochs")
+ parser.add_argument(
+ "--num_layers", type=int, default=2, help="number of SGC layers")
+ args = parser.parse_args()
+ log.info(args)
+ main(args)
diff --git a/examples/static_gat/README.md b/examples/static_gat/README.md
index bab075f815dd5a5a235363f4942cdbf239f6fd4b..1071e27e48c2c76d13a11c5db22e8947e966da11 100644
--- a/examples/static_gat/README.md
+++ b/examples/static_gat/README.md
@@ -11,7 +11,7 @@ The datasets contain three citation networks: CORA, PUBMED, CITESEER. The detail
### Dependencies
-- paddlepaddle>=1.4 (The speed can be faster in 1.5.)
+- paddlepaddle>=1.6
- pgl
### Performance
@@ -19,11 +19,11 @@ The datasets contain three citation networks: CORA, PUBMED, CITESEER. The detail
We train our models for 200 epochs and report the accuracy on the test dataset.
-| Dataset | Accuracy | Speed with paddle 1.4
(epoch time) | Speed with paddle 1.5
(epoch time)| examples/gat | Improvement |
-| --- | --- | --- |---| --- | --- |
-| Cora | ~83% | 0.0145s | 0.0119s | 0.0175s | 1.47x |
-| Pubmed | ~78% | 0.0352s | 0.0193s |0.0295s | 1.53x |
-| Citeseer | ~70% | 0.0148s | 0.0124s |0.0253s | 2.04x |
+| Dataset | Accuracy | epoch time | examples/gat | Improvement |
+| --- | --- | --- | --- | --- |
+| Cora | ~83% | 0.0119s | 0.0175s | 1.47x |
+| Pubmed | ~78% | 0.0193s |0.0295s | 1.53x |
+| Citeseer | ~70% | 0.0124s |0.0253s | 2.04x |
### How to run
diff --git a/examples/static_gat/train.py b/examples/static_gat/train.py
index 42c2da80195927f5642ec751dc3538591218fd2b..3fa38a19b7db601da501302883a74ff138361cb0 100644
--- a/examples/static_gat/train.py
+++ b/examples/static_gat/train.py
@@ -84,7 +84,7 @@ def main(args):
initializer = []
with fluid.program_guard(train_program, startup_program):
train_node_index, init = paddle_helper.constant(
- "train_node_index", dtype="int32", value=train_index)
+ "train_node_index", dtype="int64", value=train_index)
initializer.append(init)
train_node_label, init = paddle_helper.constant(
@@ -103,7 +103,7 @@ def main(args):
with fluid.program_guard(val_program, startup_program):
val_node_index, init = paddle_helper.constant(
- "val_node_index", dtype="int32", value=val_index)
+ "val_node_index", dtype="int64", value=val_index)
initializer.append(init)
val_node_label, init = paddle_helper.constant(
@@ -119,7 +119,7 @@ def main(args):
with fluid.program_guard(test_program, startup_program):
test_node_index, init = paddle_helper.constant(
- "test_node_index", dtype="int32", value=test_index)
+ "test_node_index", dtype="int64", value=test_index)
initializer.append(init)
test_node_label, init = paddle_helper.constant(
diff --git a/examples/static_gcn/README.md b/examples/static_gcn/README.md
index 9e87d51d94fad5774dd62cb7cfa33e49200cc033..cabd9cf3be06779b77e44ddfa471937c05ae95d6 100644
--- a/examples/static_gcn/README.md
+++ b/examples/static_gcn/README.md
@@ -10,7 +10,7 @@ The datasets contain three citation networks: CORA, PUBMED, CITESEER. The detail
### Dependencies
-- paddlepaddle>=1.4 (The speed can be faster in 1.5.)
+- paddlepaddle>=1.6
- pgl
### Performance
@@ -18,12 +18,11 @@ The datasets contain three citation networks: CORA, PUBMED, CITESEER. The detail
We train our models for 200 epochs and report the accuracy on the test dataset.
-| Dataset | Accuracy | Speed with paddle 1.4
(epoch time) | Speed with paddle 1.5
(epoch time)| examples/gcn | Improvement |
-| --- | --- | --- |---| --- | --- |
-| Cora | ~81% | 0.0053s | 0.0047s | 0.0104s | 2.21x |
-| Pubmed | ~79% | 0.0105s | 0.0049s |0.0154s | 3.14x |
-| Citeseer | ~71% | 0.0051s | 0.0045s |0.0177s | 3.93x |
-
+| Dataset | Accuracy | epoch time | examples/gcn | Improvement |
+| --- | --- | --- | --- | --- |
+| Cora | ~81% | 0.0047s | 0.0104s | 2.21x |
+| Pubmed | ~79% | 0.0049s |0.0154s | 3.14x |
+| Citeseer | ~71% | 0.0045s |0.0177s | 3.93x |
### How to run
diff --git a/examples/static_gcn/train.py b/examples/static_gcn/train.py
index f01435d4b93e35247ee664c83409f17b0e11e482..4c9724e0cec79c3aab291f03727279ac46a1452f 100644
--- a/examples/static_gcn/train.py
+++ b/examples/static_gcn/train.py
@@ -85,7 +85,7 @@ def main(args):
initializer = []
with fluid.program_guard(train_program, startup_program):
train_node_index, init = paddle_helper.constant(
- "train_node_index", dtype="int32", value=train_index)
+ "train_node_index", dtype="int64", value=train_index)
initializer.append(init)
train_node_label, init = paddle_helper.constant(
@@ -104,7 +104,7 @@ def main(args):
with fluid.program_guard(val_program, startup_program):
val_node_index, init = paddle_helper.constant(
- "val_node_index", dtype="int32", value=val_index)
+ "val_node_index", dtype="int64", value=val_index)
initializer.append(init)
val_node_label, init = paddle_helper.constant(
@@ -120,7 +120,7 @@ def main(args):
with fluid.program_guard(test_program, startup_program):
test_node_index, init = paddle_helper.constant(
- "test_node_index", dtype="int32", value=test_index)
+ "test_node_index", dtype="int64", value=test_index)
initializer.append(init)
test_node_label, init = paddle_helper.constant(
diff --git a/examples/strucvec/README.md b/examples/strucvec/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b23c10a297014f6822e0ce1e4c7b3b1ac4559cd9
--- /dev/null
+++ b/examples/strucvec/README.md
@@ -0,0 +1,44 @@
+## PGL Examples For Struc2Vec
+[Struc2vec](https://arxiv.org/abs/1704.03165) is is a concept of symmetry in which network nodes are identified according to the network structure and their relationship to other nodes. A novel and flexible framework for learning latent representations is proposed in the paper of struc2vec. We reproduce Struc2vec algorithm in the PGL.
+## DataSet
+The paper of use air-traffic network to valid algorithm of Struc2vec.
+The each edge in the dataset indicate that having one flight between the airports. Using the the connection between the airports to predict the level of activity. The following dataset will be used to valid the algorithm accuracy.Data collected from the Bureau of Transportation Statistics2 from January to October, 2016. The network has 1,190 nodes, 13,599 edges (diameter is 8). [Link](https://www.transtats.bts.gov/)
+
+- usa-airports.edgelist
+- labels-usa-airports.txt
+
+## Dependencies
+If use want to use the struc2vec model in pgl, please install the gensim, pathos, fastdtw additional.
+- paddlepaddle>=1.6
+- pgl
+- gensim
+- pathos
+- fastdtw
+
+## How to use
+For examples, we want to train and valid the Struc2vec model on American airpot dataset
+> python struc2vec.py --edge_file data/usa-airports.edgelist --label_file data/labels-usa-airports.txt --train True --valid True --opt2 True
+
+## Hyperparameters
+| Args| Meaning|
+| ------------- | ------------- |
+| edge_file | input file name for edges|
+| label_file | input file name for node label|
+| emb_file | input file name for node label|
+| walk_depth| The step3 for random walk|
+| opt1| The flag to open optimization 1 to reduce time cost|
+| opt2| The flag to open optimization 2 to reduce time cost|
+| w2v_emb_size| The dims of output the word2vec embedding|
+| w2v_window_size| The context length of word2vec|
+| w2v_epoch| The num of epoch to train the model.|
+| train| The flag to run the struc2vec algorithm to get the w2v embedding|
+| valid| The flag to use the w2v embedding to valid the classification result|
+| num_class| The num of class in classification model to be trained|
+
+## Experiment results
+| Dataset | Model | Metric | PGL Result | Paper repo Result |
+| ------------- | ------------- |------------- |------------- |------------- |
+| American airport dataset | Struc2vec without time cost optimization| ACC |0.6483|0.6340|
+| American airport dataset | Struc2vec with optimization 1| ACC |0.6466|0.6242|
+| American airport dataset | Struc2vec with optimization 2| ACC |0.6252|0.6241|
+| American airport dataset | Struc2vec with optimization1&2| ACC |0.6226|0.6083|
diff --git a/examples/strucvec/classify.py b/examples/strucvec/classify.py
new file mode 100644
index 0000000000000000000000000000000000000000..daaa87eb8dee6e117ae1c523c089ddcc51af206f
--- /dev/null
+++ b/examples/strucvec/classify.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+
+
+def build_lr_model(args):
+ """
+ Build the LR model to train.
+ """
+ emb_x = fluid.layers.data(
+ name="emb_x", dtype='float32', shape=[args.w2v_emb_size])
+ label = fluid.layers.data(name="label_y", dtype='int64', shape=[1])
+ logits = fluid.layers.fc(input=emb_x,
+ size=args.num_class,
+ act=None,
+ name='classification_layer')
+ proba = fluid.layers.softmax(logits)
+ loss = fluid.layers.softmax_with_cross_entropy(logits, label)
+ loss = fluid.layers.mean(loss)
+ acc = fluid.layers.accuracy(input=proba, label=label, k=1)
+ return loss, acc
+
+
+def construct_feed_data(data):
+ """
+ Construct the data to feed model.
+ """
+ datas = []
+ labels = []
+ for sample in data:
+ if len(datas) < 16:
+ labels.append([sample[-1]])
+ datas.append(sample[1:-1])
+ else:
+ yield np.array(datas).astype(np.float32), np.array(labels).astype(
+ np.int64)
+ datas = []
+ labels = []
+ if len(datas) != 0:
+ yield np.array(datas).astype(np.float32), np.array(labels).astype(
+ np.int64)
+
+
+def run_epoch(exe, data, program, stage, epoch, loss, acc):
+ """
+ The epoch funtcion to run each epoch.
+ """
+ print('start {} epoch of {}'.format(stage, epoch))
+ all_loss = 0.0
+ all_acc = 0.0
+ all_samples = 0.0
+ count = 0
+ for datas, labels in construct_feed_data(data):
+ batch_loss, batch_acc = exe.run(
+ program,
+ fetch_list=[loss, acc],
+ feed={"emb_x": datas,
+ "label_y": labels})
+ len_samples = len(datas)
+ all_loss = batch_loss * len_samples
+ all_acc = batch_acc * len_samples
+ all_samples += len_samples
+ count += 1
+ print("pass:{}, epoch:{}, loss:{}, acc:{}".format(stage, epoch, batch_loss,
+ all_acc / (len_samples)))
+
+
+def train_lr_model(args, data):
+ """
+ The main function to run the lr model.
+ """
+ data_nums = len(data)
+ train_data_nums = int(0.8 * data_nums)
+ train_data = data[:train_data_nums]
+ test_data = data[train_data_nums:]
+
+ place = fluid.CPUPlace()
+
+ train_program = fluid.Program()
+ startup_program = fluid.Program()
+
+ with fluid.program_guard(train_program, startup_program):
+ loss, acc = build_lr_model(args)
+ test_program = train_program.clone(for_test=True)
+
+ with fluid.program_guard(train_program, startup_program):
+ adam = fluid.optimizer.Adam(learning_rate=args.lr)
+ adam.minimize(loss)
+
+ exe = fluid.Executor(place)
+ exe.run(startup_program)
+
+ for epoch in range(0, args.epoch):
+ run_epoch(exe, train_data, train_program, "train", epoch, loss, acc)
+ print('-------------------')
+ run_epoch(exe, test_data, test_program, "valid", epoch, loss, acc)
diff --git a/examples/strucvec/data_loader.py b/examples/strucvec/data_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..98cb932a8b873e1df506640527b4a797e771a5ef
--- /dev/null
+++ b/examples/strucvec/data_loader.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+data_loader.py
+"""
+from pgl import graph
+import numpy as np
+
+
+class EdgeDataset():
+ """
+ The data load just read the edge file, at the same time reindex the source and destination.
+ """
+
+ def __init__(self, undirected=True, data_dir=""):
+ self._undirected = undirected
+ self._data_dir = data_dir
+ self._load_edge_data()
+
+ def _load_edge_data(self):
+ node_sets = set()
+ edges = []
+ with open(self._data_dir, "r") as f:
+ node_dict = dict()
+ for line in f:
+ src, dist = [
+ int(data) for data in line.strip("\n\r").split(" ")
+ ]
+ if src not in node_dict:
+ node_dict[src] = len(node_dict) + 1
+ src = node_dict[src]
+ if dist not in node_dict:
+ node_dict[dist] = len(node_dict) + 1
+ dist = node_dict[dist]
+ node_sets.add(src)
+ node_sets.add(dist)
+ edges.append((src, dist))
+ if self._undirected:
+ edges.append((dist, src))
+
+ num_nodes = len(node_sets)
+ self.graph = graph.Graph(num_nodes=num_nodes + 1, edges=edges)
+ self.nodes = np.array(list(node_sets))
+ self.node_dict = node_dict
diff --git a/examples/strucvec/requirements.txt b/examples/strucvec/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a3e7c50d255fe9a8dba5ba7054f9637f9a8665f8
--- /dev/null
+++ b/examples/strucvec/requirements.txt
@@ -0,0 +1,3 @@
+gensim
+pathos
+fastdtw
diff --git a/examples/strucvec/sklearn_classify.py b/examples/strucvec/sklearn_classify.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2dde24b9af2f5d9e8455a4efb114ea7e51b3d14
--- /dev/null
+++ b/examples/strucvec/sklearn_classify.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import sklearn
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
+
+random_seed = 67
+
+
+def train_lr_l2_model(args, data):
+ """
+ The main function to train lr model with l2 regularization.
+ """
+ acc_list = []
+ data = np.array(data)
+ data = data[data[:, 0].argsort()]
+ x_data = data[:, 1:-1]
+ y_data = data[:, -1]
+ for random_num in range(0, 10):
+ X_train, X_test, y_train, y_test = train_test_split(
+ x_data,
+ y_data,
+ test_size=0.2,
+ random_state=random_num + random_seed)
+
+ # use the one vs rest to train the lr model with l2
+ pred_test = []
+ for i in range(0, args.num_class):
+ y_train_relabel = np.where(y_train == i, 1, 0)
+ y_test_relabel = np.where(y_test == i, 1, 0)
+ lr = LogisticRegression(C=10.0, random_state=0, max_iter=100)
+ lr.fit(X_train, y_train_relabel)
+ pred = lr.predict_proba(X_test)
+ pred_test.append(pred[:, -1].tolist())
+ pred_test = np.array(pred_test)
+ pred_test = np.transpose(pred_test)
+ c_index = np.argmax(pred_test, axis=1)
+ acc = accuracy_score(y_test.flatten(), c_index)
+ acc_list.append(acc)
+ print("pass:{}-acc:{}".format(random_num, acc))
+ print("the avg acc is {}".format(np.mean(acc_list)))
diff --git a/examples/strucvec/struc2vec.py b/examples/strucvec/struc2vec.py
new file mode 100644
index 0000000000000000000000000000000000000000..11c87073d972436ec1223a3ddaa13793a62ff717
--- /dev/null
+++ b/examples/strucvec/struc2vec.py
@@ -0,0 +1,545 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+struc2vec.py
+"""
+import argparse
+import math
+import random
+import numpy as np
+import pgl
+from pgl import graph
+from pgl.graph_kernel import alias_sample_build_table
+from pgl.sample import alias_sample
+from data_loader import EdgeDataset
+from classify import train_lr_model
+from sklearn_classify import train_lr_l2_model
+
+
+def selectDegrees(degree_root, index_left, index_right, degree_left,
+ degree_right):
+ """
+ Select the which degree to be next step.
+ """
+
+ if index_left == -1:
+ degree_now = degree_right
+ elif index_right == -1:
+ degree_now = degree_left
+ elif (abs(degree_left - degree_root) < abs(degree_right - degree_root)):
+ degree_now = degree_left
+ else:
+ degree_now = degree_right
+
+ return degree_now
+
+
+class StrucVecGraph():
+ """
+ The class wrapper the PGL graph, the class involve the funtions to implement struc2vec algorithm.
+ """
+
+ def __init__(self, graph, nodes, opt1, opt2, opt3, depth, num_walks,
+ walk_depth):
+ self.graph = graph
+ self.nodes = nodes
+ self.opt1 = opt1
+ self.opt2 = opt2
+ self.opt3 = opt3
+ self.num_walks = num_walks
+ self.walk_depth = walk_depth
+ self.tag = args.tag
+ self.degree_list = dict()
+ self.degree2nodes = dict()
+ self.node2degree = dict()
+ self.distance = dict()
+ self.degrees_sorted = None
+ self.layer_distance = dict()
+ self.layer_message = dict()
+ self.layer_norm_distance = dict()
+ self.sample_alias = dict()
+ self.sample_events = dict()
+ self.layer_node_weight_count = dict()
+ if opt3 == True:
+ self.depth = depth
+ else:
+ self.depth = 1000
+
+ def distance_func(self, a, b):
+ """
+ The basic function to calculate the distance between two list with different length.
+ """
+ ep = 0.5
+ m = max(a, b) + ep
+ mi = min(a, b) + ep
+ return ((m / mi) - 1)
+
+ def distance_opt1_func(self, a, b):
+ """
+ The optimization function to calculate the distance between two list with list count.
+ """
+ ep = 0.5
+ m = max(a[0], b[0]) + ep
+ mi = min(a[0], b[0]) + ep
+ return ((m / mi) - 1) * max(a[1], b[1])
+
+ def add_degree_todict(self, node_id, degree, depth, opt1):
+ """
+ output the degree of each node to a dict
+ """
+ if node_id not in self.degree_list:
+ self.degree_list[node_id] = dict()
+ if depth not in self.degree_list[node_id]:
+ self.degree_list[node_id][depth] = None
+ if opt1:
+ degree = np.array(np.unique(degree, return_counts=True)).T
+ self.degree_list[node_id][depth] = degree
+
+ def output_degree_with_depth(self, depth, opt1):
+ """
+ according to the BFS to get the degree of each layer
+ """
+ degree_dict = dict()
+
+ for node in self.nodes:
+ start_node = node
+ cur_node = node
+ cur_dep = 0
+ flag_visit = set()
+ while cur_node is not None and cur_dep < depth:
+ if not isinstance(cur_node, list):
+ cur_node = [cur_node]
+ filter_node = []
+ for node in cur_node:
+ if node not in flag_visit:
+ flag_visit.add(node)
+ filter_node.append(node)
+ cur_node = filter_node
+ if len(cur_node) == 0:
+ break
+ outdegree = self.graph.outdegree(cur_node)
+ mask = (outdegree != 0)
+ if np.any(mask):
+ outdegree = np.sort(outdegree[mask])
+ else:
+ break
+ # save the layer degree message to dict
+ self.add_degree_todict(start_node, outdegree[mask], cur_dep,
+ opt1)
+ succes = self.graph.successor(cur_node)
+ cur_node = []
+ for succ in succes:
+ if isinstance(succ, np.ndarray):
+ cur_node.extend(succ.flatten().tolist())
+ elif isinstance(succ, int):
+ cur_node.append(succ)
+ cur_node = list(set(cur_node))
+ cur_dep += 1
+
+ def get_sim_neighbours(self, node, selected_num):
+ """
+ Select the neighours by using the degree similiarity.
+ """
+ degree = self.node2degree[node]
+ select_count = 0
+ node_nbh_list = list()
+ for node_nbh in self.degree2nodes[degree]:
+ if node != node_nbh:
+ node_nbh_list.append(node_nbh)
+ select_count += 1
+ if select_count > selected_num:
+ return node_nbh_list
+ degree_vec_len = len(self.degrees_sorted)
+ index_degree = self.degrees_sorted.index(degree)
+
+ index_left = -1
+ index_right = -1
+ degree_left = -1
+ degree_right = -1
+
+ if index_degree != -1 and index_degree >= 1:
+ index_left = index_degree - 1
+ if index_degree != -1 and index_degree <= degree_vec_len - 2:
+ index_right = index_degree + 1
+ if index_left == -1 and index_right == -1:
+ return node_nbh_list
+ if index_left != -1:
+ degree_left = self.degrees_sorted[index_left]
+ if index_right != -1:
+ degree_right = self.degrees_sorted[index_right]
+ select_degree = selectDegrees(degree, index_left, index_right,
+ degree_left, degree_right)
+ while True:
+ for node_nbh in self.degree2nodes[select_degree]:
+ if node_nbh != node:
+ node_nbh_list.append(node_nbh)
+ select_count += 1
+ if select_count > selected_num:
+ return node_nbh_list
+
+ if select_degree == degree_left:
+ if index_left >= 1:
+ index_left = index_left - 1
+ else:
+ index_left = -1
+
+ else:
+ if index_right <= degree_vec_len - 2:
+ index_right += 1
+ else:
+ index_right = -1
+
+ if index_left == -1 and index_right == -1:
+ return node_nbh_list
+
+ if index_left != -1:
+ degree_left = self.degrees_sorted[index_left]
+ if index_right != -1:
+ degree_right = self.degrees_sorted[index_right]
+ select_degree = selectDegrees(degree, index_left, index_right,
+ degree_left, degree_right)
+ return node_nbh_list
+
+ def calc_node_with_neighbor_dtw_opt2(self, src):
+ """
+ Use the optimization algorithm to reduce the next steps range.
+ """
+ from fastdtw import fastdtw
+ node_nbh_list = self.get_sim_neighbours(src, self.selected_nbh_nums)
+ distance = {}
+ for dist in node_nbh_list:
+ calc_layer_len = min(len(self.degree_list[src]), \
+ len(self.degree_list[dist]))
+ distance_iteration = 0.0
+ distance[src, dist] = {}
+ for layer in range(0, calc_layer_len):
+ src_layer = self.degree_list[src][layer]
+ dist_layer = self.degree_list[dist][layer]
+ weight, path = fastdtw(
+ src_layer,
+ dist_layer,
+ radius=1,
+ dist=self.distance_calc_func)
+ distance_iteration += weight
+ distance[src, dist][layer] = distance_iteration
+ return distance
+
+ def calc_node_with_neighbor_dtw(self, src_index):
+ """
+ No optimization algorithm to reduce the next steps range, just calculate distance of all path.
+ """
+ from fastdtw import fastdtw
+ distance = {}
+ for dist_index in range(src_index + 1, self.graph.num_nodes - 1):
+ src = self.nodes[src_index]
+ dist = self.nodes[dist_index]
+ calc_layer_len = min(len(self.degree_list[src]), \
+ len(self.degree_list[dist]))
+ distance_iteration = 0.0
+ distance[src, dist] = {}
+ for layer in range(0, calc_layer_len):
+ src_layer = self.degree_list[src][layer]
+ dist_layer = self.degree_list[dist][layer]
+ weight, path = fastdtw(
+ src_layer,
+ dist_layer,
+ radius=1,
+ dist=self.distance_calc_func)
+ distance_iteration += weight
+ distance[src, dist][layer] = distance_iteration
+ return distance
+
+ def calc_distances_between_nodes(self):
+ """
+ Use the dtw algorithm to calculate the distance between nodes.
+ """
+ from fastdtw import fastdtw
+ from pathos.multiprocessing import Pool
+ # decide use which algo to use
+ if self.opt1 == True:
+ self.distance_calc_func = self.distance_opt1_func
+ else:
+ self.distance_calc_func = self.distance_func
+
+ dtws = []
+ if self.opt2:
+ depth = 0
+ for node in self.nodes:
+ if node in self.degree_list:
+ if depth in self.degree_list[node]:
+ degree = self.degree_list[node][depth]
+ if args.opt1:
+ degree = degree[0][0]
+ else:
+ degree = degree[0]
+ if degree not in self.degree2nodes:
+ self.degree2nodes[degree] = []
+ if node not in self.node2degree:
+ self.node2degree[node] = degree
+ self.degree2nodes[degree].append(node)
+ # select the log(n) node to select data
+ degree_keys = self.degree2nodes.keys()
+ degree_keys = np.array(list(degree_keys), dtype='int')
+ self.degrees_sorted = list(np.sort(degree_keys))
+ selected_nbh_nums = 2 * math.log(self.graph.num_nodes - 1, 2)
+ self.selected_nbh_nums = selected_nbh_nums
+
+ pool = Pool(10)
+ dtws = pool.map(self.calc_node_with_neighbor_dtw_opt2, self.nodes)
+ pool.close()
+ pool.join()
+ else:
+ src_indices = range(0, self.graph.num_nodes - 2)
+
+ pool = Pool(10)
+ dtws = pool.map(self.calc_node_with_neighbor_dtw, src_indices)
+ pool.close()
+ pool.join()
+ print('calc the dtw done.')
+ for dtw in dtws:
+ self.distance.update(dtw)
+
+ def normlization_layer_weight(self):
+ """
+ Normlation the distance between nodes, weight[1, 2, ....N] = distance[1, 2, ......N] / sum(distance)
+ """
+ for sd_keys, layer_weight in self.distance.items():
+ src, dist = sd_keys
+ layers, weights = layer_weight.keys(), layer_weight.values()
+ for layer, weight in zip(layers, weights):
+ if layer not in self.layer_distance:
+ self.layer_distance[layer] = {}
+ if layer not in self.layer_message:
+ self.layer_message[layer] = {}
+ self.layer_distance[layer][src, dist] = weight
+
+ if src not in self.layer_message[layer]:
+ self.layer_message[layer][src] = []
+ if dist not in self.layer_message[layer]:
+ self.layer_message[layer][dist] = []
+ self.layer_message[layer][src].append(dist)
+ self.layer_message[layer][dist].append(src)
+
+ # normalization the layer weight
+ for i in range(0, self.depth):
+ layer_weight = 0.0
+ layer_count = 0
+ if i not in self.layer_norm_distance:
+ self.layer_norm_distance[i] = {}
+ if i not in self.sample_alias:
+ self.sample_alias[i] = {}
+ if i not in self.sample_events:
+ self.sample_events[i] = {}
+ if i not in self.layer_message:
+ continue
+ for node in self.nodes:
+ if node not in self.layer_message[i]:
+ continue
+ nbhs = self.layer_message[i][node]
+ weights = []
+ sum_weight = 0.0
+ for dist in nbhs:
+ if (node, dist) in self.layer_distance[i]:
+ weight = self.layer_distance[i][node, dist]
+ else:
+ weight = self.layer_distance[i][dist, node]
+ weight = np.exp(-float(weight))
+ weights.append(weight)
+ # norm the weight
+ sum_weight = sum(weights)
+ if sum_weight == 0.0:
+ sum_weight = 1.0
+ weight_list = [weight / sum_weight for weight in weights]
+ self.layer_norm_distance[i][node] = weight_list
+ alias, events = alias_sample_build_table(np.array(weight_list))
+ self.sample_alias[i][node] = alias
+ self.sample_events[i][node] = events
+ layer_weight += 1.0
+ #layer_weight += sum(weight_list)
+ layer_count += len(weights)
+ layer_avg_weight = layer_weight / (1.0 * layer_count)
+
+ self.layer_node_weight_count[i] = dict()
+ for node in self.nodes:
+ if node not in self.layer_norm_distance[i]:
+ continue
+ weight_list = self.layer_norm_distance[i][node]
+ node_cnt = 0
+ for weight in weight_list:
+ if weight > layer_avg_weight:
+ node_cnt += 1
+ self.layer_node_weight_count[i][node] = node_cnt
+
+ def choose_neighbor_alias_method(self, node, layer):
+ """
+ Choose the neighhor with strategy of random
+ """
+ weight_list = self.layer_norm_distance[layer][node]
+ neighbors = self.layer_message[layer][node]
+ select_idx = alias_sample(1, self.sample_alias[layer][node],
+ self.sample_events[layer][node])
+ return neighbors[select_idx[0]]
+
+ def choose_layer_to_walk(self, node, layer):
+ """
+ Choose the layer to random walk
+ """
+ random_value = random.random()
+ higher_neigbours_nums = self.layer_node_weight_count[layer][node]
+ prob = math.log(higher_neigbours_nums + math.e)
+ prob = prob / (1.0 + prob)
+ if random_value > prob:
+ if layer > 0:
+ layer = layer - 1
+ else:
+ if layer + 1 in self.layer_message and \
+ node in self.layer_message[layer + 1]:
+ layer = layer + 1
+ return layer
+
+ def executor_random_walk(self, walk_process_id):
+ """
+ The main function to execute the structual random walk
+ """
+ nodes = self.nodes
+ random.shuffle(nodes)
+ walk_path_all_nodes = []
+ for node in nodes:
+ walk_path = []
+ walk_path.append(node)
+ layer = 0
+ while len(walk_path) < self.walk_depth:
+ prop = random.random()
+ if prop < 0.3:
+ node = self.choose_neighbor_alias_method(node, layer)
+ walk_path.append(node)
+ else:
+ layer = self.choose_layer_to_walk(node, layer)
+ walk_path_all_nodes.append(walk_path)
+ return walk_path_all_nodes
+
+ def random_walk_structual_sim(self):
+ """
+ According to struct distance to walk the path
+ """
+ from pathos.multiprocessing import Pool
+ print('start process struc2vec random walk.')
+ walks_process_ids = [i for i in range(0, self.num_walks)]
+ pool = Pool(10)
+ walks = pool.map(self.executor_random_walk, walks_process_ids)
+ pool.close()
+ pool.join()
+
+ #save the final walk result
+ file_result = open(args.tag + "_walk_path", "w")
+ for walk in walks:
+ for walk_node in walk:
+ walk_node_str = " ".join([str(node) for node in walk_node])
+ file_result.write(walk_node_str + "\n")
+ file_result.close()
+ print('process struc2vec random walk done.')
+
+
+def learning_embedding_from_struc2vec(args):
+ """
+ Learning the word2vec from the random path
+ """
+ from gensim.models import Word2Vec
+ from gensim.models.word2vec import LineSentence
+ struc_walks = LineSentence(args.tag + "_walk_path")
+ model = Word2Vec(struc_walks, size=args.w2v_emb_size, window=args.w2v_window_size, iter=args.w2v_epoch, \
+ min_count=0, hs=1, sg=1, workers=5)
+ model.wv.save_word2vec_format(args.emb_file)
+
+
+def main(args):
+ """
+ The main fucntion to run the algorithm struc2vec
+ """
+ if args.train:
+ dataset = EdgeDataset(
+ undirected=args.undirected, data_dir=args.edge_file)
+ graph = StrucVecGraph(dataset.graph, dataset.nodes, args.opt1, args.opt2, args.opt3, args.depth,\
+ args.num_walks, args.walk_depth)
+ graph.output_degree_with_depth(args.depth, args.opt1)
+ graph.calc_distances_between_nodes()
+ graph.normlization_layer_weight()
+ graph.random_walk_structual_sim()
+ learning_embedding_from_struc2vec(args)
+ file_label = open(args.label_file)
+ file_label_reindex = open(args.label_file + "_reindex", "w")
+ for line in file_label:
+ items = line.strip("\n\r").split(" ")
+ try:
+ items = [int(item) for item in items]
+ except:
+ continue
+ if items[0] not in dataset.node_dict:
+ continue
+ reindex = dataset.node_dict[items[0]]
+ file_label_reindex.write(str(reindex) + " " + str(items[1]) + "\n")
+ file_label_reindex.close()
+
+ if args.valid:
+ emb_file = open(args.emb_file)
+ file_label_reindex = open(args.label_file + "_reindex")
+ label_dict = dict()
+ for line in file_label_reindex:
+ items = line.strip("\n\r").split(" ")
+ try:
+ label_dict[int(items[0])] = int(items[1])
+ except:
+ continue
+
+ data_for_train_valid = []
+ for line in emb_file:
+ items = line.strip("\n\r").split(" ")
+ if len(items) <= 2:
+ continue
+ index = int(items[0])
+ label = int(label_dict[index])
+ sample = []
+ sample.append(index)
+ feature_emb = items[1:]
+ feature_emb = [float(feature) for feature in feature_emb]
+ sample.extend(feature_emb)
+ sample.append(label)
+ data_for_train_valid.append(sample)
+ train_lr_l2_model(args, data_for_train_valid)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description='struc2vec')
+ parser.add_argument("--edge_file", type=str, default="")
+ parser.add_argument("--label_file", type=str, default="")
+ parser.add_argument("--emb_file", type=str, default="w2v_emb")
+ parser.add_argument("--undirected", type=bool, default=True)
+ parser.add_argument("--depth", type=int, default=8)
+ parser.add_argument("--num_walks", type=int, default=10)
+ parser.add_argument("--walk_depth", type=int, default=80)
+ parser.add_argument("--opt1", type=bool, default=False)
+ parser.add_argument("--opt2", type=bool, default=False)
+ parser.add_argument("--opt3", type=bool, default=False)
+ parser.add_argument("--w2v_emb_size", type=int, default=128)
+ parser.add_argument("--w2v_window_size", type=int, default=10)
+ parser.add_argument("--w2v_epoch", type=int, default=5)
+ parser.add_argument("--train", type=bool, default=False)
+ parser.add_argument("--valid", type=bool, default=False)
+ parser.add_argument("--lr", type=float, default=0.0001)
+ parser.add_argument("--num_class", type=int, default=4)
+ parser.add_argument("--epoch", type=int, default=2000)
+ parser.add_argument("--tag", type=str, default="")
+
+ args = parser.parse_args()
+ main(args)
diff --git a/examples/unsup_graphsage/model.py b/examples/unsup_graphsage/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..dea1f13d9c29a70a883fc44cd94ed434f5de6583
--- /dev/null
+++ b/examples/unsup_graphsage/model.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""model.py"""
+import paddle
+import paddle.fluid as fluid
+
+
+def copy_send(src_feat, dst_feat, edge_feat):
+ """copy_send"""
+ return src_feat["h"]
+
+
+def mean_recv(feat):
+ """mean_recv"""
+ return fluid.layers.sequence_pool(feat, pool_type="average")
+
+
+def sum_recv(feat):
+ """sum_recv"""
+ return fluid.layers.sequence_pool(feat, pool_type="sum")
+
+
+def max_recv(feat):
+ """max_recv"""
+ return fluid.layers.sequence_pool(feat, pool_type="max")
+
+
+def lstm_recv(feat):
+ """lstm_recv"""
+ hidden_dim = 128
+ forward, _ = fluid.layers.dynamic_lstm(
+ input=feat, size=hidden_dim * 4, use_peepholes=False)
+ output = fluid.layers.sequence_last_step(forward)
+ return output
+
+
+def graphsage_mean(gw, feature, hidden_size, act, name):
+ """graphsage_mean"""
+ msg = gw.send(copy_send, nfeat_list=[("h", feature)])
+ neigh_feature = gw.recv(msg, mean_recv)
+ self_feature = feature
+ self_feature = fluid.layers.fc(self_feature,
+ hidden_size,
+ act=act,
+ name=name + '_l')
+ neigh_feature = fluid.layers.fc(neigh_feature,
+ hidden_size,
+ act=act,
+ name=name + '_r')
+ output = fluid.layers.concat([self_feature, neigh_feature], axis=1)
+ output = fluid.layers.l2_normalize(output, axis=1)
+ return output
+
+
+def graphsage_meanpool(gw,
+ feature,
+ hidden_size,
+ act,
+ name,
+ inner_hidden_size=512):
+ """graphsage_meanpool"""
+ neigh_feature = fluid.layers.fc(feature, inner_hidden_size, act="relu")
+ msg = gw.send(copy_send, nfeat_list=[("h", neigh_feature)])
+ neigh_feature = gw.recv(msg, mean_recv)
+ neigh_feature = fluid.layers.fc(neigh_feature,
+ hidden_size,
+ act=act,
+ name=name + '_r')
+
+ self_feature = feature
+ self_feature = fluid.layers.fc(self_feature,
+ hidden_size,
+ act=act,
+ name=name + '_l')
+ output = fluid.layers.concat([self_feature, neigh_feature], axis=1)
+ output = fluid.layers.l2_normalize(output, axis=1)
+ return output
+
+
+def graphsage_maxpool(gw,
+ feature,
+ hidden_size,
+ act,
+ name,
+ inner_hidden_size=512):
+ """graphsage_maxpool"""
+ neigh_feature = fluid.layers.fc(feature, inner_hidden_size, act="relu")
+ msg = gw.send(copy_send, nfeat_list=[("h", neigh_feature)])
+ neigh_feature = gw.recv(msg, max_recv)
+ neigh_feature = fluid.layers.fc(neigh_feature,
+ hidden_size,
+ act=act,
+ name=name + '_r')
+
+ self_feature = feature
+ self_feature = fluid.layers.fc(self_feature,
+ hidden_size,
+ act=act,
+ name=name + '_l')
+ output = fluid.layers.concat([self_feature, neigh_feature], axis=1)
+ output = fluid.layers.l2_normalize(output, axis=1)
+ return output
+
+
+def graphsage_lstm(gw, feature, hidden_size, act, name):
+ """graphsage_lstm"""
+ inner_hidden_size = 128
+ neigh_feature = fluid.layers.fc(feature, inner_hidden_size, act="relu")
+
+ hidden_dim = 128
+ forward_proj = fluid.layers.fc(input=neigh_feature,
+ size=hidden_dim * 4,
+ bias_attr=False,
+ name="lstm_proj")
+ msg = gw.send(copy_send, nfeat_list=[("h", forward_proj)])
+ neigh_feature = gw.recv(msg, lstm_recv)
+ neigh_feature = fluid.layers.fc(neigh_feature,
+ hidden_size,
+ act=act,
+ name=name + '_r')
+
+ self_feature = feature
+ self_feature = fluid.layers.fc(self_feature,
+ hidden_size,
+ act=act,
+ name=name + '_l')
+ output = fluid.layers.concat([self_feature, neigh_feature], axis=1)
+ output = fluid.layers.l2_normalize(output, axis=1)
+ return output
diff --git a/examples/unsup_graphsage/reader.py b/examples/unsup_graphsage/reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0c8e4a17026ab1315b8dd0ea63e5c7f59d1deb5
--- /dev/null
+++ b/examples/unsup_graphsage/reader.py
@@ -0,0 +1,214 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""reader.py"""
+import os
+import numpy as np
+import pickle as pkl
+import paddle
+import paddle.fluid as fluid
+import pgl
+import time
+from pgl.utils.logger import log
+from pgl.utils import mp_reader
+
+
+def batch_iter(data, batch_size):
+ """batch_iter"""
+ src, dst, eid = data
+ perm = np.arange(len(eid))
+ np.random.shuffle(perm)
+ start = 0
+ while start < len(src):
+ index = perm[start:start + batch_size]
+ start += batch_size
+ yield src[index], dst[index], eid[index]
+
+
+def traverse(item):
+ """traverse"""
+ if isinstance(item, list) or isinstance(item, np.ndarray):
+ for i in iter(item):
+ for j in traverse(i):
+ yield j
+ else:
+ yield item
+
+
+def flat_node_and_edge(nodes, eids):
+ """flat_node_and_edge"""
+ nodes = list(set(traverse(nodes)))
+ eids = list(set(traverse(eids)))
+ return nodes, eids
+
+
+def graph_reader(num_layers,
+ graph_wrappers,
+ data,
+ batch_size,
+ samples,
+ num_workers,
+ feed_name_list,
+ use_pyreader=False,
+ graph=None,
+ predict=False):
+ """graph_reader
+ """
+ assert num_layers == len(samples), "Must be unified number of layers!"
+ if num_workers > 1:
+ return multiprocess_graph_reader(
+ num_layers,
+ graph_wrappers,
+ data,
+ batch_size,
+ samples,
+ num_workers,
+ feed_name_list,
+ use_pyreader,
+ graph=graph,
+ predict=predict)
+
+ batch_info = list(batch_iter(data, batch_size=batch_size))
+ work = worker(
+ num_layers,
+ batch_info,
+ graph_wrappers,
+ samples,
+ feed_name_list,
+ use_pyreader,
+ graph=graph,
+ predict=predict)
+
+ def reader():
+ """reader"""
+ for batch in work():
+ yield batch
+
+ return reader
+ #return paddle.reader.buffered(reader, 100)
+
+
+def worker(num_layers, batch_info, graph_wrappers, samples, feed_name_list,
+ use_pyreader, graph, predict):
+ """worker
+ """
+ pid = os.getppid()
+ np.random.seed((int(time.time() * 10000) + pid) % 65535)
+
+ graphs = [graph, graph]
+
+ def work():
+ """work
+ """
+ feed_dict = {}
+ ind = 0
+ perm = np.arange(0, len(batch_info))
+ np.random.shuffle(perm)
+ for p in perm:
+ batch_src, batch_dst, batch_eid = batch_info[p]
+ ind += 1
+ ind_start = time.time()
+ try:
+ nodes = start_nodes = np.concatenate([batch_src, batch_dst], 0)
+ eids = []
+ layer_nodes, layer_eids = [], []
+ for layer_idx in reversed(range(num_layers)):
+ if len(start_nodes) == 0:
+ layer_nodes = [nodes] + layer_nodes
+ layer_eids = [eids] + layer_eids
+ continue
+ pred_nodes, pred_eids = graphs[
+ layer_idx].sample_predecessor(
+ start_nodes, samples[layer_idx], return_eids=True)
+ last_nodes = nodes
+ nodes, eids = flat_node_and_edge([nodes, pred_nodes],
+ [eids, pred_eids])
+ layer_nodes = [nodes] + layer_nodes
+ layer_eids = [eids] + layer_eids
+ # Find new nodes
+ start_nodes = list(set(nodes) - set(last_nodes))
+ if predict is False:
+ eids = (batch_eid * 2 + 1).tolist() + (batch_eid * 2
+ ).tolist()
+ layer_eids[0] = list(set(layer_eids[0]) - set(eids))
+
+ # layer_nodes[0]: use first layer nodes as all subgraphs' nodes
+ subgraph = graphs[0].subgraph(
+ nodes=layer_nodes[0], eid=layer_eids[0])
+ node_feat = np.array(layer_nodes[0], dtype="int64")
+ subgraph.node_feat["index"] = node_feat
+
+ except Exception as e:
+ print(e)
+ if len(feed_dict) > 0:
+ yield feed_dict
+ continue
+ feed_dict = graph_wrappers[0].to_feed(subgraph)
+
+ # only reindex from first subgraph
+ sub_src_idx = subgraph.reindex_from_parrent_nodes(batch_src)
+ sub_dst_idx = subgraph.reindex_from_parrent_nodes(batch_dst)
+
+ feed_dict["src_index"] = sub_src_idx.astype("int64")
+ feed_dict["dst_index"] = sub_dst_idx.astype("int64")
+ if predict:
+ feed_dict["node_id"] = batch_src.astype("int64")
+
+ if use_pyreader:
+ yield [feed_dict[name] for name in feed_name_list]
+ else:
+ yield feed_dict
+
+ return work
+
+
+def multiprocess_graph_reader(num_layers, graph_wrappers, data, batch_size,
+ samples, num_workers, feed_name_list,
+ use_pyreader, graph, predict):
+ """ multiprocess_graph_reader
+ """
+
+ def parse_to_subgraph(rd):
+ """ parse_to_subgraph
+ """
+
+ def work():
+ """ work
+ """
+ for data in rd():
+ yield data
+
+ return work
+
+ def reader():
+ """ reader
+ """
+ batch_info = list(batch_iter(data, batch_size=batch_size))
+ log.info("The size of batch:%d" % (len(batch_info)))
+ block_size = int(len(batch_info) / num_workers + 1)
+ reader_pool = []
+ for i in range(num_workers):
+ reader_pool.append(
+ worker(num_layers, batch_info[block_size * i:block_size * (
+ i + 1)], graph_wrappers, samples, feed_name_list,
+ use_pyreader, graph, predict))
+ use_pipe = True
+ multi_process_sample = mp_reader.multiprocess_reader(
+ reader_pool, use_pipe=use_pipe)
+ r = parse_to_subgraph(multi_process_sample)
+ if use_pipe:
+ return paddle.reader.buffered(r, 5 * num_workers)
+ else:
+ return r
+
+ return reader()
diff --git a/examples/unsup_graphsage/sample.txt b/examples/unsup_graphsage/sample.txt
new file mode 100644
index 0000000000000000000000000000000000000000..731d556477db19ca19a0c3a7b18f584ed1a2dc90
--- /dev/null
+++ b/examples/unsup_graphsage/sample.txt
@@ -0,0 +1,2000 @@
+265 1599
+979 1790
+650 1488
+638 1310
+962 1916
+239 1958
+103 1763
+918 1874
+599 1924
+47 1691
+272 1978
+550 1583
+163 1142
+561 1458
+211 1447
+188 1529
+983 1039
+68 1923
+715 1900
+657 1555
+338 1937
+379 1409
+19 1978
+224 1420
+755 1499
+618 1172
+766 1294
+401 1188
+89 1257
+149 1048
+835 1526
+358 1858
+218 1187
+227 1022
+530 1643
+197 1255
+529 1672
+960 1558
+519 1176
+433 1093
+347 1495
+572 1877
+505 1047
+988 1587
+125 1249
+555 1942
+614 1586
+836 1681
+628 1076
+28 1693
+519 1398
+133 1136
+883 1493
+158 1441
+568 1928
+723 1585
+488 1331
+719 1471
+265 1113
+174 1799
+722 1226
+744 1467
+807 1075
+839 1393
+664 1380
+689 1552
+36 1864
+211 1611
+90 1444
+819 1428
+241 1551
+746 1599
+72 1098
+712 1787
+54 1575
+677 1485
+289 1007
+289 1079
+907 1144
+7 1983
+655 1272
+638 1047
+849 1957
+492 1278
+453 1304
+657 1807
+367 1002
+141 1346
+688 1450
+984 1749
+255 1240
+156 1625
+731 1051
+211 1922
+165 1805
+765 1054
+794 1555
+709 1747
+822 1099
+805 1774
+422 1240
+728 1679
+55 1299
+314 1808
+781 1689
+558 1605
+707 1110
+510 1705
+956 1064
+568 1132
+267 1257
+868 1269
+690 1453
+858 1602
+826 1373
+338 1650
+335 1453
+458 1340
+0 1818
+729 1694
+25 1816
+679 1109
+323 1609
+614 1457
+342 1028
+436 1081
+932 1139
+190 1821
+808 1623
+717 1267
+950 1265
+177 1956
+97 1380
+500 1744
+232 1582
+119 1015
+656 1462
+730 1007
+860 1142
+771 1989
+784 1623
+976 1084
+770 1642
+527 1515
+784 1943
+527 1578
+718 1396
+942 1089
+661 1705
+787 1800
+893 1932
+849 1395
+758 1482
+424 1148
+873 1470
+896 1333
+465 1021
+137 1507
+718 1027
+7 1045
+285 1932
+371 1468
+51 1692
+249 1358
+898 1858
+688 1213
+419 1289
+328 1326
+764 1786
+142 1399
+905 1738
+976 1295
+715 1537
+994 1393
+479 1291
+165 1560
+308 1446
+691 1728
+779 1162
+320 1989
+745 1579
+586 1426
+142 1517
+45 1317
+657 1339
+191 1780
+801 1216
+124 1414
+344 1717
+682 1383
+216 1891
+24 1759
+207 1080
+707 1699
+212 1606
+902 1435
+525 1174
+349 1299
+380 1840
+265 1294
+352 1390
+439 1410
+984 1481
+423 1499
+261 1484
+70 1033
+192 1909
+36 1960
+823 1109
+132 1418
+992 1257
+126 1548
+872 1488
+287 1645
+108 1836
+990 1314
+450 1119
+132 1549
+0 1003
+748 1373
+841 1475
+75 1987
+880 1458
+447 1443
+122 1385
+209 1022
+74 1724
+355 1688
+742 1892
+900 1092
+48 1220
+525 1221
+817 1010
+957 1212
+713 1558
+504 1851
+84 1860
+695 1187
+326 1524
+33 1647
+864 1637
+905 1637
+280 1617
+47 1034
+781 1137
+792 1319
+901 1850
+183 1511
+571 1725
+111 1957
+222 1030
+794 1169
+147 1973
+588 1789
+24 1581
+597 1471
+106 1786
+432 1146
+447 1325
+521 1444
+968 1417
+13 1075
+521 1478
+853 1294
+550 1550
+673 1426
+150 1684
+369 1737
+994 1038
+601 1397
+616 1400
+958 1028
+279 1177
+920 1180
+878 1584
+661 1852
+225 1631
+793 1401
+507 1289
+177 1818
+551 1836
+473 1065
+723 1383
+337 1938
+81 1601
+62 1139
+928 1853
+122 1946
+260 1289
+541 1378
+934 1069
+52 1311
+689 1420
+307 1862
+811 1691
+636 1885
+405 1883
+337 1132
+645 1261
+969 1224
+823 1106
+727 1066
+763 1126
+54 1168
+677 1750
+699 1223
+744 1183
+343 1883
+152 1440
+534 1665
+79 1853
+272 1581
+92 1309
+756 1884
+460 1305
+595 1868
+469 1904
+552 1067
+422 1318
+673 1843
+403 1174
+224 1445
+181 1566
+389 1618
+936 1479
+80 1002
+291 1611
+776 1201
+57 1495
+397 1053
+807 1810
+763 1374
+648 1054
+869 1432
+169 1083
+891 1318
+270 1200
+833 1663
+970 1653
+363 1637
+188 1192
+116 1751
+110 1035
+204 1216
+524 1995
+914 1426
+289 1814
+357 1521
+366 1808
+176 1775
+650 1959
+775 1062
+781 1712
+396 1798
+725 1577
+864 1497
+540 1188
+321 1623
+995 1622
+719 1299
+72 1656
+348 1728
+141 1547
+722 1095
+64 1689
+747 1143
+892 1758
+381 1463
+693 1199
+89 1555
+576 1313
+253 1809
+878 1466
+954 1776
+365 1366
+716 1351
+707 1441
+325 1167
+63 1385
+430 1225
+479 1159
+13 1185
+731 1653
+373 1529
+271 1904
+631 1111
+114 1758
+502 1983
+685 1261
+719 1932
+1 1646
+738 1698
+432 1294
+197 1463
+293 1626
+434 1457
+315 1481
+552 1877
+100 1103
+294 1569
+689 1377
+84 1142
+631 1935
+87 1508
+560 1358
+5 1787
+65 1877
+114 1948
+536 1435
+223 1753
+494 1230
+139 1335
+55 1306
+481 1253
+326 1662
+7 1171
+663 1992
+353 1586
+693 1397
+70 1498
+902 1897
+729 1627
+838 1296
+9 1528
+633 1988
+216 1535
+813 1534
+528 1061
+130 1705
+889 1019
+278 1810
+937 1399
+286 1498
+166 1574
+725 1506
+202 1018
+306 1420
+553 1717
+755 1731
+561 1619
+147 1981
+862 1065
+349 1219
+573 1137
+336 1871
+473 1511
+342 1051
+983 1181
+798 1663
+197 1930
+164 1477
+954 1083
+695 1879
+964 1046
+638 1817
+404 1886
+927 1211
+554 1115
+88 1417
+345 1165
+383 1551
+412 1484
+305 1532
+57 1380
+171 1550
+15 1082
+941 1507
+199 1774
+787 1953
+125 1398
+336 1958
+640 1851
+251 1127
+740 1306
+302 1217
+786 1014
+706 1811
+835 1851
+978 1262
+629 1944
+429 1202
+714 1954
+153 1381
+103 1759
+268 1286
+346 1808
+420 1343
+947 1467
+668 1857
+833 1736
+600 1008
+137 1649
+452 1985
+480 1545
+212 1182
+150 1726
+784 1217
+362 1595
+763 1365
+68 1395
+195 1041
+92 1599
+314 1397
+971 1003
+606 1914
+711 1706
+699 1056
+119 1593
+367 1476
+725 1098
+432 1234
+684 1255
+469 1606
+440 1086
+200 1848
+294 1144
+449 1888
+376 1225
+796 1352
+767 1447
+713 1845
+223 1333
+119 1797
+752 1927
+627 1464
+279 1488
+40 1562
+62 1149
+771 1058
+600 1911
+625 1164
+366 1416
+714 1530
+513 1935
+419 1485
+963 1665
+459 1648
+977 1522
+890 1521
+931 1566
+622 1838
+158 1958
+848 1520
+357 1275
+43 1440
+404 1772
+788 1930
+841 1832
+845 1281
+516 1121
+423 1130
+86 1619
+863 1928
+195 1789
+167 1944
+589 1093
+146 1206
+74 1133
+819 1445
+678 1004
+752 1725
+366 1604
+903 1738
+882 1858
+561 1195
+436 1980
+77 1894
+353 1879
+561 1166
+989 1964
+624 1013
+572 1704
+272 1077
+509 1242
+770 1001
+279 1392
+621 1924
+542 1766
+555 1951
+577 1598
+531 1148
+806 1401
+497 1115
+872 1309
+387 1880
+430 1485
+295 1175
+400 1774
+941 1522
+336 1032
+806 1873
+576 1422
+566 1974
+241 1847
+215 1645
+670 1804
+831 1834
+734 1091
+16 1641
+952 1975
+299 1587
+442 1032
+702 1341
+570 1405
+633 1651
+444 1731
+980 1774
+381 1729
+900 1661
+875 1274
+968 1095
+894 1805
+683 1961
+130 1549
+963 1350
+817 1864
+190 1281
+91 1657
+208 1194
+621 1911
+447 1338
+538 1343
+234 1534
+765 1920
+632 1263
+96 1090
+121 1659
+47 1975
+856 1354
+601 1061
+480 1236
+808 1487
+866 1999
+861 1892
+667 1124
+425 1307
+90 1002
+725 1337
+134 1749
+272 1587
+567 1276
+43 1332
+715 1084
+967 1477
+62 1731
+244 1540
+317 1112
+893 1108
+242 1443
+688 1544
+937 1475
+761 1912
+994 1219
+827 1193
+420 1966
+109 1691
+482 1767
+564 1146
+372 1215
+954 1348
+422 1045
+987 1040
+471 1247
+919 1824
+190 1615
+874 1879
+251 1198
+611 1575
+121 1733
+596 1950
+791 1492
+504 1201
+153 1680
+719 1967
+964 1095
+889 1106
+732 1770
+967 1631
+351 1061
+912 1835
+911 1925
+501 1502
+810 1406
+948 1718
+928 1080
+384 1940
+330 1301
+143 1081
+412 1649
+686 1840
+178 1544
+266 1121
+528 1714
+296 1156
+220 1753
+726 1679
+126 1416
+364 1424
+625 1539
+721 1708
+805 1639
+384 1157
+553 1693
+570 1877
+511 1984
+774 1254
+354 1949
+823 1162
+281 1204
+657 1774
+578 1943
+902 1764
+859 1063
+543 1845
+815 1052
+430 1118
+22 1210
+477 1586
+872 1692
+478 1943
+630 1850
+928 1247
+893 1126
+757 1774
+133 1275
+740 1101
+117 1200
+931 1120
+259 1184
+16 1782
+447 1131
+637 1498
+472 1859
+760 1877
+303 1511
+903 1074
+795 1227
+398 1450
+28 1339
+428 1891
+476 1680
+934 1409
+78 1737
+467 1075
+126 1830
+0 1421
+783 1357
+584 1061
+139 1166
+122 1768
+735 1219
+202 1684
+867 1405
+619 1176
+843 1833
+553 1239
+287 1080
+373 1780
+65 1816
+227 1871
+45 1701
+38 1281
+46 1077
+911 1708
+137 1478
+20 1550
+822 1631
+831 1527
+13 1001
+509 1096
+31 1751
+196 1123
+379 1614
+777 1288
+364 1222
+478 1070
+460 1580
+986 1340
+696 1498
+679 1139
+713 1343
+91 1691
+602 1696
+377 1770
+253 1021
+957 1179
+500 1423
+487 1281
+821 1652
+180 1122
+443 1247
+583 1289
+676 1258
+781 1693
+718 1500
+832 1662
+555 1029
+575 1595
+145 1801
+471 1769
+491 1388
+269 1241
+159 1428
+631 1698
+478 1268
+925 1141
+583 1096
+759 1592
+967 1352
+862 1444
+119 1991
+534 1602
+526 1226
+880 1614
+236 1615
+448 1600
+752 1041
+25 1127
+445 1853
+414 1058
+127 1913
+512 1080
+158 1522
+787 1287
+664 1744
+914 1335
+899 1630
+187 1279
+951 1942
+884 1777
+529 1937
+395 1590
+478 1066
+790 1518
+286 1614
+640 1528
+882 1707
+102 1303
+716 1794
+919 1605
+859 1759
+236 1321
+858 1608
+732 1506
+435 1263
+93 1508
+813 1260
+640 1668
+607 1185
+402 1039
+943 1569
+523 1415
+511 1786
+637 1934
+10 1885
+507 1375
+544 1988
+709 1537
+342 1717
+324 1393
+216 1090
+788 1753
+362 1308
+64 1576
+811 1726
+555 1636
+944 1715
+259 1251
+141 1888
+48 1290
+570 1331
+957 1104
+223 1233
+494 1531
+423 1433
+151 1266
+704 1002
+694 1685
+740 1001
+174 1537
+947 1359
+49 1891
+875 1386
+274 1621
+918 1610
+631 1564
+961 1960
+702 1642
+871 1489
+384 1642
+932 1559
+886 1097
+842 1143
+950 1971
+83 1986
+944 1135
+168 1923
+900 1611
+684 1389
+540 1749
+123 1265
+673 1617
+952 1921
+767 1401
+696 1941
+868 1536
+515 1953
+438 1757
+430 1411
+661 1193
+527 1882
+147 1145
+225 1101
+710 1671
+579 1255
+30 1920
+906 1298
+333 1635
+214 1127
+362 1189
+878 1530
+808 1842
+419 1559
+861 1291
+743 1043
+333 1257
+186 1604
+141 1957
+751 1236
+573 1937
+908 1460
+627 1155
+726 1885
+332 1888
+267 1040
+28 1660
+194 1200
+971 1788
+861 1122
+582 1397
+176 1091
+397 1678
+730 1307
+309 1860
+881 1255
+701 1068
+750 1103
+755 1843
+834 1786
+900 1837
+433 1601
+897 1464
+593 1661
+451 1638
+953 1101
+122 1123
+220 1792
+35 1933
+726 1751
+715 1411
+662 1307
+197 1322
+125 1658
+478 1700
+772 1881
+547 1822
+910 1280
+924 1933
+79 1740
+466 1567
+53 1768
+500 1502
+572 1048
+751 1194
+18 1187
+374 1480
+158 1135
+712 1686
+171 1466
+25 1036
+144 1847
+664 1937
+301 1129
+641 1880
+147 1709
+885 1911
+631 1910
+338 1914
+628 1257
+909 1333
+970 1790
+971 1691
+260 1724
+693 1946
+857 1056
+918 1053
+612 1838
+479 1407
+626 1359
+273 1709
+633 1008
+364 1434
+393 1873
+294 1300
+657 1988
+355 1639
+635 1468
+914 1350
+916 1148
+305 1381
+131 1748
+756 1484
+758 1203
+825 1062
+152 1209
+441 1164
+63 1885
+864 1797
+165 1036
+124 1548
+246 1053
+810 1398
+127 1091
+277 1028
+860 1069
+700 1933
+338 1962
+211 1770
+809 1483
+489 1507
+123 1382
+669 1030
+180 1996
+972 1922
+723 1670
+647 1683
+422 1440
+391 1204
+178 1071
+421 1598
+729 1466
+339 1403
+419 1326
+407 1011
+479 1867
+722 1076
+662 1802
+110 1438
+759 1868
+22 1458
+725 1648
+958 1753
+814 1656
+673 1044
+962 1020
+475 1523
+882 1513
+802 1227
+863 1121
+772 1677
+714 1072
+112 1047
+422 1664
+419 1718
+60 1864
+570 1683
+536 1673
+581 1789
+894 1074
+739 1311
+805 1863
+861 1750
+55 1748
+47 1833
+101 1108
+872 1008
+926 1907
+909 1021
+53 1233
+617 1349
+674 1909
+507 1567
+855 1723
+690 1171
+973 1859
+686 1210
+49 1435
+146 1915
+357 1620
+208 1724
+76 1583
+133 1191
+619 1426
+190 1497
+228 1868
+365 1144
+360 1770
+329 1142
+672 1408
+91 1997
+986 1299
+654 1333
+93 1475
+146 1307
+62 1772
+502 1058
+382 1427
+181 1739
+74 1104
+170 1684
+466 1861
+147 1747
+162 1027
+499 1903
+813 1621
+591 1379
+227 1518
+110 1999
+781 1791
+415 1744
+257 1846
+942 1601
+628 1696
+317 1001
+27 1681
+80 1078
+794 1279
+330 1237
+830 1994
+728 1673
+204 1943
+295 1422
+159 1499
+207 1019
+110 1497
+439 1526
+201 1323
+620 1723
+501 1157
+305 1604
+878 1784
+483 1653
+262 1539
+21 1967
+191 1836
+199 1821
+500 1910
+232 1499
+104 1750
+868 1607
+288 1013
+434 1368
+874 1055
+870 1257
+219 1143
+990 1924
+70 1764
+207 1575
+1 1364
+405 1498
+414 1507
+65 1704
+868 1415
+256 1962
+886 1425
+834 1587
+770 1842
+74 1070
+778 1750
+550 1592
+484 1948
+669 1401
+610 1909
+480 1784
+182 1147
+842 1670
+272 1923
+371 1407
+574 1985
+978 1300
+369 1286
+884 1459
+322 1261
+456 1418
+261 1718
+330 1708
+83 1249
+473 1188
+542 1281
+551 1262
+801 1288
+372 1574
+676 1927
+44 1222
+190 1020
+284 1513
+866 1845
+828 1977
+620 1854
+288 1086
+367 1606
+71 1770
+114 1316
+571 1850
+224 1272
+406 1095
+902 1571
+576 1886
+576 1562
+767 1443
+644 1201
+295 1009
+944 1751
+90 1708
+663 1042
+283 1708
+758 1027
+851 1684
+537 1204
+271 1697
+541 1885
+973 1218
+694 1904
+822 1999
+194 1872
+276 1297
+909 1886
+312 1706
+516 1473
+844 1236
+62 1617
+366 1866
+127 1474
+743 1215
+286 1096
+87 1795
+69 1711
+757 1530
+333 1844
+257 1796
+515 1491
+66 1851
+117 1510
+18 1967
+553 1979
+267 1060
+99 1321
+861 1155
+506 1067
+944 1727
+964 1171
+329 1159
+856 1018
+858 1931
+765 1617
+951 1457
+903 1184
+241 1717
+285 1533
+320 1286
+409 1400
+924 1999
+719 1501
+14 1550
+866 1246
+86 1987
+868 1551
+620 1495
+285 1918
+810 1733
+754 1871
+755 1418
+394 1528
+839 1856
+927 1964
+321 1381
+758 1337
+635 1986
+404 1038
+854 1124
+600 1507
+342 1517
+756 1567
+498 1350
+944 1048
+481 1899
+904 1335
+412 1492
+218 1021
+636 1556
+417 1354
+116 1960
+173 1267
+525 1086
+312 1389
+973 1064
+619 1103
+987 1394
+447 1188
+862 1969
+930 1485
+419 1157
+756 1787
+860 1821
+58 1662
+353 1437
+345 1290
+753 1889
+412 1688
+37 1319
+753 1201
+136 1253
+949 1592
+459 1756
+976 1522
+450 1868
+936 1384
+393 1653
+385 1936
+704 1840
+616 1709
+786 1438
+291 1830
+848 1112
+975 1595
+967 1231
+741 1672
+160 1217
+254 1634
+530 1610
+0 1445
+170 1236
+164 1316
+127 1330
+302 1627
+953 1449
+156 1583
+784 1210
+226 1551
+397 1325
+564 1825
+42 1027
+725 1612
+114 1802
+483 1384
+684 1352
+463 1908
+978 1226
+445 1217
+800 1969
+556 1274
+49 1049
+777 1808
+732 1982
+749 1590
+574 1433
+462 1515
+637 1702
+344 1224
+489 1586
+45 1242
+755 1144
+716 1293
+319 1595
+831 1657
+154 1562
+396 1814
+657 1704
+442 1405
+898 1698
+970 1287
+967 1068
+25 1761
+211 1183
+691 1905
+466 1116
+99 1521
+834 1871
+408 1809
+8 1007
+483 1336
+485 1896
+849 1467
+192 1341
+779 1801
+678 1596
+276 1051
+709 1252
+759 1656
+27 1621
+273 1911
+697 1898
+450 1995
+688 1717
+52 1966
+920 1957
+437 1549
+533 1627
+130 1315
+392 1676
+73 1886
+650 1254
+352 1079
+165 1930
+388 1236
+426 1370
+625 1648
+457 1858
+17 1109
+926 1431
+853 1530
+90 1766
+586 1275
+894 1244
+331 1469
+447 1183
+132 1167
+230 1198
+501 1240
+440 1100
+58 1665
+85 1864
+913 1448
+738 1041
+486 1012
+162 1767
+877 1060
+10 1485
+514 1807
+224 1453
+781 1340
+311 1645
+720 1837
+259 1252
+54 1174
+788 1926
+375 1440
+23 1880
+977 1632
+389 1445
+38 1508
+517 1927
+798 1598
+483 1391
+541 1788
+46 1329
+816 1758
+158 1317
+900 1577
+369 1255
+227 1795
+37 1630
+813 1565
+965 1663
+953 1963
+503 1221
+223 1064
+161 1498
+717 1855
+527 1349
+773 1813
+522 1630
+767 1275
+582 1305
+541 1563
+79 1403
+794 1544
+74 1161
+548 1543
+18 1739
+516 1516
+697 1422
+259 1840
+195 1273
+412 1222
+571 1301
+203 1914
+420 1256
+327 1277
+894 1315
+929 1302
+773 1429
+302 1309
+488 1728
+403 1256
+549 1342
+940 1764
+524 1226
+409 1076
+233 1421
+753 1667
+664 1257
+359 1079
+291 1973
+199 1373
+654 1498
+645 1074
+481 1607
+432 1852
+692 1206
+498 1726
+586 1249
+555 1338
+107 1563
+473 1300
+51 1031
+345 1236
+757 1907
+548 1088
+680 1430
+349 1468
+435 1451
+884 1301
+683 1645
+280 1388
+84 1393
+585 1561
+86 1338
+261 1972
+941 1523
+306 1697
+718 1192
+930 1121
+726 1639
+617 1399
+939 1184
+511 1084
+832 1662
+377 1881
+371 1725
+393 1653
+415 1528
+254 1572
+927 1447
+848 1355
+797 1983
+613 1417
+127 1835
+715 1471
+974 1999
+355 1178
+675 1820
+415 1601
+593 1186
+648 1907
+922 1931
+859 1828
+110 1809
+547 1809
+944 1841
+106 1446
+635 1762
+866 1431
+199 1373
+595 1454
+991 1626
+903 1720
+989 1465
+509 1506
+168 1653
+742 1892
+644 1457
+972 1046
+87 1807
+79 1596
+24 1470
+313 1732
+772 1976
+226 1146
+835 1835
+107 1057
+430 1719
+203 1810
+643 1477
+30 1918
+889 1216
+750 1501
+180 1660
+71 1463
+966 1588
+261 1858
+829 1804
+774 1379
+342 1765
+328 1943
+296 1939
+937 1444
+628 1407
+0 1977
+233 1097
+359 1438
+910 1911
+963 1026
+942 1483
+706 1997
+682 1974
+900 1513
+298 1463
+893 1855
+322 1360
+604 1122
+948 1091
+828 1158
+682 1198
+466 1781
+661 1031
+884 1744
+891 1299
+688 1266
+89 1325
+3 1026
+299 1861
+413 1062
+775 1812
+560 1926
+799 1473
+936 1445
+537 1718
+591 1680
+202 1140
+906 1163
+977 1709
+482 1904
+345 1181
+486 1502
+445 1292
+305 1328
+87 1851
+803 1197
+94 1937
+574 1546
+643 1302
+704 1633
+536 1238
+329 1663
+737 1969
+663 1278
+335 1416
+873 1390
+705 1607
+139 1436
+740 1904
+974 1321
+338 1350
+694 1456
+779 1035
+639 1238
+603 1768
+245 1363
+390 1329
+141 1680
+483 1613
+226 1632
+820 1303
+424 1655
+54 1618
+399 1297
+130 1295
+169 1996
+78 1455
+525 1409
+741 1860
+887 1664
+347 1878
+391 1343
+66 1243
+287 1876
+35 1750
+492 1261
+789 1404
+917 1041
+937 1756
+69 1239
+218 1981
+142 1382
+882 1052
+757 1290
+178 1593
+962 1504
+781 1090
+648 1912
+207 1551
+472 1372
+937 1427
+37 1270
+511 1721
+208 1491
+299 1193
+167 1718
+781 1100
+689 1177
+732 1202
+852 1665
+556 1152
+256 1908
+261 1473
+918 1941
+755 1786
+77 1062
+208 1633
+451 1502
+181 1513
+311 1571
+240 1404
+470 1720
+913 1239
+947 1553
+706 1158
+215 1968
+912 1213
+684 1117
+560 1825
+787 1083
+764 1654
+566 1252
+238 1959
+953 1954
+985 1437
+835 1434
+88 1896
+469 1447
+655 1672
+760 1631
+919 1516
+683 1698
+811 1123
+911 1961
+302 1273
+344 1399
+89 1289
+936 1236
+395 1575
+417 1981
+10 1115
+878 1839
+213 1171
+484 1475
+460 1901
+708 1299
+320 1544
+965 1375
+451 1144
+116 1959
+143 1384
+843 1051
+368 1953
+994 1141
+704 1641
+385 1729
+240 1851
+967 1306
+719 1878
+726 1439
+550 1613
+261 1660
+550 1511
+154 1782
+12 1087
+328 1120
+618 1763
+422 1667
+519 1854
+639 1719
+942 1705
+814 1893
+576 1491
+139 1499
+422 1956
+95 1082
+676 1262
+287 1965
+60 1867
+713 1444
+435 1021
+606 1042
+86 1891
+58 1035
+311 1320
+140 1463
+82 1415
+756 1991
+505 1140
+510 1982
+701 1579
+428 1787
+388 1279
+446 1709
+222 1060
+550 1363
+798 1691
+219 1181
+137 1225
+828 1955
+721 1417
+82 1675
+854 1649
+203 1355
+352 1560
+582 1633
+118 1858
+771 1304
+321 1251
+392 1206
+958 1070
+684 1713
+939 1999
+592 1726
+56 1867
+592 1988
+736 1842
+958 1559
+989 1906
+183 1749
+462 1407
+294 1890
+771 1725
+1 1897
+49 1062
+124 1558
+575 1327
+506 1243
+154 1403
+672 1573
+423 1160
+222 1950
+67 1904
+664 1802
+585 1438
+327 1353
+284 1803
+369 1251
+291 1294
+61 1509
+551 1861
+938 1061
+765 1678
+509 1323
+145 1822
+887 1975
+768 1646
+610 1140
+690 1793
+763 1262
+96 1287
+837 1876
+632 1819
+747 1141
+71 1442
+561 1709
+290 1050
+514 1106
+87 1416
+762 1666
+83 1070
+467 1271
+7 1152
+472 1509
+861 1016
+913 1109
+934 1154
+288 1197
+175 1244
+588 1960
+316 1946
+543 1882
+359 1614
+465 1779
+892 1726
+695 1531
+542 1461
+288 1190
+966 1558
+736 1064
+997 1750
+885 1427
+888 1064
+342 1553
+77 1234
+845 1636
+407 1181
+354 1114
+670 1836
+69 1065
+12 1432
+982 1944
+837 1518
+231 1274
+2 1155
+423 1136
+377 1012
+353 1203
+257 1205
+350 1753
+479 1238
+324 1619
+705 1382
+236 1249
+695 1195
+213 1906
+231 1368
+819 1392
+509 1785
+661 1546
+210 1123
+873 1301
+363 1029
+216 1998
+240 1351
+667 1195
+515 1136
+230 1779
+385 1750
+574 1432
+435 1830
+804 1902
+249 1360
+303 1158
+969 1732
+249 1526
+159 1575
+139 1833
+347 1342
+661 1731
+887 1859
+19 1001
+748 1763
+829 1878
+828 1086
+835 1791
+895 1387
+326 1003
+568 1049
+485 1750
+760 1171
+414 1394
+987 1379
+851 1857
+8 1594
+76 1655
+363 1189
+90 1630
+976 1005
+57 1457
+886 1166
+29 1658
+543 1710
+379 1142
+499 1112
+177 1843
+746 1808
+454 1523
+676 1465
+762 1980
+309 1286
+74 1330
+359 1949
+781 1590
+874 1658
+455 1770
+790 1487
+651 1249
+855 1143
+386 1439
+298 1007
+2 1028
+217 1428
+318 1191
+968 1588
+5 1329
+625 1475
+140 1718
+401 1543
+936 1260
+311 1625
+711 1886
+832 1395
+114 1259
+782 1156
+434 1891
+539 1855
+448 1748
+199 1518
+735 1380
+908 1798
+301 1759
+876 1155
+63 1637
+739 1461
+558 1305
+533 1177
+801 1914
+97 1422
+423 1377
+920 1775
+215 1512
+691 1628
+905 1824
+540 1573
+567 1285
+573 1665
diff --git a/examples/unsup_graphsage/train.py b/examples/unsup_graphsage/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5655d61ea32b60f4a8abfa5f283595c8a236ac6
--- /dev/null
+++ b/examples/unsup_graphsage/train.py
@@ -0,0 +1,389 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""train.py
+"""
+import argparse
+import time
+import glob
+import os
+
+import numpy as np
+
+import pgl
+from pgl.utils.logger import log
+from pgl.utils import paddle_helper
+import paddle
+import paddle.fluid as fluid
+import tqdm
+
+import reader
+import model
+
+
+def get_layer(layer_type, gw, feature, hidden_size, act, name, is_test=False):
+ """get_layer"""
+ return getattr(model, layer_type)(gw, feature, hidden_size, act, name)
+
+
+def load_pos_neg(data_path):
+ """load_pos_neg"""
+ train_eid = []
+ train_src = []
+ train_dst = []
+ with open(data_path) as f:
+ eid = 0
+ for idx, line in tqdm.tqdm(enumerate(f)):
+ src, dst = line.strip().split('\t')
+ train_src.append(int(src))
+ train_dst.append(int(dst))
+ train_eid.append(int(eid))
+ eid += 1
+ # concate the the pos data and neg data
+ train_eid = np.array(train_eid, dtype="int64")
+ train_src = np.array(train_src, dtype="int64")
+ train_dst = np.array(train_dst, dtype="int64")
+
+ returns = {"train_data": (train_src, train_dst, train_eid), }
+ return returns
+
+
+def binary_op(u_embed, v_embed, binary_op_type):
+ """binary_op"""
+ if binary_op_type == "Average":
+ edge_embed = (u_embed + v_embed) / 2
+ elif binary_op_type == "Hadamard":
+ edge_embed = u_embed * v_embed
+ elif binary_op_type == "Weighted-L1":
+ edge_embed = fluid.layers.abs(u_embed - v_embed)
+ elif binary_op_type == "Weighted-L2":
+ edge_embed = (u_embed - v_embed) * (u_embed - v_embed)
+ else:
+ raise ValueError(binary_op_type + " binary_op_type doesn't exists")
+ return edge_embed
+
+
+class RetDict(object):
+ """RetDict"""
+ pass
+
+
+def build_graph_model(args):
+ """build_graph_model"""
+ node_feature_info = [('index', [None], np.dtype('int64'))]
+
+ place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+ graph_wrappers = []
+ feed_list = []
+
+ graph_wrappers.append(
+ pgl.graph_wrapper.GraphWrapper(
+ "layer_0", fluid.CPUPlace(), node_feat=node_feature_info))
+ #edge_feat=[("f", [None, 1], "float32")]))
+
+ num_embed = args.num_nodes
+
+ num_layers = args.num_layers
+
+ src_index = fluid.layers.data(
+ "src_index", shape=[None], dtype="int64", append_batch_size=False)
+
+ dst_index = fluid.layers.data(
+ "dst_index", shape=[None], dtype="int64", append_batch_size=False)
+
+ feature = fluid.layers.embedding(
+ input=fluid.layers.reshape(graph_wrappers[0].node_feat['index'],
+ [-1, 1]),
+ size=(num_embed + 1, args.hidden_size),
+ is_sparse=args.is_sparse,
+ is_distributed=args.is_distributed)
+
+ features = [feature]
+ ret_dict = RetDict()
+ ret_dict.graph_wrappers = graph_wrappers
+ edge_data = [src_index, dst_index]
+ feed_list.extend(edge_data)
+ ret_dict.feed_list = feed_list
+
+ for i in range(num_layers):
+ if i == num_layers - 1:
+ act = None
+ else:
+ act = "leaky_relu"
+ feature = get_layer(
+ args.layer_type,
+ graph_wrappers[0],
+ feature,
+ args.hidden_size,
+ act,
+ name="%s_%s" % (args.layer_type, i))
+ features.append(feature)
+
+ src_feat = fluid.layers.gather(features[-1], src_index)
+ src_feat = fluid.layers.fc(src_feat,
+ args.hidden_size,
+ bias_attr=None,
+ param_attr=fluid.ParamAttr(name="feat"))
+ dst_feat = fluid.layers.gather(features[-1], dst_index)
+ dst_feat = fluid.layers.fc(dst_feat,
+ args.hidden_size,
+ bias_attr=None,
+ param_attr=fluid.ParamAttr(name="feat"))
+ if args.phase == "predict":
+ node_id = fluid.layers.data(
+ "node_id", shape=[None, 1], dtype="int64", append_batch_size=False)
+ ret_dict.src_feat = src_feat
+ ret_dict.dst_feat = dst_feat
+ ret_dict.id = node_id
+ return ret_dict
+
+ batch_size = args.batch_size
+ batch_negative_label = fluid.layers.reshape(
+ fluid.layers.range(0, batch_size, 1, "int64"), [-1, 1])
+ batch_negative_label = fluid.layers.one_hot(batch_negative_label,
+ batch_size)
+ batch_loss_weight = (batch_negative_label *
+ (batch_size - 2) + 1.0) / (batch_size - 1)
+ batch_loss_weight.stop_gradient = True
+ batch_negative_label = batch_negative_label
+ batch_negative_label = fluid.layers.cast(
+ batch_negative_label, dtype="float32")
+ batch_negative_label.stop_gradient = True
+
+ cos_theta = fluid.layers.matmul(src_feat, dst_feat, transpose_y=True)
+
+ # Calc Loss
+ loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+ x=cos_theta, label=batch_negative_label)
+ loss = loss * batch_loss_weight
+ #loss = fluid.layers.reduce_sum(loss, -1)
+ loss = fluid.layers.mean(loss)
+
+ # Calc AUC
+ proba = fluid.layers.sigmoid(cos_theta)
+ proba = fluid.layers.reshape(proba, [-1, 1])
+ proba = fluid.layers.concat([proba * -1 + 1, proba], axis=1)
+ gold_label = fluid.layers.reshape(batch_negative_label, [-1, 1])
+ gold_label = fluid.layers.cast(gold_label, "int64")
+ auc, batch_auc_out, [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg] = \
+ fluid.layers.auc(input=proba, label=gold_label, curve='ROC', )
+
+ ret_dict.loss = loss
+ ret_dict.auc = batch_auc_out
+ return ret_dict
+
+
+def run_epoch(
+ py_reader,
+ exe,
+ program,
+ prefix,
+ model_dict,
+ epoch,
+ batch_size,
+ log_per_step=100,
+ save_per_step=10000, ):
+ """run_epoch"""
+ batch = 0
+ start = time.time()
+
+ batch_end = time.time()
+
+ for batch_feed_dict in py_reader():
+ if prefix == "train":
+ if batch_feed_dict["src_index"].shape[0] != batch_size:
+ log.warning(
+ 'batch_feed_dict["src_index"].shape[0] != 1024, continue')
+ continue
+ batch_start = time.time()
+ batch += 1
+ batch_loss, batch_auc = exe.run(
+ program,
+ feed=batch_feed_dict,
+ fetch_list=[model_dict.loss.name, model_dict.auc.name])
+
+ batch_end = time.time()
+ if batch % log_per_step == 0:
+ log.info(
+ "Batch %s %s-Loss %s \t %s-Auc %s \t Speed(per batch) %.5lf sec"
+ % (batch, prefix, np.mean(batch_loss), prefix,
+ np.mean(batch_auc), batch_end - batch_start))
+ if batch != 0 and batch % save_per_step == 0:
+ fluid.io.save_params(
+ exe, dirname='checkpoint', main_program=program)
+ fluid.io.save_params(exe, dirname='checkpoint', main_program=program)
+
+
+def run_predict_epoch(py_reader,
+ exe,
+ program,
+ prefix,
+ model_dict,
+ num_nodes,
+ hidden_size,
+ log_per_step=100):
+ """run_predict_epoch"""
+ batch = 0
+ start = time.time()
+ #use the parallel executor to speed up
+ batch_end = time.time()
+ all_feat = np.zeros((num_nodes, hidden_size), dtype="float32")
+
+ for batch_feed_dict in tqdm.tqdm(py_reader()):
+ batch_start = time.time()
+ batch += 1
+ batch_src_feat, batch_id = exe.run(
+ program,
+ feed=batch_feed_dict,
+ fetch_list=[model_dict.src_feat.name, model_dict.id.name])
+
+ for ind, id in enumerate(batch_id):
+ all_feat[id] = batch_src_feat[ind]
+ np.save("emb.npy", all_feat)
+
+
+def main(args):
+ """main"""
+ place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+ exe = fluid.Executor(place)
+
+ train_program = fluid.Program()
+ startup_program = fluid.Program()
+
+ with fluid.program_guard(train_program, startup_program):
+ ret_dict = build_graph_model(args=args)
+
+ val_program = train_program.clone(for_test=True)
+ if args.phase == "train":
+ with fluid.program_guard(train_program, startup_program):
+ adam = fluid.optimizer.Adam(learning_rate=args.lr)
+ adam.minimize(ret_dict.loss)
+ # reset the place according to role of parameter server
+ exe.run(startup_program)
+
+ with open(args.data_path) as f:
+ log.info("Begin Load Graph")
+ src = []
+ dst = []
+ for idx, line in tqdm.tqdm(enumerate(f)):
+ s, d = line.strip().split()
+ src.append(s)
+ dst.append(d)
+ dst.append(s)
+ src.append(d)
+ src = np.array(src, dtype="int64").reshape(-1, 1)
+ dst = np.array(dst, dtype="int64").reshape(-1, 1)
+ edges = np.hstack([src, dst])
+
+ log.info("Begin Build Index")
+ ret_dict.graph = pgl.graph.Graph(num_nodes=args.num_nodes, edges=edges)
+ ret_dict.graph.indegree()
+ log.info("End Build Index")
+
+ if args.phase == "train":
+ #just the worker, load the sample
+ data = load_pos_neg(args.data_path)
+
+ feed_name_list = [var.name for var in ret_dict.feed_list]
+ train_iter = reader.graph_reader(
+ args.num_layers,
+ ret_dict.graph_wrappers,
+ batch_size=args.batch_size,
+ data=data['train_data'],
+ samples=args.samples,
+ num_workers=args.sample_workers,
+ feed_name_list=feed_name_list,
+ use_pyreader=args.use_pyreader,
+ graph=ret_dict.graph)
+
+ # get PyReader
+ for epoch in range(args.epoch):
+ epoch_start = time.time()
+ try:
+ run_epoch(
+ train_iter,
+ program=train_program,
+ exe=exe,
+ prefix="train",
+ model_dict=ret_dict,
+ epoch=epoch,
+ batch_size=args.batch_size,
+ log_per_step=10)
+ epoch_end = time.time()
+ print("Epoch: {0}, Train total expend: {1} ".format(
+ epoch, epoch_end - epoch_start))
+ except Exception as e:
+ log.info("Run Epoch Error %s" % e)
+ fluid.io.save_params(
+ exe,
+ dirname=args.checkpoint + '_%s' % epoch,
+ main_program=train_program)
+
+ log.info("EPOCH END")
+
+ log.info("RUN FINISH")
+ elif args.phase == "predict":
+ fluid.io.load_params(
+ exe,
+ dirname=args.checkpoint + '_%s' % args.epoch,
+ main_program=val_program)
+ test_src = np.arange(0, args.num_nodes, dtype="int64")
+ feed_name_list = [var.name for var in ret_dict.feed_list]
+ predict_iter = reader.graph_reader(
+ args.num_layers,
+ ret_dict.graph_wrappers,
+ batch_size=args.batch_size,
+ data=(test_src, test_src, test_src, test_src),
+ samples=args.samples,
+ num_workers=args.sample_workers,
+ feed_name_list=feed_name_list,
+ use_pyreader=args.use_pyreader,
+ graph=ret_dict.graph,
+ predict=True)
+ run_predict_epoch(
+ predict_iter,
+ program=val_program,
+ exe=exe,
+ prefix="predict",
+ hidden_size=args.hidden_size,
+ model_dict=ret_dict,
+ num_nodes=args.num_nodes,
+ log_per_step=100)
+ log.info("EPOCH END")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description='graphsage')
+ parser.add_argument(
+ "--use_cuda", action='store_true', help="use_cuda", default=False)
+ parser.add_argument("--layer_type", type=str, default="graphsage_mean")
+ parser.add_argument("--epoch", type=int, default=1)
+ parser.add_argument("--hidden_size", type=int, default=128)
+ parser.add_argument("--batch_size", type=int, default=1024)
+ parser.add_argument("--lr", type=float, default=0.001)
+ parser.add_argument("--num_layers", type=int, default=2)
+ parser.add_argument("--data_path", type=str, required=True)
+ parser.add_argument("--checkpoint", type=str, default="model_ckpt")
+ parser.add_argument("--cache_path", type=str, default="./tmp")
+ parser.add_argument("--phase", type=str, default="train")
+ parser.add_argument("--digraph", action='store_true', default=False)
+ parser.add_argument('--samples', nargs='+', type=int, default=[10, 10])
+ parser.add_argument("--sample_workers", type=int, default=10)
+ parser.add_argument("--num_nodes", type=int, required=True)
+ parser.add_argument("--is_sparse", action='store_true', default=False)
+ parser.add_argument("--is_distributed", action='store_true', default=False)
+ parser.add_argument("--real_graph", action='store_true', default=True)
+ parser.add_argument("--use_pyreader", action='store_true', default=False)
+ args = parser.parse_args()
+ log.info(args)
+ main(args)
diff --git a/pgl/__init__.py b/pgl/__init__.py
index ea39debf0ee35f7e73097352b760682495a5c3a3..15c23cc633540a53b2b445a3012f2669319ba5d4 100644
--- a/pgl/__init__.py
+++ b/pgl/__init__.py
@@ -13,8 +13,9 @@
# limitations under the License.
"""Generate pgl apis
"""
-__version__ = "0.1.0.beta"
+__version__ = "1.0.0"
from pgl import layers
from pgl import graph_wrapper
from pgl import graph
from pgl import data_loader
+from pgl import contrib
diff --git a/pgl/contrib/__init__.py b/pgl/contrib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dcd918aebbc3ce1cc9ce77e3e742985d223c39f
--- /dev/null
+++ b/pgl/contrib/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generate Contrib api
+"""
+
+from pgl.contrib import heter_graph
+from pgl.contrib import heter_graph_wrapper
diff --git a/pgl/contrib/heter_graph.py b/pgl/contrib/heter_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..b82bf46b729ad6f0df56b25db4053b09bf9ef361
--- /dev/null
+++ b/pgl/contrib/heter_graph.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ This package implement Heterogeneous Graph structure for handling Heterogeneous graph data.
+"""
+import numpy as np
+import pickle as pkl
+import time
+import pgl.graph_kernel as graph_kernel
+from pgl import graph
+
+__all__ = ['HeterGraph']
+
+
+def _hide_num_nodes(shape):
+ """Set the first dimension as unknown
+ """
+ shape = list(shape)
+ shape[0] = None
+ return shape
+
+
+class HeterGraph(object):
+ """Implementation of graph structure in pgl
+
+ This is a simple implementation of heterogeneous graph structure in pgl
+
+ Args:
+ num_nodes_every_type: dict, number of nodes for every node type
+
+ edges_every_type: dict, every element is a list of (u, v) tuples.
+
+ node_feat_every_type: features for every node type.
+
+ Examples:
+ .. code-block:: python
+
+ import numpy as np
+ num_nodes_every_type = {'type1':3,'type2':4, 'type3':2}
+ edges_every_type = {
+ ('type1','type2', 'edges_type1'): [(0,1), (1,2)],
+ ('type1', 'type3', 'edges_type2'): [(1,2), (3,1)],
+ }
+ node_feat_every_type = {
+ 'type1': {'features1': np.random.randn(3, 4),
+ 'features2': np.random.randn(3, 4)},
+ 'type2': {'features3': np.random.randn(4, 4)},
+ 'type3': {'features1': np.random.randn(2, 4),
+ 'features2': np.random.randn(2, 4)}
+ }
+ edges_feat_every_type = {
+ ('type1','type2','edges_type1'): {'h': np.random.randn(2, 4)},
+ ('type1', 'type3', 'edges_type2'): {'h':np.random.randn(2, 4)},
+ }
+
+ g = heter_graph.HeterGraph(
+ num_nodes_every_type=num_nodes_every_type,
+ edges_every_type=edges_every_type,
+ node_feat_every_type=node_feat_every_type,
+ edge_feat_every_type=edges_feat_every_type)
+
+ """
+
+ def __init__(self,
+ num_nodes_every_type,
+ edges_every_type,
+ node_feat_every_type=None,
+ edge_feat_every_type=None):
+
+ self._num_nodes_dict = num_nodes_every_type
+ self._edges_dict = edges_every_type
+ if node_feat_every_type is not None:
+ self._node_feat = node_feat_every_type
+ else:
+ self._node_feat = {}
+
+ if edge_feat_every_type is not None:
+ self._edge_feat = edge_feat_every_type
+ else:
+ self._edge_feat = {}
+
+ self._multi_graph = {}
+ for key, value in self._edges_dict.items():
+ if not self._node_feat:
+ node_feat = None
+ else:
+ node_feat = self._node_feat[key[0]]
+
+ if not self._edge_feat:
+ edge_feat = None
+ else:
+ edge_feat = self._edge_feat[key]
+
+ self._multi_graph[key] = graph.Graph(
+ num_nodes=self._num_nodes_dict[key[1]],
+ edges=value,
+ node_feat=node_feat,
+ edge_feat=edge_feat)
+
+ def __getitem__(self, edge_type):
+ """__getitem__
+ """
+ return self._multi_graph[edge_type]
+
+ def meta_path_random_walk(self, start_node, edge_types, meta_path,
+ max_depth):
+ """Meta path random walk sampling.
+
+ Args:
+ start_nodes: int, node to begin random walk.
+ edge_types: list, the edge types to be sampled.
+ meta_path: 'user-item-user'
+ max_depth: the max length of every walk.
+ """
+ edges_type_list = []
+ node_type_list = meta_path.split('-')
+ for i in range(1, len(node_type_list)):
+ edges_type_list.append(
+ (node_type_list[i - 1], node_type_list[i], edge_types[i - 1]))
+
+ no_neighbors_flag = False
+ walk = [start_node]
+ for i in range(max_depth):
+ for e_type in edges_type_list:
+ cur_node = [walk[-1]]
+ nxt_node = self._multi_graph[e_type].sample_successor(
+ cur_node, max_degree=1) # list of np.array
+ nxt_node = nxt_node[0]
+ if len(nxt_node) == 0:
+ no_neighbors_flag = True
+ break
+ else:
+ walk.append(nxt_node.tolist()[0])
+
+ if no_neighbors_flag:
+ break
+
+ return walk
+
+ def node_feat_info(self):
+ """Return the information of node feature for HeterGraphWrapper.
+
+ This function return the information of node features of all node types. And this
+ function is used to help constructing HeterGraphWrapper
+
+ Return:
+ A dict of list of tuple (name, shape, dtype) for all given node feature.
+
+ """
+ node_feat_info = {}
+ for node_type_name, feat_dict in self._node_feat.items():
+ tmp_node_feat_info = []
+ for feat_name, feat in feat_dict.items():
+ full_name = feat_name
+ tmp_node_feat_info.append(
+ (full_name, _hide_num_nodes(feat.shape), feat.dtype))
+ node_feat_info[node_type_name] = tmp_node_feat_info
+
+ return node_feat_info
+
+ def edge_feat_info(self):
+ """Return the information of edge feature for HeterGraphWrapper.
+
+ This function return the information of edge features of all edge types. And this
+ function is used to help constructing HeterGraphWrapper
+
+ Return:
+ A dict of list of tuple (name, shape, dtype) for all given edge feature.
+
+ """
+ edge_feat_info = {}
+ for edge_type_name, feat_dict in self._edge_feat.items():
+ tmp_edge_feat_info = []
+ for feat_name, feat in feat_dict.items():
+ full_name = feat_name
+ tmp_edge_feat_info.append(
+ (full_name, _hide_num_nodes(feat.shape), feat.dtype))
+ edge_feat_info[edge_type_name] = tmp_edge_feat_info
+ return edge_feat_info
+
+ def edge_types_info(self):
+ """Return the information of all edge types.
+
+ Return:
+ A list of tuple ('srctype','dsttype', 'edges_type') for all edge types.
+
+ """
+ edge_types_info = []
+ for key, _ in self._edges_dict.items():
+ edge_types_info.append(key)
+
+ return edge_types_info
diff --git a/pgl/contrib/heter_graph_wrapper.py b/pgl/contrib/heter_graph_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..27937c11c2f1610ff49d670bf24a76e1aa2cb54d
--- /dev/null
+++ b/pgl/contrib/heter_graph_wrapper.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This package provides interface to help building static computational graph
+for PaddlePaddle.
+"""
+
+import warnings
+import numpy as np
+import paddle.fluid as fluid
+
+from pgl.utils import op
+from pgl.utils import paddle_helper
+from pgl.utils.logger import log
+from pgl.graph_wrapper import GraphWrapper
+
+ALL = "__ALL__"
+
+
+def is_all(arg):
+ """is_all
+ """
+ return isinstance(arg, str) and arg == ALL
+
+
+class BipartiteGraphWrapper(GraphWrapper):
+ """Implement a bipartite graph wrapper that creates a graph data holders.
+ """
+
+ def __init__(self, name, place, node_feat=[], edge_feat=[]):
+ super(BipartiteGraphWrapper, self).__init__(name, place, node_feat,
+ edge_feat)
+
+ def send(self,
+ message_func,
+ src_nfeat_list=None,
+ dst_nfeat_list=None,
+ efeat_list=None):
+ """Send message from all src nodes to dst nodes.
+
+ The UDF message function should has the following format.
+
+ .. code-block:: python
+
+ def message_func(src_feat, dst_feat, edge_feat):
+ '''
+ Args:
+ src_feat: the node feat dict attached to the src nodes.
+ dst_feat: the node feat dict attached to the dst nodes.
+ edge_feat: the edge feat dict attached to the
+ corresponding (src, dst) edges.
+
+ Return:
+ It should return a tensor or a dictionary of tensor. And each tensor
+ should have a shape of (num_edges, dims).
+ '''
+ pass
+
+ Args:
+ message_func: UDF function.
+ src_nfeat_list: a list of tuple (name, tensor) for src nodes
+ dst_nfeat_list: a list of tuple (name, tensor) for dst nodes
+ efeat_list: a list of names or tuple (name, tensor)
+
+ Return:
+ A dictionary of tensor representing the message. Each of the values
+ in the dictionary has a shape (num_edges, dim) which should be collected
+ by :code:`recv` function.
+ """
+ if efeat_list is None:
+ efeat_list = {}
+ if src_nfeat_list is None:
+ src_nfeat_list = {}
+ if dst_nfeat_list is None:
+ dst_nfeat_list = {}
+
+ src, dst = self.edges
+ src_feat = {}
+ for feat in src_nfeat_list:
+ if isinstance(feat, str):
+ src_feat[feat] = self.node_feat[feat]
+ else:
+ name, tensor = feat
+ src_feat[name] = tensor
+
+ dst_feat = {}
+ for feat in dst_nfeat_list:
+ if isinstance(feat, str):
+ dst_feat[feat] = self.node_feat[feat]
+ else:
+ name, tensor = feat
+ dst_feat[name] = tensor
+
+ efeat = {}
+ for feat in efeat_list:
+ if isinstance(feat, str):
+ efeat[feat] = self.edge_feat[feat]
+ else:
+ name, tensor = feat
+ efeat[name] = tensor
+
+ src_feat = op.read_rows(src_feat, src)
+ dst_feat = op.read_rows(dst_feat, dst)
+ msg = message_func(src_feat, dst_feat, efeat)
+
+ return msg
+
+
+class HeterGraphWrapper(object):
+ """Implement a heterogeneous graph wrapper that creates a graph data holders
+ that attributes and features in the heterogeneous graph.
+ And we provide interface :code:`to_feed` to help converting :code:`Graph`
+ data into :code:`feed_dict`.
+
+ Args:
+ name: The heterogeneous graph data prefix
+
+ place: fluid.CPUPlace or fluid.GPUPlace(n) indicating the
+ device to hold the graph data.
+
+ node_feat: A dict of list of tuples that decribe the details of node
+ feature tenosr. Each tuple mush be (name, shape, dtype)
+ and the first dimension of the shape must be set unknown
+ (-1 or None) or we can easily use :code:`HeterGraph.node_feat_info()`
+ to get the node_feat settings.
+
+ edge_feat: A dict of list of tuples that decribe the details of edge
+ feature tenosr. Each tuple mush be (name, shape, dtype)
+ and the first dimension of the shape must be set unknown
+ (-1 or None) or we can easily use :code:`HeterGraph.edge_feat_info()`
+ to get the edge_feat settings.
+
+ Examples:
+ .. code-block:: python
+
+ import paddle.fluid as fluid
+ import numpy as np
+ num_nodes_every_type = {'type1':3,'type2':4, 'type3':2}
+ edges_every_type = {
+ ('type1','type2', 'edges_type1'): [(0,1), (1,2)],
+ ('type1', 'type3', 'edges_type2'): [(1,2), (3,1)],
+ }
+ node_feat_every_type = {
+ 'type1': {'features1': np.random.randn(3, 4),
+ 'features2': np.random.randn(3, 4)},
+ 'type2': {'features3': np.random.randn(4, 4)},
+ 'type3': {'features1': np.random.randn(2, 4),
+ 'features2': np.random.randn(2, 4)}
+ }
+ edges_feat_every_type = {
+ ('type1','type2','edges_type1'): {'h': np.random.randn(2, 4)},
+ ('type1', 'type3', 'edges_type2'): {'h':np.random.randn(2, 4)},
+ }
+
+ g = heter_graph.HeterGraph(
+ num_nodes_every_type=num_nodes_every_type,
+ edges_every_type=edges_every_type,
+ node_feat_every_type=node_feat_every_type,
+ edge_feat_every_type=edges_feat_every_type)
+
+
+ place = fluid.CPUPlace()
+
+ gw = pgl.heter_graph_wrapper.HeterGraphWrapper(
+ name='heter_graph',
+ place = place,
+ edge_types = g.edge_types_info(),
+ node_feat=g.node_feat_info(),
+ edge_feat=g.edge_feat_info())
+ """
+
+ def __init__(self, name, place, edge_types, node_feat={}, edge_feat={}):
+ self.__data_name_prefix = name
+ self._place = place
+ self._edge_types = edge_types
+ self._multi_gw = {}
+ for edge_type in self._edge_types:
+ type_name = self.__data_name_prefix + '/' + edge_type[
+ 0] + '_' + edge_type[1]
+ if node_feat:
+ n_feat = node_feat[edge_type[0]]
+ else:
+ n_feat = {}
+
+ if edge_feat:
+ e_feat = edge_feat[edge_type]
+ else:
+ e_feat = {}
+
+ self._multi_gw[edge_type] = BipartiteGraphWrapper(
+ name=type_name,
+ place=self._place,
+ node_feat=n_feat,
+ edge_feat=e_feat)
+
+ def to_feed(self, heterGraph, edge_types_list=ALL):
+ """Convert the graph into feed_dict.
+
+ This function helps to convert graph data into feed dict
+ for :code:`fluid.Excecutor` to run the model.
+
+ Args:
+ heterGraph: the :code:`HeterGraph` data object
+ edge_types_list: the edge types list to be fed
+
+ Return:
+ A dictinary contains data holder names and its coresponding data.
+ """
+ multi_graphs = heterGraph._multi_graph
+ if is_all(edge_types_list):
+ edge_types_list = self._edge_types
+
+ feed_dict = {}
+ for edge_type in edge_types_list:
+ feed_d = self._multi_gw[edge_type].to_feed(multi_graphs[edge_type])
+ feed_dict.update(feed_d)
+
+ return feed_dict
+
+ def __getitem__(self, edge_type):
+ """__getitem__
+ """
+ return self._multi_gw[edge_type]
diff --git a/pgl/data_loader.py b/pgl/data_loader.py
index 93d89e5e70fef27fd07306636df761dd0716b135..fb0e107203efc91a9cd8de313b1c15f9dc756366 100644
--- a/pgl/data_loader.py
+++ b/pgl/data_loader.py
@@ -20,7 +20,6 @@ import io
import sys
import numpy as np
import pickle as pkl
-import networkx as nx
from pgl import graph
from pgl.utils.logger import log
@@ -91,6 +90,7 @@ class CitationDataset(object):
def _load_data(self):
"""Load data
"""
+ import networkx as nx
objnames = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
objects = []
for i in range(len(objnames)):
@@ -98,7 +98,7 @@ class CitationDataset(object):
'rb') as f:
objects.append(_pickle_load(f))
- x, y, tx, ty, allx, ally, _graph = tuple(objects)
+ x, y, tx, ty, allx, ally, _graph = objects
test_idx_reorder = _parse_index_file("{}/ind.{}.test.index".format(
self.path, self.name))
test_idx_range = np.sort(test_idx_reorder)
diff --git a/pgl/graph.py b/pgl/graph.py
index f341363ef751db012ae49f118a06a96e2450ef16..8af2038f6378bef688f3530757bef3bd2a11b0c2 100644
--- a/pgl/graph.py
+++ b/pgl/graph.py
@@ -114,25 +114,41 @@ class Graph(object):
self._edge_feat = {}
if isinstance(edges, np.ndarray):
- if edges.dtype != "int32":
- edges = edges.astype("int32")
+ if edges.dtype != "int64":
+ edges = edges.astype("int64")
else:
- edges = np.array(edges, dtype="int32")
+ edges = np.array(edges, dtype="int64")
self._edges = edges
self._num_nodes = num_nodes
if len(edges) == 0:
- # check emtpy edges
- src, dst = np.array([], dtype="int32"), np.array([], dtype="int32")
- else:
- src = edges[:, 0]
- dst = edges[:, 1]
+ raise ValueError("The Graph have no edges.")
+
+ self._adj_src_index = None
+ self._adj_dst_index = None
+
+ @property
+ def adj_src_index(self):
+ """Return an EdgeIndex object for src.
+ """
+ if self._adj_src_index is None:
+ self._adj_src_index = EdgeIndex(
+ u=self._edges[:, 0],
+ v=self._edges[:, 1],
+ num_nodes=self._num_nodes)
+ return self._adj_src_index
- self._adj_src_index = EdgeIndex(
- u=src, v=dst, num_nodes=self._num_nodes)
- self._adj_dst_index = EdgeIndex(
- u=dst, v=src, num_nodes=self._num_nodes)
+ @property
+ def adj_dst_index(self):
+ """Return an EdgeIndex object for dst.
+ """
+ if self._adj_dst_index is None:
+ self._adj_dst_index = EdgeIndex(
+ u=self._edges[:, 1],
+ v=self._edges[:, 0],
+ num_nodes=self._num_nodes)
+ return self._adj_dst_index
@property
def edge_feat(self):
@@ -180,16 +196,16 @@ class Graph(object):
if sort_by not in ["src", "dst"]:
raise ValueError("sort_by should be in 'src' or 'dst'.")
if sort_by == 'src':
- src, dst, eid = self._adj_src_index.triples()
+ src, dst, eid = self.adj_src_index.triples()
else:
- dst, src, eid = self._adj_dst_index.triples()
+ dst, src, eid = self.adj_dst_index.triples()
return src, dst, eid
@property
def nodes(self):
"""Return all nodes id from 0 to :code:`num_nodes - 1`
"""
- return np.arange(self._num_nodes, dtype="int32")
+ return np.arange(self._num_nodes, dtype="int64")
def indegree(self, nodes=None):
"""Return the indegree of the given nodes
@@ -204,9 +220,9 @@ class Graph(object):
A numpy.ndarray as the given nodes' indegree.
"""
if nodes is None:
- return self._adj_dst_index.degree
+ return self.adj_dst_index.degree
else:
- return self._adj_dst_index.degree[nodes]
+ return self.adj_dst_index.degree[nodes]
def outdegree(self, nodes=None):
"""Return the outdegree of the given nodes.
@@ -221,9 +237,9 @@ class Graph(object):
A numpy.array as the given nodes' outdegree.
"""
if nodes is None:
- return self._adj_src_index.degree
+ return self.adj_src_index.degree
else:
- return self._adj_src_index.degree[nodes]
+ return self.adj_src_index.degree[nodes]
def successor(self, nodes=None, return_eids=False):
"""Find successor of given nodes.
@@ -273,17 +289,21 @@ class Graph(object):
"""
if nodes is None:
if return_eids:
- return self._adj_src_index.v, self._adj_src_index.eid
+ return self.adj_src_index.v, self.adj_src_index.eid
else:
- return self._adj_src_index.v
+ return self.adj_src_index.v
else:
if return_eids:
- return self._adj_src_index.v[nodes], self._adj_src_index.eid[
+ return self.adj_src_index.v[nodes], self.adj_src_index.eid[
nodes]
else:
- return self._adj_src_index.v[nodes]
+ return self.adj_src_index.v[nodes]
- def sample_successor(self, nodes, max_degree, return_eids=False):
+ def sample_successor(self,
+ nodes,
+ max_degree,
+ return_eids=False,
+ shuffle=False):
"""Sample successors of given nodes.
Args:
@@ -304,26 +324,20 @@ class Graph(object):
node_succ = self.successor(nodes, return_eids=return_eids)
if return_eids:
node_succ, node_succ_eid = node_succ
+
if nodes is None:
nodes = self.nodes
- sample_succ, sample_succ_eid = [], []
- for i in range(len(nodes)):
- max_size = min(max_degree, len(node_succ[i]))
- if max_size == 0:
- sample_succ.append([])
- if return_eids:
- sample_succ_eid.append([])
- else:
- ind = np.random.choice(
- len(node_succ[i]), max_size, replace=False)
- sample_succ.append(node_succ[i][ind])
- if return_eids:
- sample_succ_eid.append(node_succ_eid[i][ind])
+ node_succ = node_succ.tolist()
+
if return_eids:
- return sample_succ, sample_succ_eid
+ node_succ_eid = node_succ_eid.tolist()
+
+ if return_eids:
+ return graph_kernel.sample_subset_with_eid(
+ node_succ, node_succ_eid, max_degree, shuffle)
else:
- return sample_succ
+ return graph_kernel.sample_subset(node_succ, max_degree, shuffle)
def predecessor(self, nodes=None, return_eids=False):
"""Find predecessor of given nodes.
@@ -373,17 +387,21 @@ class Graph(object):
"""
if nodes is None:
if return_eids:
- return self._adj_dst_index.v, self._adj_dst_index.eid
+ return self.adj_dst_index.v, self.adj_dst_index.eid
else:
- return self._adj_dst_index.v
+ return self.adj_dst_index.v
else:
if return_eids:
- return self._adj_dst_index.v[nodes], self._adj_dst_index.eid[
+ return self.adj_dst_index.v[nodes], self.adj_dst_index.eid[
nodes]
else:
- return self._adj_dst_index.v[nodes]
+ return self.adj_dst_index.v[nodes]
- def sample_predecessor(self, nodes, max_degree, return_eids=False):
+ def sample_predecessor(self,
+ nodes,
+ max_degree,
+ return_eids=False,
+ shuffle=False):
"""Sample predecessor of given nodes.
Args:
@@ -407,24 +425,16 @@ class Graph(object):
if nodes is None:
nodes = self.nodes
- sample_pred, sample_pred_eid = [], []
- for i in range(len(nodes)):
- max_size = min(max_degree, len(node_pred[i]))
- if max_size == 0:
- sample_pred.append([])
- if return_eids:
- sample_pred_eid.append([])
- else:
- ind = np.random.choice(
- len(node_pred[i]), max_size, replace=False)
- sample_pred.append(node_pred[i][ind])
- if return_eids:
- sample_pred_eid.append(node_pred_eid[i][ind])
+ node_pred = node_pred.tolist()
+
+ if return_eids:
+ node_pred_eid = node_pred_eid.tolist()
if return_eids:
- return sample_pred, sample_pred_eid
+ return graph_kernel.sample_subset_with_eid(
+ node_pred, node_pred_eid, max_degree, shuffle)
else:
- return sample_pred
+ return graph_kernel.sample_subset(node_pred, max_degree, shuffle)
def node_feat_info(self):
"""Return the information of node feature for GraphWrapper.
@@ -500,19 +510,21 @@ class Graph(object):
(key, _hide_num_nodes(value.shape), value.dtype))
return edge_feat_info
- def subgraph(self, nodes, eid):
+ def subgraph(self, nodes, eid=None, edges=None):
"""Generate subgraph with nodes and edge ids.
This function will generate a :code:`pgl.graph.Subgraph` object and
copy all corresponding node and edge features. Nodes and edges will
- be reindex from 0.
+ be reindex from 0. Eid and edges can't both be None.
WARNING: ALL NODES IN EID MUST BE INCLUDED BY NODES
Args:
nodes: Node ids which will be included in the subgraph.
- eid: Edge ids which will be included in the subgraph.
+ eid (optional): Edge ids which will be included in the subgraph.
+
+ edges (optional): Edge(src, dst) list which will be included in the subgraph.
Return:
A :code:`pgl.graph.Subgraph` object.
@@ -522,11 +534,22 @@ class Graph(object):
for ind, node in enumerate(nodes):
reindex[node] = ind
- eid = np.array(eid, dtype="int32")
- sub_edges = graph_kernel.map_edges(eid, self._edges, reindex)
+ if eid is None and edges is None:
+ raise ValueError("Eid and edges can't be None at the same time.")
+
+ if edges is None:
+ edges = self._edges[eid]
+ else:
+ edges = np.array(edges, dtype="int64")
+
+ sub_edges = graph_kernel.map_edges(
+ np.arange(
+ len(edges), dtype="int64"), edges, reindex)
sub_edge_feat = {}
for key, value in self._edge_feat.items():
+ if eid is None:
+ raise ValueError("Eid can not be None with edge features.")
sub_edge_feat[key] = value[eid]
sub_node_feat = {}
@@ -554,7 +577,7 @@ class Graph(object):
Return:
Batch iterator
"""
- perm = np.arange(self._num_nodes, dtype="int32")
+ perm = np.arange(self._num_nodes, dtype="int64")
if shuffle:
np.random.shuffle(perm)
start = 0
@@ -644,7 +667,7 @@ class Graph(object):
break
succ = self.successor(cur_nodes)
sample_index = np.floor(
- np.random.rand(outdegree.shape[0]) * outdegree).astype("int32")
+ np.random.rand(outdegree.shape[0]) * outdegree).astype("int64")
nxt_cur_nodes = []
for s, ind, walk_id in zip(succ, sample_index, cur_walk_ids):
@@ -677,8 +700,8 @@ class Graph(object):
cur_walk_ids = np.arange(0, len(nodes))
cur_nodes = np.array(nodes)
- prev_nodes = np.array([-1] * len(nodes), dtype="int32")
- prev_succs = np.array([[]] * len(nodes), dtype="int32")
+ prev_nodes = np.array([-1] * len(nodes), dtype="int64")
+ prev_succs = np.array([[]] * len(nodes), dtype="int64")
for l in range(max_depth):
# select the walks not end
outdegree = self.outdegree(cur_nodes)
@@ -693,7 +716,7 @@ class Graph(object):
break
cur_succs = self.successor(cur_nodes)
num_nodes = cur_nodes.shape[0]
- nxt_nodes = np.zeros(num_nodes, dtype="int32")
+ nxt_nodes = np.zeros(num_nodes, dtype="int64")
for idx, (succ, prev_succ, walk_id, prev_node) in enumerate(
zip(cur_succs, prev_succs, cur_walk_ids, prev_nodes)):
diff --git a/pgl/graph_kernel.pyx b/pgl/graph_kernel.pyx
index 58095c3031bd77cd630152c5a5430f40f3ae0d2c..a911debe6bf3828838f33ea45b2efb90df98a070 100644
--- a/pgl/graph_kernel.pyx
+++ b/pgl/graph_kernel.pyx
@@ -26,20 +26,20 @@ from libc.stdlib cimport rand, RAND_MAX
@cython.boundscheck(False)
@cython.wraparound(False)
-def build_index(np.ndarray[np.int32_t, ndim=1] u,
- np.ndarray[np.int32_t, ndim=1] v,
- int num_nodes):
+def build_index(np.ndarray[np.int64_t, ndim=1] u,
+ np.ndarray[np.int64_t, ndim=1] v,
+ long long num_nodes):
"""Building Edge Index
"""
- cdef int i
- cdef int h=len(u)
- cdef int n_size = num_nodes
- cdef np.ndarray[np.int32_t, ndim=1] degree = np.zeros([n_size], dtype=np.int32)
- cdef np.ndarray[np.int32_t, ndim=1] count = np.zeros([n_size], dtype=np.int32)
- cdef np.ndarray[np.int32_t, ndim=1] _tmp_v = np.zeros([h], dtype=np.int32)
- cdef np.ndarray[np.int32_t, ndim=1] _tmp_u = np.zeros([h], dtype=np.int32)
- cdef np.ndarray[np.int32_t, ndim=1] _tmp_eid = np.zeros([h], dtype=np.int32)
- cdef np.ndarray[np.int32_t, ndim=1] indptr = np.zeros([n_size + 1], dtype=np.int32)
+ cdef long long i
+ cdef long long h=len(u)
+ cdef long long n_size = num_nodes
+ cdef np.ndarray[np.int64_t, ndim=1] degree = np.zeros([n_size], dtype=np.int64)
+ cdef np.ndarray[np.int64_t, ndim=1] count = np.zeros([n_size], dtype=np.int64)
+ cdef np.ndarray[np.int64_t, ndim=1] _tmp_v = np.zeros([h], dtype=np.int64)
+ cdef np.ndarray[np.int64_t, ndim=1] _tmp_u = np.zeros([h], dtype=np.int64)
+ cdef np.ndarray[np.int64_t, ndim=1] _tmp_eid = np.zeros([h], dtype=np.int64)
+ cdef np.ndarray[np.int64_t, ndim=1] indptr = np.zeros([n_size + 1], dtype=np.int64)
with nogil:
for i in xrange(h):
@@ -64,16 +64,16 @@ def build_index(np.ndarray[np.int32_t, ndim=1] u,
@cython.boundscheck(False)
@cython.wraparound(False)
-def map_edges(np.ndarray[np.int32_t, ndim=1] eid,
- np.ndarray[np.int32_t, ndim=2] edges,
+def map_edges(np.ndarray[np.int64_t, ndim=1] eid,
+ np.ndarray[np.int64_t, ndim=2] edges,
reindex):
"""Mapping edges by given dictionary
"""
- cdef unordered_map[int, int] m = reindex
- cdef int i = 0
- cdef int h = len(eid)
- cdef np.ndarray[np.int32_t, ndim=2] r_edges = np.zeros([h, 2], dtype=np.int32)
- cdef int j
+ cdef unordered_map[long long, long long] m = reindex
+ cdef long long i = 0
+ cdef long long h = len(eid)
+ cdef np.ndarray[np.int64_t, ndim=2] r_edges = np.zeros([h, 2], dtype=np.int64)
+ cdef long long j
with nogil:
for i in xrange(h):
j = eid[i]
@@ -86,31 +86,33 @@ def map_edges(np.ndarray[np.int32_t, ndim=1] eid,
def map_nodes(nodes, reindex):
"""Mapping nodes by given dictionary
"""
- cdef unordered_map[int, int] m = reindex
- cdef int i = 0
- cdef int h = len(nodes)
- cdef np.ndarray[np.int32_t, ndim=1] new_nodes = np.zeros([h], dtype=np.int32)
- cdef int j
- for i in xrange(h):
- j = nodes[i]
- new_nodes[i] = m[j]
+ cdef np.ndarray[np.int64_t, ndim=1] t_nodes = np.array(nodes, dtype=np.int64)
+ cdef unordered_map[long long, long long] m = reindex
+ cdef long long i = 0
+ cdef long long h = len(nodes)
+ cdef np.ndarray[np.int64_t, ndim=1] new_nodes = np.zeros([h], dtype=np.int64)
+ cdef long long j
+ with nogil:
+ for i in xrange(h):
+ j = t_nodes[i]
+ new_nodes[i] = m[j]
return new_nodes
@cython.boundscheck(False)
@cython.wraparound(False)
-def node2vec_sample(np.ndarray[np.int32_t, ndim=1] succ,
- np.ndarray[np.int32_t, ndim=1] prev_succ, int prev_node,
+def node2vec_sample(np.ndarray[np.int64_t, ndim=1] succ,
+ np.ndarray[np.int64_t, ndim=1] prev_succ, long long prev_node,
float p, float q):
"""Fast implement of node2vec sampling
"""
- cdef int i
+ cdef long long i
cdef succ_len = len(succ)
cdef prev_succ_len = len(prev_succ)
cdef vector[float] probs
cdef float prob_sum = 0
- cdef unordered_set[int] prev_succ_set
+ cdef unordered_set[long long] prev_succ_set
for i in xrange(prev_succ_len):
prev_succ_set.insert(prev_succ[i])
@@ -127,9 +129,177 @@ def node2vec_sample(np.ndarray[np.int32_t, ndim=1] succ,
cdef float rand_num = float(rand())/RAND_MAX * prob_sum
- cdef int sample_succ = 0
+ cdef long long sample_succ = 0
for i in xrange(succ_len):
rand_num -= probs[i]
if rand_num <= 0:
sample_succ = succ[i]
return sample_succ
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def subset_choose_index(long long s_size,
+ np.ndarray[ndim=1, dtype=np.int64_t] nid,
+ np.ndarray[ndim=1, dtype=np.int64_t] rnd,
+ np.ndarray[ndim=1, dtype=np.int64_t] buff_nid,
+ long long offset):
+ cdef long long n_size = len(nid)
+ cdef long long i
+ cdef long long j
+ cdef unordered_map[long long, long long] m
+ with nogil:
+ for i in xrange(s_size):
+ j = rnd[offset + i] % n_size
+ if j >= i:
+ buff_nid[offset + i] = nid[j] if m.find(j) == m.end() else nid[m[j]]
+ m[j] = i if m.find(i) == m.end() else m[i]
+ else:
+ buff_nid[offset + i] = buff_nid[offset + j]
+ buff_nid[offset + j] = nid[i] if m.find(i) == m.end() else nid[m[i]]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def subset_choose_index_eid(long long s_size,
+ np.ndarray[ndim=1, dtype=np.int64_t] nid,
+ np.ndarray[ndim=1, dtype=np.int64_t] eid,
+ np.ndarray[ndim=1, dtype=np.int64_t] rnd,
+ np.ndarray[ndim=1, dtype=np.int64_t] buff_nid,
+ np.ndarray[ndim=1, dtype=np.int64_t] buff_eid,
+ long long offset):
+ cdef long long n_size = len(nid)
+ cdef long long i
+ cdef long long j
+ cdef unordered_map[long long, long long] m
+ with nogil:
+ for i in xrange(s_size):
+ j = rnd[offset + i] % n_size
+ if j >= i:
+ if m.find(j) == m.end():
+ buff_nid[offset + i], buff_eid[offset + i] = nid[j], eid[j]
+ else:
+ buff_nid[offset + i], buff_eid[offset + i] = nid[m[j]], eid[m[j]]
+ m[j] = i if m.find(i) == m.end() else m[i]
+ else:
+ buff_nid[offset + i], buff_eid[offset + i] = buff_nid[offset + j], buff_eid[offset + j]
+ if m.find(i) == m.end():
+ buff_nid[offset + j], buff_eid[offset + j] = nid[i], eid[i]
+ else:
+ buff_nid[offset + j], buff_eid[offset + j] = nid[m[i]], eid[m[i]]
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def sample_subset(list nids, long long maxdegree, shuffle=False):
+ cdef np.ndarray[ndim=1, dtype=np.int64_t] buff_index
+ cdef long long buff_size, sample_size
+ cdef long long total_buff_size = 0
+ cdef long long inc = 0
+ cdef list output = []
+ for inc in xrange(len(nids)):
+ buff_size = len(nids[inc])
+ if buff_size > maxdegree:
+ total_buff_size += maxdegree
+ elif shuffle:
+ total_buff_size += buff_size
+ cdef np.ndarray[ndim=1, dtype=np.int64_t] buff_nid = np.zeros([total_buff_size], dtype=np.int64)
+ cdef np.ndarray[np.int64_t, ndim=1] rnd = np.random.randint(0, np.iinfo(np.int64).max,
+ dtype=np.int64, size=total_buff_size)
+
+ cdef long long offset = 0
+ for inc in xrange(len(nids)):
+ buff_size = len(nids[inc])
+ if not shuffle and buff_size <= maxdegree:
+ output.append(nids[inc])
+ else:
+ sample_size = buff_size if buff_size <= maxdegree else maxdegree
+ subset_choose_index(sample_size, nids[inc], rnd, buff_nid, offset)
+ output.append(buff_nid[offset:offset+sample_size])
+ offset += sample_size
+ return output
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def sample_subset_with_eid(list nids, list eids, long long maxdegree, shuffle=False):
+ cdef np.ndarray[ndim=1, dtype=np.int64_t] buff_index
+ cdef long long buff_size, sample_size
+ cdef long long total_buff_size = 0
+ cdef long long inc = 0
+ cdef list output = []
+ cdef list output_eid = []
+ for inc in xrange(len(nids)):
+ buff_size = len(nids[inc])
+ if buff_size > maxdegree:
+ total_buff_size += maxdegree
+ elif shuffle:
+ total_buff_size += buff_size
+ cdef np.ndarray[ndim=1, dtype=np.int64_t] buff_nid = np.zeros([total_buff_size], dtype=np.int64)
+ cdef np.ndarray[ndim=1, dtype=np.int64_t] buff_eid = np.zeros([total_buff_size], dtype=np.int64)
+ cdef np.ndarray[np.int64_t, ndim=1] rnd = np.random.randint(0, np.iinfo(np.int64).max,
+ dtype=np.int64, size=total_buff_size)
+
+ cdef long long offset = 0
+ for inc in xrange(len(nids)):
+ buff_size = len(nids[inc])
+ if not shuffle and buff_size <= maxdegree:
+ output.append(nids[inc])
+ output_eid.append(eids[inc])
+ else:
+ sample_size = buff_size if buff_size <= maxdegree else maxdegree
+ subset_choose_index_eid(sample_size, nids[inc], eids[inc], rnd, buff_nid, buff_eid, offset)
+ output.append(buff_nid[offset:offset+sample_size])
+ output_eid.append(buff_eid[offset:offset+sample_size])
+ offset += sample_size
+ return output, output_eid
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def skip_gram_gen_pair(vector[long long] walk, long win_size=5):
+ cdef vector[long long] src
+ cdef vector[long long] dst
+ cdef long long l = len(walk)
+ cdef long long real_win_size, left, right, i
+ cdef np.ndarray[np.int64_t, ndim=1] rnd = np.random.randint(1, win_size+1,
+ dtype=np.int64, size=l)
+ with nogil:
+ for i in xrange(l):
+ real_win_size = rnd[i]
+ left = i - real_win_size
+ if left < 0:
+ left = 0
+ right = i + real_win_size
+ if right >= l:
+ right = l - 1
+ for j in xrange(left, right+1):
+ if walk[i] == walk[j]:
+ continue
+ src.push_back(walk[i])
+ dst.push_back(walk[j])
+ return src, dst
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def alias_sample_build_table(np.ndarray[np.float64_t, ndim=1] probs):
+ cdef long long l = len(probs)
+ cdef np.ndarray[np.float64_t, ndim=1] alias = probs * l
+ cdef np.ndarray[np.int64_t, ndim=1] events = np.zeros(l, dtype=np.int64)
+
+ cdef vector[long long] larger_num, smaller_num
+ cdef long long i, s_i, l_i
+ with nogil:
+ for i in xrange(l):
+ if alias[i] > 1:
+ larger_num.push_back(i)
+ elif alias[i] < 1:
+ smaller_num.push_back(i)
+
+ while smaller_num.size() > 0 and larger_num.size() > 0:
+ s_i = smaller_num.back()
+ l_i = larger_num.back()
+ smaller_num.pop_back()
+ events[s_i] = l_i
+ alias[l_i] -= (1 - alias[s_i])
+ if alias[l_i] <= 1:
+ larger_num.pop_back()
+ if alias[l_i] < 1:
+ smaller_num.push_back(l_i)
+ return alias, events
diff --git a/pgl/graph_wrapper.py b/pgl/graph_wrapper.py
index 42cc6c2ec8082a188d288e79468a1a82b07d72f1..d41d09c5fcf62bfc8661f5999a720392c295a186 100644
--- a/pgl/graph_wrapper.py
+++ b/pgl/graph_wrapper.py
@@ -97,7 +97,6 @@ class BaseGraphWrapper(object):
self._indegree = None
self._edge_uniq_dst = None
self._edge_uniq_dst_count = None
- self._bucketing_index = None
self._node_ids = None
def send(self, message_func, nfeat_list=None, efeat_list=None):
@@ -188,7 +187,7 @@ class BaseGraphWrapper(object):
output = recv(
dst=self._edges_dst,
uniq_dst=self._edge_uniq_dst,
- bucketing_index=self._bucketing_index,
+ bucketing_index=self._edge_uniq_dst_count,
msg=msg,
reduce_function=reduce_function,
node_ids=self._node_ids)
@@ -200,7 +199,7 @@ class BaseGraphWrapper(object):
Return:
A tuple of Tensor (src, dst). Src and dst are both
- tensor with shape (num_edges, ) and dtype int32.
+ tensor with shape (num_edges, ) and dtype int64.
"""
return self._edges_src, self._edges_dst
@@ -209,7 +208,7 @@ class BaseGraphWrapper(object):
"""Return a variable of number of nodes
Return:
- A variable with shape (1,) as the number of nodes in int32.
+ A variable with shape (1,) as the number of nodes in int64.
"""
return self._num_nodes
@@ -237,7 +236,7 @@ class BaseGraphWrapper(object):
"""Return the indegree tensor for all nodes.
Return:
- A tensor of shape (num_nodes, ) in int32.
+ A tensor of shape (num_nodes, ) in int64.
"""
return self._indegree
@@ -312,6 +311,8 @@ class StaticGraphWrapper(BaseGraphWrapper):
nodes = graph.nodes
uniq_dst = nodes[indegree > 0]
uniq_dst_count = indegree[indegree > 0]
+ uniq_dst_count = np.cumsum(uniq_dst_count, dtype='int32')
+ uniq_dst_count = np.insert(uniq_dst_count, 0, 0)
edge_feat = {}
@@ -323,56 +324,46 @@ class StaticGraphWrapper(BaseGraphWrapper):
self.__create_graph_edge_feat(edge_feat, self._initializers)
self._edges_src, init = paddle_helper.constant(
- dtype="int32",
+ dtype="int64",
value=src,
- name=self.__data_name_prefix + '_edges_src')
+ name=self.__data_name_prefix + '/edges_src')
self._initializers.append(init)
self._edges_dst, init = paddle_helper.constant(
- dtype="int32",
+ dtype="int64",
value=dst,
- name=self.__data_name_prefix + '_edges_dst')
+ name=self.__data_name_prefix + '/edges_dst')
self._initializers.append(init)
self._num_nodes, init = paddle_helper.constant(
- dtype="int32",
+ dtype="int64",
hide_batch_size=False,
value=np.array([graph.num_nodes]),
- name=self.__data_name_prefix + '_num_nodes')
+ name=self.__data_name_prefix + '/num_nodes')
self._initializers.append(init)
self._edge_uniq_dst, init = paddle_helper.constant(
- name=self.__data_name_prefix + "_uniq_dst",
- dtype="int32",
+ name=self.__data_name_prefix + "/uniq_dst",
+ dtype="int64",
value=uniq_dst)
self._initializers.append(init)
self._edge_uniq_dst_count, init = paddle_helper.constant(
- name=self.__data_name_prefix + "_uniq_dst_count",
+ name=self.__data_name_prefix + "/uniq_dst_count",
dtype="int32",
value=uniq_dst_count)
self._initializers.append(init)
- bucket_value = np.expand_dims(
- np.arange(
- 0, len(dst), dtype="int32"), -1)
- self._bucketing_index, init = paddle_helper.lod_constant(
- name=self.__data_name_prefix + "_bucketing_index",
- dtype="int32",
- lod=list(uniq_dst_count),
- value=bucket_value)
- self._initializers.append(init)
-
- node_ids_value = np.arange(0, graph.num_nodes, dtype="int32")
+ node_ids_value = np.arange(0, graph.num_nodes, dtype="int64")
self._node_ids, init = paddle_helper.constant(
- name=self.__data_name_prefix + "_node_ids",
- dtype="int32",
+ name=self.__data_name_prefix + "/node_ids",
+ dtype="int64",
value=node_ids_value)
self._initializers.append(init)
self._indegree, init = paddle_helper.constant(
- name=self.__data_name_prefix + "_indegree",
- dtype="int32",
+ name=self.__data_name_prefix + "/indegree",
+ dtype="int64",
value=indegree)
self._initializers.append(init)
@@ -384,7 +375,8 @@ class StaticGraphWrapper(BaseGraphWrapper):
node_feat_dtype = node_feat_value.dtype
self._node_feat_tensor_dict[
node_feat_name], init = paddle_helper.constant(
- name=self.__data_name_prefix + '_' + node_feat_name,
+ name=self.__data_name_prefix + '/node_feat/' +
+ node_feat_name,
dtype=node_feat_dtype,
value=node_feat_value)
collector.append(init)
@@ -397,7 +389,8 @@ class StaticGraphWrapper(BaseGraphWrapper):
edge_feat_dtype = edge_feat_value.dtype
self._edge_feat_tensor_dict[
edge_feat_name], init = paddle_helper.constant(
- name=self.__data_name_prefix + '_' + edge_feat_name,
+ name=self.__data_name_prefix + '/edge_feat/' +
+ edge_feat_name,
dtype=edge_feat_dtype,
value=edge_feat_value)
collector.append(init)
@@ -483,6 +476,8 @@ class GraphWrapper(BaseGraphWrapper):
def __init__(self, name, place, node_feat=[], edge_feat=[]):
super(GraphWrapper, self).__init__()
+ # collect holders for PyReader
+ self._holder_list = []
self.__data_name_prefix = name
self._place = place
self.__create_graph_attr_holders()
@@ -498,78 +493,78 @@ class GraphWrapper(BaseGraphWrapper):
"""Create data holders for graph attributes.
"""
self._edges_src = fluid.layers.data(
- self.__data_name_prefix + '_edges_src',
+ self.__data_name_prefix + '/edges_src',
shape=[None],
append_batch_size=False,
- dtype="int32",
+ dtype="int64",
stop_gradient=True)
self._edges_dst = fluid.layers.data(
- self.__data_name_prefix + '_edges_dst',
+ self.__data_name_prefix + '/edges_dst',
shape=[None],
append_batch_size=False,
- dtype="int32",
+ dtype="int64",
stop_gradient=True)
self._num_nodes = fluid.layers.data(
- self.__data_name_prefix + '_num_nodes',
+ self.__data_name_prefix + '/num_nodes',
shape=[1],
append_batch_size=False,
- dtype='int32',
+ dtype='int64',
stop_gradient=True)
self._edge_uniq_dst = fluid.layers.data(
- self.__data_name_prefix + "_uniq_dst",
+ self.__data_name_prefix + "/uniq_dst",
shape=[None],
append_batch_size=False,
- dtype="int32",
+ dtype="int64",
stop_gradient=True)
self._edge_uniq_dst_count = fluid.layers.data(
- self.__data_name_prefix + "_uniq_dst_count",
+ self.__data_name_prefix + "/uniq_dst_count",
shape=[None],
append_batch_size=False,
dtype="int32",
stop_gradient=True)
- self._bucketing_index = fluid.layers.data(
- self.__data_name_prefix + "_bucketing_index",
- shape=[None, 1],
- append_batch_size=False,
- dtype="int32",
- lod_level=1,
- stop_gradient=True)
self._node_ids = fluid.layers.data(
- self.__data_name_prefix + "_node_ids",
+ self.__data_name_prefix + "/node_ids",
shape=[None],
append_batch_size=False,
- dtype="int32",
+ dtype="int64",
stop_gradient=True)
self._indegree = fluid.layers.data(
- self.__data_name_prefix + "_indegree",
+ self.__data_name_prefix + "/indegree",
shape=[None],
append_batch_size=False,
- dtype="int32",
+ dtype="int64",
stop_gradient=True)
+ self._holder_list.extend([
+ self._edges_src, self._edges_dst, self._num_nodes,
+ self._edge_uniq_dst, self._edge_uniq_dst_count, self._node_ids,
+ self._indegree
+ ])
def __create_graph_node_feat_holders(self, node_feat_name, node_feat_shape,
node_feat_dtype):
"""Create data holders for node features.
"""
feat_holder = fluid.layers.data(
- self.__data_name_prefix + '_' + node_feat_name,
+ self.__data_name_prefix + '/node_feat/' + node_feat_name,
shape=node_feat_shape,
append_batch_size=False,
dtype=node_feat_dtype,
stop_gradient=True)
self._node_feat_tensor_dict[node_feat_name] = feat_holder
+ self._holder_list.append(feat_holder)
def __create_graph_edge_feat_holders(self, edge_feat_name, edge_feat_shape,
edge_feat_dtype):
"""Create edge holders for edge features.
"""
feat_holder = fluid.layers.data(
- self.__data_name_prefix + '_' + edge_feat_name,
+ self.__data_name_prefix + '/edge_feat/' + edge_feat_name,
shape=edge_feat_shape,
append_batch_size=False,
dtype=edge_feat_dtype,
stop_gradient=True)
self._edge_feat_tensor_dict[edge_feat_name] = feat_holder
+ self._holder_list.append(feat_holder)
def to_feed(self, graph):
"""Convert the graph into feed_dict.
@@ -590,6 +585,8 @@ class GraphWrapper(BaseGraphWrapper):
nodes = graph.nodes
uniq_dst = nodes[indegree > 0]
uniq_dst_count = indegree[indegree > 0]
+ uniq_dst_count = np.cumsum(uniq_dst_count, dtype='int32')
+ uniq_dst_count = np.insert(uniq_dst_count, 0, 0)
edge_feat = {}
@@ -597,21 +594,27 @@ class GraphWrapper(BaseGraphWrapper):
edge_feat[key] = value[eid]
node_feat = graph.node_feat
- feed_dict[self.__data_name_prefix + '_edges_src'] = src
- feed_dict[self.__data_name_prefix + '_edges_dst'] = dst
- feed_dict[self.__data_name_prefix + '_num_nodes'] = graph.num_nodes
- feed_dict[self.__data_name_prefix + '_uniq_dst'] = uniq_dst
- feed_dict[self.__data_name_prefix + '_uniq_dst_count'] = uniq_dst_count
- feed_dict[self.__data_name_prefix + '_node_ids'] = graph.nodes
- feed_dict[self.__data_name_prefix + '_indegree'] = indegree
- feed_dict[self.__data_name_prefix + '_bucketing_index'] = \
- fluid.create_lod_tensor(np.expand_dims(np.arange(0, len(dst), dtype="int32"), -1),
- [list(uniq_dst_count)], self._place)
+ feed_dict[self.__data_name_prefix + '/edges_src'] = src
+ feed_dict[self.__data_name_prefix + '/edges_dst'] = dst
+ feed_dict[self.__data_name_prefix + '/num_nodes'] = np.array(
+ graph.num_nodes)
+ feed_dict[self.__data_name_prefix + '/uniq_dst'] = uniq_dst
+ feed_dict[self.__data_name_prefix + '/uniq_dst_count'] = uniq_dst_count
+ feed_dict[self.__data_name_prefix + '/node_ids'] = graph.nodes
+ feed_dict[self.__data_name_prefix + '/indegree'] = indegree
for key in self._node_feat_tensor_dict:
- feed_dict[self.__data_name_prefix + '_' + key] = node_feat[key]
+ feed_dict[self.__data_name_prefix + '/node_feat/' +
+ key] = node_feat[key]
for key in self._edge_feat_tensor_dict:
- feed_dict[self.__data_name_prefix + '_' + key] = edge_feat[key]
+ feed_dict[self.__data_name_prefix + '/edge_feat/' +
+ key] = edge_feat[key]
return feed_dict
+
+ @property
+ def holder_list(self):
+ """Return the holder list.
+ """
+ return self._holder_list
diff --git a/pgl/layers/conv.py b/pgl/layers/conv.py
index a1dabdde55697638d07f83446ffab034f6eebe9c..804534a06b0602b1f6ab3698efe2cfc997cb4d13 100644
--- a/pgl/layers/conv.py
+++ b/pgl/layers/conv.py
@@ -53,7 +53,7 @@ def gcn(gw, feature, hidden_size, activation, name, norm=None):
feature = fluid.layers.fc(feature,
size=hidden_size,
bias_attr=False,
- name=name)
+ param_attr=fluid.ParamAttr(name=name))
if norm is not None:
feature = feature * norm
@@ -67,7 +67,7 @@ def gcn(gw, feature, hidden_size, activation, name, norm=None):
output = fluid.layers.fc(output,
size=hidden_size,
bias_attr=False,
- name=name)
+ param_attr=fluid.ParamAttr(name=name))
if norm is not None:
output = output * norm
@@ -152,7 +152,7 @@ def gat(gw,
ft = fluid.layers.fc(feature,
hidden_size * num_heads,
bias_attr=False,
- name=name + '_weight')
+ param_attr=fluid.ParamAttr(name=name + '_weight'))
left_a = fluid.layers.create_parameter(
shape=[num_heads, hidden_size],
dtype='float32',
diff --git a/pgl/layers/set2set.py b/pgl/layers/set2set.py
new file mode 100644
index 0000000000000000000000000000000000000000..18d65fae97009a770900130ccf14d248df06ebed
--- /dev/null
+++ b/pgl/layers/set2set.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This package implements common layers to help building pooling operators.
+"""
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import paddle.fluid as F
+import paddle.fluid.layers as L
+
+import pgl
+
+
+class Set2Set(object):
+ """Implementation of set2set pooling operator.
+
+ This is an implementation of the paper ORDER MATTERS: SEQUENCE TO SEQUENCE
+ FOR SETS (https://arxiv.org/pdf/1511.06391.pdf).
+ """
+
+ def __init__(self, input_dim, n_iters, n_layers):
+ """
+ Args:
+ input_dim: hidden size of input data.
+ n_iters: number of set2set iterations.
+ n_layers: number of lstm layers.
+ """
+ self.input_dim = input_dim
+ self.output_dim = 2 * input_dim
+ self.n_iters = n_iters
+
+ # this's set2set n_layers, lstm n_layers = 1
+ self.n_layers = n_layers
+
+ def forward(self, feat):
+ """
+ Args:
+ feat: input feature with shape [batch, n_edges, dim].
+
+ Return:
+ output_feat: output feature of set2set pooling with shape [batch, 2*dim].
+ """
+
+ seqlen = 1
+ h = L.fill_constant_batch_size_like(
+ feat, [1, self.n_layers, self.input_dim], "float32", 0)
+ h = L.transpose(h, [1, 0, 2])
+ c = h
+
+ # [seqlen, batch, dim]
+ q_star = L.fill_constant_batch_size_like(
+ feat, [1, seqlen, self.output_dim], "float32", 0)
+ q_star = L.transpose(q_star, [1, 0, 2])
+
+ for _ in range(self.n_iters):
+
+ # q [seqlen, batch, dim]
+ # h [layer, batch, dim]
+ q, h, c = L.lstm(
+ q_star,
+ h,
+ c,
+ seqlen,
+ self.input_dim,
+ self.n_layers,
+ is_bidirec=False)
+
+ # e [batch, seqlen, n_edges]
+ e = L.matmul(L.transpose(q, [1, 0, 2]), feat, transpose_y=True)
+ # alpha [batch, seqlen, n_edges]
+ alpha = L.softmax(e)
+
+ # readout [batch, seqlen, dim]
+ readout = L.matmul(alpha, feat)
+ readout = L.transpose(readout, [1, 0, 2])
+
+ # q_star [seqlen, batch, dim + dim]
+ q_star = L.concat([q, readout], -1)
+
+ return L.squeeze(q_star, [0])
diff --git a/pgl/redis_graph.py b/pgl/redis_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae6ae1a8b1cd19a9a3eddfab34cd1c5e525f9f47
--- /dev/null
+++ b/pgl/redis_graph.py
@@ -0,0 +1,478 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""redis_graph"""
+
+import pgl
+import redis
+from redis import BlockingConnectionPool, StrictRedis
+from redis._compat import b, unicode, bytes, long, basestring
+from rediscluster.nodemanager import NodeManager
+from rediscluster.crc import crc16
+from collections import OrderedDict
+import threading
+import numpy as np
+import time
+import json
+import pgl.graph as pgraph
+import pickle as pkl
+from pgl.utils.logger import log
+import pgl.graph_kernel as graph_kernel
+
+
+def encode(value):
+ """
+ Return a bytestring representation of the value.
+ This method is copied from Redis' connection.py:Connection.encode
+ """
+ if isinstance(value, bytes):
+ return value
+ elif isinstance(value, (int, long)):
+ value = b(str(value))
+ elif isinstance(value, float):
+ value = b(repr(value))
+ elif not isinstance(value, basestring):
+ value = unicode(value)
+ if isinstance(value, unicode):
+ value = value.encode('utf-8')
+ return value
+
+
+def crc16_hash(data):
+ """crc16_hash"""
+ return crc16(encode(data))
+
+
+LUA_SCRIPT = """
+math.randomseed(tonumber(ARGV[1]))
+
+local function permute(tab, count, bucket_size)
+ local n = #tab / bucket_size
+ local o_ret = {}
+ local o_dict = {}
+ for i = 1, count do
+ local j = math.random(i, n)
+ o_ret[i] = string.sub(tab, (i - 1) * bucket_size + 1, i * bucket_size)
+ if j > count then
+ if o_dict[j] ~= nil then
+ o_ret[i], o_dict[j] = o_dict[j], o_ret[i]
+ else
+ o_dict[j], o_ret[i] = o_ret[i], string.sub(tab, (j - 1) * bucket_size + 1, j * bucket_size)
+ end
+ end
+ end
+ return table.concat(o_ret)
+end
+
+local bucket_size = 16
+local ret = {}
+local sample_size = tonumber(ARGV[2])
+for i=1, #ARGV - 2 do
+ local tab = redis.call("HGET", KEYS[1], ARGV[i + 2])
+ if tab then
+ if #tab / bucket_size <= sample_size then
+ ret[i] = tab
+ else
+ ret[i] = permute(tab, sample_size, bucket_size)
+ end
+ else
+ ret[i] = tab
+ end
+end
+return ret
+"""
+
+
+class RedisCluster(object):
+ """RedisCluster"""
+
+ def __init__(self, startup_nodes):
+ self.nodemanager = NodeManager(startup_nodes=startup_nodes)
+ self.nodemanager.initialize()
+ self.redis_worker = {}
+ for node, config in self.nodemanager.nodes.items():
+ rdp = BlockingConnectionPool(
+ host=config["host"], port=config["port"])
+ self.redis_worker[node] = {
+ "worker": StrictRedis(
+ connection_pool=rdp, decode_responses=False),
+ "type": config["server_type"]
+ }
+
+ def get(self, key):
+ """get"""
+ slot = self.nodemanager.keyslot(key)
+ node = np.random.choice(self.nodemanager.slots[slot])
+ worker = self.redis_worker[node['name']]
+ if worker["type"] == "slave":
+ worker["worker"].execute_command("READONLY")
+ return worker["worker"].get(key)
+
+ def hmget(self, key, fields):
+ """hmget"""
+ while True:
+ retry = 0
+ try:
+ slot = self.nodemanager.keyslot(key)
+ node = np.random.choice(self.nodemanager.slots[slot])
+ worker = self.redis_worker[node['name']]
+ if worker["type"] == "slave":
+ worker["worker"].execute_command("READONLY")
+ ret = worker["worker"].hmget(key, fields)
+ break
+ except Exception as e:
+ retry += 1
+ if retry > 5:
+ raise e
+ print("RETRY hmget after 1 sec. Retry Time %s" % retry)
+ time.sleep(1)
+ return ret
+
+ def hmget_sample(self, key, fields, sample):
+ """hmget_sample"""
+ while True:
+ retry = 0
+ try:
+ slot = self.nodemanager.keyslot(key)
+ node = np.random.choice(self.nodemanager.slots[slot])
+ worker = self.redis_worker[node['name']]
+ if worker["type"] == "slave":
+ worker["worker"].execute_command("READONLY")
+ func = worker["worker"].register_script(LUA_SCRIPT)
+ ret = func(
+ keys=[key],
+ args=[np.random.randint(4294967295), sample] + fields)
+ break
+ except Exception as e:
+ retry += 1
+ if retry > 5:
+ raise e
+ print("RETRY hmget_sample after 1 sec. Retry Time %s" % retry)
+ time.sleep(1)
+ return ret
+
+
+def hmget_sample_helper(rs, query, num_parts, sample_size):
+ """hmget_sample_helper"""
+ buff = [b""] * len(query)
+ part_dict = {}
+ part_ind_dict = {}
+ for ind, q in enumerate(query):
+ part = crc16_hash(q) % num_parts
+ part = "part-%s" % part
+ if part not in part_dict:
+ part_dict[part] = []
+ part_ind_dict[part] = []
+ part_dict[part].append(q)
+ part_ind_dict[part].append(ind)
+
+ def worker(_key, _value, _buff, _rs, _part_ind_dict, _sample_size):
+ """worker"""
+ response = _rs.hmget_sample(_key, _value, _sample_size)
+ for res, ind in zip(response, _part_ind_dict[_key]):
+ buff[ind] = res
+
+ def hmget(_part_dict, _rs, _buff, _part_ind_dict, _sample_size):
+ """hmget"""
+ key_value = list(_part_dict.items())
+ np.random.shuffle(key_value)
+ for key, value in key_value:
+ worker(key, value, _buff, _rs, _part_ind_dict, _sample_size)
+
+ hmget(part_dict, rs, buff, part_ind_dict, sample_size)
+ return buff
+
+
+def hmget_helper(rs, query, num_parts):
+ """hmget_helper"""
+ buff = [b""] * len(query)
+ part_dict = {}
+ part_ind_dict = {}
+ for ind, q in enumerate(query):
+ part = crc16_hash(q) % num_parts
+ part = "part-%s" % part
+ if part not in part_dict:
+ part_dict[part] = []
+ part_ind_dict[part] = []
+ part_dict[part].append(q)
+ part_ind_dict[part].append(ind)
+
+ def worker(_key, _value, _buff, _rs, _part_ind_dict):
+ """worker"""
+ response = _rs.hmget(_key, _value)
+ for res, ind in zip(response, _part_ind_dict[_key]):
+ buff[ind] = res
+
+ def hmget(_part_dict, _rs, _buff, _part_ind_dict):
+ """hmget"""
+ key_value = list(_part_dict.items())
+ np.random.shuffle(key_value)
+ for key, value in key_value:
+ worker(key, value, _buff, _rs, _part_ind_dict)
+
+ hmget(part_dict, rs, buff, part_ind_dict)
+ return buff
+
+
+class RedisGraph(pgraph.Graph):
+ """RedisGraph"""
+
+ def __init__(self, name, redis_config, num_parts):
+ self._rs = RedisCluster(startup_nodes=redis_config)
+ self.num_parts = num_parts
+ self._name = name
+ self._num_nodes = None
+ self._num_edges = None
+ self._node_feat_info = None
+ self._edge_feat_info = None
+ self._node_feat_dtype = None
+ self._edge_feat_dtype = None
+ self._node_feat_shape = None
+ self._edge_feat_shape = None
+
+ @property
+ def num_nodes(self):
+ """num_nodes"""
+ if self._num_nodes is None:
+ self._num_nodes = int(self._rs.get("num_nodes"))
+ return self._num_nodes
+
+ @property
+ def num_edges(self):
+ """num_edges"""
+ if self._num_edges is None:
+ self._num_edges = int(self._rs.get("num_edges"))
+ return self._num_edges
+
+ def node_feat_info(self):
+ """node_feat_info"""
+ if self._node_feat_info is None:
+ buff = self._rs.get("nf:infos")
+ self._node_feat_info = json.loads(buff.decode())
+ return self._node_feat_info
+
+ def node_feat_dtype(self, key):
+ """node_feat_dtype"""
+ if self._node_feat_dtype is None:
+ self._node_feat_dtype = {}
+ for key, _, dtype in self.node_feat_info():
+ self._node_feat_dtype[key] = dtype
+ return self._node_feat_dtype[key]
+
+ def node_feat_shape(self, key):
+ """node_feat_shape"""
+ if self._node_feat_shape is None:
+ self._node_feat_shape = {}
+ for key, shape, _ in self.node_feat_info():
+ self._node_feat_shape[key] = shape
+ return self._node_feat_shape[key]
+
+ def edge_feat_shape(self, key):
+ """edge_feat_shape"""
+ if self._edge_feat_shape is None:
+ self._edge_feat_shape = {}
+ for key, shape, _ in self.edge_feat_info():
+ self._edge_feat_shape[key] = shape
+ return self._edge_feat_shape[key]
+
+ def edge_feat_dtype(self, key):
+ """edge_feat_dtype"""
+ if self._edge_feat_dtype is None:
+ self._edge_feat_dtype = {}
+ for key, _, dtype in self.edge_feat_info():
+ self._edge_feat_dtype[key] = dtype
+ return self._edge_feat_dtype[key]
+
+ def edge_feat_info(self):
+ """edge_feat_info"""
+ if self._edge_feat_info is None:
+ buff = self._rs.get("ef:infos")
+ self._edge_feat_info = json.loads(buff.decode())
+ return self._edge_feat_info
+
+ def sample_predecessor(self, nodes, max_degree, return_eids=False):
+ """sample_predecessor"""
+ query = ["d:%s" % n for n in nodes]
+ rets = hmget_sample_helper(self._rs, query, self.num_parts, max_degree)
+ v = []
+ eid = []
+ for buff in rets:
+ if buff is None:
+ v.append(np.array([], dtype="int64"))
+ eid.append(np.array([], dtype="int64"))
+ else:
+ npret = np.frombuffer(
+ buff, dtype="int64").reshape([-1, 2]).astype("int64")
+ v.append(npret[:, 0])
+ eid.append(npret[:, 1])
+ if return_eids:
+ return np.array(v), np.array(eid)
+ else:
+ return np.array(v)
+
+ def sample_successor(self, nodes, max_degree, return_eids=False):
+ """sample_successor"""
+ query = ["s:%s" % n for n in nodes]
+ rets = hmget_sample_helper(self._rs, query, self.num_parts, max_degree)
+ v = []
+ eid = []
+ for buff in rets:
+ if buff is None:
+ v.append(np.array([], dtype="int64"))
+ eid.append(np.array([], dtype="int64"))
+ else:
+ npret = np.frombuffer(
+ buff, dtype="int64").reshape([-1, 2]).astype("int64")
+ v.append(npret[:, 0])
+ eid.append(npret[:, 1])
+ if return_eids:
+ return np.array(v), np.array(eid)
+ else:
+ return np.array(v)
+
+ def predecessor(self, nodes, return_eids=False):
+ """predecessor"""
+ query = ["d:%s" % n for n in nodes]
+ ret = hmget_helper(self._rs, query, self.num_parts)
+ v = []
+ eid = []
+ for buff in ret:
+ if buff is not None:
+ npret = np.frombuffer(
+ buff, dtype="int64").reshape([-1, 2]).astype("int64")
+ v.append(npret[:, 0])
+ eid.append(npret[:, 1])
+ else:
+ v.append(np.array([], dtype="int64"))
+ eid.append(np.array([], dtype="int64"))
+ if return_eids:
+ return np.array(v), np.array(eid)
+ else:
+ return np.array(v)
+
+ def successor(self, nodes, return_eids=False):
+ """successor"""
+ query = ["s:%s" % n for n in nodes]
+ ret = hmget_helper(self._rs, query, self.num_parts)
+ v = []
+ eid = []
+ for buff in ret:
+ if buff is not None:
+ npret = np.frombuffer(
+ buff, dtype="int64").reshape([-1, 2]).astype("int64")
+ v.append(npret[:, 0])
+ eid.append(npret[:, 1])
+ else:
+ v.append(np.array([], dtype="int64"))
+ eid.append(np.array([], dtype="int64"))
+ if return_eids:
+ return np.array(v), np.array(eid)
+ else:
+ return np.array(v)
+
+ def get_edges_by_id(self, eids):
+ """get_edges_by_id"""
+ queries = ["e:%s" % e for e in eids]
+ ret = hmget_helper(self._rs, queries, self.num_parts)
+ o = np.asarray(ret, dtype="int64")
+ dst = o % self.num_nodes
+ src = o // self.num_nodes
+ data = np.hstack(
+ [src.reshape([-1, 1]), dst.reshape([-1, 1])]).astype("int64")
+ return data
+
+ def get_node_feat_by_id(self, key, nodes):
+ """get_node_feat_by_id"""
+ queries = ["nf:%s:%i" % (key, nid) for nid in nodes]
+ ret = hmget_helper(self._rs, queries, self.num_parts)
+ ret = b"".join(ret)
+ data = np.frombuffer(ret, dtype=self.node_feat_dtype(key))
+ data = data.reshape(self.node_feat_shape(key))
+ return data
+
+ def get_edge_feat_by_id(self, key, eids):
+ """get_edge_feat_by_id"""
+ queries = ["ef:%s:%i" % (key, e) for e in eids]
+ ret = hmget_helper(self._rs, queries, self.num_parts)
+ ret = b"".join(ret)
+ data = np.frombuffer(ret, dtype=self.edge_feat_dtype(key))
+ data = data.reshape(self.edge_feat_shape(key))
+ return data
+
+ def subgraph(self, nodes, eid, edges=None):
+ """Generate subgraph with nodes and edge ids.
+
+ This function will generate a :code:`pgl.graph.Subgraph` object and
+ copy all corresponding node and edge features. Nodes and edges will
+ be reindex from 0.
+
+ WARNING: ALL NODES IN EID MUST BE INCLUDED BY NODES
+
+ Args:
+ nodes: Node ids which will be included in the subgraph.
+
+ eid: Edge ids which will be included in the subgraph.
+
+ Return:
+ A :code:`pgl.graph.Subgraph` object.
+ """
+ reindex = {}
+
+ for ind, node in enumerate(nodes):
+ reindex[node] = ind
+
+ if edges is None:
+ edges = self.get_edges_by_id(eid)
+ else:
+ edges = np.array(edges, dtype="int64")
+
+ sub_edges = graph_kernel.map_edges(
+ np.arange(
+ len(edges), dtype="int64"), edges, reindex)
+
+ sub_edge_feat = {}
+ for key, _, _ in self.edge_feat_info():
+ sub_edge_feat[key] = self.get_edge_feat_by_id(key, eid)
+
+ sub_node_feat = {}
+ for key, _, _ in self.node_feat_info():
+ sub_node_feat[key] = self.get_node_feat_by_id(key, nodes)
+
+ subgraph = pgraph.SubGraph(
+ num_nodes=len(nodes),
+ edges=sub_edges,
+ node_feat=sub_node_feat,
+ edge_feat=sub_edge_feat,
+ reindex=reindex)
+ return subgraph
+
+ def node_batch_iter(self, batch_size, shuffle=True):
+ """Node batch iterator
+
+ Iterate all node by batch.
+
+ Args:
+ batch_size: The batch size of each batch of nodes.
+
+ shuffle: Whether shuffle the nodes.
+
+ Return:
+ Batch iterator
+ """
+ perm = np.arange(self.num_nodes, dtype="int64")
+ if shuffle:
+ np.random.shuffle(perm)
+ start = 0
+ while start < self._num_nodes:
+ yield perm[start:start + batch_size]
+ start += batch_size
diff --git a/pgl/sample.py b/pgl/sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..176c32b9c85e7819646de494966120583c2ddb98
--- /dev/null
+++ b/pgl/sample.py
@@ -0,0 +1,253 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ This package implement graph sampling algorithm.
+"""
+import time
+import copy
+
+import numpy as np
+import pgl
+from pgl.utils.logger import log
+from pgl import graph_kernel
+
+__all__ = ['graphsage_sample', 'node2vec_sample', 'deepwalk_sample']
+
+
+def edge_hash(src, dst):
+ """edge_hash
+ """
+ return src * 100000007 + dst
+
+
+def graphsage_sample(graph, nodes, samples, ignore_edges=[]):
+ """Implement of graphsage sample.
+
+ Reference paper: https://cs.stanford.edu/~jure/pubs/node2vec-kdd16.pdf.
+
+ Args:
+ graph: A pgl graph instance
+ nodes: Sample starting from nodes
+ samples: A list, number of neighbors in each layer
+ ignore_edges: list of edge(src, dst) will be ignored.
+
+ Return:
+ A list of subgraphs
+ """
+ start = time.time()
+ num_layers = len(samples)
+ start_nodes = nodes
+ nodes = list(start_nodes)
+ eids, edges = [], []
+ nodes_set = set(nodes)
+ layer_nodes, layer_eids, layer_edges = [], [], []
+ ignore_edge_set = set([edge_hash(src, dst) for src, dst in ignore_edges])
+
+ for layer_idx in reversed(range(num_layers)):
+ if len(start_nodes) == 0:
+ layer_nodes = [nodes] + layer_nodes
+ layer_eids = [eids] + layer_eids
+ layer_edges = [edges] + layer_edges
+ continue
+ batch_pred_nodes, batch_pred_eids = graph.sample_predecessor(
+ start_nodes, samples[layer_idx], return_eids=True)
+ log.debug("sample_predecessor time: %s" % (time.time() - start))
+ start = time.time()
+ last_nodes_set = nodes_set
+
+ nodes, eids = copy.copy(nodes), copy.copy(eids)
+ edges = copy.copy(edges)
+ nodes_set, eids_set = set(nodes), set(eids)
+ for srcs, dst, pred_eids in zip(batch_pred_nodes, start_nodes,
+ batch_pred_eids):
+ for src, eid in zip(srcs, pred_eids):
+ if edge_hash(src, dst) in ignore_edge_set:
+ continue
+ if eid not in eids_set:
+ eids.append(eid)
+ edges.append([src, dst])
+ eids_set.add(eid)
+ if src not in nodes_set:
+ nodes.append(src)
+ nodes_set.add(src)
+ layer_edges = [edges] + layer_edges
+ start_nodes = list(nodes_set - last_nodes_set)
+ layer_nodes = [nodes] + layer_nodes
+ layer_eids = [eids] + layer_eids
+ log.debug("flat time: %s" % (time.time() - start))
+ start = time.time()
+ # Find new nodes
+
+ feed_dict = {}
+
+ subgraphs = []
+ for i in range(num_layers):
+ subgraphs.append(
+ graph.subgraph(
+ nodes=layer_nodes[0], eid=layer_eids[i], edges=layer_edges[i]))
+ # only for this task
+ subgraphs[i].node_feat["index"] = np.array(
+ layer_nodes[0], dtype="int64")
+ log.debug("subgraph time: %s" % (time.time() - start))
+
+ return subgraphs
+
+
+def alias_sample(size, alias, events):
+ """Implement of alias sample.
+ Args:
+ size: Output shape.
+ alias: The alias table build by `alias_sample_build_table`.
+ events: The events table build by `alias_sample_build_table`.
+
+ Return:
+ samples: The generated random samples.
+ """
+ rand_num = np.random.uniform(0.0, len(alias), size)
+ idx = rand_num.astype("int64")
+ uni = rand_num - idx
+ flags = (uni >= alias[idx])
+ idx[flags] = events[idx][flags]
+ return idx
+
+
+def graph_alias_sample_table(graph, edge_weight_name):
+ """Build alias sample table for weighted deepwalk.
+ Args:
+ graph: The input graph
+ edge_weight_name: The name of edge weight in edge_feat.
+
+ Return:
+ Alias sample tables for each nodes.
+ """
+ edge_weight = graph.edge_feat[edge_weight_name]
+ _, eids_array = graph.successor(return_eids=True)
+ alias_array, events_array = [], []
+ for eids in eids_array:
+ probs = edge_weight[eids]
+ probs /= np.sum(probs)
+ alias, events = graph_kernel.alias_sample_build_table(probs)
+ alias_array.append(alias), events_array.append(events)
+ alias_array, events_array = np.array(alias_array), np.array(events_array)
+ return alias_array, events_array
+
+
+def deepwalk_sample(graph, nodes, max_depth, alias_name=None,
+ events_name=None):
+ """Implement of random walk.
+
+ This function get random walks path for given nodes and depth.
+
+ Args:
+ nodes: Walk starting from nodes
+ max_depth: Max walking depth
+
+ Return:
+ A list of walks.
+ """
+ walk = []
+ # init
+ for node in nodes:
+ walk.append([node])
+
+ cur_walk_ids = np.arange(0, len(nodes))
+ cur_nodes = np.array(nodes)
+ for l in range(max_depth):
+ # select the walks not end
+ cur_succs = graph.successor(cur_nodes)
+ mask = [len(succ) > 0 for succ in cur_succs]
+
+ if np.any(mask):
+ cur_walk_ids = cur_walk_ids[mask]
+ cur_nodes = cur_nodes[mask]
+ cur_succs = cur_succs[mask]
+ else:
+ # stop when all nodes have no successor
+ break
+
+ if alias_name is not None and events_name is not None:
+ sample_index = [
+ alias_sample([1], graph.node_feat[alias_name][node],
+ graph.node_feat[events_name][node])[0]
+ for node in cur_nodes
+ ]
+ else:
+ outdegree = [len(cur_succ) for cur_succ in cur_succs]
+ sample_index = np.floor(
+ np.random.rand(cur_succs.shape[0]) * outdegree).astype("int64")
+
+ nxt_cur_nodes = []
+ for s, ind, walk_id in zip(cur_succs, sample_index, cur_walk_ids):
+ walk[walk_id].append(s[ind])
+ nxt_cur_nodes.append(s[ind])
+ cur_nodes = np.array(nxt_cur_nodes)
+ return walk
+
+
+def node2vec_sample(graph, nodes, max_depth, p=1.0, q=1.0):
+ """Implement of node2vec random walk.
+
+ Reference paper: https://cs.stanford.edu/~jure/pubs/node2vec-kdd16.pdf.
+
+ Args:
+ graph: A pgl graph instance
+ nodes: Walk starting from nodes
+ max_depth: Max walking depth
+ p: Return parameter
+ q: In-out parameter
+
+ Return:
+ A list of walks.
+ """
+ if p == 1.0 and q == 1.0:
+ return deepwalk_sample(graph, nodes, max_depth)
+
+ walk = []
+ # init
+ for node in nodes:
+ walk.append([node])
+
+ cur_walk_ids = np.arange(0, len(nodes))
+ cur_nodes = np.array(nodes)
+ prev_nodes = np.array([-1] * len(nodes), dtype="int64")
+ prev_succs = np.array([[]] * len(nodes), dtype="int64")
+ for l in range(max_depth):
+ # select the walks not end
+ cur_succs = graph.successor(cur_nodes)
+
+ mask = [len(succ) > 0 for succ in cur_succs]
+ if np.any(mask):
+ cur_walk_ids = cur_walk_ids[mask]
+ cur_nodes = cur_nodes[mask]
+ prev_nodes = prev_nodes[mask]
+ prev_succs = prev_succs[mask]
+ cur_succs = cur_succs[mask]
+ else:
+ # stop when all nodes have no successor
+ break
+ num_nodes = cur_nodes.shape[0]
+ nxt_nodes = np.zeros(num_nodes, dtype="int64")
+
+ for idx, (
+ succ, prev_succ, walk_id, prev_node
+ ) in enumerate(zip(cur_succs, prev_succs, cur_walk_ids, prev_nodes)):
+
+ sampled_succ = graph_kernel.node2vec_sample(succ, prev_succ,
+ prev_node, p, q)
+ walk[walk_id].append(sampled_succ)
+ nxt_nodes[idx] = sampled_succ
+
+ prev_nodes, prev_succs = cur_nodes, cur_succs
+ cur_nodes = nxt_nodes
+ return walk
diff --git a/pgl/tests/deepwalk/test_alias_sample.py b/pgl/tests/deepwalk/test_alias_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..c14ba5a3c15f6ca3a62531e5cf63c00e3c1bc967
--- /dev/null
+++ b/pgl/tests/deepwalk/test_alias_sample.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""test_alias_sample"""
+import argparse
+import time
+import unittest
+from collections import Counter
+
+import numpy as np
+
+from pgl.graph_kernel import alias_sample_build_table
+from pgl.sample import alias_sample
+
+
+class AliasSampleTest(unittest.TestCase):
+ """AliasSampleTest
+ """
+
+ def setUp(self):
+ pass
+
+ def test_speed(self):
+ """test_speed
+ """
+
+ num = 1000
+ size = [10240, 1, 5]
+ probs = np.random.uniform(0.0, 1.0, [num])
+ probs /= np.sum(probs)
+
+ start = time.time()
+ alias, events = alias_sample_build_table(probs)
+ for i in range(100):
+ alias_sample(size, alias, events)
+ alias_sample_time = time.time() - start
+
+ start = time.time()
+ for i in range(100):
+ np.random.choice(num, size, p=probs)
+ np_sample_time = time.time() - start
+ self.assertTrue(alias_sample_time < np_sample_time)
+
+ def test_resut(self):
+ """test_result
+ """
+ size = [450000]
+ num = 10
+ probs = np.arange(1, num).astype(np.float64)
+ probs /= np.sum(probs)
+ alias, events = alias_sample_build_table(probs)
+ ret = alias_sample(size, alias, events)
+ cnt = Counter(ret)
+ sort_cnt_keys = [x[1] for x in sorted(zip(cnt.values(), cnt.keys()))]
+ self.assertEqual(sort_cnt_keys, np.arange(0, num - 1).tolist())
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/pgl/tests/test_redis_graph.py b/pgl/tests/test_redis_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d2601de26207b45806052887312bef3fdbd32b8
--- /dev/null
+++ b/pgl/tests/test_redis_graph.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""test_redis_graph"""
+import time
+import unittest
+import json
+import os
+
+import numpy as np
+from pgl.redis_graph import RedisGraph
+
+
+class RedisGraphTest(unittest.TestCase):
+ """RedisGraphTest
+ """
+
+ def setUp(self):
+ config_path = os.path.join(
+ os.path.abspath(os.path.dirname(__file__)),
+ 'test_redis_graph_conf.json')
+ with open(config_path) as inf:
+ config = json.load(inf)
+ redis_configs = [config["redis"], ]
+ self.graph = RedisGraph(
+ "reddit-graph", redis_configs, num_parts=config["num_parts"])
+
+ def test_random_seed(self):
+ """test_random_seed
+ """
+ np.random.seed(1)
+ data1 = self.graph.sample_predecessor(range(1000), max_degree=5)
+ data1 = [nid for nodes in data1 for nid in nodes]
+ np.random.seed(1)
+ data2 = self.graph.sample_predecessor(range(1000), max_degree=5)
+ data2 = [nid for nodes in data2 for nid in nodes]
+ np.random.seed(3)
+ data3 = self.graph.sample_predecessor(range(1000), max_degree=5)
+ data3 = [nid for nodes in data3 for nid in nodes]
+
+ self.assertEqual(data1, data2)
+ self.assertNotEqual(data2, data3)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/pgl/tests/test_redis_graph_conf.json b/pgl/tests/test_redis_graph_conf.json
new file mode 100644
index 0000000000000000000000000000000000000000..c334656132b75f1bf2a55d802bf4d9a9363239f3
--- /dev/null
+++ b/pgl/tests/test_redis_graph_conf.json
@@ -0,0 +1,8 @@
+{
+ "redis":
+ {
+ "host": "10.86.54.13",
+ "port": "7003"
+ },
+ "num_parts": 64
+}
diff --git a/pgl/tests/test_sample.py b/pgl/tests/test_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..6eadb652160b5c7ad917357be52ee1fd8d348aae
--- /dev/null
+++ b/pgl/tests/test_sample.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ This package implement graph sampling algorithm.
+"""
+import unittest
+import os
+import json
+
+import numpy as np
+from pgl.redis_graph import RedisGraph
+from pgl.sample import graphsage_sample
+from pgl.sample import node2vec_sample
+
+
+class SampleTest(unittest.TestCase):
+ """SampleTest
+ """
+
+ def setUp(self):
+ config_path = os.path.join(
+ os.path.abspath(os.path.dirname(__file__)),
+ 'test_redis_graph_conf.json')
+ with open(config_path) as inf:
+ config = json.load(inf)
+ redis_configs = [config["redis"], ]
+ self.graph = RedisGraph(
+ "reddit-graph", redis_configs, num_parts=config["num_parts"])
+
+ def test_graphsage_sample(self):
+ """test_graphsage_sample
+ """
+ eids = np.random.choice(self.graph.num_edges, 1000)
+ edges = self.graph.get_edges_by_id(eids)
+ nodes = [n for edge in edges for n in edge]
+ ignore_edges = edges.tolist() + edges[:, [1, 0]].tolist()
+
+ np.random.seed(1)
+ subgraphs = graphsage_sample(self.graph, nodes, [10, 10], [])
+
+ np.random.seed(1)
+ subgraphs_ignored = graphsage_sample(self.graph, nodes, [10, 10],
+ ignore_edges)
+
+ self.assertEqual(subgraphs[0].num_nodes,
+ subgraphs_ignored[0].num_nodes)
+ self.assertGreaterEqual(subgraphs[0].num_edges,
+ subgraphs_ignored[0].num_edges)
+
+ def test_node2vec_sample(self):
+ """test_node2vec_sample
+ """
+ walks = node2vec_sample(self.graph, range(10), 3)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/pgl/tests/test_set2set.py b/pgl/tests/test_set2set.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0572c1508238602ff21295d6a353d482a7d55f2
--- /dev/null
+++ b/pgl/tests/test_set2set.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ Comment.
+"""
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+import unittest
+
+import paddle.fluid as F
+import paddle.fluid.layers as L
+
+from pgl.layers.set2set import Set2Set
+
+
+def paddle_easy_run(model_func, data):
+ prog = F.Program()
+ startup_prog = F.Program()
+ with F.program_guard(prog, startup_prog):
+ ret = model_func()
+ place = F.CUDAPlace(0)
+ exe = F.Executor(place)
+ exe.run(startup_prog)
+ return exe.run(prog, fetch_list=ret, feed=data)
+
+
+class Set2SetTest(unittest.TestCase):
+ """Set2SetTest
+ """
+
+ def test_graphsage_sample(self):
+ """test_graphsage_sample
+ """
+ import numpy as np
+
+ def model_func():
+ s2s = Set2Set(5, 1, 3)
+ h0 = L.data(
+ name='h0',
+ shape=[2, 10, 5],
+ dtype='float32',
+ append_batch_size=False)
+ h1 = s2s.forward(h0)
+ return h1,
+
+ data = {"h0": np.random.rand(2, 10, 5).astype("float32")}
+ h1, = paddle_easy_run(model_func, data)
+
+ self.assertEqual(h1.shape[0], 2)
+ self.assertEqual(h1.shape[1], 10)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/pgl/utils/mp_reader.py b/pgl/utils/mp_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f26a60ddb38ea7b7044dd211cc629ba679a3616
--- /dev/null
+++ b/pgl/utils/mp_reader.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimized Multiprocessing Reader for PaddlePaddle
+"""
+import logging
+log = logging.getLogger(__name__)
+import multiprocessing
+import copy
+try:
+ import ujson as json
+except:
+ log.info("ujson not install, fail back to use json instead")
+ import json
+import numpy as np
+import time
+import paddle.fluid as fluid
+
+
+def serialize_data(data):
+ """serialize_data"""
+ if data is None:
+ return None
+ return numpy_serialize_data(data) #, ensure_ascii=False)
+
+
+def numpy_serialize_data(data):
+ """serialize_data"""
+ ret_data = {}
+ for key in data:
+ if isinstance(data[key], np.ndarray):
+ ret_data[key] = (data[key].tobytes(), list(data[key].shape),
+ "%s" % data[key].dtype)
+ else:
+ ret_data[key] = data[key]
+ return ret_data
+
+
+def numpy_deserialize_data(data):
+ """deserialize_data"""
+ if data is None:
+ return None
+ for key in data:
+ if isinstance(data[key], tuple):
+ value = np.frombuffer(
+ data[key][0], dtype=data[key][2]).reshape(data[key][1])
+ data[key] = value
+ return data
+
+
+def deserialize_data(data):
+ """deserialize_data"""
+ return numpy_deserialize_data(data)
+
+
+def multiprocess_reader(readers, use_pipe=True, queue_size=1000, pipe_size=10):
+ """
+ multiprocess_reader use python multi process to read data from readers
+ and then use multiprocess.Queue or multiprocess.Pipe to merge all
+ data. The process number is equal to the number of input readers, each
+ process call one reader.
+ Multiprocess.Queue require the rw access right to /dev/shm, some
+ platform does not support.
+ you need to create multiple readers first, these readers should be independent
+ to each other so that each process can work independently.
+ An example:
+ .. code-block:: python
+ reader0 = reader(["file01", "file02"])
+ reader1 = reader(["file11", "file12"])
+ reader1 = reader(["file21", "file22"])
+ reader = multiprocess_reader([reader0, reader1, reader2],
+ queue_size=100, use_pipe=False)
+ """
+
+ assert type(readers) is list and len(readers) > 0
+
+ def _read_into_queue(reader, queue):
+ """read_into_queue"""
+ for sample in reader():
+ if sample is None:
+ raise ValueError("sample has None")
+ queue.put(serialize_data(sample))
+ queue.put(serialize_data(None))
+
+ def queue_reader():
+ """queue_reader"""
+ queue = multiprocessing.Queue(queue_size)
+ for reader in readers:
+ p = multiprocessing.Process(
+ target=_read_into_queue, args=(reader, queue))
+ p.start()
+
+ reader_num = len(readers)
+ finish_num = 0
+ while finish_num < reader_num:
+ sample = deserialize_data(queue.get())
+ if sample is None:
+ finish_num += 1
+ else:
+ yield sample
+
+ def _read_into_pipe(reader, conn, max_pipe_size):
+ """read_into_pipe"""
+ for sample in reader():
+ if sample is None:
+ raise ValueError("sample has None!")
+ conn.send(serialize_data(sample))
+ conn.send(serialize_data(None))
+ conn.close()
+
+ def pipe_reader():
+ """pipe_reader"""
+ conns = []
+ for reader in readers:
+ parent_conn, child_conn = multiprocessing.Pipe()
+ conns.append(parent_conn)
+ p = multiprocessing.Process(
+ target=_read_into_pipe, args=(reader, child_conn, pipe_size))
+ p.start()
+
+ reader_num = len(readers)
+ finish_num = 0
+ conn_to_remove = []
+ finish_flag = np.zeros(len(conns), dtype="int32")
+ while finish_num < reader_num:
+ for conn_id, conn in enumerate(conns):
+ if finish_flag[conn_id] > 0:
+ continue
+ if conn.poll(0.01):
+ buff = conn.recv()
+ sample = deserialize_data(buff)
+ if sample is None:
+ finish_num += 1
+ conn.close()
+ finish_flag[conn_id] = 1
+ else:
+ yield sample
+
+ if use_pipe:
+ return pipe_reader
+ else:
+ return queue_reader
diff --git a/pgl/utils/mt_reader.py b/pgl/utils/mt_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a1aad43243ccbb99b06f1bc8914d2e744c025ad
--- /dev/null
+++ b/pgl/utils/mt_reader.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimized Multithreading Reader for PaddlePaddle
+"""
+import logging
+log = logging.getLogger(__name__)
+import threading
+import queue
+import copy
+import numpy as np
+import time
+import paddle.fluid as fluid
+
+
+def multithreading_reader(readers, queue_size=1000):
+ """
+ multithreading_reader use python multi thread to read data from readers
+ and then use queue to merge all
+ data. The process number is equal to the number of input readers, each
+ process call one reader.
+ CPU usage rate won't go over 100% with GIL.
+ you need to create multiple readers first, these readers should be independent
+ to each other so that each process can work independently.
+ An example:
+ .. code-block:: python
+ reader0 = reader(["file01", "file02"])
+ reader1 = reader(["file11", "file12"])
+ reader1 = reader(["file21", "file22"])
+ reader = multithreading_reader([reader0, reader1, reader2],
+ queue_size=100)
+ """
+
+ assert type(readers) is list and len(readers) > 0
+
+ def _read_into_queue(reader, queue):
+ """read_into_queue"""
+ for sample in reader():
+ if sample is None:
+ raise ValueError("sample has None")
+ queue.put(sample)
+ queue.put(None)
+
+ def queue_reader():
+ """queue_reader"""
+ output_queue = queue.Queue(queue_size)
+ thread_pool = []
+ thread_num = 0
+ for reader in readers:
+ p = threading.Thread(
+ target=_read_into_queue, args=(reader, output_queue))
+ p.daemon = True
+ p.start()
+ thread_pool.append(p)
+ thread_num += 1
+
+ while True:
+ ret = output_queue.get()
+ if ret is not None:
+ yield ret
+ else:
+ thread_num -= 1
+ if thread_num == 0:
+ break
+
+ for thread in thread_pool:
+ thread.join()
+
+ return queue_reader
diff --git a/pgl/utils/paddle_helper.py b/pgl/utils/paddle_helper.py
index ecea36483967552fc40df61a86397b46e52d09de..f1e53aedcd25389f1994301a5213defabd62ba52 100644
--- a/pgl/utils/paddle_helper.py
+++ b/pgl/utils/paddle_helper.py
@@ -223,5 +223,25 @@ def scatter_add(input, index, updates):
Same type and shape as input.
"""
- output = fluid.layers.scatter(input, index, updates, overwrite=False)
+ output = fluid.layers.scatter(input, index, updates, mode='add')
+ return output
+
+
+def scatter_max(input, index, updates):
+ """Scatter max updates to input by given index.
+
+ Adds sparse updates to input variables.
+
+ Args:
+ input: Input tensor to be updated
+
+ index: Slice index
+
+ updates: Must have same type as input.
+
+ Return:
+ Same type and shape as input.
+ """
+
+ output = fluid.layers.scatter(input, index, updates, mode='max')
return output
diff --git a/requirements.txt b/requirements.txt
index b8b9367120ca4e91ddaab29f3dfab8bc6c1e2195..4a13b16b9df0af570752f5dac9857fa7e496d3db 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,6 @@
-numpy >= 1.14.5
+numpy >= 1.16.4
cython >= 0.25.2
+
+#paddlepaddle
+
+redis-py-cluster
diff --git a/setup.py b/setup.py
index 3a720b1a30340a1a9076949c72791ec8b568f107..f0b808afacd550293ecdc47ff221a683c50ae07f 100755
--- a/setup.py
+++ b/setup.py
@@ -16,10 +16,35 @@ import os
import sys
import re
import codecs
-import numpy as np
from setuptools import setup, find_packages
-from setuptools.extension import Extension
-from Cython.Build import cythonize
+from setuptools import Extension
+from setuptools import dist
+from setuptools.command.build_ext import build_ext as _build_ext
+
+try:
+ from Cython.Build import cythonize
+except ImportError:
+
+ def cythonize(*args, **kwargs):
+ """cythonize"""
+ from Cython.Build import cythonize
+ return cythonize(*args, **kwargs)
+
+
+class CustomBuildExt(_build_ext):
+ """CustomBuildExt"""
+
+ def finalize_options(self):
+ _build_ext.finalize_options(self)
+ # Prevent numpy from thinking it is still in its setup process:
+ __builtins__.__NUMPY_SETUP__ = False
+ import numpy
+ self.include_dirs.append(numpy.get_include())
+
+
+workdir = os.path.dirname(os.path.abspath(__file__))
+with open(os.path.join(workdir, './requirements.txt')) as f:
+ requirements = f.read().splitlines()
cur_dir = os.path.abspath(os.path.dirname(__file__))
with open(os.path.join(cur_dir, 'README.md'), 'rb') as f:
@@ -58,7 +83,6 @@ extensions = [
"pgl.graph_kernel",
["pgl/graph_kernel.pyx"],
language="c++",
- include_dirs=[np.get_include()],
extra_compile_args=compile_extra_args,
extra_link_args=link_extra_args, ),
]
@@ -66,7 +90,6 @@ extensions = [
def get_package_data(path):
files = []
- print(path)
for root, dirnames, filenames in os.walk(path):
for filename in filenames:
files.append(os.path.join(root, filename))
@@ -83,9 +106,16 @@ setup(
long_description_content_type='text/markdown',
url="https://github.com/PaddlePaddle/PGL",
package_data=package_data,
+ setup_requires=[
+ 'setuptools>=18.0',
+ 'numpy>=1.16.4',
+ ],
+ install_requires=requirements,
+ cmdclass={'build_ext': CustomBuildExt},
packages=find_packages(),
include_package_data=True,
- ext_modules=cythonize(extensions),
+ #ext_modules=cythonize(extensions),
+ ext_modules=extensions,
classifiers=[
'Intended Audience :: Developers',
'License :: OSI Approved :: Apache Software License',
diff --git a/tests/scatter_add_test.py b/tests/scatter_add_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..04f4ef6639a5c429d7fa76761aa561471fff0171
--- /dev/null
+++ b/tests/scatter_add_test.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""scatter test cases"""
+
+import unittest
+
+import numpy as np
+import paddle.fluid as fluid
+
+
+class ScatterAddTest(unittest.TestCase):
+ """ScatterAddTest"""
+
+ def test_scatter_add(self):
+ """test_scatter_add"""
+ with fluid.dygraph.guard(fluid.CPUPlace()):
+ input = fluid.dygraph.to_variable(
+ np.array(
+ [[1, 2], [5, 6]], dtype='float32'), )
+ index = fluid.dygraph.to_variable(np.array([1, 1], dtype=np.int32))
+ updates = fluid.dygraph.to_variable(
+ np.array(
+ [[3, 4], [3, 4]], dtype='float32'), )
+ output = fluid.layers.scatter(input, index, updates, mode='add')
+ assert output.numpy().tolist() == [[1, 2], [11, 14]]
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/unique_with_counts_test.py b/tests/unique_with_counts_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..532843c27d4732715f7a325e27bbb66bffa164e6
--- /dev/null
+++ b/tests/unique_with_counts_test.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""unique with counts test"""
+
+import unittest
+
+import numpy as np
+import paddle.fluid as fluid
+
+
+class UniqueWithCountTest(unittest.TestCase):
+ """UniqueWithCountTest"""
+
+ def _test_unique_with_counts_helper(self, input, output):
+ place = fluid.CPUPlace()
+ exe = fluid.Executor(place)
+ main_program = fluid.Program()
+ startup_program = fluid.Program()
+ with fluid.program_guard(main_program, startup_program):
+ x = fluid.layers.data(
+ name='input',
+ dtype='int64',
+ shape=[-1],
+ append_batch_size=False)
+ #x = fluid.assign(np.array([2, 3, 3, 1, 5, 3], dtype='int32'))
+ out, index, count = fluid.layers.unique_with_counts(x)
+
+ out, index, count = exe.run(
+ main_program,
+ feed={'input': np.array(
+ input, dtype='int64'), },
+ fetch_list=[out, index, count],
+ return_numpy=True, )
+ out, index, count = out.tolist(), index.tolist(), count.tolist()
+ assert [out, index, count] == output
+
+ def test_unique_with_counts(self):
+ """test_unique_with_counts"""
+ self._test_unique_with_counts_helper(
+ input=[1, 1, 2, 4, 4, 4, 7, 8, 8],
+ output=[
+ [1, 2, 4, 7, 8],
+ [0, 0, 1, 2, 2, 2, 3, 4, 4],
+ [2, 1, 3, 1, 2],
+ ], )
+ self._test_unique_with_counts_helper(
+ input=[1],
+ output=[
+ [1],
+ [0],
+ [1],
+ ], )
+ self._test_unique_with_counts_helper(
+ input=[1, 1],
+ output=[
+ [1],
+ [0, 0],
+ [2],
+ ], )
+
+
+if __name__ == '__main__':
+ unittest.main()