未验证 提交 d347a2bb 编写于 作者: H Huang Zhengjie 提交者: GitHub

Merge pull request #25 from Liwb5/develop

develop PGL v1.1
...@@ -21,7 +21,7 @@ import tqdm ...@@ -21,7 +21,7 @@ import tqdm
import numpy as np import numpy as np
import logging import logging
import random import random
from pgl.contrib import heter_graph from pgl import heter_graph
import pickle as pkl import pickle as pkl
......
...@@ -21,7 +21,7 @@ import logging ...@@ -21,7 +21,7 @@ import logging
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers as fl import paddle.fluid.layers as fl
from pgl.contrib import heter_graph_wrapper from pgl import heter_graph_wrapper
class GATNE(object): class GATNE(object):
......
...@@ -19,8 +19,8 @@ import pgl ...@@ -19,8 +19,8 @@ import pgl
import time import time
from pgl.utils import mp_reader from pgl.utils import mp_reader
from pgl.utils.logger import log from pgl.utils.logger import log
import train
import time import time
import copy
def node_batch_iter(nodes, node_label, batch_size): def node_batch_iter(nodes, node_label, batch_size):
...@@ -46,12 +46,11 @@ def traverse(item): ...@@ -46,12 +46,11 @@ def traverse(item):
yield item yield item
def flat_node_and_edge(nodes, eids): def flat_node_and_edge(nodes):
"""flat_node_and_edge """flat_node_and_edge
""" """
nodes = list(set(traverse(nodes))) nodes = list(set(traverse(nodes)))
eids = list(set(traverse(eids))) return nodes
return nodes, eids
def worker(batch_info, graph, graph_wrapper, samples): def worker(batch_info, graph, graph_wrapper, samples):
...@@ -61,31 +60,42 @@ def worker(batch_info, graph, graph_wrapper, samples): ...@@ -61,31 +60,42 @@ def worker(batch_info, graph, graph_wrapper, samples):
def work(): def work():
"""work """work
""" """
first = True _graph_wrapper = copy.copy(graph_wrapper)
_graph_wrapper.node_feat_tensor_dict = {}
for batch_train_samples, batch_train_labels in batch_info: for batch_train_samples, batch_train_labels in batch_info:
start_nodes = batch_train_samples start_nodes = batch_train_samples
nodes = start_nodes nodes = start_nodes
eids = [] edges = []
for max_deg in samples: for max_deg in samples:
pred, pred_eid = graph.sample_predecessor( pred_nodes = graph.sample_predecessor(
start_nodes, max_degree=max_deg, return_eids=True) start_nodes, max_degree=max_deg)
for dst_node, src_nodes in zip(start_nodes, pred_nodes):
for src_node in src_nodes:
edges.append((src_node, dst_node))
last_nodes = nodes last_nodes = nodes
nodes = [nodes, pred] nodes = [nodes, pred_nodes]
eids = [eids, pred_eid] nodes = flat_node_and_edge(nodes)
nodes, eids = flat_node_and_edge(nodes, eids)
# Find new nodes # Find new nodes
start_nodes = list(set(nodes) - set(last_nodes)) start_nodes = list(set(nodes) - set(last_nodes))
if len(start_nodes) == 0: if len(start_nodes) == 0:
break break
subgraph = graph.subgraph(nodes=nodes, eid=eids) subgraph = graph.subgraph(
nodes=nodes,
edges=edges,
with_node_feat=False,
with_edge_feat=False)
sub_node_index = subgraph.reindex_from_parrent_nodes( sub_node_index = subgraph.reindex_from_parrent_nodes(
batch_train_samples) batch_train_samples)
feed_dict = graph_wrapper.to_feed(subgraph) feed_dict = _graph_wrapper.to_feed(subgraph)
feed_dict["node_label"] = np.expand_dims( feed_dict["node_label"] = np.expand_dims(
np.array( np.array(
batch_train_labels, dtype="int64"), -1) batch_train_labels, dtype="int64"), -1)
feed_dict["node_index"] = sub_node_index feed_dict["node_index"] = sub_node_index
feed_dict["parent_node_index"] = np.array(nodes, dtype="int64")
yield feed_dict yield feed_dict
return work return work
...@@ -97,23 +107,25 @@ def multiprocess_graph_reader(graph, ...@@ -97,23 +107,25 @@ def multiprocess_graph_reader(graph,
node_index, node_index,
batch_size, batch_size,
node_label, node_label,
with_parent_node_index=False,
num_workers=4): num_workers=4):
"""multiprocess_graph_reader """multiprocess_graph_reader
""" """
def parse_to_subgraph(rd): def parse_to_subgraph(rd, prefix, node_feat, _with_parent_node_index):
"""parse_to_subgraph """parse_to_subgraph
""" """
def work(): def work():
"""work """work
""" """
last = time.time()
for data in rd(): for data in rd():
this = time.time()
feed_dict = data feed_dict = data
now = time.time() for key in node_feat:
last = now feed_dict[prefix + '/node_feat/' + key] = node_feat[key][
feed_dict["parent_node_index"]]
if not _with_parent_node_index:
del feed_dict["parent_node_index"]
yield feed_dict yield feed_dict
return work return work
...@@ -129,46 +141,17 @@ def multiprocess_graph_reader(graph, ...@@ -129,46 +141,17 @@ def multiprocess_graph_reader(graph,
reader_pool.append( reader_pool.append(
worker(batch_info[block_size * i:block_size * (i + 1)], graph, worker(batch_info[block_size * i:block_size * (i + 1)], graph,
graph_wrapper, samples)) graph_wrapper, samples))
multi_process_sample = mp_reader.multiprocess_reader(
reader_pool, use_pipe=True, queue_size=1000)
r = parse_to_subgraph(multi_process_sample)
return paddle.reader.buffered(r, 1000)
return reader()
def graph_reader(graph, graph_wrapper, samples, node_index, batch_size,
node_label):
"""graph_reader"""
def reader():
"""reader"""
for batch_train_samples, batch_train_labels in node_batch_iter(
node_index, node_label, batch_size=batch_size):
start_nodes = batch_train_samples
nodes = start_nodes
eids = []
for max_deg in samples:
pred, pred_eid = graph.sample_predecessor(
start_nodes, max_degree=max_deg, return_eids=True)
last_nodes = nodes
nodes = [nodes, pred]
eids = [eids, pred_eid]
nodes, eids = flat_node_and_edge(nodes, eids)
# Find new nodes
start_nodes = list(set(nodes) - set(last_nodes))
if len(start_nodes) == 0:
break
subgraph = graph.subgraph(nodes=nodes, eid=eids) if len(reader_pool) == 1:
feed_dict = graph_wrapper.to_feed(subgraph) r = parse_to_subgraph(reader_pool[0],
sub_node_index = subgraph.reindex_from_parrent_nodes( repr(graph_wrapper), graph.node_feat,
batch_train_samples) with_parent_node_index)
else:
multi_process_sample = mp_reader.multiprocess_reader(
reader_pool, use_pipe=True, queue_size=1000)
r = parse_to_subgraph(multi_process_sample,
repr(graph_wrapper), graph.node_feat,
with_parent_node_index)
return paddle.reader.buffered(r, num_workers)
feed_dict["node_label"] = np.expand_dims( return reader()
np.array(
batch_train_labels, dtype="int64"), -1)
feed_dict["node_index"] = np.array(sub_node_index, dtype="int32")
yield feed_dict
return paddle.reader.buffered(reader, 1000)
...@@ -63,10 +63,7 @@ def load_data(normalize=True, symmetry=True): ...@@ -63,10 +63,7 @@ def load_data(normalize=True, symmetry=True):
log.info("Feature shape %s" % (repr(feature.shape))) log.info("Feature shape %s" % (repr(feature.shape)))
graph = pgl.graph.Graph( graph = pgl.graph.Graph(
num_nodes=feature.shape[0], num_nodes=feature.shape[0], edges=list(zip(src, dst)))
edges=list(zip(src, dst)),
node_feat={"index": np.arange(
0, len(feature), dtype="int64")})
return { return {
"graph": graph, "graph": graph,
...@@ -89,7 +86,13 @@ def build_graph_model(graph_wrapper, num_class, k_hop, graphsage_type, ...@@ -89,7 +86,13 @@ def build_graph_model(graph_wrapper, num_class, k_hop, graphsage_type,
node_label = fluid.layers.data( node_label = fluid.layers.data(
"node_label", shape=[None, 1], dtype="int64", append_batch_size=False) "node_label", shape=[None, 1], dtype="int64", append_batch_size=False)
feature = fluid.layers.gather(feature, graph_wrapper.node_feat['index']) parent_node_index = fluid.layers.data(
"parent_node_index",
shape=[None],
dtype="int64",
append_batch_size=False)
feature = fluid.layers.gather(feature, parent_node_index)
feature.stop_gradient = True feature.stop_gradient = True
for i in range(k_hop): for i in range(k_hop):
...@@ -221,59 +224,35 @@ def main(args): ...@@ -221,59 +224,35 @@ def main(args):
exe.run(startup_program) exe.run(startup_program)
feature_init(place) feature_init(place)
if args.sample_workers > 1: train_iter = reader.multiprocess_graph_reader(
train_iter = reader.multiprocess_graph_reader( data['graph'],
data['graph'], graph_wrapper,
graph_wrapper, samples=samples,
samples=samples, num_workers=args.sample_workers,
num_workers=args.sample_workers, batch_size=args.batch_size,
batch_size=args.batch_size, with_parent_node_index=True,
node_index=data['train_index'], node_index=data['train_index'],
node_label=data["train_label"]) node_label=data["train_label"])
else:
train_iter = reader.graph_reader( val_iter = reader.multiprocess_graph_reader(
data['graph'], data['graph'],
graph_wrapper, graph_wrapper,
samples=samples, samples=samples,
batch_size=args.batch_size, num_workers=args.sample_workers,
node_index=data['train_index'], batch_size=args.batch_size,
node_label=data["train_label"]) with_parent_node_index=True,
node_index=data['val_index'],
if args.sample_workers > 1: node_label=data["val_label"])
val_iter = reader.multiprocess_graph_reader(
data['graph'], test_iter = reader.multiprocess_graph_reader(
graph_wrapper, data['graph'],
samples=samples, graph_wrapper,
num_workers=args.sample_workers, samples=samples,
batch_size=args.batch_size, num_workers=args.sample_workers,
node_index=data['val_index'], batch_size=args.batch_size,
node_label=data["val_label"]) with_parent_node_index=True,
else: node_index=data['test_index'],
val_iter = reader.graph_reader( node_label=data["test_label"])
data['graph'],
graph_wrapper,
samples=samples,
batch_size=args.batch_size,
node_index=data['val_index'],
node_label=data["val_label"])
if args.sample_workers > 1:
test_iter = reader.multiprocess_graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
num_workers=args.sample_workers,
batch_size=args.batch_size,
node_index=data['test_index'],
node_label=data["test_label"])
else:
test_iter = reader.graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
batch_size=args.batch_size,
node_index=data['test_index'],
node_label=data["test_label"])
for epoch in range(args.epoch): for epoch in range(args.epoch):
run_epoch( run_epoch(
......
...@@ -262,59 +262,32 @@ def main(args): ...@@ -262,59 +262,32 @@ def main(args):
else: else:
train_exe = exe train_exe = exe
if args.sample_workers > 1: train_iter = reader.multiprocess_graph_reader(
train_iter = reader.multiprocess_graph_reader( data['graph'],
data['graph'], graph_wrapper,
graph_wrapper, samples=samples,
samples=samples, num_workers=args.sample_workers,
num_workers=args.sample_workers, batch_size=args.batch_size,
batch_size=args.batch_size, node_index=data['train_index'],
node_index=data['train_index'], node_label=data["train_label"])
node_label=data["train_label"])
else: val_iter = reader.multiprocess_graph_reader(
train_iter = reader.graph_reader( data['graph'],
data['graph'], graph_wrapper,
graph_wrapper, samples=samples,
samples=samples, num_workers=args.sample_workers,
batch_size=args.batch_size, batch_size=args.batch_size,
node_index=data['train_index'], node_index=data['val_index'],
node_label=data["train_label"]) node_label=data["val_label"])
if args.sample_workers > 1: test_iter = reader.multiprocess_graph_reader(
val_iter = reader.multiprocess_graph_reader( data['graph'],
data['graph'], graph_wrapper,
graph_wrapper, samples=samples,
samples=samples, num_workers=args.sample_workers,
num_workers=args.sample_workers, batch_size=args.batch_size,
batch_size=args.batch_size, node_index=data['test_index'],
node_index=data['val_index'], node_label=data["test_label"])
node_label=data["val_label"])
else:
val_iter = reader.graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
batch_size=args.batch_size,
node_index=data['val_index'],
node_label=data["val_label"])
if args.sample_workers > 1:
test_iter = reader.multiprocess_graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
num_workers=args.sample_workers,
batch_size=args.batch_size,
node_index=data['test_index'],
node_label=data["test_label"])
else:
test_iter = reader.graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
batch_size=args.batch_size,
node_index=data['test_index'],
node_label=data["test_label"])
for epoch in range(args.epoch): for epoch in range(args.epoch):
run_epoch( run_epoch(
......
...@@ -97,11 +97,7 @@ def load_data(normalize=True, symmetry=True, scale=1): ...@@ -97,11 +97,7 @@ def load_data(normalize=True, symmetry=True, scale=1):
graph = pgl.graph.Graph( graph = pgl.graph.Graph(
num_nodes=feature.shape[0], num_nodes=feature.shape[0],
edges=edges, edges=edges,
node_feat={ node_feat={"feature": feature})
"index": np.arange(
0, len(feature), dtype="int64"),
"feature": feature
})
return { return {
"graph": graph, "graph": graph,
...@@ -244,59 +240,32 @@ def main(args): ...@@ -244,59 +240,32 @@ def main(args):
test_program = train_program.clone(for_test=True) test_program = train_program.clone(for_test=True)
if args.sample_workers > 1: train_iter = reader.multiprocess_graph_reader(
train_iter = reader.multiprocess_graph_reader( data['graph'],
data['graph'], graph_wrapper,
graph_wrapper, samples=samples,
samples=samples, num_workers=args.sample_workers,
num_workers=args.sample_workers, batch_size=args.batch_size,
batch_size=args.batch_size, node_index=data['train_index'],
node_index=data['train_index'], node_label=data["train_label"])
node_label=data["train_label"])
else: val_iter = reader.multiprocess_graph_reader(
train_iter = reader.graph_reader( data['graph'],
data['graph'], graph_wrapper,
graph_wrapper, samples=samples,
samples=samples, num_workers=args.sample_workers,
batch_size=args.batch_size, batch_size=args.batch_size,
node_index=data['train_index'], node_index=data['val_index'],
node_label=data["train_label"]) node_label=data["val_label"])
if args.sample_workers > 1: test_iter = reader.multiprocess_graph_reader(
val_iter = reader.multiprocess_graph_reader( data['graph'],
data['graph'], graph_wrapper,
graph_wrapper, samples=samples,
samples=samples, num_workers=args.sample_workers,
num_workers=args.sample_workers, batch_size=args.batch_size,
batch_size=args.batch_size, node_index=data['test_index'],
node_index=data['val_index'], node_label=data["test_label"])
node_label=data["val_label"])
else:
val_iter = reader.graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
batch_size=args.batch_size,
node_index=data['val_index'],
node_label=data["val_label"])
if args.sample_workers > 1:
test_iter = reader.multiprocess_graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
num_workers=args.sample_workers,
batch_size=args.batch_size,
node_index=data['test_index'],
node_label=data["test_label"])
else:
test_iter = reader.graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
batch_size=args.batch_size,
node_index=data['test_index'],
node_label=data["test_label"])
with fluid.program_guard(train_program, startup_program): with fluid.program_guard(train_program, startup_program):
adam = fluid.optimizer.Adam(learning_rate=args.lr) adam = fluid.optimizer.Adam(learning_rate=args.lr)
......
...@@ -23,7 +23,7 @@ import tqdm ...@@ -23,7 +23,7 @@ import tqdm
import time import time
import logging import logging
import random import random
from pgl.contrib import heter_graph from pgl import heter_graph
import pickle as pkl import pickle as pkl
...@@ -71,8 +71,12 @@ class Dataset(object): ...@@ -71,8 +71,12 @@ class Dataset(object):
if len(walk) > 1: if len(walk) > 1:
self.sentences_count += 1 self.sentences_count += 1
for word in walk: for word in walk:
self.token_count += 1 if int(word) >= self.config[
word_freq[word] = word_freq.get(word, 0) + 1 'paper_start_index']: # remove paper
continue
else:
self.token_count += 1
word_freq[word] = word_freq.get(word, 0) + 1
wid = 0 wid = 0
logging.info('Read %d sentences.' % self.sentences_count) logging.info('Read %d sentences.' % self.sentences_count)
...@@ -126,6 +130,10 @@ class Dataset(object): ...@@ -126,6 +130,10 @@ class Dataset(object):
with open(filename) as reader: with open(filename) as reader:
for line in reader: for line in reader:
words = line.strip().split() words = line.strip().split()
words = [
w for w in words
if int(w) < self.config['paper_start_index']
]
if len(words) > 1: if len(words) > 1:
word_ids = [ word_ids = [
self.word2id[w] for w in words if w in self.word2id self.word2id[w] for w in words if w in self.word2id
......
...@@ -42,9 +42,10 @@ data_loader: ...@@ -42,9 +42,10 @@ data_loader:
walk_path: walks/* walk_path: walks/*
word2id_file: word2id.pkl word2id_file: word2id.pkl
batch_size: 32 batch_size: 32
win_size: 7 # default: 7 win_size: 5 # default: 7
neg_num: 5 neg_num: 5
min_count: 10 min_count: 10
paper_start_index: 1697414
model: model:
type: SkipgramModel type: SkipgramModel
......
...@@ -28,7 +28,7 @@ import tqdm ...@@ -28,7 +28,7 @@ import tqdm
import time import time
import logging import logging
import random import random
from pgl.contrib import heter_graph from pgl import heter_graph
from pgl.sample import metapath_randomwalk from pgl.sample import metapath_randomwalk
from utils import * from utils import *
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""test ogb
"""
import argparse
import pgl
import numpy as np
import paddle.fluid as fluid
from pgl.contrib.ogb.linkproppred.dataset_pgl import PglLinkPropPredDataset
from pgl.utils import paddle_helper
from ogb.linkproppred import Evaluator
def send_func(src_feat, dst_feat, edge_feat):
"""send_func"""
return src_feat["h"]
def recv_func(feat):
"""recv_func"""
return fluid.layers.sequence_pool(feat, pool_type="sum")
class GNNModel(object):
"""GNNModel"""
def __init__(self, name, num_nodes, emb_dim, num_layers):
self.num_nodes = num_nodes
self.emb_dim = emb_dim
self.num_layers = num_layers
self.name = name
self.src_nodes = fluid.layers.data(
name='src_nodes',
shape=[None, 1],
dtype='int64', )
self.dst_nodes = fluid.layers.data(
name='dst_nodes',
shape=[None, 1],
dtype='int64', )
self.edge_label = fluid.layers.data(
name='edge_label',
shape=[None, 1],
dtype='float32', )
def forward(self, graph):
"""forward"""
h = fluid.layers.create_parameter(
shape=[self.num_nodes, self.emb_dim],
dtype="float32",
name=self.name + "_embedding")
# edge_attr = fluid.layers.fc(graph.edge_feat["feat"], size=self.emb_dim)
for layer in range(self.num_layers):
msg = graph.send(
send_func,
nfeat_list=[("h", h)], )
h = graph.recv(msg, recv_func)
h = fluid.layers.fc(
h,
size=self.emb_dim,
bias_attr=False,
param_attr=fluid.ParamAttr(name=self.name + '_%s' % layer))
h = h * graph.node_feat["norm"]
bias = fluid.layers.create_parameter(
shape=[self.emb_dim],
dtype='float32',
is_bias=True,
name=self.name + '_bias_%s' % layer)
h = fluid.layers.elementwise_add(h, bias, act="relu")
src = fluid.layers.gather(h, self.src_nodes)
dst = fluid.layers.gather(h, self.dst_nodes)
edge_embed = src * dst
pred = fluid.layers.fc(input=edge_embed,
size=1,
name=self.name + "_pred_output")
prob = fluid.layers.sigmoid(pred)
loss = fluid.layers.sigmoid_cross_entropy_with_logits(pred,
self.edge_label)
loss = fluid.layers.reduce_mean(loss)
return pred, prob, loss
def main():
"""main
"""
# Training settings
parser = argparse.ArgumentParser(description='Graph Dataset')
parser.add_argument(
'--epochs',
type=int,
default=100,
help='number of epochs to train (default: 100)')
parser.add_argument(
'--dataset',
type=str,
default="ogbl-ppa",
help='dataset name (default: protein protein associations)')
args = parser.parse_args()
#place = fluid.CUDAPlace(0)
place = fluid.CPUPlace() # Dataset too big to use GPU
### automatic dataloading and splitting
print("loadding dataset")
dataset = PglLinkPropPredDataset(name=args.dataset)
splitted_edge = dataset.get_edge_split()
print(splitted_edge['train_edge'].shape)
print(splitted_edge['train_edge_label'].shape)
print("building evaluator")
### automatic evaluator. takes dataset name as input
evaluator = Evaluator(args.dataset)
graph_data = dataset[0]
print("num_nodes: %d" % graph_data.num_nodes)
train_program = fluid.Program()
startup_program = fluid.Program()
test_program = fluid.Program()
# degree normalize
indegree = graph_data.indegree()
norm = np.zeros_like(indegree, dtype="float32")
norm[indegree > 0] = np.power(indegree[indegree > 0], -0.5)
graph_data.node_feat["norm"] = np.expand_dims(norm, -1).astype("float32")
with fluid.program_guard(train_program, startup_program):
model = GNNModel(
name="gnn",
num_nodes=graph_data.num_nodes,
emb_dim=64,
num_layers=2)
gw = pgl.graph_wrapper.GraphWrapper(
"graph",
place,
node_feat=graph_data.node_feat_info(),
edge_feat=graph_data.edge_feat_info())
pred, prob, loss = model.forward(gw)
val_program = train_program.clone(for_test=True)
with fluid.program_guard(train_program, startup_program):
adam = fluid.optimizer.Adam(
learning_rate=1e-2,
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0005))
adam.minimize(loss)
exe = fluid.Executor(place)
exe.run(startup_program)
feed = gw.to_feed(graph_data)
for epoch in range(1, args.epochs + 1):
feed['src_nodes'] = splitted_edge["train_edge"][:, 0].reshape(-1, 1)
feed['dst_nodes'] = splitted_edge["train_edge"][:, 1].reshape(-1, 1)
feed['edge_label'] = splitted_edge["train_edge_label"].astype(
"float32").reshape(-1, 1)
res_loss, y_pred = exe.run(train_program,
feed=feed,
fetch_list=[loss, prob])
print("Loss %s" % res_loss[0])
result = {}
print("Evaluating...")
feed['src_nodes'] = splitted_edge["valid_edge"][:, 0].reshape(-1, 1)
feed['dst_nodes'] = splitted_edge["valid_edge"][:, 1].reshape(-1, 1)
feed['edge_label'] = splitted_edge["valid_edge_label"].astype(
"float32").reshape(-1, 1)
y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0]
input_dict = {
"y_true": splitted_edge["valid_edge_label"],
"y_pred": y_pred.reshape(-1, ),
}
result["valid"] = evaluator.eval(input_dict)
feed['src_nodes'] = splitted_edge["test_edge"][:, 0].reshape(-1, 1)
feed['dst_nodes'] = splitted_edge["test_edge"][:, 1].reshape(-1, 1)
feed['edge_label'] = splitted_edge["test_edge_label"].astype(
"float32").reshape(-1, 1)
y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0]
input_dict = {
"y_true": splitted_edge["test_edge_label"],
"y_pred": y_pred.reshape(-1, ),
}
result["test"] = evaluator.eval(input_dict)
print(result)
if __name__ == "__main__":
main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""test ogb
"""
import argparse
import pgl
import numpy as np
import paddle.fluid as fluid
from pgl.contrib.ogb.nodeproppred.dataset_pgl import PglNodePropPredDataset
from pgl.utils import paddle_helper
from ogb.nodeproppred import Evaluator
def train():
pass
def send_func(src_feat, dst_feat, edge_feat):
return (src_feat["h"] + edge_feat["h"]) * src_feat["norm"]
class GNNModel(object):
def __init__(self, name, emb_dim, num_task, num_layers):
self.num_task = num_task
self.emb_dim = emb_dim
self.num_layers = num_layers
self.name = name
def forward(self, graph):
h = fluid.layers.embedding(
graph.node_feat["x"],
size=(2, self.emb_dim)) # name=self.name + "_embedding")
edge_attr = fluid.layers.fc(graph.edge_feat["feat"], size=self.emb_dim)
for layer in range(self.num_layers):
msg = graph.send(
send_func,
nfeat_list=[("h", h), ("norm", graph.node_feat["norm"])],
efeat_list=[("h", edge_attr)])
h = graph.recv(msg, "sum")
h = fluid.layers.fc(
h,
size=self.emb_dim,
bias_attr=False,
param_attr=fluid.ParamAttr(name=self.name + '_%s' % layer))
h = h * graph.node_feat["norm"]
bias = fluid.layers.create_parameter(
shape=[self.emb_dim],
dtype='float32',
is_bias=True,
name=self.name + '_bias_%s' % layer)
h = fluid.layers.elementwise_add(h, bias, act="relu")
pred = fluid.layers.fc(h,
self.num_task,
act=None,
name=self.name + "_pred_output")
return pred
def main():
"""main
"""
# Training settings
parser = argparse.ArgumentParser(description='Graph Dataset')
parser.add_argument(
'--epochs',
type=int,
default=100,
help='number of epochs to train (default: 100)')
parser.add_argument(
'--dataset',
type=str,
default="ogbn-proteins",
help='dataset name (default: proteinfunc)')
args = parser.parse_args()
#device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")
#place = fluid.CUDAPlace(0)
place = fluid.CPUPlace() # Dataset too big to use GPU
### automatic dataloading and splitting
dataset = PglNodePropPredDataset(name=args.dataset)
splitted_idx = dataset.get_idx_split()
### automatic evaluator. takes dataset name as input
evaluator = Evaluator(args.dataset)
graph_data, label = dataset[0]
train_program = fluid.Program()
startup_program = fluid.Program()
test_program = fluid.Program()
# degree normalize
indegree = graph_data.indegree()
norm = np.zeros_like(indegree, dtype="float32")
norm[indegree > 0] = np.power(indegree[indegree > 0], -0.5)
graph_data.node_feat["norm"] = np.expand_dims(norm, -1).astype("float32")
graph_data.node_feat["x"] = np.zeros((len(indegree), 1), dtype="int64")
graph_data.edge_feat["feat"] = graph_data.edge_feat["feat"].astype(
"float32")
model = GNNModel(
name="gnn", num_task=dataset.num_tasks, emb_dim=64, num_layers=2)
with fluid.program_guard(train_program, startup_program):
gw = pgl.graph_wrapper.StaticGraphWrapper("graph", graph_data, place)
pred = model.forward(gw)
sigmoid_pred = fluid.layers.sigmoid(pred)
val_program = train_program.clone(for_test=True)
initializer = []
with fluid.program_guard(train_program, startup_program):
train_node_index, init = paddle_helper.constant(
"train_node_index", dtype="int64", value=splitted_idx["train"])
initializer.append(init)
train_node_label, init = paddle_helper.constant(
"train_node_label",
dtype="float32",
value=label[splitted_idx["train"]].astype("float32"))
initializer.append(init)
train_pred_t = fluid.layers.gather(pred, train_node_index)
train_loss_t = fluid.layers.sigmoid_cross_entropy_with_logits(
x=train_pred_t, label=train_node_label)
train_loss_t = fluid.layers.reduce_sum(train_loss_t)
train_pred_t = fluid.layers.sigmoid(train_pred_t)
adam = fluid.optimizer.Adam(
learning_rate=1e-2,
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0005))
adam.minimize(train_loss_t)
exe = fluid.Executor(place)
exe.run(startup_program)
gw.initialize(place)
for init in initializer:
init(place)
for epoch in range(1, args.epochs + 1):
loss = exe.run(train_program, feed={}, fetch_list=[train_loss_t])
print("Loss %s" % loss[0])
print("Evaluating...")
y_pred = exe.run(val_program, feed={}, fetch_list=[sigmoid_pred])[0]
result = {}
input_dict = {
"y_true": label[splitted_idx["train"]],
"y_pred": y_pred[splitted_idx["train"]]
}
result["train"] = evaluator.eval(input_dict)
input_dict = {
"y_true": label[splitted_idx["valid"]],
"y_pred": y_pred[splitted_idx["valid"]]
}
result["valid"] = evaluator.eval(input_dict)
input_dict = {
"y_true": label[splitted_idx["test"]],
"y_pred": y_pred[splitted_idx["test"]]
}
result["test"] = evaluator.eval(input_dict)
print(result)
if __name__ == "__main__":
main()
...@@ -18,4 +18,6 @@ from pgl import layers ...@@ -18,4 +18,6 @@ from pgl import layers
from pgl import graph_wrapper from pgl import graph_wrapper
from pgl import graph from pgl import graph
from pgl import data_loader from pgl import data_loader
from pgl import heter_graph
from pgl import heter_graph_wrapper
from pgl import contrib from pgl import contrib
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""__init__.py"""
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PglGraphPropPredDataset
"""
import pandas as pd
import shutil, os
import os.path as osp
import numpy as np
from ogb.utils.url import decide_download, download_url, extract_zip
from ogb.graphproppred import make_master_file
from pgl.contrib.ogb.io.read_graph_pgl import read_csv_graph_pgl
def to_bool(value):
"""to_bool"""
return np.array([value], dtype="bool")[0]
class PglGraphPropPredDataset(object):
"""PglGraphPropPredDataset"""
def __init__(self, name, root="dataset"):
self.name = name ## original name, e.g., ogbg-mol-tox21
self.dir_name = "_".join(
name.split("-")
) + "_pgl" ## replace hyphen with underline, e.g., ogbg_mol_tox21_dgl
self.original_root = root
self.root = osp.join(root, self.dir_name)
self.meta_info = make_master_file.df #pd.read_csv(
#os.path.join(os.path.dirname(__file__), "master.csv"), index_col=0)
if not self.name in self.meta_info:
print(self.name)
error_mssg = "Invalid dataset name {}.\n".format(self.name)
error_mssg += "Available datasets are as follows:\n"
error_mssg += "\n".join(self.meta_info.keys())
raise ValueError(error_mssg)
self.download_name = self.meta_info[self.name][
"download_name"] ## name of downloaded file, e.g., tox21
self.num_tasks = int(self.meta_info[self.name]["num tasks"])
self.task_type = self.meta_info[self.name]["task type"]
super(PglGraphPropPredDataset, self).__init__()
self.pre_process()
def pre_process(self):
"""Pre-processing"""
processed_dir = osp.join(self.root, 'processed')
raw_dir = osp.join(self.root, 'raw')
pre_processed_file_path = osp.join(processed_dir, 'pgl_data_processed')
if os.path.exists(pre_processed_file_path):
# TODO: Load Preprocessed
pass
else:
### download
url = self.meta_info[self.name]["url"]
if decide_download(url):
path = download_url(url, self.original_root)
extract_zip(path, self.original_root)
os.unlink(path)
# delete folder if there exists
try:
shutil.rmtree(self.root)
except:
pass
shutil.move(
osp.join(self.original_root, self.download_name),
self.root)
else:
print("Stop download.")
exit(-1)
### preprocess
add_inverse_edge = to_bool(self.meta_info[self.name][
"add_inverse_edge"])
self.graphs = read_csv_graph_pgl(
raw_dir, add_inverse_edge=add_inverse_edge)
self.graphs = np.array(self.graphs)
self.labels = np.array(
pd.read_csv(
osp.join(raw_dir, "graph-label.csv.gz"),
compression="gzip",
header=None).values)
# TODO: Load Graph
### load preprocessed files
def get_idx_split(self):
"""Train/Valid/Test split"""
split_type = self.meta_info[self.name]["split"]
path = osp.join(self.root, "split", split_type)
train_idx = pd.read_csv(
osp.join(path, "train.csv.gz"), compression="gzip",
header=None).values.T[0]
valid_idx = pd.read_csv(
osp.join(path, "valid.csv.gz"), compression="gzip",
header=None).values.T[0]
test_idx = pd.read_csv(
osp.join(path, "test.csv.gz"), compression="gzip",
header=None).values.T[0]
return {
"train": np.array(
train_idx, dtype="int64"),
"valid": np.array(
valid_idx, dtype="int64"),
"test": np.array(
test_idx, dtype="int64")
}
def __getitem__(self, idx):
"""Get datapoint with index"""
return self.graphs[idx], self.labels[idx]
def __len__(self):
"""Length of the dataset
Returns
-------
int
Length of Dataset
"""
return len(self.graphs)
def __repr__(self): # pragma: no cover
return '{}({})'.format(self.__class__.__name__, len(self))
if __name__ == "__main__":
pgl_dataset = PglGraphPropPredDataset(name="ogbg-mol-bace")
splitted_index = pgl_dataset.get_idx_split()
print(pgl_dataset)
print(pgl_dataset[3:20])
#print(pgl_dataset[splitted_index["train"]])
#print(pgl_dataset[splitted_index["valid"]])
#print(pgl_dataset[splitted_index["test"]])
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -11,8 +11,5 @@ ...@@ -11,8 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Generate Contrib api """__init__.py
""" """
from pgl.contrib import heter_graph
from pgl.contrib import heter_graph_wrapper
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""pgl read_csv_graph for ogb
"""
import pandas as pd
import os.path as osp
import numpy as np
import pgl
from ogb.io.read_graph_raw import read_csv_graph_raw
def read_csv_graph_pgl(raw_dir, add_inverse_edge=False):
"""Read CSV data and build PGL Graph
"""
graph_list = read_csv_graph_raw(raw_dir, add_inverse_edge)
pgl_graph_list = []
for graph in graph_list:
edges = list(zip(graph["edge_index"][0], graph["edge_index"][1]))
g = pgl.graph.Graph(num_nodes=graph["num_nodes"], edges=edges)
if graph["edge_feat"] is not None:
g.edge_feat["feat"] = graph["edge_feat"]
if graph["node_feat"] is not None:
g.node_feat["feat"] = graph["node_feat"]
pgl_graph_list.append(g)
return pgl_graph_list
if __name__ == "__main__":
# graph_list = read_csv_graph_dgl('dataset/proteinfunc_v2/raw', add_inverse_edge = True)
graph_list = read_csv_graph_pgl(
'dataset/ogbn_proteins_pgl/raw', add_inverse_edge=True)
print(graph_list)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""__init__.py
"""
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""LinkPropPredDataset for pgl
"""
import pandas as pd
import shutil, os
import os.path as osp
import numpy as np
from ogb.utils.url import decide_download, download_url, extract_zip
from ogb.linkproppred import make_master_file
from pgl.contrib.ogb.io.read_graph_pgl import read_csv_graph_pgl
def to_bool(value):
"""to_bool"""
return np.array([value], dtype="bool")[0]
class PglLinkPropPredDataset(object):
"""PglLinkPropPredDataset
"""
def __init__(self, name, root="dataset"):
self.name = name ## original name, e.g., ogbl-ppa
self.dir_name = "_".join(name.split(
"-")) + "_pgl" ## replace hyphen with underline, e.g., ogbl_ppa_pgl
self.original_root = root
self.root = osp.join(root, self.dir_name)
self.meta_info = make_master_file.df #pd.read_csv(os.path.join(os.path.dirname(__file__), "master.csv"), index_col=0)
if not self.name in self.meta_info:
print(self.name)
error_mssg = "Invalid dataset name {}.\n".format(self.name)
error_mssg += "Available datasets are as follows:\n"
error_mssg += "\n".join(self.meta_info.keys())
raise ValueError(error_mssg)
self.download_name = self.meta_info[self.name][
"download_name"] ## name of downloaded file, e.g., ppassoc
self.task_type = self.meta_info[self.name]["task type"]
super(PglLinkPropPredDataset, self).__init__()
self.pre_process()
def pre_process(self):
"""pre_process downlaoding data
"""
processed_dir = osp.join(self.root, 'processed')
pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed')
if osp.exists(pre_processed_file_path):
#TODO: Reload Preprocess files
pass
else:
### check download
if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")):
url = self.meta_info[self.name]["url"]
if decide_download(url):
path = download_url(url, self.original_root)
extract_zip(path, self.original_root)
os.unlink(path)
# delete folder if there exists
try:
shutil.rmtree(self.root)
except:
pass
shutil.move(
osp.join(self.original_root, self.download_name),
self.root)
else:
print("Stop download.")
exit(-1)
raw_dir = osp.join(self.root, "raw")
### pre-process and save
add_inverse_edge = to_bool(self.meta_info[self.name][
"add_inverse_edge"])
self.graph = read_csv_graph_pgl(
raw_dir, add_inverse_edge=add_inverse_edge)
#TODO: SAVE preprocess graph
def get_edge_split(self):
"""Train/Validation/Test split
"""
split_type = self.meta_info[self.name]["split"]
path = osp.join(self.root, "split", split_type)
train_idx = pd.read_csv(
osp.join(path, "train.csv.gz"), compression="gzip",
header=None).values
valid_idx = pd.read_csv(
osp.join(path, "valid.csv.gz"), compression="gzip",
header=None).values
test_idx = pd.read_csv(
osp.join(path, "test.csv.gz"), compression="gzip",
header=None).values
if self.task_type == "link prediction":
target_type = np.int64
else:
target_type = np.float32
return {
"train_edge": np.array(
train_idx[:, :2], dtype="int64"),
"train_edge_label": np.array(
train_idx[:, 2], dtype=target_type),
"valid_edge": np.array(
valid_idx[:, :2], dtype="int64"),
"valid_edge_label": np.array(
valid_idx[:, 2], dtype=target_type),
"test_edge": np.array(
test_idx[:, :2], dtype="int64"),
"test_edge_label": np.array(
test_idx[:, 2], dtype=target_type)
}
def __getitem__(self, idx):
assert idx == 0, "This dataset has only one graph"
return self.graph[0]
def __len__(self):
return 1
def __repr__(self): # pragma: no cover
return '{}({})'.format(self.__class__.__name__, len(self))
if __name__ == "__main__":
pgl_dataset = PglLinkPropPredDataset(name="ogbl-ppa")
splitted_edge = pgl_dataset.get_edge_split()
print(pgl_dataset[0])
print(splitted_edge)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""__init__.py
"""
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""NodePropPredDataset for pgl
"""
import pandas as pd
import shutil, os
import os.path as osp
import numpy as np
from ogb.utils.url import decide_download, download_url, extract_zip
from ogb.nodeproppred import make_master_file # create master.csv
from pgl.contrib.ogb.io.read_graph_pgl import read_csv_graph_pgl
def to_bool(value):
"""to_bool"""
return np.array([value], dtype="bool")[0]
class PglNodePropPredDataset(object):
"""PglNodePropPredDataset
"""
def __init__(self, name, root="dataset"):
self.name = name ## original name, e.g., ogbn-proteins
self.dir_name = "_".join(
name.split("-")
) + "_pgl" ## replace hyphen with underline, e.g., ogbn_proteins_pgl
self.original_root = root
self.root = osp.join(root, self.dir_name)
self.meta_info = make_master_file.df #pd.read_csv(
#os.path.join(os.path.dirname(__file__), "master.csv"), index_col=0)
if not self.name in self.meta_info:
error_mssg = "Invalid dataset name {}.\n".format(self.name)
error_mssg += "Available datasets are as follows:\n"
error_mssg += "\n".join(self.meta_info.keys())
raise ValueError(error_mssg)
self.download_name = self.meta_info[self.name][
"download_name"] ## name of downloaded file, e.g., tox21
self.num_tasks = int(self.meta_info[self.name]["num tasks"])
self.task_type = self.meta_info[self.name]["task type"]
super(PglNodePropPredDataset, self).__init__()
self.pre_process()
def pre_process(self):
"""pre_process downlaoding data
"""
processed_dir = osp.join(self.root, 'processed')
pre_processed_file_path = osp.join(processed_dir, 'pgl_data_processed')
if osp.exists(pre_processed_file_path):
# TODO: Reload Preprocess files
pass
else:
### check download
if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")):
url = self.meta_info[self.name]["url"]
if decide_download(url):
path = download_url(url, self.original_root)
extract_zip(path, self.original_root)
os.unlink(path)
# delete folder if there exists
try:
shutil.rmtree(self.root)
except:
pass
shutil.move(
osp.join(self.original_root, self.download_name),
self.root)
else:
print("Stop download.")
exit(-1)
raw_dir = osp.join(self.root, "raw")
### pre-process and save
add_inverse_edge = to_bool(self.meta_info[self.name][
"add_inverse_edge"])
self.graph = read_csv_graph_pgl(
raw_dir, add_inverse_edge=add_inverse_edge)
### adding prediction target
node_label = pd.read_csv(
osp.join(raw_dir, 'node-label.csv.gz'),
compression="gzip",
header=None).values
if "classification" in self.task_type:
node_label = np.array(node_label, dtype=np.int64)
else:
node_label = np.array(node_label, dtype=np.float32)
label_dict = {"labels": node_label}
# TODO: SAVE preprocess graph
self.labels = label_dict['labels']
def get_idx_split(self):
"""Train/Validation/Test split
"""
split_type = self.meta_info[self.name]["split"]
path = osp.join(self.root, "split", split_type)
train_idx = pd.read_csv(
osp.join(path, "train.csv.gz"), compression="gzip",
header=None).values.T[0]
valid_idx = pd.read_csv(
osp.join(path, "valid.csv.gz"), compression="gzip",
header=None).values.T[0]
test_idx = pd.read_csv(
osp.join(path, "test.csv.gz"), compression="gzip",
header=None).values.T[0]
return {
"train": np.array(
train_idx, dtype="int64"),
"valid": np.array(
valid_idx, dtype="int64"),
"test": np.array(
test_idx, dtype="int64")
}
def __getitem__(self, idx):
assert idx == 0, "This dataset has only one graph"
return self.graph[idx], self.labels
def __len__(self):
return 1
def __repr__(self): # pragma: no cover
return '{}({})'.format(self.__class__.__name__, len(self))
if __name__ == "__main__":
pgl_dataset = PglNodePropPredDataset(name="ogbn-proteins")
splitted_index = pgl_dataset.get_idx_split()
print(pgl_dataset[0])
print(splitted_index)
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
This package implement Graph structure for handling graph data. This package implement Graph structure for handling graph data.
""" """
import os
import numpy as np import numpy as np
import pickle as pkl import pickle as pkl
import time import time
...@@ -77,6 +78,15 @@ class EdgeIndex(object): ...@@ -77,6 +78,15 @@ class EdgeIndex(object):
""" """
return self._sorted_u, self._sorted_v, self._sorted_eid return self._sorted_u, self._sorted_v, self._sorted_eid
def dump(self, path):
if not os.path.exists(path):
os.makedirs(path)
np.save(path + '/degree.npy', self._degree)
np.save(path + '/sorted_u.npy', self._sorted_u)
np.save(path + '/sorted_v.npy', self._sorted_v)
np.save(path + '/sorted_eid.npy', self._sorted_eid)
np.save(path + '/indptr.npy', self._indptr)
class Graph(object): class Graph(object):
"""Implementation of graph structure in pgl. """Implementation of graph structure in pgl.
...@@ -136,6 +146,18 @@ class Graph(object): ...@@ -136,6 +146,18 @@ class Graph(object):
self._adj_src_index = None self._adj_src_index = None
self._adj_dst_index = None self._adj_dst_index = None
def dump(self, path):
if not os.path.exists(path):
os.makedirs(path)
np.save(path + '/num_nodes.npy', self._num_nodes)
np.save(path + '/edges.npy', self._edges)
if self._adj_src_index:
self._adj_src_index.dump(path + '/adj_src')
if self._adj_dst_index:
self._adj_dst_index.dump(path + '/adj_dst')
@property @property
def adj_src_index(self): def adj_src_index(self):
"""Return an EdgeIndex object for src. """Return an EdgeIndex object for src.
...@@ -506,7 +528,13 @@ class Graph(object): ...@@ -506,7 +528,13 @@ class Graph(object):
(key, _hide_num_nodes(value.shape), value.dtype)) (key, _hide_num_nodes(value.shape), value.dtype))
return edge_feat_info return edge_feat_info
def subgraph(self, nodes, eid=None, edges=None): def subgraph(self,
nodes,
eid=None,
edges=None,
edge_feats=None,
with_node_feat=True,
with_edge_feat=True):
"""Generate subgraph with nodes and edge ids. """Generate subgraph with nodes and edge ids.
This function will generate a :code:`pgl.graph.Subgraph` object and This function will generate a :code:`pgl.graph.Subgraph` object and
...@@ -521,6 +549,10 @@ class Graph(object): ...@@ -521,6 +549,10 @@ class Graph(object):
eid (optional): Edge ids which will be included in the subgraph. eid (optional): Edge ids which will be included in the subgraph.
edges (optional): Edge(src, dst) list which will be included in the subgraph. edges (optional): Edge(src, dst) list which will be included in the subgraph.
with_node_feat: Whether to inherit node features from parent graph.
with_edge_feat: Whether to inherit edge features from parent graph.
Return: Return:
A :code:`pgl.graph.Subgraph` object. A :code:`pgl.graph.Subgraph` object.
...@@ -543,14 +575,20 @@ class Graph(object): ...@@ -543,14 +575,20 @@ class Graph(object):
len(edges), dtype="int64"), edges, reindex) len(edges), dtype="int64"), edges, reindex)
sub_edge_feat = {} sub_edge_feat = {}
for key, value in self._edge_feat.items(): if edges is None:
if eid is None: if with_edge_feat:
raise ValueError("Eid can not be None with edge features.") for key, value in self._edge_feat.items():
sub_edge_feat[key] = value[eid] if eid is None:
raise ValueError(
"Eid can not be None with edge features.")
sub_edge_feat[key] = value[eid]
else:
sub_edge_feat = edge_feats
sub_node_feat = {} sub_node_feat = {}
for key, value in self._node_feat.items(): if with_node_feat:
sub_node_feat[key] = value[nodes] for key, value in self._node_feat.items():
sub_node_feat[key] = value[nodes]
subgraph = SubGraph( subgraph = SubGraph(
num_nodes=len(nodes), num_nodes=len(nodes),
...@@ -779,3 +817,27 @@ class SubGraph(Graph): ...@@ -779,3 +817,27 @@ class SubGraph(Graph):
A list of node ids in parent graph. A list of node ids in parent graph.
""" """
return graph_kernel.map_nodes(nodes, self._to_reindex) return graph_kernel.map_nodes(nodes, self._to_reindex)
class MemmapEdgeIndex(EdgeIndex):
def __init__(self, path):
self._degree = np.load(path + '/degree.npy', mmap_mode="r")
self._sorted_u = np.load(path + '/sorted_u.npy', mmap_mode="r")
self._sorted_v = np.load(path + '/sorted_v.npy', mmap_mode="r")
self._sorted_eid = np.load(path + '/sorted_eid.npy', mmap_mode="r")
self._indptr = np.load(path + '/indptr.npy', mmap_mode="r")
class MemmapGraph(Graph):
def __init__(self, path):
self._num_nodes = np.load(path + '/num_nodes.npy')
self._edges = np.load(path + '/edges.npy', mmap_mode="r")
if os.path.exists(path + '/adj_src'):
self._adj_src_index = MemmapEdgeIndex(path + '/adj_src')
else:
self._adj_src_index = None
if os.path.exists(path + '/adj_dst'):
self._adj_dst_index = MemmapEdgeIndex(path + '/adj_dst')
else:
self._adj_dst_index = None
...@@ -89,8 +89,8 @@ class BaseGraphWrapper(object): ...@@ -89,8 +89,8 @@ class BaseGraphWrapper(object):
""" """
def __init__(self): def __init__(self):
self._node_feat_tensor_dict = {} self.node_feat_tensor_dict = {}
self._edge_feat_tensor_dict = {} self.edge_feat_tensor_dict = {}
self._edges_src = None self._edges_src = None
self._edges_dst = None self._edges_dst = None
self._num_nodes = None self._num_nodes = None
...@@ -98,6 +98,10 @@ class BaseGraphWrapper(object): ...@@ -98,6 +98,10 @@ class BaseGraphWrapper(object):
self._edge_uniq_dst = None self._edge_uniq_dst = None
self._edge_uniq_dst_count = None self._edge_uniq_dst_count = None
self._node_ids = None self._node_ids = None
self._data_name_prefix = ""
def __repr__(self):
return self._data_name_prefix
def send(self, message_func, nfeat_list=None, efeat_list=None): def send(self, message_func, nfeat_list=None, efeat_list=None):
"""Send message from all src nodes to dst nodes. """Send message from all src nodes to dst nodes.
...@@ -220,7 +224,7 @@ class BaseGraphWrapper(object): ...@@ -220,7 +224,7 @@ class BaseGraphWrapper(object):
A dictionary whose keys are the feature names and the values A dictionary whose keys are the feature names and the values
are feature tensor. are feature tensor.
""" """
return self._edge_feat_tensor_dict return self.edge_feat_tensor_dict
@property @property
def node_feat(self): def node_feat(self):
...@@ -230,7 +234,7 @@ class BaseGraphWrapper(object): ...@@ -230,7 +234,7 @@ class BaseGraphWrapper(object):
A dictionary whose keys are the feature names and the values A dictionary whose keys are the feature names and the values
are feature tensor. are feature tensor.
""" """
return self._node_feat_tensor_dict return self.node_feat_tensor_dict
def indegree(self): def indegree(self):
"""Return the indegree tensor for all nodes. """Return the indegree tensor for all nodes.
...@@ -298,8 +302,8 @@ class StaticGraphWrapper(BaseGraphWrapper): ...@@ -298,8 +302,8 @@ class StaticGraphWrapper(BaseGraphWrapper):
def __init__(self, name, graph, place): def __init__(self, name, graph, place):
super(StaticGraphWrapper, self).__init__() super(StaticGraphWrapper, self).__init__()
self._data_name_prefix = name
self._initializers = [] self._initializers = []
self.__data_name_prefix = name
self.__create_graph_attr(graph) self.__create_graph_attr(graph)
def __create_graph_attr(self, graph): def __create_graph_attr(self, graph):
...@@ -326,43 +330,43 @@ class StaticGraphWrapper(BaseGraphWrapper): ...@@ -326,43 +330,43 @@ class StaticGraphWrapper(BaseGraphWrapper):
self._edges_src, init = paddle_helper.constant( self._edges_src, init = paddle_helper.constant(
dtype="int64", dtype="int64",
value=src, value=src,
name=self.__data_name_prefix + '/edges_src') name=self._data_name_prefix + '/edges_src')
self._initializers.append(init) self._initializers.append(init)
self._edges_dst, init = paddle_helper.constant( self._edges_dst, init = paddle_helper.constant(
dtype="int64", dtype="int64",
value=dst, value=dst,
name=self.__data_name_prefix + '/edges_dst') name=self._data_name_prefix + '/edges_dst')
self._initializers.append(init) self._initializers.append(init)
self._num_nodes, init = paddle_helper.constant( self._num_nodes, init = paddle_helper.constant(
dtype="int64", dtype="int64",
hide_batch_size=False, hide_batch_size=False,
value=np.array([graph.num_nodes]), value=np.array([graph.num_nodes]),
name=self.__data_name_prefix + '/num_nodes') name=self._data_name_prefix + '/num_nodes')
self._initializers.append(init) self._initializers.append(init)
self._edge_uniq_dst, init = paddle_helper.constant( self._edge_uniq_dst, init = paddle_helper.constant(
name=self.__data_name_prefix + "/uniq_dst", name=self._data_name_prefix + "/uniq_dst",
dtype="int64", dtype="int64",
value=uniq_dst) value=uniq_dst)
self._initializers.append(init) self._initializers.append(init)
self._edge_uniq_dst_count, init = paddle_helper.constant( self._edge_uniq_dst_count, init = paddle_helper.constant(
name=self.__data_name_prefix + "/uniq_dst_count", name=self._data_name_prefix + "/uniq_dst_count",
dtype="int32", dtype="int32",
value=uniq_dst_count) value=uniq_dst_count)
self._initializers.append(init) self._initializers.append(init)
node_ids_value = np.arange(0, graph.num_nodes, dtype="int64") node_ids_value = np.arange(0, graph.num_nodes, dtype="int64")
self._node_ids, init = paddle_helper.constant( self._node_ids, init = paddle_helper.constant(
name=self.__data_name_prefix + "/node_ids", name=self._data_name_prefix + "/node_ids",
dtype="int64", dtype="int64",
value=node_ids_value) value=node_ids_value)
self._initializers.append(init) self._initializers.append(init)
self._indegree, init = paddle_helper.constant( self._indegree, init = paddle_helper.constant(
name=self.__data_name_prefix + "/indegree", name=self._data_name_prefix + "/indegree",
dtype="int64", dtype="int64",
value=indegree) value=indegree)
self._initializers.append(init) self._initializers.append(init)
...@@ -373,9 +377,9 @@ class StaticGraphWrapper(BaseGraphWrapper): ...@@ -373,9 +377,9 @@ class StaticGraphWrapper(BaseGraphWrapper):
for node_feat_name, node_feat_value in node_feat.items(): for node_feat_name, node_feat_value in node_feat.items():
node_feat_shape = node_feat_value.shape node_feat_shape = node_feat_value.shape
node_feat_dtype = node_feat_value.dtype node_feat_dtype = node_feat_value.dtype
self._node_feat_tensor_dict[ self.node_feat_tensor_dict[
node_feat_name], init = paddle_helper.constant( node_feat_name], init = paddle_helper.constant(
name=self.__data_name_prefix + '/node_feat/' + name=self._data_name_prefix + '/node_feat/' +
node_feat_name, node_feat_name,
dtype=node_feat_dtype, dtype=node_feat_dtype,
value=node_feat_value) value=node_feat_value)
...@@ -387,9 +391,9 @@ class StaticGraphWrapper(BaseGraphWrapper): ...@@ -387,9 +391,9 @@ class StaticGraphWrapper(BaseGraphWrapper):
for edge_feat_name, edge_feat_value in edge_feat.items(): for edge_feat_name, edge_feat_value in edge_feat.items():
edge_feat_shape = edge_feat_value.shape edge_feat_shape = edge_feat_value.shape
edge_feat_dtype = edge_feat_value.dtype edge_feat_dtype = edge_feat_value.dtype
self._edge_feat_tensor_dict[ self.edge_feat_tensor_dict[
edge_feat_name], init = paddle_helper.constant( edge_feat_name], init = paddle_helper.constant(
name=self.__data_name_prefix + '/edge_feat/' + name=self._data_name_prefix + '/edge_feat/' +
edge_feat_name, edge_feat_name,
dtype=edge_feat_dtype, dtype=edge_feat_dtype,
value=edge_feat_value) value=edge_feat_value)
...@@ -477,8 +481,8 @@ class GraphWrapper(BaseGraphWrapper): ...@@ -477,8 +481,8 @@ class GraphWrapper(BaseGraphWrapper):
def __init__(self, name, place, node_feat=[], edge_feat=[]): def __init__(self, name, place, node_feat=[], edge_feat=[]):
super(GraphWrapper, self).__init__() super(GraphWrapper, self).__init__()
# collect holders for PyReader # collect holders for PyReader
self._data_name_prefix = name
self._holder_list = [] self._holder_list = []
self.__data_name_prefix = name
self._place = place self._place = place
self.__create_graph_attr_holders() self.__create_graph_attr_holders()
for node_feat_name, node_feat_shape, node_feat_dtype in node_feat: for node_feat_name, node_feat_shape, node_feat_dtype in node_feat:
...@@ -493,43 +497,43 @@ class GraphWrapper(BaseGraphWrapper): ...@@ -493,43 +497,43 @@ class GraphWrapper(BaseGraphWrapper):
"""Create data holders for graph attributes. """Create data holders for graph attributes.
""" """
self._edges_src = fluid.layers.data( self._edges_src = fluid.layers.data(
self.__data_name_prefix + '/edges_src', self._data_name_prefix + '/edges_src',
shape=[None], shape=[None],
append_batch_size=False, append_batch_size=False,
dtype="int64", dtype="int64",
stop_gradient=True) stop_gradient=True)
self._edges_dst = fluid.layers.data( self._edges_dst = fluid.layers.data(
self.__data_name_prefix + '/edges_dst', self._data_name_prefix + '/edges_dst',
shape=[None], shape=[None],
append_batch_size=False, append_batch_size=False,
dtype="int64", dtype="int64",
stop_gradient=True) stop_gradient=True)
self._num_nodes = fluid.layers.data( self._num_nodes = fluid.layers.data(
self.__data_name_prefix + '/num_nodes', self._data_name_prefix + '/num_nodes',
shape=[1], shape=[1],
append_batch_size=False, append_batch_size=False,
dtype='int64', dtype='int64',
stop_gradient=True) stop_gradient=True)
self._edge_uniq_dst = fluid.layers.data( self._edge_uniq_dst = fluid.layers.data(
self.__data_name_prefix + "/uniq_dst", self._data_name_prefix + "/uniq_dst",
shape=[None], shape=[None],
append_batch_size=False, append_batch_size=False,
dtype="int64", dtype="int64",
stop_gradient=True) stop_gradient=True)
self._edge_uniq_dst_count = fluid.layers.data( self._edge_uniq_dst_count = fluid.layers.data(
self.__data_name_prefix + "/uniq_dst_count", self._data_name_prefix + "/uniq_dst_count",
shape=[None], shape=[None],
append_batch_size=False, append_batch_size=False,
dtype="int32", dtype="int32",
stop_gradient=True) stop_gradient=True)
self._node_ids = fluid.layers.data( self._node_ids = fluid.layers.data(
self.__data_name_prefix + "/node_ids", self._data_name_prefix + "/node_ids",
shape=[None], shape=[None],
append_batch_size=False, append_batch_size=False,
dtype="int64", dtype="int64",
stop_gradient=True) stop_gradient=True)
self._indegree = fluid.layers.data( self._indegree = fluid.layers.data(
self.__data_name_prefix + "/indegree", self._data_name_prefix + "/indegree",
shape=[None], shape=[None],
append_batch_size=False, append_batch_size=False,
dtype="int64", dtype="int64",
...@@ -545,12 +549,12 @@ class GraphWrapper(BaseGraphWrapper): ...@@ -545,12 +549,12 @@ class GraphWrapper(BaseGraphWrapper):
"""Create data holders for node features. """Create data holders for node features.
""" """
feat_holder = fluid.layers.data( feat_holder = fluid.layers.data(
self.__data_name_prefix + '/node_feat/' + node_feat_name, self._data_name_prefix + '/node_feat/' + node_feat_name,
shape=node_feat_shape, shape=node_feat_shape,
append_batch_size=False, append_batch_size=False,
dtype=node_feat_dtype, dtype=node_feat_dtype,
stop_gradient=True) stop_gradient=True)
self._node_feat_tensor_dict[node_feat_name] = feat_holder self.node_feat_tensor_dict[node_feat_name] = feat_holder
self._holder_list.append(feat_holder) self._holder_list.append(feat_holder)
def __create_graph_edge_feat_holders(self, edge_feat_name, edge_feat_shape, def __create_graph_edge_feat_holders(self, edge_feat_name, edge_feat_shape,
...@@ -558,12 +562,12 @@ class GraphWrapper(BaseGraphWrapper): ...@@ -558,12 +562,12 @@ class GraphWrapper(BaseGraphWrapper):
"""Create edge holders for edge features. """Create edge holders for edge features.
""" """
feat_holder = fluid.layers.data( feat_holder = fluid.layers.data(
self.__data_name_prefix + '/edge_feat/' + edge_feat_name, self._data_name_prefix + '/edge_feat/' + edge_feat_name,
shape=edge_feat_shape, shape=edge_feat_shape,
append_batch_size=False, append_batch_size=False,
dtype=edge_feat_dtype, dtype=edge_feat_dtype,
stop_gradient=True) stop_gradient=True)
self._edge_feat_tensor_dict[edge_feat_name] = feat_holder self.edge_feat_tensor_dict[edge_feat_name] = feat_holder
self._holder_list.append(feat_holder) self._holder_list.append(feat_holder)
def to_feed(self, graph): def to_feed(self, graph):
...@@ -594,20 +598,21 @@ class GraphWrapper(BaseGraphWrapper): ...@@ -594,20 +598,21 @@ class GraphWrapper(BaseGraphWrapper):
edge_feat[key] = value[eid] edge_feat[key] = value[eid]
node_feat = graph.node_feat node_feat = graph.node_feat
feed_dict[self.__data_name_prefix + '/edges_src'] = src feed_dict[self._data_name_prefix + '/edges_src'] = src
feed_dict[self.__data_name_prefix + '/edges_dst'] = dst feed_dict[self._data_name_prefix + '/edges_dst'] = dst
feed_dict[self.__data_name_prefix + '/num_nodes'] = np.array(graph.num_nodes) feed_dict[self._data_name_prefix + '/num_nodes'] = np.array(
feed_dict[self.__data_name_prefix + '/uniq_dst'] = uniq_dst graph.num_nodes)
feed_dict[self.__data_name_prefix + '/uniq_dst_count'] = uniq_dst_count feed_dict[self._data_name_prefix + '/uniq_dst'] = uniq_dst
feed_dict[self.__data_name_prefix + '/node_ids'] = graph.nodes feed_dict[self._data_name_prefix + '/uniq_dst_count'] = uniq_dst_count
feed_dict[self.__data_name_prefix + '/indegree'] = indegree feed_dict[self._data_name_prefix + '/node_ids'] = graph.nodes
feed_dict[self._data_name_prefix + '/indegree'] = indegree
for key in self._node_feat_tensor_dict:
feed_dict[self.__data_name_prefix + '/node_feat/' + for key in self.node_feat_tensor_dict:
feed_dict[self._data_name_prefix + '/node_feat/' +
key] = node_feat[key] key] = node_feat[key]
for key in self._edge_feat_tensor_dict: for key in self.edge_feat_tensor_dict:
feed_dict[self.__data_name_prefix + '/edge_feat/' + feed_dict[self._data_name_prefix + '/edge_feat/' +
key] = edge_feat[key] key] = edge_feat[key]
return feed_dict return feed_dict
......
...@@ -64,8 +64,8 @@ class HeterGraphWrapper(object): ...@@ -64,8 +64,8 @@ class HeterGraphWrapper(object):
import paddle.fluid as fluid import paddle.fluid as fluid
import numpy as np import numpy as np
from pgl.contrib import heter_graph from pgl import heter_graph
from pgl.contrib import heter_graph_wrapper from pgl import heter_graph_wrapper
num_nodes = 4 num_nodes = 4
node_types = [(0, 'user'), (1, 'item'), (2, 'item'), (3, 'user')] node_types = [(0, 'user'), (1, 'item'), (2, 'item'), (3, 'user')]
edges = { edges = {
......
...@@ -28,7 +28,7 @@ import pgl.graph as pgraph ...@@ -28,7 +28,7 @@ import pgl.graph as pgraph
import pickle as pkl import pickle as pkl
from pgl.utils.logger import log from pgl.utils.logger import log
import pgl.graph_kernel as graph_kernel import pgl.graph_kernel as graph_kernel
from pgl.contrib import heter_graph from pgl import heter_graph
import pgl.redis_graph as rg import pgl.redis_graph as rg
......
...@@ -24,10 +24,29 @@ from pgl import graph_kernel ...@@ -24,10 +24,29 @@ from pgl import graph_kernel
__all__ = [ __all__ = [
'graphsage_sample', 'node2vec_sample', 'deepwalk_sample', 'graphsage_sample', 'node2vec_sample', 'deepwalk_sample',
'metapath_randomwalk' 'metapath_randomwalk', 'pinsage_sample'
] ]
def traverse(item):
"""traverse the list or numpy"""
if isinstance(item, list) or isinstance(item, np.ndarray):
for i in iter(item):
for j in traverse(i):
yield j
else:
yield item
def flat_node_and_edge(nodes, eids, weights=None):
"""flatten the sub-lists to one list"""
nodes = list(set(traverse(nodes)))
eids = list(traverse(eids))
if weights is not None:
weights = list(traverse(weights))
return nodes, eids, weights
def edge_hash(src, dst): def edge_hash(src, dst):
"""edge_hash """edge_hash
""" """
...@@ -88,7 +107,6 @@ def graphsage_sample(graph, nodes, samples, ignore_edges=[]): ...@@ -88,7 +107,6 @@ def graphsage_sample(graph, nodes, samples, ignore_edges=[]):
start_nodes = list(nodes_set - last_nodes_set) start_nodes = list(nodes_set - last_nodes_set)
layer_nodes = [nodes] + layer_nodes layer_nodes = [nodes] + layer_nodes
layer_eids = [eids] + layer_eids layer_eids = [eids] + layer_eids
log.debug("flat time: %s" % (time.time() - start))
start = time.time() start = time.time()
# Find new nodes # Find new nodes
...@@ -317,3 +335,146 @@ def metapath_randomwalk(graph, ...@@ -317,3 +335,146 @@ def metapath_randomwalk(graph,
cur_nodes = np.array(nxt_cur_nodes) cur_nodes = np.array(nxt_cur_nodes)
return walk return walk
def random_walk_with_start_prob(graph, nodes, max_depth, proba=0.5):
"""Implement of random walk with the probability of returning the origin node.
This function get random walks path for given nodes and depth.
Args:
nodes: Walk starting from nodes
max_depth: Max walking depth
proba: the proba to return the origin node
Return:
A list of walks.
"""
walk = []
# init
for node in nodes:
walk.append([node])
walk_ids = np.arange(0, len(nodes))
cur_nodes = np.array(nodes)
nodes = np.array(nodes)
for l in range(max_depth):
# select the walks not end
if l >= 1:
return_proba = np.random.rand(cur_nodes.shape[0])
proba_mask = (return_proba < proba)
cur_nodes[proba_mask] = nodes[proba_mask]
outdegree = graph.outdegree(cur_nodes)
mask = (outdegree != 0)
if np.any(mask):
cur_walk_ids = walk_ids[mask]
outdegree = outdegree[mask]
else:
# stop when all nodes have no successor, wait start next loop to get precesssor
continue
succ = graph.successor(cur_nodes[mask])
sample_index = np.floor(
np.random.rand(outdegree.shape[0]) * outdegree).astype("int64")
nxt_cur_nodes = cur_nodes
for s, ind, walk_id in zip(succ, sample_index, cur_walk_ids):
walk[walk_id].append(s[ind])
nxt_cur_nodes[walk_id] = s[ind]
cur_nodes = np.array(nxt_cur_nodes)
return walk
def pinsage_sample(graph,
nodes,
samples,
top_k=10,
proba=0.5,
norm_bais=1.0,
ignore_edges=set()):
"""Implement of graphsage sample.
Reference paper: .
Args:
graph: A pgl graph instance
nodes: Sample starting from nodes
samples: A list, number of neighbors in each layer
top_k: select the top_k visit count nodes to construct the edges
proba: the probability to return the origin node
norm_bais: the normlization for the visit count
ignore_edges: list of edge(src, dst) will be ignored.
Return:
A list of subgraphs
"""
start = time.time()
num_layers = len(samples)
start_nodes = nodes
edges, weights = [], []
layer_nodes, layer_edges, layer_weights = [], [], []
ignore_edge_set = set([edge_hash(src, dst) for src, dst in ignore_edges])
for layer_idx in reversed(range(num_layers)):
if len(start_nodes) == 0:
layer_nodes = [nodes] + layer_nodes
layer_edges = [edges] + layer_edges
layer_edges_weight = [weights] + layer_weights
continue
walks = random_walk_with_start_prob(
graph, start_nodes, samples[layer_idx], proba=proba)
walks = [walk[1:] for walk in walks]
pred_edges = []
pred_weights = []
pred_nodes = []
for node, walk in zip(start_nodes, walks):
walk_nodes = []
walk_weights = []
count_sum = 0
for random_walk_node in walk:
if len(ignore_edge_set) > 0 and random_walk_node != node and \
edge_hash(random_walk_node, node) in ignore_edge_set:
continue
walk_nodes.append(random_walk_node)
unique, counts = np.unique(walk_nodes, return_counts=True)
frequencies = np.asarray((unique, counts)).T
frequencies = frequencies[np.argsort(frequencies[:, 1])]
frequencies = frequencies[-1 * top_k:, :]
for random_walk_node, random_count in zip(
frequencies[:, 0].tolist(), frequencies[:, 1].tolist()):
pred_nodes.append(random_walk_node)
pred_edges.append((random_walk_node, node))
walk_weights.append(random_count)
count_sum += random_count
count_sum += len(walk_weights) * norm_bais
walk_weights = (np.array(walk_weights) + norm_bais) / (count_sum)
pred_weights.extend(walk_weights.tolist())
last_node_set = set(nodes)
nodes, edges, weights = flat_node_and_edge([nodes, pred_nodes], \
[edges, pred_edges], [weights, pred_weights])
layer_edges = [edges] + layer_edges
layer_weights = [weights] + layer_weights
layer_nodes = [nodes] + layer_nodes
start_nodes = list(set(nodes) - last_node_set)
start = time.time()
feed_dict = {}
subgraphs = []
for i in range(num_layers):
edge_feat_dict = {
"weight": np.array(
layer_weights[i], dtype='float32')
}
subgraphs.append(
graph.subgraph(
nodes=layer_nodes[0],
edges=layer_edges[i],
edge_feats=edge_feat_dict))
subgraphs[i].node_feat["index"] = np.array(
layer_nodes[0], dtype="int64")
return subgraphs
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""test_hetergraph"""
import time
import unittest
import json
import os
import numpy as np
from pgl.sample import metapath_randomwalk
from pgl.graph import Graph
from pgl import heter_graph
class HeterGraphTest(unittest.TestCase):
"""HeterGraph test
"""
@classmethod
def setUpClass(cls):
np.random.seed(1)
edges = {}
# for test no successor
edges['c2p'] = [(1, 4), (0, 5), (1, 9), (1, 8), (2, 8), (2, 5), (3, 6),
(3, 7), (3, 4), (3, 8)]
edges['p2c'] = [(v, u) for u, v in edges['c2p']]
edges['p2a'] = [(4, 10), (4, 11), (4, 12), (4, 14), (4, 13), (6, 12),
(6, 11), (6, 14), (7, 12), (7, 11), (8, 14), (9, 10)]
edges['a2p'] = [(v, u) for u, v in edges['p2a']]
# for test speed
# edges['c2p'] = [(0, 4), (0, 5), (1, 9), (1,8), (2,8), (2,5), (3,6), (3,7), (3,4), (3,8)]
# edges['p2c'] = [(v,u) for u, v in edges['c2p']]
# edges['p2a'] = [(4,10), (4,11), (4,12), (4,14), (5,13), (6,13), (6,11), (6,14), (7,12), (7,11), (8,14), (9,13)]
# edges['a2p'] = [(v,u) for u, v in edges['p2a']]
node_types = ['c' for _ in range(4)] + ['p' for _ in range(6)
] + ['a' for _ in range(5)]
node_types = [(i, t) for i, t in enumerate(node_types)]
cls.graph = heter_graph.HeterGraph(
num_nodes=len(node_types), edges=edges, node_types=node_types)
def test_num_nodes_by_type(self):
print()
n_types = {'c': 4, 'p': 6, 'a': 5}
for nt in n_types:
num_nodes = self.graph.num_nodes_by_type(nt)
self.assertEqual(num_nodes, n_types[nt])
def test_node_batch_iter(self):
print()
batch_size = 2
ground = [[4, 5], [6, 7], [8, 9]]
for idx, nodes in enumerate(
self.graph.node_batch_iter(
batch_size=batch_size, shuffle=False, n_type='p')):
self.assertEqual(len(nodes), batch_size)
self.assertListEqual(list(nodes), ground[idx])
def test_sample_nodes(self):
print()
p_ground = [4, 5, 6, 7, 8, 9]
sample_num = 10
nodes = self.graph.sample_nodes(sample_num=sample_num, n_type='p')
self.assertEqual(len(nodes), sample_num)
for n in nodes:
self.assertIn(n, p_ground)
# test n_type == None
ground = [i for i in range(15)]
nodes = self.graph.sample_nodes(sample_num=sample_num, n_type=None)
self.assertEqual(len(nodes), sample_num)
for n in nodes:
self.assertIn(n, ground)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""test_metapath_randomwalk"""
import time
import unittest
import json
import os
import numpy as np
from pgl.sample import metapath_randomwalk
from pgl.graph import Graph
from pgl import heter_graph
np.random.seed(1)
class MetapathRandomwalkTest(unittest.TestCase):
"""metapath_randomwalk test
"""
def setUp(self):
edges = {}
# for test no successor
edges['c2p'] = [(1, 4), (0, 5), (1, 9), (1, 8), (2, 8), (2, 5), (3, 6),
(3, 7), (3, 4), (3, 8)]
edges['p2c'] = [(v, u) for u, v in edges['c2p']]
edges['p2a'] = [(4, 10), (4, 11), (4, 12), (4, 14), (4, 13), (6, 12),
(6, 11), (6, 14), (7, 12), (7, 11), (8, 14), (9, 10)]
edges['a2p'] = [(v, u) for u, v in edges['p2a']]
# for test speed
# edges['c2p'] = [(0, 4), (0, 5), (1, 9), (1,8), (2,8), (2,5), (3,6), (3,7), (3,4), (3,8)]
# edges['p2c'] = [(v,u) for u, v in edges['c2p']]
# edges['p2a'] = [(4,10), (4,11), (4,12), (4,14), (5,13), (6,13), (6,11), (6,14), (7,12), (7,11), (8,14), (9,13)]
# edges['a2p'] = [(v,u) for u, v in edges['p2a']]
self.node_types = ['c' for _ in range(4)] + [
'p' for _ in range(6)
] + ['a' for _ in range(5)]
node_types = [(i, t) for i, t in enumerate(self.node_types)]
self.graph = heter_graph.HeterGraph(
num_nodes=len(node_types), edges=edges, node_types=node_types)
def test_metapath_randomwalk(self):
meta_path = 'c2p-p2a-a2p-p2c'
path = ['c', 'p', 'a', 'p', 'c']
start_nodes = [0, 1, 2, 3]
walk_len = 10
walks = metapath_randomwalk(
graph=self.graph,
start_nodes=start_nodes,
metapath=meta_path,
walk_length=walk_len)
self.assertEqual(len(walks), 4)
for walk in walks:
for i in range(len(walk)):
idx = i % (len(path) - 1)
self.assertEqual(self.node_types[walk[i]], path[idx])
if __name__ == "__main__":
unittest.main()
...@@ -25,6 +25,8 @@ except: ...@@ -25,6 +25,8 @@ except:
import numpy as np import numpy as np
import time import time
import paddle.fluid as fluid import paddle.fluid as fluid
from queue import Queue
import threading
def serialize_data(data): def serialize_data(data):
...@@ -129,22 +131,39 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000, pipe_size=10): ...@@ -129,22 +131,39 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000, pipe_size=10):
p.start() p.start()
reader_num = len(readers) reader_num = len(readers)
finish_num = 0
conn_to_remove = [] conn_to_remove = []
finish_flag = np.zeros(len(conns), dtype="int32") finish_flag = np.zeros(len(conns), dtype="int32")
start = time.time()
def queue_worker(sub_conn, que):
while True:
buff = sub_conn.recv()
sample = deserialize_data(buff)
if sample is None:
que.put(None)
sub_conn.close()
break
que.put(sample)
thread_pool = []
output_queue = Queue(maxsize=reader_num)
for i in range(reader_num):
t = threading.Thread(
target=queue_worker, args=(conns[i], output_queue))
t.daemon = True
t.start()
thread_pool.append(t)
finish_num = 0
while finish_num < reader_num: while finish_num < reader_num:
for conn_id, conn in enumerate(conns): sample = output_queue.get()
if finish_flag[conn_id] > 0: if sample is None:
continue finish_num += 1
if conn.poll(0.01): else:
buff = conn.recv() yield sample
sample = deserialize_data(buff)
if sample is None: for thread in thread_pool:
finish_num += 1 thread.join()
conn.close()
finish_flag[conn_id] = 1
else:
yield sample
if use_pipe: if use_pipe:
return pipe_reader return pipe_reader
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册