From bcbed7f48b4031f07cf7d6bf9484575ece8d2a37 Mon Sep 17 00:00:00 2001 From: Zhong Hui Date: Tue, 14 Jul 2020 10:30:04 +0800 Subject: [PATCH] split code --- ogb_examples/nodeproppred/ogbn-mag/README.md | 29 ++ .../nodeproppred/ogbn-mag/dataloader.py | 61 +++ ogb_examples/nodeproppred/ogbn-mag/main.py | 376 ++++++------------ ogb_examples/nodeproppred/ogbn-mag/rgcn.py | 84 ++-- ogb_examples/nodeproppred/ogbn-mag/sample.py | 97 +++++ pgl/sample.py | 32 +- 6 files changed, 384 insertions(+), 295 deletions(-) create mode 100644 ogb_examples/nodeproppred/ogbn-mag/README.md create mode 100644 ogb_examples/nodeproppred/ogbn-mag/dataloader.py create mode 100644 ogb_examples/nodeproppred/ogbn-mag/sample.py diff --git a/ogb_examples/nodeproppred/ogbn-mag/README.md b/ogb_examples/nodeproppred/ogbn-mag/README.md new file mode 100644 index 0000000..fad88db --- /dev/null +++ b/ogb_examples/nodeproppred/ogbn-mag/README.md @@ -0,0 +1,29 @@ +# Relational Graph Convolutional Neural Network +RGCN shows that GCN framework can be applied to modeling relational data in knowledge base, To learn more about the study of RGCN, see [Modeling Relational Data with Graph Convolutional Networks](https://arxiv.org/pdf/1703.06103.pdf) for more details. + +### Datasets +In this repo, we use RGCN to deal with the ogbn-mag dataset. ogbn-mag dataset is a heterogeneous network composed of a subset of the Microsoft Academic Graph. In addition, we adopt GraphSAINT sampler in the training phase. + +### Dependencies +- paddlepaddle>=1.7 +- pgl>=1.1 +- ogb>=1.2.0 + +### How to run +> CUDA_VISIBLE_DEVICES=0 python main.py --use_cude + +### Hyperparameters +- epoch: Number of epochs default (40) +- use_cuda: Use gpu if assign use_cuda. +- sample_workers: The number of workers for multiprocessing subgraph sample. +- lr: Learning rate. +- batch_size: Batch size. +- hidden_size: The hidden size of the RGCN models. +- test_samples: sample num of each layers +- test_batch_size: batch_size in the test phase + +### Proformance +We evaulate 8 times on the ogbn-mag dataset. Here is the result. +Dataset| Accuracy| std| +--|--|--| +ogbn-mag | 0.4727 | 0.0031 | diff --git a/ogb_examples/nodeproppred/ogbn-mag/dataloader.py b/ogb_examples/nodeproppred/ogbn-mag/dataloader.py new file mode 100644 index 0000000..7c20e95 --- /dev/null +++ b/ogb_examples/nodeproppred/ogbn-mag/dataloader.py @@ -0,0 +1,61 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from pgl.utils.mp_mapper import mp_reader_mapper +from sample import graph_saint_hetero, k_hop_sampler + + +def dataloader(source_node, label, batch_size=1024): + index = np.arange(len(source_node)) + np.random.shuffle(index) + + def loader(): + start = 0 + while start < len(source_node): + end = min(start + batch_size, len(source_node)) + yield source_node[index[start:end]], label[index[start:end]] + start = end + + return loader + + +def sample_loader(args, phase, homograph, hetergraph, gw, source_node, label): + if phase == 'train': + sample_func = graph_saint_hetero + batch_size = args.batch_size + # max_depth for deepwalk + other_args = len(args.test_samples) + else: + sample_func = k_hop_sampler + batch_size = args.test_batch_size + # sample num for k_hop + other_args = args.test_samples + + def map_fun(node_label): + node, label = node_label + subgraph, train_index, sample_nodes, train_label = sample_func( + homograph, hetergraph, node, other_args) + feed_dict = gw.to_feed(subgraph) + feed_dict['label'] = label if train_label is None else train_label + feed_dict['train_index'] = train_index + feed_dict['sub_node_index'] = sample_nodes + return feed_dict + + loader = dataloader(source_node, label, batch_size) + reader = mp_reader_mapper( + loader, func=map_fun, num_works=args.sample_workers) + + for feed_dict in reader(): + yield feed_dict diff --git a/ogb_examples/nodeproppred/ogbn-mag/main.py b/ogb_examples/nodeproppred/ogbn-mag/main.py index 291cc90..2125061 100644 --- a/ogb_examples/nodeproppred/ogbn-mag/main.py +++ b/ogb_examples/nodeproppred/ogbn-mag/main.py @@ -13,241 +13,77 @@ # limitations under the License. import os -os.environ['CUDA_VISIBLE_DEVICES'] = '3' -os.environ['CPU_NUM'] = str(20) -import numpy as np +import argparse import copy -import paddle -import paddle.fluid as fluid +import numpy as np import pgl +import paddle.fluid as fluid -#from pgl.sample import graph_saint_random_walk_sample -from pgl.sample import deepwalk_sample -from pgl.contrib.ogb.nodeproppred.dataset_pgl import PglNodePropPredDataset -from rgcn import RGCNModel, softmax_loss, paper_mask -from pgl.utils.mp_mapper import mp_reader_mapper +from paddle.fluid.contrib import summary +from pgl.utils.logger import log from pgl.utils.share_numpy import ToShareMemGraph +from pgl.contrib.ogb.nodeproppred.dataset_pgl import PglNodePropPredDataset + +from rgcn import RGCNModel, cross_entropy_loss +from dataloader import sample_loader def hetero2homo(heterograph): edge = [] for edge_type in heterograph.edge_types_info(): - print('edge_type: ', edge_type, ' shape of edges : ', - heterograph[edge_type].edges.shape) + log.info('edge_type: {}, shape of edges : {}'.format( + edge_type, heterograph[edge_type].edges.shape)) edge.append(heterograph[edge_type].edges) edges = np.vstack(edge) g = pgl.graph.Graph(num_nodes=heterograph.num_nodes, edges=edges) - print('homo graph nodes ', g.num_nodes) - print('homo graph edges ', g.num_edges) + log.info('homo graph nodes %d' % g.num_nodes) + log.info('homo graph edges %d' % g.num_edges) g.outdegree() ToShareMemGraph(g) return g -def extract_edges_from_nodes(hetergraph, sample_nodes): - eids = {} - for key in hetergraph.edge_types_info(): - graph = hetergraph[key] - eids[key] = pgl.graph_kernel.extract_edges_from_nodes( - graph.adj_src_index._indptr, graph.adj_src_index._sorted_v, - graph.adj_src_index._sorted_eid, sample_nodes) - return eids - - -def graph_saint_random_walk_sample(graph, - hetergraph, - nodes, - max_depth, - alias_name=None, - events_name=None): - """Implement of graph saint random walk sample. - - First, this function will get random walks path for given nodes and depth. - Then, it will create subgraph from all sampled nodes. - - Reference Paper: https://arxiv.org/abs/1907.04931 - - Args: - graph: A pgl graph instance - nodes: Walk starting from nodes - max_depth: Max walking depth - - Return: - a subgraph of sampled nodes. - """ - # the seed of multiprocess for numpy should be reset. - np.random.seed() - graph.outdegree() - # try sample from random nodes - # nodes=np.random.choice(np.arange(graph.num_nodes, dtype='int64'), size=len(nodes), replace=False) - nodes = np.random.choice( - np.arange( - graph.num_nodes, dtype='int64'), size=20000, replace=False) - walks = deepwalk_sample(graph, nodes, max_depth, alias_name, events_name) - sample_nodes = [] - for walk in walks: - sample_nodes.extend(walk) - #print("length of sample_nodes ", len(sample_nodes)) - sample_nodes = np.unique(sample_nodes) - #print("length of unique sample_nodes ", len(sample_nodes)) - - eids = extract_edges_from_nodes(hetergraph, sample_nodes) - subgraph = hetergraph.subgraph( - nodes=sample_nodes, eid=eids, with_node_feat=True, with_edge_feat=True) - #subgraph.node_feat["index"] = np.array(sample_nodes, dtype="int64") - all_label = graph._node_feat['train_label'][sample_nodes] - train_index = np.where(all_label > -1)[0] - train_label = all_label[train_index] - #print("sample", train_index.shape) - #print("sample", train_label.shape) - return subgraph, sample_nodes, train_index, train_label - - -def graph_saint_hetero(graph, hetergraph, batch_nodes, max_depth=2): - subgraph, sample_nodes, train_index, train_label = graph_saint_random_walk_sample( - graph, hetergraph, batch_nodes, max_depth) - # train_index = subgraph.reindex_from_parrent_nodes(batch_nodes) - return subgraph, train_index, sample_nodes, train_label - - -def traverse(item): - """traverse - """ - if isinstance(item, list) or isinstance(item, np.ndarray): - for i in iter(item): - for j in traverse(i): - yield j - else: - yield item - - -def flat_node_and_edge(nodes): - """flat_node_and_edge - """ - nodes = list(set(traverse(nodes))) - return nodes - - -def k_hop_sampler(graph, hetergraph, batch_nodes, samples=[30, 30]): - # for batch_train_samples, batch_train_labels in batch_info: - np.random.seed() - start_nodes = copy.deepcopy(batch_nodes) - nodes = start_nodes - edges = [] - for max_deg in samples: - pred_nodes = graph.sample_predecessor(start_nodes, max_degree=max_deg) - - for dst_node, src_nodes in zip(start_nodes, pred_nodes): - for src_node in src_nodes: - edges.append((src_node, dst_node)) - - last_nodes = nodes - nodes = [nodes, pred_nodes] - nodes = flat_node_and_edge(nodes) - # Find new nodes - start_nodes = list(set(nodes) - set(last_nodes)) - if len(start_nodes) == 0: - break - nodes = np.unique(np.array(nodes, dtype='int64')) - eids = extract_edges_from_nodes(hetergraph, nodes) - - subgraph = hetergraph.subgraph( - nodes=nodes, eid=eids, with_node_feat=True, with_edge_feat=True) - #sub_node_index = subgraph.reindex_from_parrent_nodes(batch_nodes) - train_index = subgraph.reindex_from_parrent_nodes(batch_nodes) - - return subgraph, train_index, np.array(nodes, dtype='int64'), None - - -def dataloader(source_node, label, batch_size=1024): - index = np.arange(len(source_node)) - np.random.shuffle(index) - - def loader(): - start = 0 - while start < len(source_node): - end = min(start + batch_size, len(source_node)) - yield source_node[index[start:end]], label[index[start:end]] - start = end - - return loader - - -def sample_loader(phase, homograph, hetergraph, gw, source_node, label): - #print(source_node) - #print(label) - if phase == 'train': - sample_func = graph_saint_hetero - batch_size = 20000 - else: - sample_func = k_hop_sampler - batch_size = 512 - - def map_fun(node_label): - node, label = node_label - subgraph, train_index, sample_nodes, train_label = sample_func( - homograph, hetergraph, node) - #print(train_index.shape) - #print(sample_nodes.shape) - #print(sum(subgraph['p2p'].edges[:,0] * subgraph['p2p'].edges[:, 1] == 0) /len(subgraph['p2p'].edges) ) - feed_dict = gw.to_feed(subgraph) - feed_dict['label'] = label if train_label is None else train_label - feed_dict['train_index'] = train_index - feed_dict['sub_node_index'] = sample_nodes - return feed_dict - - loader = dataloader(source_node, label, batch_size) - reader = mp_reader_mapper(loader, func=map_fun, num_works=6) - - for feed_dict in reader(): - yield feed_dict - - -def run_epoch(exe, loss, acc, homograph, hetergraph, gw, train_program, +def run_epoch(args, exe, fetch_list, homograph, hetergraph, gw, train_program, test_program, all_label, split_idx, split_real_idx): - best_acc = 1.0 - for epoch in range(1000): + best_acc = 0.0 + for epoch in range(args.epoch): for phase in ['train', 'valid', 'test']: - # if phase == 'train': - # continue running_loss = [] running_acc = [] for feed_dict in sample_loader( - phase, homograph, hetergraph, gw, + args, phase, homograph, hetergraph, gw, split_real_idx[phase]['paper'], all_label['paper'][split_idx[phase]['paper']]): - #print("train_shape\t", feed_dict['train_index'].shape) - #print("allnode_shape\t", feed_dict['sub_node_index'].shape) - res = exe.run( - train_program if phase == 'train' else test_program, - # test_program, - feed=feed_dict, - fetch_list=[loss.name, acc.name], - use_prune=True) + # print("train_shape\t", feed_dict['train_index'].shape) + # print("allnode_shape\t", feed_dict['sub_node_index'].shape) + res = exe.run(train_program + if phase == 'train' else test_program, + feed=feed_dict, + fetch_list=fetch_list, + use_prune=True) running_loss.append(res[0]) running_acc.append(res[1]) if phase == 'train': - print("training_acc %f" % res[1]) + log.info("training_acc %f" % res[1]) avg_loss = sum(running_loss) / len(running_loss) avg_acc = sum(running_acc) / len(running_acc) if phase == 'valid': if avg_acc > best_acc: fluid.io.save_persistables(exe, './output/checkpoint', - train_program) + test_program) best_acc = avg_acc - print('new best_acc %f' % best_acc) - print("%d, %s %f %f" % (epoch, phase, avg_loss, avg_acc)) + log.info('new best_acc %f' % best_acc) + log.info("%d, %s %f %f" % (epoch, phase, avg_loss, avg_acc)) -def main(): +def main(args): num_class = 349 - num_nodes = 1939743 - start_paper_index = 1203354 - hidden_size = 128 - - dataset = PglNodePropPredDataset('ogbn-mag') + embedding_size = 128 + dataset = PglNodePropPredDataset('ogbn-papers100M') g, all_label = dataset[0] + num_nodes = g.num_nodes + homograph = hetero2homo(g) for key in g.edge_types_info(): g[key].outdegree() @@ -255,43 +91,56 @@ def main(): split_idx = dataset.get_idx_split() split_real_idx = copy.deepcopy(split_idx) - - start_paper_index = g.num_node_dict['paper'][1] - # reindex the original idx of each type of node for t, idx in split_real_idx.items(): - for k, v in idx.items(): + for k in idx.keys(): split_real_idx[t][k] += g.num_node_dict[k][1] + # the num_node_dict record the node nums and the start index of each type of node. + start_paper_index = g.num_node_dict['paper'][1] + end_paper_index = start_paper_index + g.num_node_dict['paper'][0] + # additional feat of paper node, and corresponding index + additional_paper_feature = g.node_feat_dict[ + 'paper'][:, :embedding_size].astype('float32') + extract_index = (np.arange(start_paper_index, + end_paper_index)).astype('int32') + + # extract the train label feat as node feat of homograph homograph._node_feat['train_label'] = -1 * np.ones( [homograph.num_nodes, 1], dtype='int64') + # copy train label to homograph node feat train_label = all_label['paper'][split_idx['train']['paper']] train_index = split_real_idx['train']['paper'] homograph._node_feat['train_label'][train_index] = train_label - place = fluid.CUDAPlace(0) - #place = fluid.CPUPlace() + if args.use_cuda: + place = fluid.CUDAPlace(0) + else: + place = fluid.CPUPlace() train_program = fluid.Program() startup_program = fluid.Program() test_program = fluid.Program() - additional_paper_feature = g.node_feat_dict[ - 'paper'][:, :hidden_size].astype('float32') - extact_index = (np.arange(start_paper_index, num_nodes)).astype('int32') - with fluid.program_guard(train_program, startup_program): + gw = pgl.heter_graph_wrapper.HeterGraphWrapper( + name="heter_graph", + edge_types=g.edge_types_info(), + node_feat=g.node_feat_info(), + edge_feat=g.edge_feat_info()) + + # set the paper node feature paper_feature = fluid.layers.create_parameter( shape=additional_paper_feature.shape, dtype='float32', default_initializer=fluid.initializer.NumpyArrayInitializer( additional_paper_feature), - name='paper_feature') + name="paper_feature") paper_index = fluid.layers.create_parameter( - shape=extact_index.shape, + shape=extract_index.shape, dtype='int32', default_initializer=fluid.initializer.NumpyArrayInitializer( - extact_index), - name='paper_index') + extract_index), + name="paper_index") paper_feature.stop_gradient = True paper_index.stop_gradient = True @@ -302,66 +151,101 @@ def main(): label = fluid.layers.data(shape=[-1], dtype="int64", name='label') label = fluid.layers.reshape(label, [-1, 1]) label.stop_gradient = True - gw = pgl.heter_graph_wrapper.HeterGraphWrapper( - name="heter_graph", - edge_types=g.edge_types_info(), - node_feat=g.node_feat_info(), - edge_feat=g.edge_feat_info()) + feat = fluid.layers.create_parameter( - shape=[num_nodes, hidden_size], dtype='float32') - # TODO: the paper feature replaced the total feat, not add + shape=[num_nodes, embedding_size], dtype='float32') + # NOTE: the paper feature replaced the total feat, not add feat = fluid.layers.scatter( feat, paper_index, paper_feature, overwrite=False) sub_node_feat = fluid.layers.gather(feat, sub_node_index) - model = RGCNModel(gw, 2, num_class, num_nodes, g.edge_types_info()) + + model = RGCNModel( + graph_wrapper=gw, + num_layers=args.num_layers, + hidden_size=args.hidden_size, + num_class=num_class, + edge_types=g.edge_types_info()) + feat = model.forward(sub_node_feat) - #feat = paper_mask(feat, gw, start_paper_index) feat = fluid.layers.gather(feat, train_index) - loss, logit, acc = softmax_loss(feat, label, num_class) - opt = fluid.optimizer.AdamOptimizer(learning_rate=0.005) + loss, acc = cross_entropy_loss(feat, label) + + opt = fluid.optimizer.Adam(learning_rate=args.lr) opt.minimize(loss) test_program = train_program.clone(for_test=True) - from paddle.fluid.contrib import summary - summary(train_program) + summary(train_program) exe = fluid.Executor(place) exe.run(startup_program) - # fluid.io.load_persistables(executor=exe, dirname='./output/checkpoint', - # main_program=train_program) - run_epoch(exe, loss, acc, homograph, g, gw, train_program, test_program, - all_label, split_idx, split_real_idx) + if args.load_pretrain: + fluid.io.load_persistables( + executor=exe, + dirname=os.path.join(args.output_path, 'checkpoint'), + main_program=test_program) + + fetch_list = [loss.name, acc.name] + run_epoch(args, exe, fetch_list, homograph, g, gw, train_program, + test_program, all_label, split_idx, split_real_idx) return None - feed_dict = gw.to_feed(g) - #rand_label = (np.random.rand(num_nodes - start_paper_index) > - # 0.5).astype('int64') - #feed_dict['label'] = rand_label + +def full_batch(g, gw, all_label, split_idx, split_real_idx, exe, train_program, + test_program, fetch_list): + """ The full batch verison of rgcn. No sufficient gpu memory for full batch! + """ + feed_dict = gw.to_feed(g) feed_dict['label'] = all_label['paper'][split_idx['train']['paper']] feed_dict['train_index'] = split_real_idx['train']['paper'] - #feed_dict['sub_node_index'] = np.arange(num_nodes).astype('int64') - #feed_dict['paper_index'] = extact_index - #feed_dict['paper_feature'] = additional_paper_feature + feed_dict['sub_node_index'] = np.arange(g.num_nodes).astype('int64') for epoch in range(10): feed_dict['label'] = all_label['paper'][split_idx['train']['paper']] feed_dict['train_index'] = split_real_idx['train']['paper'] - for step in range(10): - res = exe.run(train_program, - feed=feed_dict, - fetch_list=[loss.name, acc.name]) - print("%d,%d %f %f" % (epoch, step, res[0], res[1])) - #print(res[1]) + + res = exe.run(train_program, feed=feed_dict, fetch_list=fetch_list) + log.info("Train %d, %f %f" % (epoch, res[0], res[1])) feed_dict['label'] = all_label['paper'][split_idx['valid']['paper']] feed_dict['train_index'] = split_real_idx['valid']['paper'] - res = exe.run(test_program, - feed=feed_dict, - fetch_list=[loss.name, acc.name]) - print("Test %d, %f %f" % (epoch, res[0], res[1])) + res = exe.run(test_program, feed=feed_dict, fetch_list=fetch_list) + log.info("Valid %d, %f %f" % (epoch, res[0], res[1])) + + feed_dict['label'] = all_label['paper'][split_idx['test']['paper']] + feed_dict['train_index'] = split_real_idx['test']['paper'] + res = exe.run(test_program, feed=feed_dict, fetch_list=fetch_list) + log.info("Test %d, %f %f" % (epoch, res[0], res[1])) if __name__ == "__main__": - main() + parser = argparse.ArgumentParser(description='graphsaint with rgcn') + parser.add_argument( + "--output_path", + type=str, + default=None, + help="output path to save model") + parser.add_argument( + "--load_pretrain", action='store_true', help="load pretrained mode") + parser.add_argument("--use_cuda", action='store_true', help="use_cuda") + parser.add_argument("--sample_workers", type=int, default=6) + parser.add_argument("--epoch", type=int, default=40) + parser.add_argument("--num_layers", type=int, default=2) + parser.add_argument("--hidden_size", type=int, default=64) + parser.add_argument("--batch_size", type=int, default=20000) + parser.add_argument("--lr", type=float, default=0.005) + parser.add_argument( + "--test_batch_size", + type=int, + default=512, + help="sample nums of k-hop of test phase.") + parser.add_argument( + "--test_samples", + type=int, + nargs='+', + default=[30, 30], + help="sample nums of k-hop.") + args = parser.parse_args() + log.info(args) + main(args) diff --git a/ogb_examples/nodeproppred/ogbn-mag/rgcn.py b/ogb_examples/nodeproppred/ogbn-mag/rgcn.py index 2dbe9e3..eb89fbd 100644 --- a/ogb_examples/nodeproppred/ogbn-mag/rgcn.py +++ b/ogb_examples/nodeproppred/ogbn-mag/rgcn.py @@ -14,9 +14,9 @@ import paddle import pgl -import paddle.fluid as fluid import numpy as np -from pgl.contrib.ogb.nodeproppred.dataset_pgl import PglNodePropPredDataset +import paddle.fluid as fluid +from paddle.fluid.contrib import summary def rgcn_conv(graph_wrapper, @@ -34,14 +34,11 @@ def rgcn_conv(graph_wrapper, """ return fluid.layers.sequence_pool(feat, pool_type='average') - gw = graph_wrapper if not isinstance(edge_types, list): edge_types = [edge_types] - #output = fluid.layers.zeros((feature.shape[0], hidden_size), dtype='float32') output = None for i in range(len(edge_types)): - assert feature is not None tmp_feat = fluid.layers.fc( feature, size=hidden_size, @@ -50,8 +47,9 @@ def rgcn_conv(graph_wrapper, act=None) if output is None: output = fluid.layers.zeros_like(tmp_feat) - msg = gw[edge_types[i]].send(__message, nfeat_list=[('h', feature)]) - neigh_feat = gw[edge_types[i]].recv(msg, __reduce) + msg = graph_wrapper[edge_types[i]].send( + __message, nfeat_list=[('h', tmp_feat)]) + neigh_feat = graph_wrapper[edge_types[i]].recv(msg, __reduce) # The weight of FC should be the same for the same type of node # The edge type str should be `A2B`(from type A to type B) neigh_feat = fluid.layers.fc( @@ -60,24 +58,25 @@ def rgcn_conv(graph_wrapper, param_attr=fluid.ParamAttr(name='%s_edge_fc_%s' % (name, edge_types[i])), act=None) + # TODO: the tmp_feat and neigh_feat should be add togather. output = output + neigh_feat * tmp_feat - #output = fluid.layers.relu(out) + return output class RGCNModel: - def __init__(self, gw, layers, num_class, num_nodes, edge_types): - self.hidden_size = 64 - self.layers = layers - self.num_nodes = num_nodes - self.edge_types = edge_types - self.gw = gw + def __init__(self, graph_wrapper, num_layers, hidden_size, num_class, + edge_types): + self.graph_wrapper = graph_wrapper + self.num_layers = num_layers + self.hidden_size = hidden_size self.num_class = num_class + self.edge_types = edge_types def forward(self, feat): - for i in range(self.layers - 1): + for i in range(self.num_layers - 1): feat = rgcn_conv( - self.gw, + self.graph_wrapper, feat, self.hidden_size, self.edge_types, @@ -85,49 +84,44 @@ class RGCNModel: feat = fluid.layers.relu(feat) feat = fluid.layers.dropout(feat, dropout_prob=0.5) feat = rgcn_conv( - self.gw, + self.graph_wrapper, feat, self.num_class, self.edge_types, - name="rgcn_%d" % (self.layers - 1)) + name="rgcn_%d" % (self.num_layers - 1)) return feat -def softmax_loss(feat, label, class_num): - #logit = fluid.layers.fc(feat, class_num) - logit = feat +def cross_entropy_loss(logit, label): loss = fluid.layers.softmax_with_cross_entropy(logit, label) loss = fluid.layers.mean(loss) acc = fluid.layers.accuracy(fluid.layers.softmax(logit), label) - return loss, logit, acc - - -def paper_mask(feat, gw, start_index): - mask = fluid.layers.cast(gw[0].node_feat['index'] > start_index) - feat = fluid.layers.mask_select(feat, mask) - return feat + return loss, acc if __name__ == "__main__": - #PglNodePropPredDataset('ogbn-mag') num_nodes = 4 num_class = 2 + feat_dim = 16 + hidden_size = 16 + num_layers = 2 + node_types = [(0, 'user'), (1, 'user'), (2, 'item'), (3, 'item')] edges = { 'U2U': [(0, 1), (1, 0)], 'U2I': [(1, 2), (0, 3), (1, 3)], 'I2I': [(2, 3), (3, 2)], } - node_feat = {'feature': np.random.randn(4, 16)} + node_feat = {'feature': np.random.randn(4, feat_dim)} edges_feat = { 'U2U': { - 'h': np.random.randn(2, 16) + 'h': np.random.randn(2, feat_dim) }, 'U2I': { - 'h': np.random.randn(3, 16) + 'h': np.random.randn(3, feat_dim) }, 'I2I': { - 'h': np.random.randn(2, 16) + 'h': np.random.randn(2, feat_dim) }, } g = pgl.heter_graph.HeterGraph( @@ -136,14 +130,13 @@ if __name__ == "__main__": node_types=node_types, node_feat=node_feat, edge_feat=edges_feat) - place = fluid.CPUPlace() + train_program = fluid.Program() startup_program = fluid.Program() test_program = fluid.Program() with fluid.program_guard(train_program, startup_program): label = fluid.layers.data(shape=[-1], dtype="int64", name='label') - #label = fluid.layers.create_global_var(shape=[4], value=1, dtype="int64") label = fluid.layers.reshape(label, [-1, 1]) label.stop_gradient = True gw = pgl.heter_graph_wrapper.HeterGraphWrapper( @@ -153,23 +146,24 @@ if __name__ == "__main__": edge_feat=g.edge_feat_info()) feat = fluid.layers.create_parameter( - shape=[num_nodes, 16], dtype='float32') + shape=[num_nodes, feat_dim], dtype='float32') - model = RGCNModel(gw, 3, num_class, num_nodes, g.edge_types_info()) - feat = model.forward(feat) - loss, logit, acc = softmax_loss(feat, label, 2) - opt = fluid.optimizer.AdamOptimizer(learning_rate=0.001) + model = RGCNModel(gw, num_layers, num_class, hidden_size, + g.edge_types_info()) + logit = model.forward(feat) + loss, acc = cross_entropy_loss(logit, label) + opt = fluid.optimizer.SGD(learning_rate=0.1) opt.minimize(loss) - from paddle.fluid.contrib import summary - summary(train_program) + summary(train_program) + place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_program) + feed_dict = gw.to_feed(g) feed_dict['label'] = np.array([1, 0, 1, 1]).astype('int64') for i in range(100): res = exe.run(train_program, feed=feed_dict, - fetch_list=[loss.name, logit.name, acc.name]) - print("%d %f %f" % (i, res[0], res[2])) - #print(res[1]) + fetch_list=[loss.name, acc.name]) + print("STEP: %d LOSS: %f ACC: %f" % (i, res[0], res[1])) diff --git a/ogb_examples/nodeproppred/ogbn-mag/sample.py b/ogb_examples/nodeproppred/ogbn-mag/sample.py new file mode 100644 index 0000000..844cbf8 --- /dev/null +++ b/ogb_examples/nodeproppred/ogbn-mag/sample.py @@ -0,0 +1,97 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pgl +import copy +import numpy as np + +from pgl.sample import deepwalk_sample, traverse +from pgl.sample import extract_edges_from_nodes + + +def graph_saint_random_walk_sample(graph, + hetergraph, + nodes, + max_depth, + alias_name=None, + events_name=None): + """Implement of graph saint random walk sample for hetergraph. + + Args: + graph: A pgl graph instance, homograph for hetergraph + hetergraph: A pgl hetergraph instance + nodes: Walk starting from nodes + max_depth: Max walking depth + + Return: + a subgraph of sampled nodes. + """ + # the seed for numpy should be reset in multiprocessing. + np.random.seed() + graph.outdegree() + # sample random nodes + nodes = np.random.choice( + np.arange( + graph.num_nodes, dtype='int64'), size=20000, replace=False) + walks = deepwalk_sample(graph, nodes, max_depth, alias_name, events_name) + sample_nodes = [] + for walk in walks: + sample_nodes.extend(walk) + + sample_nodes = np.unique(sample_nodes) + eids = extract_edges_from_nodes(hetergraph, sample_nodes) + subgraph = hetergraph.subgraph( + nodes=sample_nodes, eid=eids, with_node_feat=True, with_edge_feat=True) + + return subgraph, sample_nodes + + +def graph_saint_hetero(graph, hetergraph, batch_nodes, max_depth=2): + subgraph, sample_nodes = graph_saint_random_walk_sample( + graph, hetergraph, batch_nodes, max_depth) + # the new index of sample_nodes is range(0, len(sample_nodes)) + all_label = graph._node_feat['train_label'][sample_nodes] + train_index = np.where(all_label > -1)[0] + train_label = all_label[train_index] + return subgraph, train_index, sample_nodes, train_label + + +def k_hop_sampler(graph, hetergraph, batch_nodes, samples=[30, 30]): + # the seed for numpy should be reset in multiprocessing. + np.random.seed() + start_nodes = copy.deepcopy(batch_nodes) + nodes = start_nodes + edges = [] + for max_deg in samples: + pred_nodes = graph.sample_predecessor(start_nodes, max_degree=max_deg) + for dst_node, src_nodes in zip(start_nodes, pred_nodes): + for src_node in src_nodes: + edges.append((src_node, dst_node)) + last_nodes = nodes + nodes = [nodes, pred_nodes] + nodes = list(set(traverse(nodes))) + # Find new nodes + start_nodes = list(set(nodes) - set(last_nodes)) + if len(start_nodes) == 0: + break + + # TODO: Only use certrain sampled edges to construct subgraph. + nodes = np.unique(np.array(nodes, dtype='int64')) + eids = extract_edges_from_nodes(hetergraph, nodes) + + subgraph = hetergraph.subgraph( + nodes=nodes, eid=eids, with_node_feat=True, with_edge_feat=True) + train_index = subgraph.reindex_from_parrent_nodes(batch_nodes) + + return subgraph, train_index, nodes, None diff --git a/pgl/sample.py b/pgl/sample.py index 81241d5..689726e 100644 --- a/pgl/sample.py +++ b/pgl/sample.py @@ -479,10 +479,34 @@ def pinsage_sample(graph, def extract_edges_from_nodes(graph, sample_nodes): - eids = graph_kernel.extract_edges_from_nodes( - graph.adj_src_index._indptr, graph.adj_src_index._sorted_v, - graph.adj_src_index._sorted_eid, sample_nodes) - return eids + """ Tools for extract all edges of certrain nodes. + + Note: Before calling this method, please build adj_src_index first. + + Args: + graph: A pgl graph instance of hetergraph instance + sample_nodes: the given nodes + + Return: + eids array for graph instance. + or a dict of eids array for hetergraph instance. + """ + sample_nodes = np.array(sample_nodes, dtype='int64') + if isinstance(graph, pgl.graph.Graph): + eids = graph_kernel.extract_edges_from_nodes( + graph.adj_src_index._indptr, graph.adj_src_index._sorted_v, + graph.adj_src_index._sorted_eid, sample_nodes) + return eids + elif isinstance(graph, pgl.heter_graph.HeterGraph): + eids = {} + for key in graph.edge_types_info(): + eids[key] = pgl.graph_kernel.extract_edges_from_nodes( + graph[key].adj_src_index._indptr, + graph[key].adj_src_index._sorted_v, + graph[key].adj_src_index._sorted_eid, sample_nodes) + return eids + else: + raise ValueError("Please pass a graph instance to this function!") def graph_saint_random_walk_sample(graph, -- GitLab