diff --git a/ogb_examples/nodeproppred/ogbn-mag/README.md b/ogb_examples/nodeproppred/ogbn-mag/README.md index fad88db7763c9ae4786f38832ca9441f9f137bc4..ade999f91d93934b9f7d7c0f2395abba8a656d24 100644 --- a/ogb_examples/nodeproppred/ogbn-mag/README.md +++ b/ogb_examples/nodeproppred/ogbn-mag/README.md @@ -23,7 +23,7 @@ In this repo, we use RGCN to deal with the ogbn-mag dataset. ogbn-mag dataset is - test_batch_size: batch_size in the test phase ### Proformance -We evaulate 8 times on the ogbn-mag dataset. Here is the result. +We evaulate 10 times on the ogbn-mag dataset. Here is the result. Dataset| Accuracy| std| --|--|--| -ogbn-mag | 0.4727 | 0.0031 | +ogbn-mag | 0.4734 | 0.0030 | diff --git a/ogb_examples/nodeproppred/ogbn-mag/main.py b/ogb_examples/nodeproppred/ogbn-mag/main.py index 2125061aa08f5b8317ae7e06d307f06b64fcd0ff..423374daadab117246280ba98ec4b32f8ac0e95a 100644 --- a/ogb_examples/nodeproppred/ogbn-mag/main.py +++ b/ogb_examples/nodeproppred/ogbn-mag/main.py @@ -15,14 +15,16 @@ import os import argparse import copy -import numpy as np import pgl -import paddle.fluid as fluid +import numpy as np +import paddle.fluid as fluid +from collections import OrderedDict from paddle.fluid.contrib import summary from pgl.utils.logger import log from pgl.utils.share_numpy import ToShareMemGraph -from pgl.contrib.ogb.nodeproppred.dataset_pgl import PglNodePropPredDataset +#from pgl.contrib.ogb.nodeproppred.dataset_pgl import PglNodePropPredDataset +from ogb.nodeproppred import NodePropPredDataset, Evaluator from rgcn import RGCNModel, cross_entropy_loss from dataloader import sample_loader @@ -49,40 +51,87 @@ def run_epoch(args, exe, fetch_list, homograph, hetergraph, gw, train_program, for epoch in range(args.epoch): for phase in ['train', 'valid', 'test']: running_loss = [] - running_acc = [] + predict = [] + label = [] for feed_dict in sample_loader( args, phase, homograph, hetergraph, gw, split_real_idx[phase]['paper'], all_label['paper'][split_idx[phase]['paper']]): - # print("train_shape\t", feed_dict['train_index'].shape) - # print("allnode_shape\t", feed_dict['sub_node_index'].shape) res = exe.run(train_program if phase == 'train' else test_program, feed=feed_dict, fetch_list=fetch_list, use_prune=True) running_loss.append(res[0]) - running_acc.append(res[1]) if phase == 'train': log.info("training_acc %f" % res[1]) + predict.append(res[2].reshape(-1, 1)) + label.append(feed_dict["label"]) avg_loss = sum(running_loss) / len(running_loss) - avg_acc = sum(running_acc) / len(running_acc) + predict = np.vstack(predict) + label = np.vstack(label) + evaluator = Evaluator(name="ogbn-mag") + input_dict = {"y_true": label, "y_pred": predict} + result_dict = evaluator.eval(input_dict) if phase == 'valid': - if avg_acc > best_acc: + if result_dict['acc'] > best_acc: fluid.io.save_persistables(exe, './output/checkpoint', test_program) - best_acc = avg_acc + best_acc = result_dict['acc'] log.info('new best_acc %f' % best_acc) - log.info("%d, %s %f %f" % (epoch, phase, avg_loss, avg_acc)) + log.info("%d, %s %f %f" % + (epoch, phase, avg_loss, result_dict['acc'])) + + +def ogb2pgl_hetergraph(graph): + node_index = OrderedDict() + node_types = [] + num_nodes = 0 + for k, v in graph["num_nodes_dict"].items(): + node_types.append( + np.ones( + shape=[v, 1], dtype='int64') * len(node_index)) + node_index[k] = (v, num_nodes) + num_nodes += v + # logger.info(node_index) + node_types = np.vstack(node_types) + edges_by_types = {} + for k, v in graph["edge_index_dict"].items(): + v[0, :] += node_index[k[0]][1] + v[1, :] += node_index[k[2]][1] + inverse_v = np.array(v) + inverse_v[0, :] = v[1, :] + inverse_v[1, :] = v[0, :] + if k[0] != k[2]: + edges_by_types["{}2{}".format(k[0][0], k[2][0])] = v.T + edges_by_types["{}2{}".format(k[2][0], k[0][0])] = inverse_v.T + else: + edges = np.hstack((v, inverse_v)) + edges_by_types["{}2{}".format(k[0][0], k[2][0])] = edges.T + + node_features = { + 'index': + np.array([i for i in range(num_nodes)]).reshape(-1, 1).astype(np.int64) + } + g = pgl.heter_graph.HeterGraph( + num_nodes=num_nodes, + edges=edges_by_types, + node_types=node_types, + node_feat=node_features) + g.edge_feat_dict = graph['edge_feat_dict'] + g.node_feat_dict = graph['node_feat_dict'] + g.num_node_dict = node_index + return g def main(args): - num_class = 349 embedding_size = 128 - dataset = PglNodePropPredDataset('ogbn-papers100M') + dataset = NodePropPredDataset('ogbn-mag') g, all_label = dataset[0] + g = ogb2pgl_hetergraph(g) num_nodes = g.num_nodes + num_class = dataset.num_classes homograph = hetero2homo(g) for key in g.edge_types_info(): @@ -169,6 +218,7 @@ def main(args): feat = model.forward(sub_node_feat) feat = fluid.layers.gather(feat, train_index) loss, acc = cross_entropy_loss(feat, label) + predict = fluid.layers.argmax(feat, -1) opt = fluid.optimizer.Adam(learning_rate=args.lr) opt.minimize(loss) @@ -185,12 +235,10 @@ def main(args): dirname=os.path.join(args.output_path, 'checkpoint'), main_program=test_program) - fetch_list = [loss.name, acc.name] + fetch_list = [loss.name, acc.name, predict.name] run_epoch(args, exe, fetch_list, homograph, g, gw, train_program, test_program, all_label, split_idx, split_real_idx) - return None - def full_batch(g, gw, all_label, split_idx, split_real_idx, exe, train_program, test_program, fetch_list): diff --git a/pgl/contrib/ogb/nodeproppred/dataset_pgl.py b/pgl/contrib/ogb/nodeproppred/dataset_pgl.py index e6405df8687821b1ecf52652fd9dbcb01e8fb04f..be984ef85cfce6cee2804b925bd867f0335188c5 100644 --- a/pgl/contrib/ogb/nodeproppred/dataset_pgl.py +++ b/pgl/contrib/ogb/nodeproppred/dataset_pgl.py @@ -56,7 +56,7 @@ class PglNodePropPredDataset(object): self.num_tasks = int(self.meta_info[self.name]["num tasks"]) self.task_type = self.meta_info[self.name]["task type"] self.eval_metric = self.meta_info[self.name]["eval metric"] - self.__num_classes__ = int(self.meta_info[self.name]["num classes"]) + self.num_classes = int(self.meta_info[self.name]["num classes"]) self.is_hetero = self.meta_info[self.name]["is hetero"] super(PglNodePropPredDataset, self).__init__() diff --git a/pgl/graph.py b/pgl/graph.py index 493f8d5358c62bca771f1878a48cf33c48894c4d..f3a1b803e2832ab1ba98758d49eab61bf6320feb 100644 --- a/pgl/graph.py +++ b/pgl/graph.py @@ -177,6 +177,8 @@ class Graph(object): os.makedirs(path) np.save(os.path.join(path, 'num_nodes.npy'), self._num_nodes) np.save(os.path.join(path, 'edges.npy'), self._edges) + np.save(os.path.join(path, 'num_graph.npy'), self._num_graph) + np.save(os.path.join(path, 'graph_lod.npy'), self._graph_lod) if self._adj_src_index: self._adj_src_index.dump(os.path.join(path, 'adj_src')) @@ -201,11 +203,14 @@ class Graph(object): """ load graph from dumped files. """ if not os.path.exists(path): - raise ValueError("Not find path {}, can't load graph".format(path)) + raise ValueError("Can't find path {}, stop loading graph!".format( + path)) self._num_nodes = np.load(os.path.join(path, 'num_nodes.npy')) self._edges = np.load( os.path.join(path, 'edges.npy'), mmap_mode=mmap_mode) + self._num_graph = np.load(os.path.join(path, 'num_graph.npy')) + self._graph_lod = np.load(os.path.join(path, 'graph_lod.npy')) if os.path.isdir(os.path.join(path, 'adj_src')): edge_index = EdgeIndex() edge_index.load(os.path.join(path, 'adj_src'), mmap_mode=mmap_mode)