From 4a3aff8848e9f9b9a1d686d80dc0d9030a5a3ca5 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Fri, 17 Jul 2020 13:59:41 +0800
Subject: [PATCH] refine

---
 ogb_examples/nodeproppred/ogbn-mag/README.md |  4 +-
 ogb_examples/nodeproppred/ogbn-mag/main.py   | 80 ++++++++++++++++----
 pgl/contrib/ogb/nodeproppred/dataset_pgl.py  |  2 +-
 pgl/graph.py                                 |  7 +-
 4 files changed, 73 insertions(+), 20 deletions(-)

diff --git a/ogb_examples/nodeproppred/ogbn-mag/README.md b/ogb_examples/nodeproppred/ogbn-mag/README.md
index fad88db..ade999f 100644
--- a/ogb_examples/nodeproppred/ogbn-mag/README.md
+++ b/ogb_examples/nodeproppred/ogbn-mag/README.md
@@ -23,7 +23,7 @@ In this repo, we use RGCN to deal with the ogbn-mag dataset. ogbn-mag dataset is
 - test_batch_size: batch_size in the test phase
 
 ### Proformance
-We evaulate 8 times on the ogbn-mag dataset. Here is the result.
+We evaulate 10 times on the ogbn-mag dataset. Here is the result.
 Dataset| Accuracy| std|
 --|--|--|
-ogbn-mag | 0.4727 | 0.0031 |
+ogbn-mag | 0.4734 | 0.0030 |
diff --git a/ogb_examples/nodeproppred/ogbn-mag/main.py b/ogb_examples/nodeproppred/ogbn-mag/main.py
index 2125061..423374d 100644
--- a/ogb_examples/nodeproppred/ogbn-mag/main.py
+++ b/ogb_examples/nodeproppred/ogbn-mag/main.py
@@ -15,14 +15,16 @@
 import os
 import argparse
 import copy
-import numpy as np
 import pgl
-import paddle.fluid as fluid
 
+import numpy as np
+import paddle.fluid as fluid
+from collections import OrderedDict
 from paddle.fluid.contrib import summary
 from pgl.utils.logger import log
 from pgl.utils.share_numpy import ToShareMemGraph
-from pgl.contrib.ogb.nodeproppred.dataset_pgl import PglNodePropPredDataset
+#from pgl.contrib.ogb.nodeproppred.dataset_pgl import PglNodePropPredDataset
+from ogb.nodeproppred import NodePropPredDataset, Evaluator
 
 from rgcn import RGCNModel, cross_entropy_loss
 from dataloader import sample_loader
@@ -49,40 +51,87 @@ def run_epoch(args, exe, fetch_list, homograph, hetergraph, gw, train_program,
     for epoch in range(args.epoch):
         for phase in ['train', 'valid', 'test']:
             running_loss = []
-            running_acc = []
+            predict = []
+            label = []
             for feed_dict in sample_loader(
                     args, phase, homograph, hetergraph, gw,
                     split_real_idx[phase]['paper'],
                     all_label['paper'][split_idx[phase]['paper']]):
-                # print("train_shape\t", feed_dict['train_index'].shape)
-                # print("allnode_shape\t", feed_dict['sub_node_index'].shape)
                 res = exe.run(train_program
                               if phase == 'train' else test_program,
                               feed=feed_dict,
                               fetch_list=fetch_list,
                               use_prune=True)
                 running_loss.append(res[0])
-                running_acc.append(res[1])
                 if phase == 'train':
                     log.info("training_acc %f" % res[1])
+                predict.append(res[2].reshape(-1, 1))
+                label.append(feed_dict["label"])
             avg_loss = sum(running_loss) / len(running_loss)
-            avg_acc = sum(running_acc) / len(running_acc)
+            predict = np.vstack(predict)
+            label = np.vstack(label)
 
+            evaluator = Evaluator(name="ogbn-mag")
+            input_dict = {"y_true": label, "y_pred": predict}
+            result_dict = evaluator.eval(input_dict)
             if phase == 'valid':
-                if avg_acc > best_acc:
+                if result_dict['acc'] > best_acc:
                     fluid.io.save_persistables(exe, './output/checkpoint',
                                                test_program)
-                    best_acc = avg_acc
+                    best_acc = result_dict['acc']
                     log.info('new best_acc %f' % best_acc)
-            log.info("%d, %s  %f %f" % (epoch, phase, avg_loss, avg_acc))
+            log.info("%d, %s  %f %f" %
+                     (epoch, phase, avg_loss, result_dict['acc']))
+
+
+def ogb2pgl_hetergraph(graph):
+    node_index = OrderedDict()
+    node_types = []
+    num_nodes = 0
+    for k, v in graph["num_nodes_dict"].items():
+        node_types.append(
+            np.ones(
+                shape=[v, 1], dtype='int64') * len(node_index))
+        node_index[k] = (v, num_nodes)
+        num_nodes += v
+    # logger.info(node_index)
+    node_types = np.vstack(node_types)
+    edges_by_types = {}
+    for k, v in graph["edge_index_dict"].items():
+        v[0, :] += node_index[k[0]][1]
+        v[1, :] += node_index[k[2]][1]
+        inverse_v = np.array(v)
+        inverse_v[0, :] = v[1, :]
+        inverse_v[1, :] = v[0, :]
+        if k[0] != k[2]:
+            edges_by_types["{}2{}".format(k[0][0], k[2][0])] = v.T
+            edges_by_types["{}2{}".format(k[2][0], k[0][0])] = inverse_v.T
+        else:
+            edges = np.hstack((v, inverse_v))
+            edges_by_types["{}2{}".format(k[0][0], k[2][0])] = edges.T
+
+    node_features = {
+        'index':
+        np.array([i for i in range(num_nodes)]).reshape(-1, 1).astype(np.int64)
+    }
+    g = pgl.heter_graph.HeterGraph(
+        num_nodes=num_nodes,
+        edges=edges_by_types,
+        node_types=node_types,
+        node_feat=node_features)
+    g.edge_feat_dict = graph['edge_feat_dict']
+    g.node_feat_dict = graph['node_feat_dict']
+    g.num_node_dict = node_index
+    return g
 
 
 def main(args):
-    num_class = 349
     embedding_size = 128
-    dataset = PglNodePropPredDataset('ogbn-papers100M')
+    dataset = NodePropPredDataset('ogbn-mag')
     g, all_label = dataset[0]
+    g = ogb2pgl_hetergraph(g)
     num_nodes = g.num_nodes
+    num_class = dataset.num_classes
 
     homograph = hetero2homo(g)
     for key in g.edge_types_info():
@@ -169,6 +218,7 @@ def main(args):
         feat = model.forward(sub_node_feat)
         feat = fluid.layers.gather(feat, train_index)
         loss, acc = cross_entropy_loss(feat, label)
+        predict = fluid.layers.argmax(feat, -1)
 
         opt = fluid.optimizer.Adam(learning_rate=args.lr)
         opt.minimize(loss)
@@ -185,12 +235,10 @@ def main(args):
             dirname=os.path.join(args.output_path, 'checkpoint'),
             main_program=test_program)
 
-    fetch_list = [loss.name, acc.name]
+    fetch_list = [loss.name, acc.name, predict.name]
     run_epoch(args, exe, fetch_list, homograph, g, gw, train_program,
               test_program, all_label, split_idx, split_real_idx)
 
-    return None
-
 
 def full_batch(g, gw, all_label, split_idx, split_real_idx, exe, train_program,
                test_program, fetch_list):
diff --git a/pgl/contrib/ogb/nodeproppred/dataset_pgl.py b/pgl/contrib/ogb/nodeproppred/dataset_pgl.py
index e6405df..be984ef 100644
--- a/pgl/contrib/ogb/nodeproppred/dataset_pgl.py
+++ b/pgl/contrib/ogb/nodeproppred/dataset_pgl.py
@@ -56,7 +56,7 @@ class PglNodePropPredDataset(object):
         self.num_tasks = int(self.meta_info[self.name]["num tasks"])
         self.task_type = self.meta_info[self.name]["task type"]
         self.eval_metric = self.meta_info[self.name]["eval metric"]
-        self.__num_classes__ = int(self.meta_info[self.name]["num classes"])
+        self.num_classes = int(self.meta_info[self.name]["num classes"])
         self.is_hetero = self.meta_info[self.name]["is hetero"]
 
         super(PglNodePropPredDataset, self).__init__()
diff --git a/pgl/graph.py b/pgl/graph.py
index 493f8d5..f3a1b80 100644
--- a/pgl/graph.py
+++ b/pgl/graph.py
@@ -177,6 +177,8 @@ class Graph(object):
             os.makedirs(path)
         np.save(os.path.join(path, 'num_nodes.npy'), self._num_nodes)
         np.save(os.path.join(path, 'edges.npy'), self._edges)
+        np.save(os.path.join(path, 'num_graph.npy'), self._num_graph)
+        np.save(os.path.join(path, 'graph_lod.npy'), self._graph_lod)
 
         if self._adj_src_index:
             self._adj_src_index.dump(os.path.join(path, 'adj_src'))
@@ -201,11 +203,14 @@ class Graph(object):
         """ load graph from dumped files.
         """
         if not os.path.exists(path):
-            raise ValueError("Not find path {}, can't load graph".format(path))
+            raise ValueError("Can't find path {}, stop loading graph!".format(
+                path))
 
         self._num_nodes = np.load(os.path.join(path, 'num_nodes.npy'))
         self._edges = np.load(
             os.path.join(path, 'edges.npy'), mmap_mode=mmap_mode)
+        self._num_graph = np.load(os.path.join(path, 'num_graph.npy'))
+        self._graph_lod = np.load(os.path.join(path, 'graph_lod.npy'))
         if os.path.isdir(os.path.join(path, 'adj_src')):
             edge_index = EdgeIndex()
             edge_index.load(os.path.join(path, 'adj_src'), mmap_mode=mmap_mode)
-- 
GitLab