add ogb PropPredDataset for pgl

752b6169 · liweibin · b46b2b1a · 752b6169 · 752b6169 · 752b6169
11 changed file
--- a/ogb_examples/linkproppred/main_pgl.py
+++ b/ogb_examples/linkproppred/main_pgl.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""test ogb
+"""
+import argparse
+
+import pgl
+import numpy as np
+import paddle.fluid as fluid
+from pgl.contrib.ogb.linkproppred.dataset_pgl import PglLinkPropPredDataset
+from pgl.utils import paddle_helper
+from ogb.linkproppred import Evaluator
+
+
+def send_func(src_feat, dst_feat, edge_feat):
+    """send_func"""
+    return src_feat["h"]
+
+
+def recv_func(feat):
+    """recv_func"""
+    return fluid.layers.sequence_pool(feat, pool_type="sum")
+
+
+class GNNModel(object):
+    """GNNModel"""
+
+    def __init__(self, name, num_nodes, emb_dim, num_layers):
+        self.num_nodes = num_nodes
+        self.emb_dim = emb_dim
+        self.num_layers = num_layers
+        self.name = name
+
+        self.src_nodes = fluid.layers.data(
+            name='src_nodes',
+            shape=[None, 1],
+            dtype='int64', )
+
+        self.dst_nodes = fluid.layers.data(
+            name='dst_nodes',
+            shape=[None, 1],
+            dtype='int64', )
+
+        self.edge_label = fluid.layers.data(
+            name='edge_label',
+            shape=[None, 1],
+            dtype='float32', )
+
+    def forward(self, graph):
+        """forward"""
+        h = fluid.layers.create_parameter(
+            shape=[self.num_nodes, self.emb_dim],
+            dtype="float32",
+            name=self.name + "_embedding")
+        #  edge_attr = fluid.layers.fc(graph.edge_feat["feat"], size=self.emb_dim)
+
+        for layer in range(self.num_layers):
+            msg = graph.send(
+                send_func,
+                nfeat_list=[("h", h)], )
+            h = graph.recv(msg, recv_func)
+            h = fluid.layers.fc(
+                h,
+                size=self.emb_dim,
+                bias_attr=False,
+                param_attr=fluid.ParamAttr(name=self.name + '_%s' % layer))
+            h = h * graph.node_feat["norm"]
+            bias = fluid.layers.create_parameter(
+                shape=[self.emb_dim],
+                dtype='float32',
+                is_bias=True,
+                name=self.name + '_bias_%s' % layer)
+            h = fluid.layers.elementwise_add(h, bias, act="relu")
+
+        src = fluid.layers.gather(h, self.src_nodes)
+        dst = fluid.layers.gather(h, self.dst_nodes)
+        edge_embed = src * dst
+        pred = fluid.layers.fc(input=edge_embed,
+                               size=1,
+                               name=self.name + "_pred_output")
+
+        prob = fluid.layers.sigmoid(pred)
+
+        loss = fluid.layers.sigmoid_cross_entropy_with_logits(pred,
+                                                              self.edge_label)
+        loss = fluid.layers.reduce_mean(loss)
+
+        return pred, prob, loss
+
+
+def main():
+    """main
+    """
+    # Training settings
+    parser = argparse.ArgumentParser(description='Graph Dataset')
+    parser.add_argument(
+        '--epochs',
+        type=int,
+        default=100,
+        help='number of epochs to train (default: 100)')
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        default="ogbl-ppa",
+        help='dataset name (default: protein protein associations)')
+    args = parser.parse_args()
+
+    #place = fluid.CUDAPlace(0)
+    place = fluid.CPUPlace()  # Dataset too big to use GPU
+
+    ### automatic dataloading and splitting
+    print("loadding dataset")
+    dataset = PglLinkPropPredDataset(name=args.dataset)
+    splitted_edge = dataset.get_edge_split()
+    print(splitted_edge['train_edge'].shape)
+    print(splitted_edge['train_edge_label'].shape)
+
+    print("building evaluator")
+    ### automatic evaluator. takes dataset name as input
+    evaluator = Evaluator(args.dataset)
+
+    graph_data = dataset[0]
+    print("num_nodes: %d" % graph_data.num_nodes)
+
+    train_program = fluid.Program()
+    startup_program = fluid.Program()
+    test_program = fluid.Program()
+    # degree normalize
+    indegree = graph_data.indegree()
+    norm = np.zeros_like(indegree, dtype="float32")
+    norm[indegree > 0] = np.power(indegree[indegree > 0], -0.5)
+    graph_data.node_feat["norm"] = np.expand_dims(norm, -1).astype("float32")
+
+    with fluid.program_guard(train_program, startup_program):
+        model = GNNModel(
+            name="gnn",
+            num_nodes=graph_data.num_nodes,
+            emb_dim=64,
+            num_layers=2)
+        gw = pgl.graph_wrapper.GraphWrapper(
+            "graph",
+            place,
+            node_feat=graph_data.node_feat_info(),
+            edge_feat=graph_data.edge_feat_info())
+        pred, prob, loss = model.forward(gw)
+
+    val_program = train_program.clone(for_test=True)
+
+    with fluid.program_guard(train_program, startup_program):
+        adam = fluid.optimizer.Adam(
+            learning_rate=1e-2,
+            regularization=fluid.regularizer.L2DecayRegularizer(
+                regularization_coeff=0.0005))
+        adam.minimize(loss)
+
+    exe = fluid.Executor(place)
+    exe.run(startup_program)
+
+    feed = gw.to_feed(graph_data)
+    for epoch in range(1, args.epochs + 1):
+        feed['src_nodes'] = splitted_edge["train_edge"][:, 0].reshape(-1, 1)
+        feed['dst_nodes'] = splitted_edge["train_edge"][:, 1].reshape(-1, 1)
+        feed['edge_label'] = splitted_edge["train_edge_label"].astype(
+            "float32").reshape(-1, 1)
+        res_loss, y_pred = exe.run(train_program,
+                                   feed=feed,
+                                   fetch_list=[loss, prob])
+        print("Loss %s" % res_loss[0])
+
+        result = {}
+        print("Evaluating...")
+        feed['src_nodes'] = splitted_edge["valid_edge"][:, 0].reshape(-1, 1)
+        feed['dst_nodes'] = splitted_edge["valid_edge"][:, 1].reshape(-1, 1)
+        feed['edge_label'] = splitted_edge["valid_edge_label"].astype(
+            "float32").reshape(-1, 1)
+        y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0]
+        input_dict = {
+            "y_true": splitted_edge["valid_edge_label"],
+            "y_pred": y_pred.reshape(-1, ),
+        }
+        result["valid"] = evaluator.eval(input_dict)
+
+        feed['src_nodes'] = splitted_edge["test_edge"][:, 0].reshape(-1, 1)
+        feed['dst_nodes'] = splitted_edge["test_edge"][:, 1].reshape(-1, 1)
+        feed['edge_label'] = splitted_edge["test_edge_label"].astype(
+            "float32").reshape(-1, 1)
+        y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0]
+        input_dict = {
+            "y_true": splitted_edge["test_edge_label"],
+            "y_pred": y_pred.reshape(-1, ),
+        }
+        result["test"] = evaluator.eval(input_dict)
+        print(result)
+
+
+if __name__ == "__main__":
+    main()
--- a/ogb_examples/nodeproppred/main_pgl.py
+++ b/ogb_examples/nodeproppred/main_pgl.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""test ogb
+"""
+import argparse
+
+import pgl
+import numpy as np
+import paddle.fluid as fluid
+from pgl.contrib.ogb.nodeproppred.dataset_pgl import PglNodePropPredDataset
+from pgl.utils import paddle_helper
+from ogb.nodeproppred import Evaluator
+
+
+def train():
+    pass
+
+
+def send_func(src_feat, dst_feat, edge_feat):
+    return (src_feat["h"] + edge_feat["h"]) * src_feat["norm"]
+
+
+class GNNModel(object):
+    def __init__(self, name, emb_dim, num_task, num_layers):
+        self.num_task = num_task
+        self.emb_dim = emb_dim
+        self.num_layers = num_layers
+        self.name = name
+
+    def forward(self, graph):
+        h = fluid.layers.embedding(
+            graph.node_feat["x"],
+            size=(2, self.emb_dim))  # name=self.name + "_embedding") 
+        edge_attr = fluid.layers.fc(graph.edge_feat["feat"], size=self.emb_dim)
+        for layer in range(self.num_layers):
+            msg = graph.send(
+                send_func,
+                nfeat_list=[("h", h), ("norm", graph.node_feat["norm"])],
+                efeat_list=[("h", edge_attr)])
+            h = graph.recv(msg, "sum")
+            h = fluid.layers.fc(
+                h,
+                size=self.emb_dim,
+                bias_attr=False,
+                param_attr=fluid.ParamAttr(name=self.name + '_%s' % layer))
+            h = h * graph.node_feat["norm"]
+            bias = fluid.layers.create_parameter(
+                shape=[self.emb_dim],
+                dtype='float32',
+                is_bias=True,
+                name=self.name + '_bias_%s' % layer)
+            h = fluid.layers.elementwise_add(h, bias, act="relu")
+        pred = fluid.layers.fc(h,
+                               self.num_task,
+                               act=None,
+                               name=self.name + "_pred_output")
+        return pred
+
+
+def main():
+    """main
+    """
+    # Training settings
+    parser = argparse.ArgumentParser(description='Graph Dataset')
+    parser.add_argument(
+        '--epochs',
+        type=int,
+        default=100,
+        help='number of epochs to train (default: 100)')
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        default="ogbn-proteins",
+        help='dataset name (default: proteinfunc)')
+    args = parser.parse_args()
+
+    #device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")
+    #place = fluid.CUDAPlace(0)
+    place = fluid.CPUPlace()  # Dataset too big to use GPU
+
+    ### automatic dataloading and splitting
+    dataset = PglNodePropPredDataset(name=args.dataset)
+    splitted_idx = dataset.get_idx_split()
+
+    ### automatic evaluator. takes dataset name as input
+    evaluator = Evaluator(args.dataset)
+
+    graph_data, label = dataset[0]
+
+    train_program = fluid.Program()
+    startup_program = fluid.Program()
+    test_program = fluid.Program()
+    # degree normalize
+    indegree = graph_data.indegree()
+    norm = np.zeros_like(indegree, dtype="float32")
+    norm[indegree > 0] = np.power(indegree[indegree > 0], -0.5)
+    graph_data.node_feat["norm"] = np.expand_dims(norm, -1).astype("float32")
+    graph_data.node_feat["x"] = np.zeros((len(indegree), 1), dtype="int64")
+    graph_data.edge_feat["feat"] = graph_data.edge_feat["feat"].astype(
+        "float32")
+    model = GNNModel(
+        name="gnn", num_task=dataset.num_tasks, emb_dim=64, num_layers=2)
+
+    with fluid.program_guard(train_program, startup_program):
+        gw = pgl.graph_wrapper.StaticGraphWrapper("graph", graph_data, place)
+        pred = model.forward(gw)
+        sigmoid_pred = fluid.layers.sigmoid(pred)
+
+    val_program = train_program.clone(for_test=True)
+
+    initializer = []
+    with fluid.program_guard(train_program, startup_program):
+        train_node_index, init = paddle_helper.constant(
+            "train_node_index", dtype="int64", value=splitted_idx["train"])
+        initializer.append(init)
+
+        train_node_label, init = paddle_helper.constant(
+            "train_node_label",
+            dtype="float32",
+            value=label[splitted_idx["train"]].astype("float32"))
+        initializer.append(init)
+        train_pred_t = fluid.layers.gather(pred, train_node_index)
+        train_loss_t = fluid.layers.sigmoid_cross_entropy_with_logits(
+            x=train_pred_t, label=train_node_label)
+        train_loss_t = fluid.layers.reduce_sum(train_loss_t)
+        train_pred_t = fluid.layers.sigmoid(train_pred_t)
+
+        adam = fluid.optimizer.Adam(
+            learning_rate=1e-2,
+            regularization=fluid.regularizer.L2DecayRegularizer(
+                regularization_coeff=0.0005))
+        adam.minimize(train_loss_t)
+
+    exe = fluid.Executor(place)
+    exe.run(startup_program)
+    gw.initialize(place)
+    for init in initializer:
+        init(place)
+
+    for epoch in range(1, args.epochs + 1):
+        loss = exe.run(train_program, feed={}, fetch_list=[train_loss_t])
+        print("Loss %s" % loss[0])
+        print("Evaluating...")
+        y_pred = exe.run(val_program, feed={}, fetch_list=[sigmoid_pred])[0]
+        result = {}
+        input_dict = {
+            "y_true": label[splitted_idx["train"]],
+            "y_pred": y_pred[splitted_idx["train"]]
+        }
+        result["train"] = evaluator.eval(input_dict)
+        input_dict = {
+            "y_true": label[splitted_idx["valid"]],
+            "y_pred": y_pred[splitted_idx["valid"]]
+        }
+        result["valid"] = evaluator.eval(input_dict)
+        input_dict = {
+            "y_true": label[splitted_idx["test"]],
+            "y_pred": y_pred[splitted_idx["test"]]
+        }
+        result["test"] = evaluator.eval(input_dict)
+        print(result)
+
+
+if __name__ == "__main__":
+    main()
--- a/pgl/contrib/ogb/__init__.py
+++ b/pgl/contrib/ogb/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/pgl/contrib/ogb/graphproppred/__init__.py
+++ b/pgl/contrib/ogb/graphproppred/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""__init__.py"""
--- a/pgl/contrib/ogb/graphproppred/dataset_pgl.py
+++ b/pgl/contrib/ogb/graphproppred/dataset_pgl.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PglGraphPropPredDataset
+"""
+import pandas as pd
+import shutil, os
+import os.path as osp
+import numpy as np
+from ogb.utils.url import decide_download, download_url, extract_zip
+from ogb.graphproppred import make_master_file
+from pgl.contrib.ogb.io.read_graph_pgl import read_csv_graph_pgl
+
+
+def to_bool(value):
+    """to_bool"""
+    return np.array([value], dtype="bool")[0]
+
+
+class PglGraphPropPredDataset(object):
+    """PglGraphPropPredDataset"""
+
+    def __init__(self, name, root="dataset"):
+        self.name = name  ## original name, e.g., ogbg-mol-tox21
+        self.dir_name = "_".join(
+            name.split("-")
+        ) + "_pgl"  ## replace hyphen with underline, e.g., ogbg_mol_tox21_dgl
+
+        self.original_root = root
+        self.root = osp.join(root, self.dir_name)
+
+        self.meta_info = make_master_file.df  #pd.read_csv(
+        #os.path.join(os.path.dirname(__file__), "master.csv"), index_col=0)
+        if not self.name in self.meta_info:
+            print(self.name)
+            error_mssg = "Invalid dataset name {}.\n".format(self.name)
+            error_mssg += "Available datasets are as follows:\n"
+            error_mssg += "\n".join(self.meta_info.keys())
+            raise ValueError(error_mssg)
+
+        self.download_name = self.meta_info[self.name][
+            "download_name"]  ## name of downloaded file, e.g., tox21
+
+        self.num_tasks = int(self.meta_info[self.name]["num tasks"])
+        self.task_type = self.meta_info[self.name]["task type"]
+
+        super(PglGraphPropPredDataset, self).__init__()
+
+        self.pre_process()
+
+    def pre_process(self):
+        """Pre-processing"""
+        processed_dir = osp.join(self.root, 'processed')
+        raw_dir = osp.join(self.root, 'raw')
+        pre_processed_file_path = osp.join(processed_dir, 'pgl_data_processed')
+
+        if os.path.exists(pre_processed_file_path):
+            # TODO: Load Preprocessed
+            pass
+        else:
+            ### download
+            url = self.meta_info[self.name]["url"]
+            if decide_download(url):
+                path = download_url(url, self.original_root)
+                extract_zip(path, self.original_root)
+                os.unlink(path)
+                # delete folder if there exists
+                try:
+                    shutil.rmtree(self.root)
+                except:
+                    pass
+                shutil.move(
+                    osp.join(self.original_root, self.download_name),
+                    self.root)
+            else:
+                print("Stop download.")
+                exit(-1)
+
+            ### preprocess
+            add_inverse_edge = to_bool(self.meta_info[self.name][
+                "add_inverse_edge"])
+            self.graphs = read_csv_graph_pgl(
+                raw_dir, add_inverse_edge=add_inverse_edge)
+            self.graphs = np.array(self.graphs)
+            self.labels = np.array(
+                pd.read_csv(
+                    osp.join(raw_dir, "graph-label.csv.gz"),
+                    compression="gzip",
+                    header=None).values)
+
+            # TODO: Load Graph
+            ### load preprocessed files
+
+    def get_idx_split(self):
+        """Train/Valid/Test split"""
+        split_type = self.meta_info[self.name]["split"]
+        path = osp.join(self.root, "split", split_type)
+
+        train_idx = pd.read_csv(
+            osp.join(path, "train.csv.gz"), compression="gzip",
+            header=None).values.T[0]
+        valid_idx = pd.read_csv(
+            osp.join(path, "valid.csv.gz"), compression="gzip",
+            header=None).values.T[0]
+        test_idx = pd.read_csv(
+            osp.join(path, "test.csv.gz"), compression="gzip",
+            header=None).values.T[0]
+
+        return {
+            "train": np.array(
+                train_idx, dtype="int64"),
+            "valid": np.array(
+                valid_idx, dtype="int64"),
+            "test": np.array(
+                test_idx, dtype="int64")
+        }
+
+    def __getitem__(self, idx):
+        """Get datapoint with index"""
+        return self.graphs[idx], self.labels[idx]
+
+    def __len__(self):
+        """Length of the dataset
+        Returns
+        -------
+        int
+            Length of Dataset
+        """
+        return len(self.graphs)
+
+    def __repr__(self):  # pragma: no cover
+        return '{}({})'.format(self.__class__.__name__, len(self))
+
+
+if __name__ == "__main__":
+    pgl_dataset = PglGraphPropPredDataset(name="ogbg-mol-bace")
+    splitted_index = pgl_dataset.get_idx_split()
+    print(pgl_dataset)
+    print(pgl_dataset[3:20])
+    #print(pgl_dataset[splitted_index["train"]])
+    #print(pgl_dataset[splitted_index["valid"]])
+    #print(pgl_dataset[splitted_index["test"]])
--- a/pgl/contrib/ogb/io/__init__.py
+++ b/pgl/contrib/ogb/io/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""__init__.py
+"""
--- a/pgl/contrib/ogb/io/read_graph_pgl.py
+++ b/pgl/contrib/ogb/io/read_graph_pgl.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""pgl read_csv_graph for ogb
+"""
+
+import pandas as pd
+import os.path as osp
+import numpy as np
+import pgl
+from ogb.io.read_graph_raw import read_csv_graph_raw
+
+
+def read_csv_graph_pgl(raw_dir, add_inverse_edge=False):
+    """Read CSV data and build PGL Graph
+    """
+    graph_list = read_csv_graph_raw(raw_dir, add_inverse_edge)
+    pgl_graph_list = []
+
+    for graph in graph_list:
+        edges = list(zip(graph["edge_index"][0], graph["edge_index"][1]))
+        g = pgl.graph.Graph(num_nodes=graph["num_nodes"], edges=edges)
+
+        if graph["edge_feat"] is not None:
+            g.edge_feat["feat"] = graph["edge_feat"]
+
+        if graph["node_feat"] is not None:
+            g.node_feat["feat"] = graph["node_feat"]
+
+        pgl_graph_list.append(g)
+
+    return pgl_graph_list
+
+
+if __name__ == "__main__":
+    # graph_list = read_csv_graph_dgl('dataset/proteinfunc_v2/raw', add_inverse_edge = True)
+    graph_list = read_csv_graph_pgl(
+        'dataset/ogbn_proteins_pgl/raw', add_inverse_edge=True)
+    print(graph_list)
--- a/pgl/contrib/ogb/linkproppred/__init__.py
+++ b/pgl/contrib/ogb/linkproppred/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""__init__.py
+"""
--- a/pgl/contrib/ogb/linkproppred/dataset_pgl.py
+++ b/pgl/contrib/ogb/linkproppred/dataset_pgl.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LinkPropPredDataset for pgl
+"""
+import pandas as pd
+import shutil, os
+import os.path as osp
+import numpy as np
+from ogb.utils.url import decide_download, download_url, extract_zip
+from ogb.linkproppred import make_master_file
+from pgl.contrib.ogb.io.read_graph_pgl import read_csv_graph_pgl
+
+
+def to_bool(value):
+    """to_bool"""
+    return np.array([value], dtype="bool")[0]
+
+
+class PglLinkPropPredDataset(object):
+    """PglLinkPropPredDataset
+    """
+
+    def __init__(self, name, root="dataset"):
+        self.name = name  ## original name, e.g., ogbl-ppa
+        self.dir_name = "_".join(name.split(
+            "-")) + "_pgl"  ## replace hyphen with underline, e.g., ogbl_ppa_pgl
+
+        self.original_root = root
+        self.root = osp.join(root, self.dir_name)
+
+        self.meta_info = make_master_file.df  #pd.read_csv(os.path.join(os.path.dirname(__file__), "master.csv"), index_col=0)
+        if not self.name in self.meta_info:
+            print(self.name)
+            error_mssg = "Invalid dataset name {}.\n".format(self.name)
+            error_mssg += "Available datasets are as follows:\n"
+            error_mssg += "\n".join(self.meta_info.keys())
+            raise ValueError(error_mssg)
+
+        self.download_name = self.meta_info[self.name][
+            "download_name"]  ## name of downloaded file, e.g., ppassoc
+
+        self.task_type = self.meta_info[self.name]["task type"]
+
+        super(PglLinkPropPredDataset, self).__init__()
+
+        self.pre_process()
+
+    def pre_process(self):
+        """pre_process downlaoding data
+        """
+        processed_dir = osp.join(self.root, 'processed')
+        pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed')
+
+        if osp.exists(pre_processed_file_path):
+            #TODO: Reload Preprocess files
+            pass
+        else:
+            ### check download
+            if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")):
+                url = self.meta_info[self.name]["url"]
+                if decide_download(url):
+                    path = download_url(url, self.original_root)
+                    extract_zip(path, self.original_root)
+                    os.unlink(path)
+                    # delete folder if there exists
+                    try:
+                        shutil.rmtree(self.root)
+                    except:
+                        pass
+                    shutil.move(
+                        osp.join(self.original_root, self.download_name),
+                        self.root)
+                else:
+                    print("Stop download.")
+                    exit(-1)
+
+            raw_dir = osp.join(self.root, "raw")
+
+            ### pre-process and save
+            add_inverse_edge = to_bool(self.meta_info[self.name][
+                "add_inverse_edge"])
+            self.graph = read_csv_graph_pgl(
+                raw_dir, add_inverse_edge=add_inverse_edge)
+
+            #TODO: SAVE preprocess graph
+
+    def get_edge_split(self):
+        """Train/Validation/Test split
+        """
+        split_type = self.meta_info[self.name]["split"]
+        path = osp.join(self.root, "split", split_type)
+
+        train_idx = pd.read_csv(
+            osp.join(path, "train.csv.gz"), compression="gzip",
+            header=None).values
+        valid_idx = pd.read_csv(
+            osp.join(path, "valid.csv.gz"), compression="gzip",
+            header=None).values
+        test_idx = pd.read_csv(
+            osp.join(path, "test.csv.gz"), compression="gzip",
+            header=None).values
+
+        if self.task_type == "link prediction":
+            target_type = np.int64
+        else:
+            target_type = np.float32
+
+        return {
+            "train_edge": np.array(
+                train_idx[:, :2], dtype="int64"),
+            "train_edge_label": np.array(
+                train_idx[:, 2], dtype=target_type),
+            "valid_edge": np.array(
+                valid_idx[:, :2], dtype="int64"),
+            "valid_edge_label": np.array(
+                valid_idx[:, 2], dtype=target_type),
+            "test_edge": np.array(
+                test_idx[:, :2], dtype="int64"),
+            "test_edge_label": np.array(
+                test_idx[:, 2], dtype=target_type)
+        }
+
+    def __getitem__(self, idx):
+        assert idx == 0, "This dataset has only one graph"
+        return self.graph[0]
+
+    def __len__(self):
+        return 1
+
+    def __repr__(self):  # pragma: no cover
+        return '{}({})'.format(self.__class__.__name__, len(self))
+
+
+if __name__ == "__main__":
+    pgl_dataset = PglLinkPropPredDataset(name="ogbl-ppa")
+    splitted_edge = pgl_dataset.get_edge_split()
+    print(pgl_dataset[0])
+    print(splitted_edge)
--- a/pgl/contrib/ogb/nodeproppred/__init__.py
+++ b/pgl/contrib/ogb/nodeproppred/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""__init__.py
+"""
--- a/pgl/contrib/ogb/nodeproppred/dataset_pgl.py
+++ b/pgl/contrib/ogb/nodeproppred/dataset_pgl.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""NodePropPredDataset for pgl
+"""
+import pandas as pd
+import shutil, os
+import os.path as osp
+import numpy as np
+from ogb.utils.url import decide_download, download_url, extract_zip
+from ogb.nodeproppred import make_master_file  # create master.csv
+from pgl.contrib.ogb.io.read_graph_pgl import read_csv_graph_pgl
+
+
+def to_bool(value):
+    """to_bool"""
+    return np.array([value], dtype="bool")[0]
+
+
+class PglNodePropPredDataset(object):
+    """PglNodePropPredDataset
+    """
+
+    def __init__(self, name, root="dataset"):
+        self.name = name  ## original name, e.g., ogbn-proteins
+        self.dir_name = "_".join(
+            name.split("-")
+        ) + "_pgl"  ## replace hyphen with underline, e.g., ogbn_proteins_pgl
+
+        self.original_root = root
+        self.root = osp.join(root, self.dir_name)
+
+        self.meta_info = make_master_file.df  #pd.read_csv(
+        #os.path.join(os.path.dirname(__file__), "master.csv"), index_col=0)
+        if not self.name in self.meta_info:
+            error_mssg = "Invalid dataset name {}.\n".format(self.name)
+            error_mssg += "Available datasets are as follows:\n"
+            error_mssg += "\n".join(self.meta_info.keys())
+            raise ValueError(error_mssg)
+
+        self.download_name = self.meta_info[self.name][
+            "download_name"]  ## name of downloaded file, e.g., tox21
+
+        self.num_tasks = int(self.meta_info[self.name]["num tasks"])
+        self.task_type = self.meta_info[self.name]["task type"]
+
+        super(PglNodePropPredDataset, self).__init__()
+
+        self.pre_process()
+
+    def pre_process(self):
+        """pre_process downlaoding data
+        """
+        processed_dir = osp.join(self.root, 'processed')
+        pre_processed_file_path = osp.join(processed_dir, 'pgl_data_processed')
+
+        if osp.exists(pre_processed_file_path):
+            # TODO: Reload Preprocess files 
+            pass
+        else:
+            ### check download
+            if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")):
+                url = self.meta_info[self.name]["url"]
+                if decide_download(url):
+                    path = download_url(url, self.original_root)
+                    extract_zip(path, self.original_root)
+                    os.unlink(path)
+                    # delete folder if there exists
+                    try:
+                        shutil.rmtree(self.root)
+                    except:
+                        pass
+                    shutil.move(
+                        osp.join(self.original_root, self.download_name),
+                        self.root)
+                else:
+                    print("Stop download.")
+                    exit(-1)
+
+            raw_dir = osp.join(self.root, "raw")
+
+            ### pre-process and save
+            add_inverse_edge = to_bool(self.meta_info[self.name][
+                "add_inverse_edge"])
+            self.graph = read_csv_graph_pgl(
+                raw_dir, add_inverse_edge=add_inverse_edge)
+
+            ### adding prediction target
+            node_label = pd.read_csv(
+                osp.join(raw_dir, 'node-label.csv.gz'),
+                compression="gzip",
+                header=None).values
+            if "classification" in self.task_type:
+                node_label = np.array(node_label, dtype=np.int64)
+            else:
+                node_label = np.array(node_label, dtype=np.float32)
+
+            label_dict = {"labels": node_label}
+
+            # TODO: SAVE preprocess graph
+            self.labels = label_dict['labels']
+
+    def get_idx_split(self):
+        """Train/Validation/Test split
+        """
+        split_type = self.meta_info[self.name]["split"]
+        path = osp.join(self.root, "split", split_type)
+
+        train_idx = pd.read_csv(
+            osp.join(path, "train.csv.gz"), compression="gzip",
+            header=None).values.T[0]
+        valid_idx = pd.read_csv(
+            osp.join(path, "valid.csv.gz"), compression="gzip",
+            header=None).values.T[0]
+        test_idx = pd.read_csv(
+            osp.join(path, "test.csv.gz"), compression="gzip",
+            header=None).values.T[0]
+
+        return {
+            "train": np.array(
+                train_idx, dtype="int64"),
+            "valid": np.array(
+                valid_idx, dtype="int64"),
+            "test": np.array(
+                test_idx, dtype="int64")
+        }
+
+    def __getitem__(self, idx):
+        assert idx == 0, "This dataset has only one graph"
+        return self.graph[idx], self.labels
+
+    def __len__(self):
+        return 1
+
+    def __repr__(self):  # pragma: no cover
+        return '{}({})'.format(self.__class__.__name__, len(self))
+
+
+if __name__ == "__main__":
+    pgl_dataset = PglNodePropPredDataset(name="ogbn-proteins")
+    splitted_index = pgl_dataset.get_idx_split()
+    print(pgl_dataset[0])
+    print(splitted_index)