ogbl_ppa_dataloader.py

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import

from dataloader.base_dataloader import BaseDataGenerator
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

from ogb.linkproppred import LinkPropPredDataset
from ogb.linkproppred import Evaluator
import tqdm
from collections import namedtuple
import pgl
import numpy as np


class PPADataGenerator(BaseDataGenerator):
    def __init__(self,
                 graph_wrapper=None,
                 buf_size=1000,
                 batch_size=128,
                 num_workers=1,
                 shuffle=True,
                 phase="train"):
        super(PPADataGenerator, self).__init__(
            buf_size=buf_size,
            num_workers=num_workers,
            batch_size=batch_size,
            shuffle=shuffle)

        self.d_name = "ogbl-ppa"
        self.graph_wrapper = graph_wrapper
        dataset = LinkPropPredDataset(name=self.d_name)
        splitted_edge = dataset.get_edge_split()
        self.phase = phase
        graph = dataset[0]
        edges = graph["edge_index"].T
        #self.graph = pgl.graph.Graph(num_nodes=graph["num_nodes"],
        #       edges=edges, 
        #       node_feat={"nfeat": graph["node_feat"],
        #             "node_id": np.arange(0, graph["num_nodes"], dtype="int64").reshape(-1, 1) })

        #self.graph.indegree()
        self.num_nodes = graph["num_nodes"]
        if self.phase == 'train':
            edges = splitted_edge["train"]["edge"]
            labels = np.ones(len(edges))
        elif self.phase == "valid":
            # Compute the embedding for all the nodes
            pos_edges = splitted_edge["valid"]["edge"]
            neg_edges = splitted_edge["valid"]["edge_neg"]
            pos_labels = np.ones(len(pos_edges))
            neg_labels = np.zeros(len(neg_edges))
            edges = np.vstack([pos_edges, neg_edges])
            labels = pos_labels.tolist() + neg_labels.tolist()
        elif self.phase == "test":
            # Compute the embedding for all the nodes
            pos_edges = splitted_edge["test"]["edge"]
            neg_edges = splitted_edge["test"]["edge_neg"]
            pos_labels = np.ones(len(pos_edges))
            neg_labels = np.zeros(len(neg_edges))
            edges = np.vstack([pos_edges, neg_edges])
            labels = pos_labels.tolist() + neg_labels.tolist()

        self.line_examples = []
        Example = namedtuple('Example', ['src', "dst", "label"])
        for edge, label in zip(edges, labels):
            self.line_examples.append(
                Example(
                    src=edge[0], dst=edge[1], label=label))
        print("Phase", self.phase)
        print("Len Examples", len(self.line_examples))

    def batch_fn(self, batch_ex):
        batch_src = []
        batch_dst = []
        join_graph = []
        cc = 0
        batch_node_id = []
        batch_labels = []
        for ex in batch_ex:
            batch_src.append(ex.src)
            batch_dst.append(ex.dst)
            batch_labels.append(ex.label)

        if self.phase == "train":
            for num in range(1):
                rand_src = np.random.randint(
                    low=0, high=self.num_nodes, size=len(batch_ex))
                rand_dst = np.random.randint(
                    low=0, high=self.num_nodes, size=len(batch_ex))
                batch_src = batch_src + rand_src.tolist()
                batch_dst = batch_dst + rand_dst.tolist()
                batch_labels = batch_labels + np.zeros_like(
                    rand_src, dtype="int64").tolist()

        feed_dict = {}

        feed_dict["batch_src"] = np.array(batch_src, dtype="int64")
        feed_dict["batch_dst"] = np.array(batch_dst, dtype="int64")
        feed_dict["labels"] = np.array(batch_labels, dtype="int64")
        return feed_dict