# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """NodePropPredDataset for pgl """ import pgl import pandas as pd import shutil, os import os.path as osp import numpy as np from ogb.utils.url import decide_download, download_url, extract_zip from ogb.nodeproppred import make_master_file # create master.csv from pgl.contrib.ogb.io.read_graph_pgl import read_csv_graph_pgl, read_csv_heterograph_pgl from ogb.io.read_graph_raw import read_node_label_hetero, read_nodesplitidx_split_hetero def to_bool(value): """to_bool""" return np.array([value], dtype="bool")[0] class PglNodePropPredDataset(object): """PglNodePropPredDataset """ def __init__(self, name, root="dataset"): self.name = name ## original name, e.g., ogbn-proteins self.dir_name = "_".join( name.split("-") ) + "_pgl" ## replace hyphen with underline, e.g., ogbn_proteins_pgl self.original_root = root self.root = osp.join(root, self.dir_name) self.meta_info = make_master_file.df #pd.read_csv( #os.path.join(os.path.dirname(__file__), "master.csv"), index_col=0) if not self.name in self.meta_info: error_mssg = "Invalid dataset name {}.\n".format(self.name) error_mssg += "Available datasets are as follows:\n" error_mssg += "\n".join(self.meta_info.keys()) raise ValueError(error_mssg) self.download_name = self.meta_info[self.name][ "download_name"] ## name of downloaded file, e.g., tox21 self.num_tasks = int(self.meta_info[self.name]["num tasks"]) self.task_type = self.meta_info[self.name]["task type"] self.eval_metric = self.meta_info[self.name]["eval metric"] self.num_classes = int(self.meta_info[self.name]["num classes"]) self.is_hetero = self.meta_info[self.name]["is hetero"] super(PglNodePropPredDataset, self).__init__() self.pre_process() def pre_process(self): """pre_process downlaoding data """ processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'pgl_data_processed') if osp.exists(pre_processed_file_path): # TODO: Reload Preprocess files. DONE @ZHUI # TODO: add support for heterogenous graph. self.graph = [] if os.path.isdir(pre_processed_file_path): for i in range(len(os.listdir(pre_processed_file_path))): graph_path = os.path.join(pre_processed_file_path, "graph_{}".format(i)) if os.path.exists(graph_path): self.graph.append(pgl.graph.Graph().load(graph_path)) node_label = np.load( os.path.join(pre_processed_file_path, "node_label.npy")) label_dict = {"labels": node_label} self.labels = label_dict['labels'] else: ### check download if not osp.exists(osp.join(self.root, "raw")): url = self.meta_info[self.name]["url"] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print("Stop download.") exit(-1) raw_dir = osp.join(self.root, "raw") self.raw_dir = raw_dir ### pre-process and save add_inverse_edge = to_bool(self.meta_info[self.name][ "add_inverse_edge"]) add_inverse_edge = self.meta_info[self.name][ "add_inverse_edge"] == "True" if self.meta_info[self.name]["additional node files"] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[self.name][ "additional node files"].split(',') if self.meta_info[self.name]["additional edge files"] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[self.name][ "additional edge files"].split(',') if self.is_hetero: self.graph = read_csv_heterograph_pgl( self.raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files) node_label_dict = read_node_label_hetero(self.raw_dir) y_dict = {} if "classification" in self.task_type: for nodetype, node_label in node_label_dict.items(): # detect if there is any nan if np.isnan(node_label).any(): y_dict[nodetype] = np.array( node_label, dtype='float32') else: y_dict[nodetype] = np.array( node_label, dtype='int64') else: for nodetype, node_label in node_label_dict.items(): y_dict[nodetype] = np.array( node_label, dtype='float32') self.labels = y_dict else: self.graph = read_csv_graph_pgl( raw_dir, add_inverse_edge=add_inverse_edge) ### adding prediction target node_label = pd.read_csv( osp.join(raw_dir, 'node-label.csv.gz'), compression="gzip", header=None).values if "classification" in self.task_type: node_label = np.array(node_label, dtype=np.int64) else: node_label = np.array(node_label, dtype=np.float32) label_dict = {"labels": node_label} self.labels = label_dict['labels'] # TODO: SAVE preprocess graph, DONE @ZHUI for i in range(len(self.graph)): self.graph[i].dump( os.path.join(pre_processed_file_path, "graph_{}".format(i))) np.save( os.path.join(pre_processed_file_path, "node_label.npy"), node_label) def get_idx_split(self): """Train/Validation/Test split """ split_type = self.meta_info[self.name]["split"] path = osp.join(self.root, "split", split_type) if self.is_hetero: train_idx_dict, valid_idx_dict, test_idx_dict = read_nodesplitidx_split_hetero( path) for nodetype in train_idx_dict.keys(): train_idx_dict[nodetype] = np.array( train_idx_dict[nodetype], dtype='int64') valid_idx_dict[nodetype] = np.array( valid_idx_dict[nodetype], dtype='int64') test_idx_dict[nodetype] = np.array( test_idx_dict[nodetype], dtype='int64') # code refers dataset_pyg # TODO: check the code return { "train": train_idx_dict, "valid": valid_idx_dict, "test": test_idx_dict } else: train_idx = pd.read_csv( osp.join(path, "train.csv.gz"), compression="gzip", header=None).values.T[0] valid_idx = pd.read_csv( osp.join(path, "valid.csv.gz"), compression="gzip", header=None).values.T[0] test_idx = pd.read_csv( osp.join(path, "test.csv.gz"), compression="gzip", header=None).values.T[0] return { "train": np.array( train_idx, dtype="int64"), "valid": np.array( valid_idx, dtype="int64"), "test": np.array( test_idx, dtype="int64") } def __getitem__(self, idx): assert idx == 0, "This dataset has only one graph" return self.graph[idx], self.labels def __len__(self): return 1 def __repr__(self): # pragma: no cover return '{}({})'.format(self.__class__.__name__, len(self)) if __name__ == "__main__": pgl_dataset = PglNodePropPredDataset(name="ogbn-mag") splitted_index = pgl_dataset.get_idx_split() print(pgl_dataset[0]) print(splitted_index)