提交 752b6169 编写于 作者: L liweibin

add ogb PropPredDataset for pgl

上级 b46b2b1a
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""test ogb
"""
import argparse
import pgl
import numpy as np
import paddle.fluid as fluid
from pgl.contrib.ogb.linkproppred.dataset_pgl import PglLinkPropPredDataset
from pgl.utils import paddle_helper
from ogb.linkproppred import Evaluator
def send_func(src_feat, dst_feat, edge_feat):
"""send_func"""
return src_feat["h"]
def recv_func(feat):
"""recv_func"""
return fluid.layers.sequence_pool(feat, pool_type="sum")
class GNNModel(object):
"""GNNModel"""
def __init__(self, name, num_nodes, emb_dim, num_layers):
self.num_nodes = num_nodes
self.emb_dim = emb_dim
self.num_layers = num_layers
self.name = name
self.src_nodes = fluid.layers.data(
name='src_nodes',
shape=[None, 1],
dtype='int64', )
self.dst_nodes = fluid.layers.data(
name='dst_nodes',
shape=[None, 1],
dtype='int64', )
self.edge_label = fluid.layers.data(
name='edge_label',
shape=[None, 1],
dtype='float32', )
def forward(self, graph):
"""forward"""
h = fluid.layers.create_parameter(
shape=[self.num_nodes, self.emb_dim],
dtype="float32",
name=self.name + "_embedding")
# edge_attr = fluid.layers.fc(graph.edge_feat["feat"], size=self.emb_dim)
for layer in range(self.num_layers):
msg = graph.send(
send_func,
nfeat_list=[("h", h)], )
h = graph.recv(msg, recv_func)
h = fluid.layers.fc(
h,
size=self.emb_dim,
bias_attr=False,
param_attr=fluid.ParamAttr(name=self.name + '_%s' % layer))
h = h * graph.node_feat["norm"]
bias = fluid.layers.create_parameter(
shape=[self.emb_dim],
dtype='float32',
is_bias=True,
name=self.name + '_bias_%s' % layer)
h = fluid.layers.elementwise_add(h, bias, act="relu")
src = fluid.layers.gather(h, self.src_nodes)
dst = fluid.layers.gather(h, self.dst_nodes)
edge_embed = src * dst
pred = fluid.layers.fc(input=edge_embed,
size=1,
name=self.name + "_pred_output")
prob = fluid.layers.sigmoid(pred)
loss = fluid.layers.sigmoid_cross_entropy_with_logits(pred,
self.edge_label)
loss = fluid.layers.reduce_mean(loss)
return pred, prob, loss
def main():
"""main
"""
# Training settings
parser = argparse.ArgumentParser(description='Graph Dataset')
parser.add_argument(
'--epochs',
type=int,
default=100,
help='number of epochs to train (default: 100)')
parser.add_argument(
'--dataset',
type=str,
default="ogbl-ppa",
help='dataset name (default: protein protein associations)')
args = parser.parse_args()
#place = fluid.CUDAPlace(0)
place = fluid.CPUPlace() # Dataset too big to use GPU
### automatic dataloading and splitting
print("loadding dataset")
dataset = PglLinkPropPredDataset(name=args.dataset)
splitted_edge = dataset.get_edge_split()
print(splitted_edge['train_edge'].shape)
print(splitted_edge['train_edge_label'].shape)
print("building evaluator")
### automatic evaluator. takes dataset name as input
evaluator = Evaluator(args.dataset)
graph_data = dataset[0]
print("num_nodes: %d" % graph_data.num_nodes)
train_program = fluid.Program()
startup_program = fluid.Program()
test_program = fluid.Program()
# degree normalize
indegree = graph_data.indegree()
norm = np.zeros_like(indegree, dtype="float32")
norm[indegree > 0] = np.power(indegree[indegree > 0], -0.5)
graph_data.node_feat["norm"] = np.expand_dims(norm, -1).astype("float32")
with fluid.program_guard(train_program, startup_program):
model = GNNModel(
name="gnn",
num_nodes=graph_data.num_nodes,
emb_dim=64,
num_layers=2)
gw = pgl.graph_wrapper.GraphWrapper(
"graph",
place,
node_feat=graph_data.node_feat_info(),
edge_feat=graph_data.edge_feat_info())
pred, prob, loss = model.forward(gw)
val_program = train_program.clone(for_test=True)
with fluid.program_guard(train_program, startup_program):
adam = fluid.optimizer.Adam(
learning_rate=1e-2,
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0005))
adam.minimize(loss)
exe = fluid.Executor(place)
exe.run(startup_program)
feed = gw.to_feed(graph_data)
for epoch in range(1, args.epochs + 1):
feed['src_nodes'] = splitted_edge["train_edge"][:, 0].reshape(-1, 1)
feed['dst_nodes'] = splitted_edge["train_edge"][:, 1].reshape(-1, 1)
feed['edge_label'] = splitted_edge["train_edge_label"].astype(
"float32").reshape(-1, 1)
res_loss, y_pred = exe.run(train_program,
feed=feed,
fetch_list=[loss, prob])
print("Loss %s" % res_loss[0])
result = {}
print("Evaluating...")
feed['src_nodes'] = splitted_edge["valid_edge"][:, 0].reshape(-1, 1)
feed['dst_nodes'] = splitted_edge["valid_edge"][:, 1].reshape(-1, 1)
feed['edge_label'] = splitted_edge["valid_edge_label"].astype(
"float32").reshape(-1, 1)
y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0]
input_dict = {
"y_true": splitted_edge["valid_edge_label"],
"y_pred": y_pred.reshape(-1, ),
}
result["valid"] = evaluator.eval(input_dict)
feed['src_nodes'] = splitted_edge["test_edge"][:, 0].reshape(-1, 1)
feed['dst_nodes'] = splitted_edge["test_edge"][:, 1].reshape(-1, 1)
feed['edge_label'] = splitted_edge["test_edge_label"].astype(
"float32").reshape(-1, 1)
y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0]
input_dict = {
"y_true": splitted_edge["test_edge_label"],
"y_pred": y_pred.reshape(-1, ),
}
result["test"] = evaluator.eval(input_dict)
print(result)
if __name__ == "__main__":
main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""test ogb
"""
import argparse
import pgl
import numpy as np
import paddle.fluid as fluid
from pgl.contrib.ogb.nodeproppred.dataset_pgl import PglNodePropPredDataset
from pgl.utils import paddle_helper
from ogb.nodeproppred import Evaluator
def train():
pass
def send_func(src_feat, dst_feat, edge_feat):
return (src_feat["h"] + edge_feat["h"]) * src_feat["norm"]
class GNNModel(object):
def __init__(self, name, emb_dim, num_task, num_layers):
self.num_task = num_task
self.emb_dim = emb_dim
self.num_layers = num_layers
self.name = name
def forward(self, graph):
h = fluid.layers.embedding(
graph.node_feat["x"],
size=(2, self.emb_dim)) # name=self.name + "_embedding")
edge_attr = fluid.layers.fc(graph.edge_feat["feat"], size=self.emb_dim)
for layer in range(self.num_layers):
msg = graph.send(
send_func,
nfeat_list=[("h", h), ("norm", graph.node_feat["norm"])],
efeat_list=[("h", edge_attr)])
h = graph.recv(msg, "sum")
h = fluid.layers.fc(
h,
size=self.emb_dim,
bias_attr=False,
param_attr=fluid.ParamAttr(name=self.name + '_%s' % layer))
h = h * graph.node_feat["norm"]
bias = fluid.layers.create_parameter(
shape=[self.emb_dim],
dtype='float32',
is_bias=True,
name=self.name + '_bias_%s' % layer)
h = fluid.layers.elementwise_add(h, bias, act="relu")
pred = fluid.layers.fc(h,
self.num_task,
act=None,
name=self.name + "_pred_output")
return pred
def main():
"""main
"""
# Training settings
parser = argparse.ArgumentParser(description='Graph Dataset')
parser.add_argument(
'--epochs',
type=int,
default=100,
help='number of epochs to train (default: 100)')
parser.add_argument(
'--dataset',
type=str,
default="ogbn-proteins",
help='dataset name (default: proteinfunc)')
args = parser.parse_args()
#device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")
#place = fluid.CUDAPlace(0)
place = fluid.CPUPlace() # Dataset too big to use GPU
### automatic dataloading and splitting
dataset = PglNodePropPredDataset(name=args.dataset)
splitted_idx = dataset.get_idx_split()
### automatic evaluator. takes dataset name as input
evaluator = Evaluator(args.dataset)
graph_data, label = dataset[0]
train_program = fluid.Program()
startup_program = fluid.Program()
test_program = fluid.Program()
# degree normalize
indegree = graph_data.indegree()
norm = np.zeros_like(indegree, dtype="float32")
norm[indegree > 0] = np.power(indegree[indegree > 0], -0.5)
graph_data.node_feat["norm"] = np.expand_dims(norm, -1).astype("float32")
graph_data.node_feat["x"] = np.zeros((len(indegree), 1), dtype="int64")
graph_data.edge_feat["feat"] = graph_data.edge_feat["feat"].astype(
"float32")
model = GNNModel(
name="gnn", num_task=dataset.num_tasks, emb_dim=64, num_layers=2)
with fluid.program_guard(train_program, startup_program):
gw = pgl.graph_wrapper.StaticGraphWrapper("graph", graph_data, place)
pred = model.forward(gw)
sigmoid_pred = fluid.layers.sigmoid(pred)
val_program = train_program.clone(for_test=True)
initializer = []
with fluid.program_guard(train_program, startup_program):
train_node_index, init = paddle_helper.constant(
"train_node_index", dtype="int64", value=splitted_idx["train"])
initializer.append(init)
train_node_label, init = paddle_helper.constant(
"train_node_label",
dtype="float32",
value=label[splitted_idx["train"]].astype("float32"))
initializer.append(init)
train_pred_t = fluid.layers.gather(pred, train_node_index)
train_loss_t = fluid.layers.sigmoid_cross_entropy_with_logits(
x=train_pred_t, label=train_node_label)
train_loss_t = fluid.layers.reduce_sum(train_loss_t)
train_pred_t = fluid.layers.sigmoid(train_pred_t)
adam = fluid.optimizer.Adam(
learning_rate=1e-2,
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0005))
adam.minimize(train_loss_t)
exe = fluid.Executor(place)
exe.run(startup_program)
gw.initialize(place)
for init in initializer:
init(place)
for epoch in range(1, args.epochs + 1):
loss = exe.run(train_program, feed={}, fetch_list=[train_loss_t])
print("Loss %s" % loss[0])
print("Evaluating...")
y_pred = exe.run(val_program, feed={}, fetch_list=[sigmoid_pred])[0]
result = {}
input_dict = {
"y_true": label[splitted_idx["train"]],
"y_pred": y_pred[splitted_idx["train"]]
}
result["train"] = evaluator.eval(input_dict)
input_dict = {
"y_true": label[splitted_idx["valid"]],
"y_pred": y_pred[splitted_idx["valid"]]
}
result["valid"] = evaluator.eval(input_dict)
input_dict = {
"y_true": label[splitted_idx["test"]],
"y_pred": y_pred[splitted_idx["test"]]
}
result["test"] = evaluator.eval(input_dict)
print(result)
if __name__ == "__main__":
main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""__init__.py"""
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PglGraphPropPredDataset
"""
import pandas as pd
import shutil, os
import os.path as osp
import numpy as np
from ogb.utils.url import decide_download, download_url, extract_zip
from ogb.graphproppred import make_master_file
from pgl.contrib.ogb.io.read_graph_pgl import read_csv_graph_pgl
def to_bool(value):
"""to_bool"""
return np.array([value], dtype="bool")[0]
class PglGraphPropPredDataset(object):
"""PglGraphPropPredDataset"""
def __init__(self, name, root="dataset"):
self.name = name ## original name, e.g., ogbg-mol-tox21
self.dir_name = "_".join(
name.split("-")
) + "_pgl" ## replace hyphen with underline, e.g., ogbg_mol_tox21_dgl
self.original_root = root
self.root = osp.join(root, self.dir_name)
self.meta_info = make_master_file.df #pd.read_csv(
#os.path.join(os.path.dirname(__file__), "master.csv"), index_col=0)
if not self.name in self.meta_info:
print(self.name)
error_mssg = "Invalid dataset name {}.\n".format(self.name)
error_mssg += "Available datasets are as follows:\n"
error_mssg += "\n".join(self.meta_info.keys())
raise ValueError(error_mssg)
self.download_name = self.meta_info[self.name][
"download_name"] ## name of downloaded file, e.g., tox21
self.num_tasks = int(self.meta_info[self.name]["num tasks"])
self.task_type = self.meta_info[self.name]["task type"]
super(PglGraphPropPredDataset, self).__init__()
self.pre_process()
def pre_process(self):
"""Pre-processing"""
processed_dir = osp.join(self.root, 'processed')
raw_dir = osp.join(self.root, 'raw')
pre_processed_file_path = osp.join(processed_dir, 'pgl_data_processed')
if os.path.exists(pre_processed_file_path):
# TODO: Load Preprocessed
pass
else:
### download
url = self.meta_info[self.name]["url"]
if decide_download(url):
path = download_url(url, self.original_root)
extract_zip(path, self.original_root)
os.unlink(path)
# delete folder if there exists
try:
shutil.rmtree(self.root)
except:
pass
shutil.move(
osp.join(self.original_root, self.download_name),
self.root)
else:
print("Stop download.")
exit(-1)
### preprocess
add_inverse_edge = to_bool(self.meta_info[self.name][
"add_inverse_edge"])
self.graphs = read_csv_graph_pgl(
raw_dir, add_inverse_edge=add_inverse_edge)
self.graphs = np.array(self.graphs)
self.labels = np.array(
pd.read_csv(
osp.join(raw_dir, "graph-label.csv.gz"),
compression="gzip",
header=None).values)
# TODO: Load Graph
### load preprocessed files
def get_idx_split(self):
"""Train/Valid/Test split"""
split_type = self.meta_info[self.name]["split"]
path = osp.join(self.root, "split", split_type)
train_idx = pd.read_csv(
osp.join(path, "train.csv.gz"), compression="gzip",
header=None).values.T[0]
valid_idx = pd.read_csv(
osp.join(path, "valid.csv.gz"), compression="gzip",
header=None).values.T[0]
test_idx = pd.read_csv(
osp.join(path, "test.csv.gz"), compression="gzip",
header=None).values.T[0]
return {
"train": np.array(
train_idx, dtype="int64"),
"valid": np.array(
valid_idx, dtype="int64"),
"test": np.array(
test_idx, dtype="int64")
}
def __getitem__(self, idx):
"""Get datapoint with index"""
return self.graphs[idx], self.labels[idx]
def __len__(self):
"""Length of the dataset
Returns
-------
int
Length of Dataset
"""
return len(self.graphs)
def __repr__(self): # pragma: no cover
return '{}({})'.format(self.__class__.__name__, len(self))
if __name__ == "__main__":
pgl_dataset = PglGraphPropPredDataset(name="ogbg-mol-bace")
splitted_index = pgl_dataset.get_idx_split()
print(pgl_dataset)
print(pgl_dataset[3:20])
#print(pgl_dataset[splitted_index["train"]])
#print(pgl_dataset[splitted_index["valid"]])
#print(pgl_dataset[splitted_index["test"]])
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""__init__.py
"""
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""pgl read_csv_graph for ogb
"""
import pandas as pd
import os.path as osp
import numpy as np
import pgl
from ogb.io.read_graph_raw import read_csv_graph_raw
def read_csv_graph_pgl(raw_dir, add_inverse_edge=False):
"""Read CSV data and build PGL Graph
"""
graph_list = read_csv_graph_raw(raw_dir, add_inverse_edge)
pgl_graph_list = []
for graph in graph_list:
edges = list(zip(graph["edge_index"][0], graph["edge_index"][1]))
g = pgl.graph.Graph(num_nodes=graph["num_nodes"], edges=edges)
if graph["edge_feat"] is not None:
g.edge_feat["feat"] = graph["edge_feat"]
if graph["node_feat"] is not None:
g.node_feat["feat"] = graph["node_feat"]
pgl_graph_list.append(g)
return pgl_graph_list
if __name__ == "__main__":
# graph_list = read_csv_graph_dgl('dataset/proteinfunc_v2/raw', add_inverse_edge = True)
graph_list = read_csv_graph_pgl(
'dataset/ogbn_proteins_pgl/raw', add_inverse_edge=True)
print(graph_list)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""__init__.py
"""
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""LinkPropPredDataset for pgl
"""
import pandas as pd
import shutil, os
import os.path as osp
import numpy as np
from ogb.utils.url import decide_download, download_url, extract_zip
from ogb.linkproppred import make_master_file
from pgl.contrib.ogb.io.read_graph_pgl import read_csv_graph_pgl
def to_bool(value):
"""to_bool"""
return np.array([value], dtype="bool")[0]
class PglLinkPropPredDataset(object):
"""PglLinkPropPredDataset
"""
def __init__(self, name, root="dataset"):
self.name = name ## original name, e.g., ogbl-ppa
self.dir_name = "_".join(name.split(
"-")) + "_pgl" ## replace hyphen with underline, e.g., ogbl_ppa_pgl
self.original_root = root
self.root = osp.join(root, self.dir_name)
self.meta_info = make_master_file.df #pd.read_csv(os.path.join(os.path.dirname(__file__), "master.csv"), index_col=0)
if not self.name in self.meta_info:
print(self.name)
error_mssg = "Invalid dataset name {}.\n".format(self.name)
error_mssg += "Available datasets are as follows:\n"
error_mssg += "\n".join(self.meta_info.keys())
raise ValueError(error_mssg)
self.download_name = self.meta_info[self.name][
"download_name"] ## name of downloaded file, e.g., ppassoc
self.task_type = self.meta_info[self.name]["task type"]
super(PglLinkPropPredDataset, self).__init__()
self.pre_process()
def pre_process(self):
"""pre_process downlaoding data
"""
processed_dir = osp.join(self.root, 'processed')
pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed')
if osp.exists(pre_processed_file_path):
#TODO: Reload Preprocess files
pass
else:
### check download
if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")):
url = self.meta_info[self.name]["url"]
if decide_download(url):
path = download_url(url, self.original_root)
extract_zip(path, self.original_root)
os.unlink(path)
# delete folder if there exists
try:
shutil.rmtree(self.root)
except:
pass
shutil.move(
osp.join(self.original_root, self.download_name),
self.root)
else:
print("Stop download.")
exit(-1)
raw_dir = osp.join(self.root, "raw")
### pre-process and save
add_inverse_edge = to_bool(self.meta_info[self.name][
"add_inverse_edge"])
self.graph = read_csv_graph_pgl(
raw_dir, add_inverse_edge=add_inverse_edge)
#TODO: SAVE preprocess graph
def get_edge_split(self):
"""Train/Validation/Test split
"""
split_type = self.meta_info[self.name]["split"]
path = osp.join(self.root, "split", split_type)
train_idx = pd.read_csv(
osp.join(path, "train.csv.gz"), compression="gzip",
header=None).values
valid_idx = pd.read_csv(
osp.join(path, "valid.csv.gz"), compression="gzip",
header=None).values
test_idx = pd.read_csv(
osp.join(path, "test.csv.gz"), compression="gzip",
header=None).values
if self.task_type == "link prediction":
target_type = np.int64
else:
target_type = np.float32
return {
"train_edge": np.array(
train_idx[:, :2], dtype="int64"),
"train_edge_label": np.array(
train_idx[:, 2], dtype=target_type),
"valid_edge": np.array(
valid_idx[:, :2], dtype="int64"),
"valid_edge_label": np.array(
valid_idx[:, 2], dtype=target_type),
"test_edge": np.array(
test_idx[:, :2], dtype="int64"),
"test_edge_label": np.array(
test_idx[:, 2], dtype=target_type)
}
def __getitem__(self, idx):
assert idx == 0, "This dataset has only one graph"
return self.graph[0]
def __len__(self):
return 1
def __repr__(self): # pragma: no cover
return '{}({})'.format(self.__class__.__name__, len(self))
if __name__ == "__main__":
pgl_dataset = PglLinkPropPredDataset(name="ogbl-ppa")
splitted_edge = pgl_dataset.get_edge_split()
print(pgl_dataset[0])
print(splitted_edge)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""__init__.py
"""
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""NodePropPredDataset for pgl
"""
import pandas as pd
import shutil, os
import os.path as osp
import numpy as np
from ogb.utils.url import decide_download, download_url, extract_zip
from ogb.nodeproppred import make_master_file # create master.csv
from pgl.contrib.ogb.io.read_graph_pgl import read_csv_graph_pgl
def to_bool(value):
"""to_bool"""
return np.array([value], dtype="bool")[0]
class PglNodePropPredDataset(object):
"""PglNodePropPredDataset
"""
def __init__(self, name, root="dataset"):
self.name = name ## original name, e.g., ogbn-proteins
self.dir_name = "_".join(
name.split("-")
) + "_pgl" ## replace hyphen with underline, e.g., ogbn_proteins_pgl
self.original_root = root
self.root = osp.join(root, self.dir_name)
self.meta_info = make_master_file.df #pd.read_csv(
#os.path.join(os.path.dirname(__file__), "master.csv"), index_col=0)
if not self.name in self.meta_info:
error_mssg = "Invalid dataset name {}.\n".format(self.name)
error_mssg += "Available datasets are as follows:\n"
error_mssg += "\n".join(self.meta_info.keys())
raise ValueError(error_mssg)
self.download_name = self.meta_info[self.name][
"download_name"] ## name of downloaded file, e.g., tox21
self.num_tasks = int(self.meta_info[self.name]["num tasks"])
self.task_type = self.meta_info[self.name]["task type"]
super(PglNodePropPredDataset, self).__init__()
self.pre_process()
def pre_process(self):
"""pre_process downlaoding data
"""
processed_dir = osp.join(self.root, 'processed')
pre_processed_file_path = osp.join(processed_dir, 'pgl_data_processed')
if osp.exists(pre_processed_file_path):
# TODO: Reload Preprocess files
pass
else:
### check download
if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")):
url = self.meta_info[self.name]["url"]
if decide_download(url):
path = download_url(url, self.original_root)
extract_zip(path, self.original_root)
os.unlink(path)
# delete folder if there exists
try:
shutil.rmtree(self.root)
except:
pass
shutil.move(
osp.join(self.original_root, self.download_name),
self.root)
else:
print("Stop download.")
exit(-1)
raw_dir = osp.join(self.root, "raw")
### pre-process and save
add_inverse_edge = to_bool(self.meta_info[self.name][
"add_inverse_edge"])
self.graph = read_csv_graph_pgl(
raw_dir, add_inverse_edge=add_inverse_edge)
### adding prediction target
node_label = pd.read_csv(
osp.join(raw_dir, 'node-label.csv.gz'),
compression="gzip",
header=None).values
if "classification" in self.task_type:
node_label = np.array(node_label, dtype=np.int64)
else:
node_label = np.array(node_label, dtype=np.float32)
label_dict = {"labels": node_label}
# TODO: SAVE preprocess graph
self.labels = label_dict['labels']
def get_idx_split(self):
"""Train/Validation/Test split
"""
split_type = self.meta_info[self.name]["split"]
path = osp.join(self.root, "split", split_type)
train_idx = pd.read_csv(
osp.join(path, "train.csv.gz"), compression="gzip",
header=None).values.T[0]
valid_idx = pd.read_csv(
osp.join(path, "valid.csv.gz"), compression="gzip",
header=None).values.T[0]
test_idx = pd.read_csv(
osp.join(path, "test.csv.gz"), compression="gzip",
header=None).values.T[0]
return {
"train": np.array(
train_idx, dtype="int64"),
"valid": np.array(
valid_idx, dtype="int64"),
"test": np.array(
test_idx, dtype="int64")
}
def __getitem__(self, idx):
assert idx == 0, "This dataset has only one graph"
return self.graph[idx], self.labels
def __len__(self):
return 1
def __repr__(self): # pragma: no cover
return '{}({})'.format(self.__class__.__name__, len(self))
if __name__ == "__main__":
pgl_dataset = PglNodePropPredDataset(name="ogbn-proteins")
splitted_index = pgl_dataset.get_idx_split()
print(pgl_dataset[0])
print(splitted_index)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册