From b0a20434a00ff962338041dcde4088c5fcee0140 Mon Sep 17 00:00:00 2001 From: Webbley Date: Wed, 22 Apr 2020 22:23:28 +0800 Subject: [PATCH] add ogbg ppi --- ogb_examples/graphproppred/README.md | 22 +++ ogb_examples/graphproppred/main_pgl.py | 12 +- ogb_examples/graphproppred/ogbg_ppi.py | 216 +++++++++++++++++++++++++ 3 files changed, 248 insertions(+), 2 deletions(-) create mode 100644 ogb_examples/graphproppred/README.md create mode 100644 ogb_examples/graphproppred/ogbg_ppi.py diff --git a/ogb_examples/graphproppred/README.md b/ogb_examples/graphproppred/README.md new file mode 100644 index 0000000..714c99f --- /dev/null +++ b/ogb_examples/graphproppred/README.md @@ -0,0 +1,22 @@ +# Graph Property Prediction for Open Graph Benchmark (OGB) + +[The Open Graph Benchmark (OGB)](https://ogb.stanford.edu/) is a collection of benchmark datasets, data loaders, and evaluators for graph machine learning. Here we complete the Graph Property Prediction task based on PGL. + +### Requirements + +- paddlpaddle 1.7.1 +- pgl 1.0.1 +- ogb + +NOTE: To install ogb that is fited for this project, run below command to install ogb +``` +git clone https://github.com/snap-stanford/ogb.git +git checkout 482c40bc9f31fe25f9df5aa11c8fb657bd2b1621 +python setup.py install +``` + +### How to run +For example, use GPU to train model on ogbg-molhiv dataset. +``` +python main_pgl.py --use_cuda --dataset ogbg-molhiv +``` diff --git a/ogb_examples/graphproppred/main_pgl.py b/ogb_examples/graphproppred/main_pgl.py index ef7c112..5b3c588 100644 --- a/ogb_examples/graphproppred/main_pgl.py +++ b/ogb_examples/graphproppred/main_pgl.py @@ -14,7 +14,11 @@ """test ogb """ import argparse +import ssl +ssl._create_default_https_context = ssl._create_unverified_context +# SSL +import torch import pgl import numpy as np import paddle.fluid as fluid @@ -46,6 +50,9 @@ def train(exe, batch_size, graph_wrapper, train_program, splitted_idx, dataset, loss, pred = exe.run(train_program, feed=feed_dict, fetch_list=[fetch_loss, fetch_pred]) + + if batch_no % 100 == 0: + print("loss: %s" % loss[0]) pred_output[batch_index] = pred batch_no += 1 print("train", evaluator.eval({"y_true": labels, "y_pred": pred_output})) @@ -110,6 +117,7 @@ def main(): """ # Training settings parser = argparse.ArgumentParser(description='Graph Dataset') + parser.add_argument('--use_cuda', action='store_true') parser.add_argument( '--epochs', type=int, @@ -118,11 +126,11 @@ def main(): parser.add_argument( '--dataset', type=str, - default="ogbg-mol-tox21", + default="ogbg-molhiv", help='dataset name (default: proteinfunc)') args = parser.parse_args() - place = fluid.CPUPlace() # Dataset too big to use GPU + place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace() ### automatic dataloading and splitting dataset = PglGraphPropPredDataset(name=args.dataset) diff --git a/ogb_examples/graphproppred/ogbg_ppi.py b/ogb_examples/graphproppred/ogbg_ppi.py new file mode 100644 index 0000000..e9ff287 --- /dev/null +++ b/ogb_examples/graphproppred/ogbg_ppi.py @@ -0,0 +1,216 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""test ogb +""" +import argparse +import ssl +ssl._create_default_https_context = ssl._create_unverified_context +# SSL + +import torch +import pgl +import numpy as np +import paddle.fluid as fluid +from pgl.contrib.ogb.graphproppred.dataset_pgl import PglGraphPropPredDataset +from pgl.utils import paddle_helper +from ogb.graphproppred import Evaluator +from pgl.contrib.ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder + + +def train(exe, batch_size, graph_wrapper, train_program, splitted_idx, dataset, + evaluator, fetch_loss, fetch_pred): + """Train""" + graphs, labels = dataset[splitted_idx["train"]] + perm = np.arange(0, len(graphs)) + np.random.shuffle(perm) + start_batch = 0 + batch_no = 0 + pred_output = np.zeros_like(labels, dtype="float32") + while start_batch < len(perm): + batch_index = perm[start_batch:start_batch + batch_size] + start_batch += batch_size + batch_graph = pgl.graph.MultiGraph(graphs[batch_index]) + batch_label = labels[batch_index] + batch_valid = (batch_label == batch_label).astype("float32") + batch_label = np.nan_to_num(batch_label).astype("float32") + feed_dict = graph_wrapper.to_feed(batch_graph) + feed_dict["label"] = batch_label + feed_dict["weight"] = batch_valid + loss, pred = exe.run(train_program, + feed=feed_dict, + fetch_list=[fetch_loss, fetch_pred]) + + if batch_no % 100 == 0: + print("loss: %s" % loss[0]) + pred_output[batch_index] = pred + batch_no += 1 + print("train", evaluator.eval({"y_true": labels, "y_pred": pred_output})) + + +def evaluate(exe, batch_size, graph_wrapper, val_program, splitted_idx, + dataset, mode, evaluator, fetch_pred): + """Eval""" + graphs, labels = dataset[splitted_idx[mode]] + perm = np.arange(0, len(graphs)) + start_batch = 0 + batch_no = 0 + pred_output = np.zeros_like(labels, dtype="float32") + while start_batch < len(perm): + batch_index = perm[start_batch:start_batch + batch_size] + start_batch += batch_size + batch_graph = pgl.graph.MultiGraph(graphs[batch_index]) + feed_dict = graph_wrapper.to_feed(batch_graph) + pred = exe.run(val_program, feed=feed_dict, fetch_list=[fetch_pred]) + pred_output[batch_index] = pred[0] + batch_no += 1 + print(mode, evaluator.eval({"y_true": labels, "y_pred": pred_output})) + + +def send_func(src_feat, dst_feat, edge_feat): + """Send""" + return src_feat["h"] + edge_feat["h"] + + +class GNNModel(object): + """GNNModel""" + + def __init__(self, name, emb_dim, num_task, num_layers): + self.num_task = num_task + self.emb_dim = emb_dim + self.num_layers = num_layers + self.name = name + self.atom_encoder = AtomEncoder(name=name, emb_dim=emb_dim) + self.bond_encoder = BondEncoder(name=name, emb_dim=emb_dim) + + def edges_encoder(self, inputs, name): + outputs = fluid.layers.fc(inputs, + size=self.emb_dim, + param_attr=fluid.ParamAttr(name=name)) + return outputs + + def forward(self, graph): + """foward""" + # h_node = self.atom_encoder(graph.node_feat['feat']) + h_node = fluid.layers.embedding( + graph.node_feat['feat'], size=[1, self.emb_dim]) + + # h_edge = self.bond_encoder(graph.edge_feat['feat']) + h_edge = self.edges_encoder( + graph.edge_feat['feat'], name="edges_encoder") + for layer in range(self.num_layers): + msg = graph.send( + send_func, + nfeat_list=[("h", h_node)], + efeat_list=[("h", h_edge)]) + h_node = graph.recv(msg, 'sum') + h_node + h_node = fluid.layers.fc(h_node, + size=self.emb_dim, + name=self.name + '_%s' % layer, + act="relu") + graph_nodes = pgl.layers.graph_pooling(graph, h_node, "average") + graph_pred = fluid.layers.fc(graph_nodes, self.num_task, name="final") + return graph_pred + + +def main(): + """main + """ + # Training settings + parser = argparse.ArgumentParser(description='Graph Dataset') + parser.add_argument('--use_cuda', action='store_true') + parser.add_argument( + '--epochs', + type=int, + default=100, + help='number of epochs to train (default: 100)') + parser.add_argument( + '--dataset', + type=str, + default="ogbg-molhiv", + help='dataset name (default: proteinfunc)') + args = parser.parse_args() + + place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace() + + ### automatic dataloading and splitting + dataset = PglGraphPropPredDataset(name=args.dataset) + + for i in range(len(dataset)): + g, l = dataset[i] + if not g.node_feat: + g.node_feat['feat'] = np.array( + [0 for _ in range(g.num_nodes)]).reshape(-1, 1) + + splitted_idx = dataset.get_idx_split() + + ### automatic evaluator. takes dataset name as input + evaluator = Evaluator(args.dataset) + + graph_data, label = dataset[:2] + batch_graph = pgl.graph.MultiGraph(graph_data) + graph_data = batch_graph + + train_program = fluid.Program() + startup_program = fluid.Program() + test_program = fluid.Program() + # degree normalize + graph_data.edge_feat["feat"] = graph_data.edge_feat["feat"].astype( + "float32") + graph_data.node_feat["feat"] = graph_data.node_feat["feat"].astype("int64") + + model = GNNModel( + name="gnn", num_task=dataset.num_tasks, emb_dim=64, num_layers=2) + + with fluid.program_guard(train_program, startup_program): + gw = pgl.graph_wrapper.GraphWrapper( + "graph", + place=place, + node_feat=graph_data.node_feat_info(), + edge_feat=graph_data.edge_feat_info()) + pred = model.forward(gw) + sigmoid_pred = fluid.layers.sigmoid(pred) + + val_program = train_program.clone(for_test=True) + + initializer = [] + with fluid.program_guard(train_program, startup_program): + train_label = fluid.layers.data( + name="label", dtype="float32", shape=[None, dataset.num_tasks]) + train_weight = fluid.layers.data( + name="weight", dtype="float32", shape=[None, dataset.num_tasks]) + train_loss_t = fluid.layers.sigmoid_cross_entropy_with_logits( + x=pred, label=train_label) * train_weight + train_loss_t = fluid.layers.reduce_sum(train_loss_t) + + adam = fluid.optimizer.Adam( + learning_rate=1e-2, + regularization=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=0.0005)) + adam.minimize(train_loss_t) + + exe = fluid.Executor(place) + exe.run(startup_program) + + for epoch in range(1, args.epochs + 1): + print("Epoch", epoch) + train(exe, 128, gw, train_program, splitted_idx, dataset, evaluator, + train_loss_t, sigmoid_pred) + evaluate(exe, 128, gw, val_program, splitted_idx, dataset, "valid", + evaluator, sigmoid_pred) + evaluate(exe, 128, gw, val_program, splitted_idx, dataset, "test", + evaluator, sigmoid_pred) + + +if __name__ == "__main__": + main() -- GitLab