diff --git a/ogb_examples/graphproppred/main_pgl.py b/ogb_examples/graphproppred/main_pgl.py new file mode 100644 index 0000000000000000000000000000000000000000..ef7c112e5a364db8d26d297d5b4f297e2b6ef7ad --- /dev/null +++ b/ogb_examples/graphproppred/main_pgl.py @@ -0,0 +1,189 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""test ogb +""" +import argparse + +import pgl +import numpy as np +import paddle.fluid as fluid +from pgl.contrib.ogb.graphproppred.dataset_pgl import PglGraphPropPredDataset +from pgl.utils import paddle_helper +from ogb.graphproppred import Evaluator +from pgl.contrib.ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder + + +def train(exe, batch_size, graph_wrapper, train_program, splitted_idx, dataset, + evaluator, fetch_loss, fetch_pred): + """Train""" + graphs, labels = dataset[splitted_idx["train"]] + perm = np.arange(0, len(graphs)) + np.random.shuffle(perm) + start_batch = 0 + batch_no = 0 + pred_output = np.zeros_like(labels, dtype="float32") + while start_batch < len(perm): + batch_index = perm[start_batch:start_batch + batch_size] + start_batch += batch_size + batch_graph = pgl.graph.MultiGraph(graphs[batch_index]) + batch_label = labels[batch_index] + batch_valid = (batch_label == batch_label).astype("float32") + batch_label = np.nan_to_num(batch_label).astype("float32") + feed_dict = graph_wrapper.to_feed(batch_graph) + feed_dict["label"] = batch_label + feed_dict["weight"] = batch_valid + loss, pred = exe.run(train_program, + feed=feed_dict, + fetch_list=[fetch_loss, fetch_pred]) + pred_output[batch_index] = pred + batch_no += 1 + print("train", evaluator.eval({"y_true": labels, "y_pred": pred_output})) + + +def evaluate(exe, batch_size, graph_wrapper, val_program, splitted_idx, + dataset, mode, evaluator, fetch_pred): + """Eval""" + graphs, labels = dataset[splitted_idx[mode]] + perm = np.arange(0, len(graphs)) + start_batch = 0 + batch_no = 0 + pred_output = np.zeros_like(labels, dtype="float32") + while start_batch < len(perm): + batch_index = perm[start_batch:start_batch + batch_size] + start_batch += batch_size + batch_graph = pgl.graph.MultiGraph(graphs[batch_index]) + feed_dict = graph_wrapper.to_feed(batch_graph) + pred = exe.run(val_program, feed=feed_dict, fetch_list=[fetch_pred]) + pred_output[batch_index] = pred[0] + batch_no += 1 + print(mode, evaluator.eval({"y_true": labels, "y_pred": pred_output})) + + +def send_func(src_feat, dst_feat, edge_feat): + """Send""" + return src_feat["h"] + edge_feat["h"] + + +class GNNModel(object): + """GNNModel""" + + def __init__(self, name, emb_dim, num_task, num_layers): + self.num_task = num_task + self.emb_dim = emb_dim + self.num_layers = num_layers + self.name = name + self.atom_encoder = AtomEncoder(name=name, emb_dim=emb_dim) + self.bond_encoder = BondEncoder(name=name, emb_dim=emb_dim) + + def forward(self, graph): + """foward""" + h_node = self.atom_encoder(graph.node_feat['feat']) + h_edge = self.bond_encoder(graph.edge_feat['feat']) + for layer in range(self.num_layers): + msg = graph.send( + send_func, + nfeat_list=[("h", h_node)], + efeat_list=[("h", h_edge)]) + h_node = graph.recv(msg, 'sum') + h_node + h_node = fluid.layers.fc(h_node, + size=self.emb_dim, + name=self.name + '_%s' % layer, + act="relu") + graph_nodes = pgl.layers.graph_pooling(graph, h_node, "average") + graph_pred = fluid.layers.fc(graph_nodes, self.num_task, name="final") + return graph_pred + + +def main(): + """main + """ + # Training settings + parser = argparse.ArgumentParser(description='Graph Dataset') + parser.add_argument( + '--epochs', + type=int, + default=100, + help='number of epochs to train (default: 100)') + parser.add_argument( + '--dataset', + type=str, + default="ogbg-mol-tox21", + help='dataset name (default: proteinfunc)') + args = parser.parse_args() + + place = fluid.CPUPlace() # Dataset too big to use GPU + + ### automatic dataloading and splitting + dataset = PglGraphPropPredDataset(name=args.dataset) + splitted_idx = dataset.get_idx_split() + + ### automatic evaluator. takes dataset name as input + evaluator = Evaluator(args.dataset) + + graph_data, label = dataset[:2] + batch_graph = pgl.graph.MultiGraph(graph_data) + graph_data = batch_graph + + train_program = fluid.Program() + startup_program = fluid.Program() + test_program = fluid.Program() + # degree normalize + graph_data.edge_feat["feat"] = graph_data.edge_feat["feat"].astype("int64") + graph_data.node_feat["feat"] = graph_data.node_feat["feat"].astype("int64") + + model = GNNModel( + name="gnn", num_task=dataset.num_tasks, emb_dim=64, num_layers=2) + + with fluid.program_guard(train_program, startup_program): + gw = pgl.graph_wrapper.GraphWrapper( + "graph", + place=place, + node_feat=graph_data.node_feat_info(), + edge_feat=graph_data.edge_feat_info()) + pred = model.forward(gw) + sigmoid_pred = fluid.layers.sigmoid(pred) + + val_program = train_program.clone(for_test=True) + + initializer = [] + with fluid.program_guard(train_program, startup_program): + train_label = fluid.layers.data( + name="label", dtype="float32", shape=[None, dataset.num_tasks]) + train_weight = fluid.layers.data( + name="weight", dtype="float32", shape=[None, dataset.num_tasks]) + train_loss_t = fluid.layers.sigmoid_cross_entropy_with_logits( + x=pred, label=train_label) * train_weight + train_loss_t = fluid.layers.reduce_sum(train_loss_t) + + adam = fluid.optimizer.Adam( + learning_rate=1e-2, + regularization=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=0.0005)) + adam.minimize(train_loss_t) + + exe = fluid.Executor(place) + exe.run(startup_program) + + for epoch in range(1, args.epochs + 1): + print("Epoch", epoch) + train(exe, 128, gw, train_program, splitted_idx, dataset, evaluator, + train_loss_t, sigmoid_pred) + evaluate(exe, 128, gw, val_program, splitted_idx, dataset, "valid", + evaluator, sigmoid_pred) + evaluate(exe, 128, gw, val_program, splitted_idx, dataset, "test", + evaluator, sigmoid_pred) + + +if __name__ == "__main__": + main() diff --git a/ogb_examples/linkproppred/main_pgl.py b/ogb_examples/linkproppred/main_pgl.py index 98930ee609e870d9b9045c5305b06392d8b4feef..0a0634b7a8ac6a86cc6e74b3bf7ce22fb3a0488a 100644 --- a/ogb_examples/linkproppred/main_pgl.py +++ b/ogb_examples/linkproppred/main_pgl.py @@ -14,10 +14,13 @@ """test ogb """ import argparse - -import pgl +import time +import logging import numpy as np + import paddle.fluid as fluid + +import pgl from pgl.contrib.ogb.linkproppred.dataset_pgl import PglLinkPropPredDataset from pgl.utils import paddle_helper from ogb.linkproppred import Evaluator @@ -44,12 +47,12 @@ class GNNModel(object): self.src_nodes = fluid.layers.data( name='src_nodes', - shape=[None, 1], + shape=[None], dtype='int64', ) self.dst_nodes = fluid.layers.data( name='dst_nodes', - shape=[None, 1], + shape=[None], dtype='int64', ) self.edge_label = fluid.layers.data( @@ -63,7 +66,6 @@ class GNNModel(object): shape=[self.num_nodes, self.emb_dim], dtype="float32", name=self.name + "_embedding") - # edge_attr = fluid.layers.fc(graph.edge_feat["feat"], size=self.emb_dim) for layer in range(self.num_layers): msg = graph.send( @@ -83,8 +85,8 @@ class GNNModel(object): name=self.name + '_bias_%s' % layer) h = fluid.layers.elementwise_add(h, bias, act="relu") - src = fluid.layers.gather(h, self.src_nodes) - dst = fluid.layers.gather(h, self.dst_nodes) + src = fluid.layers.gather(h, self.src_nodes, overwrite=False) + dst = fluid.layers.gather(h, self.dst_nodes, overwrite=False) edge_embed = src * dst pred = fluid.layers.fc(input=edge_embed, size=1, @@ -107,17 +109,22 @@ def main(): parser.add_argument( '--epochs', type=int, - default=100, + default=4, help='number of epochs to train (default: 100)') parser.add_argument( '--dataset', type=str, default="ogbl-ppa", help='dataset name (default: protein protein associations)') + parser.add_argument('--use_cuda', action='store_true') + parser.add_argument('--batch_size', type=int, default=5120) + parser.add_argument('--embed_dim', type=int, default=64) + parser.add_argument('--num_layers', type=int, default=2) + parser.add_argument('--lr', type=float, default=0.001) args = parser.parse_args() + print(args) - #place = fluid.CUDAPlace(0) - place = fluid.CPUPlace() # Dataset too big to use GPU + place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace() ### automatic dataloading and splitting print("loadding dataset") @@ -135,19 +142,20 @@ def main(): train_program = fluid.Program() startup_program = fluid.Program() - test_program = fluid.Program() + # degree normalize indegree = graph_data.indegree() norm = np.zeros_like(indegree, dtype="float32") norm[indegree > 0] = np.power(indegree[indegree > 0], -0.5) graph_data.node_feat["norm"] = np.expand_dims(norm, -1).astype("float32") + # graph_data.node_feat["index"] = np.array([i for i in range(graph_data.num_nodes)], dtype=np.int64).reshape(-1,1) with fluid.program_guard(train_program, startup_program): model = GNNModel( name="gnn", num_nodes=graph_data.num_nodes, - emb_dim=64, - num_layers=2) + emb_dim=args.embed_dim, + num_layers=args.num_layers) gw = pgl.graph_wrapper.GraphWrapper( "graph", place, @@ -158,50 +166,106 @@ def main(): val_program = train_program.clone(for_test=True) with fluid.program_guard(train_program, startup_program): + global_steps = int(splitted_edge['train_edge'].shape[0] / + args.batch_size * 2) + learning_rate = fluid.layers.polynomial_decay(args.lr, global_steps, + 0.00005) + adam = fluid.optimizer.Adam( - learning_rate=1e-2, + learning_rate=learning_rate, regularization=fluid.regularizer.L2DecayRegularizer( regularization_coeff=0.0005)) adam.minimize(loss) exe = fluid.Executor(place) exe.run(startup_program) - feed = gw.to_feed(graph_data) + + print("evaluate result before training: ") + result = test(exe, val_program, prob, evaluator, feed, splitted_edge) + print(result) + + print("training") + cc = 0 for epoch in range(1, args.epochs + 1): - feed['src_nodes'] = splitted_edge["train_edge"][:, 0].reshape(-1, 1) - feed['dst_nodes'] = splitted_edge["train_edge"][:, 1].reshape(-1, 1) - feed['edge_label'] = splitted_edge["train_edge_label"].astype( - "float32").reshape(-1, 1) - res_loss, y_pred = exe.run(train_program, - feed=feed, - fetch_list=[loss, prob]) - print("Loss %s" % res_loss[0]) - - result = {} - print("Evaluating...") - feed['src_nodes'] = splitted_edge["valid_edge"][:, 0].reshape(-1, 1) - feed['dst_nodes'] = splitted_edge["valid_edge"][:, 1].reshape(-1, 1) - feed['edge_label'] = splitted_edge["valid_edge_label"].astype( - "float32").reshape(-1, 1) - y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0] - input_dict = { - "y_true": splitted_edge["valid_edge_label"], - "y_pred": y_pred.reshape(-1, ), - } - result["valid"] = evaluator.eval(input_dict) - - feed['src_nodes'] = splitted_edge["test_edge"][:, 0].reshape(-1, 1) - feed['dst_nodes'] = splitted_edge["test_edge"][:, 1].reshape(-1, 1) - feed['edge_label'] = splitted_edge["test_edge_label"].astype( - "float32").reshape(-1, 1) - y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0] - input_dict = { - "y_true": splitted_edge["test_edge_label"], - "y_pred": y_pred.reshape(-1, ), - } - result["test"] = evaluator.eval(input_dict) - print(result) + for batch_data, batch_label in data_generator( + graph_data, + splitted_edge["train_edge"], + splitted_edge["train_edge_label"], + batch_size=args.batch_size): + feed['src_nodes'] = batch_data[:, 0].reshape(-1, 1) + feed['dst_nodes'] = batch_data[:, 1].reshape(-1, 1) + feed['edge_label'] = batch_label.astype("float32") + + res_loss, y_pred, b_lr = exe.run( + train_program, + feed=feed, + fetch_list=[loss, prob, learning_rate]) + if cc % 1 == 0: + print("epoch %d | step %d | lr %s | Loss %s" % + (epoch, cc, b_lr[0], res_loss[0])) + cc += 1 + + if cc % 20 == 0: + print("Evaluating...") + result = test(exe, val_program, prob, evaluator, feed, + splitted_edge) + print("epoch %d | step %d" % (epoch, cc)) + print(result) + + +def test(exe, val_program, prob, evaluator, feed, splitted_edge): + """Evaluation""" + result = {} + feed['src_nodes'] = splitted_edge["valid_edge"][:, 0].reshape(-1, 1) + feed['dst_nodes'] = splitted_edge["valid_edge"][:, 1].reshape(-1, 1) + feed['edge_label'] = splitted_edge["valid_edge_label"].astype( + "float32").reshape(-1, 1) + y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0] + input_dict = { + "y_true": splitted_edge["valid_edge_label"], + "y_pred": y_pred.reshape(-1, ), + } + result["valid"] = evaluator.eval(input_dict) + + feed['src_nodes'] = splitted_edge["test_edge"][:, 0].reshape(-1, 1) + feed['dst_nodes'] = splitted_edge["test_edge"][:, 1].reshape(-1, 1) + feed['edge_label'] = splitted_edge["test_edge_label"].astype( + "float32").reshape(-1, 1) + y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0] + input_dict = { + "y_true": splitted_edge["test_edge_label"], + "y_pred": y_pred.reshape(-1, ), + } + result["test"] = evaluator.eval(input_dict) + return result + + +def data_generator(graph, data, label_data, batch_size, shuffle=True): + """Data Generator""" + perm = np.arange(0, len(data)) + if shuffle: + np.random.shuffle(perm) + + offset = 0 + while offset < len(perm): + batch_index = perm[offset:(offset + batch_size)] + offset += batch_size + pos_data = data[batch_index] + pos_label = label_data[batch_index] + + neg_src_node = pos_data[:, 0] + neg_dst_node = np.random.choice( + pos_data.reshape(-1, ), size=len(neg_src_node)) + neg_data = np.hstack( + [neg_src_node.reshape(-1, 1), neg_dst_node.reshape(-1, 1)]) + exists = graph.has_edges_between(neg_src_node, neg_dst_node) + neg_data = neg_data[np.invert(exists)] + neg_label = np.zeros(shape=len(neg_data), dtype=np.int64) + + batch_data = np.vstack([pos_data, neg_data]) + label = np.vstack([pos_label.reshape(-1, 1), neg_label.reshape(-1, 1)]) + yield batch_data, label if __name__ == "__main__": diff --git a/pgl/contrib/ogb/graphproppred/mol_encoder.py b/pgl/contrib/ogb/graphproppred/mol_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..2662d141532dc58925f30e0973d5d85bb4953bd3 --- /dev/null +++ b/pgl/contrib/ogb/graphproppred/mol_encoder.py @@ -0,0 +1,71 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MolEncoder for ogb +""" +import paddle.fluid as fluid +from ogb.utils.features import get_atom_feature_dims, get_bond_feature_dims + + +class AtomEncoder(object): + """AtomEncoder for encoding node features""" + + def __init__(self, name, emb_dim): + self.emb_dim = emb_dim + self.name = name + + def __call__(self, x): + atom_feature = get_atom_feature_dims() + atom_input = fluid.layers.split( + x, num_or_sections=len(atom_feature), dim=-1) + outputs = None + count = 0 + for _x, _atom_input_dim in zip(atom_input, atom_feature): + count += 1 + emb = fluid.layers.embedding( + _x, + size=(_atom_input_dim, self.emb_dim), + param_attr=fluid.ParamAttr( + name=self.name + '_atom_feat_%s' % count)) + if outputs is None: + outputs = emb + else: + outputs = outputs + emb + return outputs + + +class BondEncoder(object): + """Bond for encoding edge features""" + + def __init__(self, name, emb_dim): + self.emb_dim = emb_dim + self.name = name + + def __call__(self, x): + bond_feature = get_bond_feature_dims() + bond_input = fluid.layers.split( + x, num_or_sections=len(bond_feature), dim=-1) + outputs = None + count = 0 + for _x, _bond_input_dim in zip(bond_input, bond_feature): + count += 1 + emb = fluid.layers.embedding( + _x, + size=(_bond_input_dim, self.emb_dim), + param_attr=fluid.ParamAttr( + name=self.name + '_bond_feat_%s' % count)) + if outputs is None: + outputs = emb + else: + outputs = outputs + emb + return outputs diff --git a/pgl/contrib/ogb/linkproppred/dataset_pgl.py b/pgl/contrib/ogb/linkproppred/dataset_pgl.py index 38c416272d3c4961e898d51e396c2c3bc5a9fb24..928f4a4f95d6df045089a5bee7b30e28eac95713 100644 --- a/pgl/contrib/ogb/linkproppred/dataset_pgl.py +++ b/pgl/contrib/ogb/linkproppred/dataset_pgl.py @@ -60,7 +60,7 @@ class PglLinkPropPredDataset(object): """pre_process downlaoding data """ processed_dir = osp.join(self.root, 'processed') - pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed') + pre_processed_file_path = osp.join(processed_dir, 'pgl_data_processed') if osp.exists(pre_processed_file_path): #TODO: Reload Preprocess files diff --git a/pgl/layers/__init__.py b/pgl/layers/__init__.py index a88c9b69b33335e091d5713400395f7c003361b4..efc27aa5bda6316348c7c65d6d714de70584b1dc 100644 --- a/pgl/layers/__init__.py +++ b/pgl/layers/__init__.py @@ -18,7 +18,10 @@ from pgl.layers import conv from pgl.layers.conv import * from pgl.layers import set2set from pgl.layers.set2set import * +from pgl.layers import graph_pool +from pgl.layers.graph_pool import * __all__ = [] __all__ += conv.__all__ __all__ += set2set.__all__ +__all__ += graph_pool.__all__ diff --git a/pgl/layers/graph_pool.py b/pgl/layers/graph_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..a88468f7b6ef12131c4554db9339c35472a66f11 --- /dev/null +++ b/pgl/layers/graph_pool.py @@ -0,0 +1,42 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""This package implements common layers to help building +graph neural networks. +""" +import paddle.fluid as fluid +from pgl import graph_wrapper +from pgl.utils import paddle_helper +from pgl.utils import op + +__all__ = ['graph_pooling'] + + +def graph_pooling(gw, node_feat, pool_type): + """Implementation of graph pooling + + This is an implementation of graph pooling + + Args: + gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`) + + node_feat: A tensor with shape (num_nodes, feature_size). + + pool_type: The type of pooling ("sum", "average" , "min") + + Return: + A tensor with shape (num_graph, hidden_size) + """ + graph_feat = op.nested_lod_reset(node_feat, gw.graph_lod) + graph_feat = fluid.layers.sequence_pool(graph_feat, pool_type) + return graph_feat