提交 59098f85 编写于 作者: W Webbley

update ogb

上级 a10bb833
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""test ogb
"""
import argparse
import pgl
import numpy as np
import paddle.fluid as fluid
from pgl.contrib.ogb.graphproppred.dataset_pgl import PglGraphPropPredDataset
from pgl.utils import paddle_helper
from ogb.graphproppred import Evaluator
from pgl.contrib.ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder
def train(exe, batch_size, graph_wrapper, train_program, splitted_idx, dataset,
evaluator, fetch_loss, fetch_pred):
"""Train"""
graphs, labels = dataset[splitted_idx["train"]]
perm = np.arange(0, len(graphs))
np.random.shuffle(perm)
start_batch = 0
batch_no = 0
pred_output = np.zeros_like(labels, dtype="float32")
while start_batch < len(perm):
batch_index = perm[start_batch:start_batch + batch_size]
start_batch += batch_size
batch_graph = pgl.graph.MultiGraph(graphs[batch_index])
batch_label = labels[batch_index]
batch_valid = (batch_label == batch_label).astype("float32")
batch_label = np.nan_to_num(batch_label).astype("float32")
feed_dict = graph_wrapper.to_feed(batch_graph)
feed_dict["label"] = batch_label
feed_dict["weight"] = batch_valid
loss, pred = exe.run(train_program,
feed=feed_dict,
fetch_list=[fetch_loss, fetch_pred])
pred_output[batch_index] = pred
batch_no += 1
print("train", evaluator.eval({"y_true": labels, "y_pred": pred_output}))
def evaluate(exe, batch_size, graph_wrapper, val_program, splitted_idx,
dataset, mode, evaluator, fetch_pred):
"""Eval"""
graphs, labels = dataset[splitted_idx[mode]]
perm = np.arange(0, len(graphs))
start_batch = 0
batch_no = 0
pred_output = np.zeros_like(labels, dtype="float32")
while start_batch < len(perm):
batch_index = perm[start_batch:start_batch + batch_size]
start_batch += batch_size
batch_graph = pgl.graph.MultiGraph(graphs[batch_index])
feed_dict = graph_wrapper.to_feed(batch_graph)
pred = exe.run(val_program, feed=feed_dict, fetch_list=[fetch_pred])
pred_output[batch_index] = pred[0]
batch_no += 1
print(mode, evaluator.eval({"y_true": labels, "y_pred": pred_output}))
def send_func(src_feat, dst_feat, edge_feat):
"""Send"""
return src_feat["h"] + edge_feat["h"]
class GNNModel(object):
"""GNNModel"""
def __init__(self, name, emb_dim, num_task, num_layers):
self.num_task = num_task
self.emb_dim = emb_dim
self.num_layers = num_layers
self.name = name
self.atom_encoder = AtomEncoder(name=name, emb_dim=emb_dim)
self.bond_encoder = BondEncoder(name=name, emb_dim=emb_dim)
def forward(self, graph):
"""foward"""
h_node = self.atom_encoder(graph.node_feat['feat'])
h_edge = self.bond_encoder(graph.edge_feat['feat'])
for layer in range(self.num_layers):
msg = graph.send(
send_func,
nfeat_list=[("h", h_node)],
efeat_list=[("h", h_edge)])
h_node = graph.recv(msg, 'sum') + h_node
h_node = fluid.layers.fc(h_node,
size=self.emb_dim,
name=self.name + '_%s' % layer,
act="relu")
graph_nodes = pgl.layers.graph_pooling(graph, h_node, "average")
graph_pred = fluid.layers.fc(graph_nodes, self.num_task, name="final")
return graph_pred
def main():
"""main
"""
# Training settings
parser = argparse.ArgumentParser(description='Graph Dataset')
parser.add_argument(
'--epochs',
type=int,
default=100,
help='number of epochs to train (default: 100)')
parser.add_argument(
'--dataset',
type=str,
default="ogbg-mol-tox21",
help='dataset name (default: proteinfunc)')
args = parser.parse_args()
place = fluid.CPUPlace() # Dataset too big to use GPU
### automatic dataloading and splitting
dataset = PglGraphPropPredDataset(name=args.dataset)
splitted_idx = dataset.get_idx_split()
### automatic evaluator. takes dataset name as input
evaluator = Evaluator(args.dataset)
graph_data, label = dataset[:2]
batch_graph = pgl.graph.MultiGraph(graph_data)
graph_data = batch_graph
train_program = fluid.Program()
startup_program = fluid.Program()
test_program = fluid.Program()
# degree normalize
graph_data.edge_feat["feat"] = graph_data.edge_feat["feat"].astype("int64")
graph_data.node_feat["feat"] = graph_data.node_feat["feat"].astype("int64")
model = GNNModel(
name="gnn", num_task=dataset.num_tasks, emb_dim=64, num_layers=2)
with fluid.program_guard(train_program, startup_program):
gw = pgl.graph_wrapper.GraphWrapper(
"graph",
place=place,
node_feat=graph_data.node_feat_info(),
edge_feat=graph_data.edge_feat_info())
pred = model.forward(gw)
sigmoid_pred = fluid.layers.sigmoid(pred)
val_program = train_program.clone(for_test=True)
initializer = []
with fluid.program_guard(train_program, startup_program):
train_label = fluid.layers.data(
name="label", dtype="float32", shape=[None, dataset.num_tasks])
train_weight = fluid.layers.data(
name="weight", dtype="float32", shape=[None, dataset.num_tasks])
train_loss_t = fluid.layers.sigmoid_cross_entropy_with_logits(
x=pred, label=train_label) * train_weight
train_loss_t = fluid.layers.reduce_sum(train_loss_t)
adam = fluid.optimizer.Adam(
learning_rate=1e-2,
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0005))
adam.minimize(train_loss_t)
exe = fluid.Executor(place)
exe.run(startup_program)
for epoch in range(1, args.epochs + 1):
print("Epoch", epoch)
train(exe, 128, gw, train_program, splitted_idx, dataset, evaluator,
train_loss_t, sigmoid_pred)
evaluate(exe, 128, gw, val_program, splitted_idx, dataset, "valid",
evaluator, sigmoid_pred)
evaluate(exe, 128, gw, val_program, splitted_idx, dataset, "test",
evaluator, sigmoid_pred)
if __name__ == "__main__":
main()
...@@ -14,10 +14,13 @@ ...@@ -14,10 +14,13 @@
"""test ogb """test ogb
""" """
import argparse import argparse
import time
import pgl import logging
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
import pgl
from pgl.contrib.ogb.linkproppred.dataset_pgl import PglLinkPropPredDataset from pgl.contrib.ogb.linkproppred.dataset_pgl import PglLinkPropPredDataset
from pgl.utils import paddle_helper from pgl.utils import paddle_helper
from ogb.linkproppred import Evaluator from ogb.linkproppred import Evaluator
...@@ -44,12 +47,12 @@ class GNNModel(object): ...@@ -44,12 +47,12 @@ class GNNModel(object):
self.src_nodes = fluid.layers.data( self.src_nodes = fluid.layers.data(
name='src_nodes', name='src_nodes',
shape=[None, 1], shape=[None],
dtype='int64', ) dtype='int64', )
self.dst_nodes = fluid.layers.data( self.dst_nodes = fluid.layers.data(
name='dst_nodes', name='dst_nodes',
shape=[None, 1], shape=[None],
dtype='int64', ) dtype='int64', )
self.edge_label = fluid.layers.data( self.edge_label = fluid.layers.data(
...@@ -63,7 +66,6 @@ class GNNModel(object): ...@@ -63,7 +66,6 @@ class GNNModel(object):
shape=[self.num_nodes, self.emb_dim], shape=[self.num_nodes, self.emb_dim],
dtype="float32", dtype="float32",
name=self.name + "_embedding") name=self.name + "_embedding")
# edge_attr = fluid.layers.fc(graph.edge_feat["feat"], size=self.emb_dim)
for layer in range(self.num_layers): for layer in range(self.num_layers):
msg = graph.send( msg = graph.send(
...@@ -83,8 +85,8 @@ class GNNModel(object): ...@@ -83,8 +85,8 @@ class GNNModel(object):
name=self.name + '_bias_%s' % layer) name=self.name + '_bias_%s' % layer)
h = fluid.layers.elementwise_add(h, bias, act="relu") h = fluid.layers.elementwise_add(h, bias, act="relu")
src = fluid.layers.gather(h, self.src_nodes) src = fluid.layers.gather(h, self.src_nodes, overwrite=False)
dst = fluid.layers.gather(h, self.dst_nodes) dst = fluid.layers.gather(h, self.dst_nodes, overwrite=False)
edge_embed = src * dst edge_embed = src * dst
pred = fluid.layers.fc(input=edge_embed, pred = fluid.layers.fc(input=edge_embed,
size=1, size=1,
...@@ -107,17 +109,22 @@ def main(): ...@@ -107,17 +109,22 @@ def main():
parser.add_argument( parser.add_argument(
'--epochs', '--epochs',
type=int, type=int,
default=100, default=4,
help='number of epochs to train (default: 100)') help='number of epochs to train (default: 100)')
parser.add_argument( parser.add_argument(
'--dataset', '--dataset',
type=str, type=str,
default="ogbl-ppa", default="ogbl-ppa",
help='dataset name (default: protein protein associations)') help='dataset name (default: protein protein associations)')
parser.add_argument('--use_cuda', action='store_true')
parser.add_argument('--batch_size', type=int, default=5120)
parser.add_argument('--embed_dim', type=int, default=64)
parser.add_argument('--num_layers', type=int, default=2)
parser.add_argument('--lr', type=float, default=0.001)
args = parser.parse_args() args = parser.parse_args()
print(args)
#place = fluid.CUDAPlace(0) place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
place = fluid.CPUPlace() # Dataset too big to use GPU
### automatic dataloading and splitting ### automatic dataloading and splitting
print("loadding dataset") print("loadding dataset")
...@@ -135,19 +142,20 @@ def main(): ...@@ -135,19 +142,20 @@ def main():
train_program = fluid.Program() train_program = fluid.Program()
startup_program = fluid.Program() startup_program = fluid.Program()
test_program = fluid.Program()
# degree normalize # degree normalize
indegree = graph_data.indegree() indegree = graph_data.indegree()
norm = np.zeros_like(indegree, dtype="float32") norm = np.zeros_like(indegree, dtype="float32")
norm[indegree > 0] = np.power(indegree[indegree > 0], -0.5) norm[indegree > 0] = np.power(indegree[indegree > 0], -0.5)
graph_data.node_feat["norm"] = np.expand_dims(norm, -1).astype("float32") graph_data.node_feat["norm"] = np.expand_dims(norm, -1).astype("float32")
# graph_data.node_feat["index"] = np.array([i for i in range(graph_data.num_nodes)], dtype=np.int64).reshape(-1,1)
with fluid.program_guard(train_program, startup_program): with fluid.program_guard(train_program, startup_program):
model = GNNModel( model = GNNModel(
name="gnn", name="gnn",
num_nodes=graph_data.num_nodes, num_nodes=graph_data.num_nodes,
emb_dim=64, emb_dim=args.embed_dim,
num_layers=2) num_layers=args.num_layers)
gw = pgl.graph_wrapper.GraphWrapper( gw = pgl.graph_wrapper.GraphWrapper(
"graph", "graph",
place, place,
...@@ -158,28 +166,57 @@ def main(): ...@@ -158,28 +166,57 @@ def main():
val_program = train_program.clone(for_test=True) val_program = train_program.clone(for_test=True)
with fluid.program_guard(train_program, startup_program): with fluid.program_guard(train_program, startup_program):
global_steps = int(splitted_edge['train_edge'].shape[0] /
args.batch_size * 2)
learning_rate = fluid.layers.polynomial_decay(args.lr, global_steps,
0.00005)
adam = fluid.optimizer.Adam( adam = fluid.optimizer.Adam(
learning_rate=1e-2, learning_rate=learning_rate,
regularization=fluid.regularizer.L2DecayRegularizer( regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0005)) regularization_coeff=0.0005))
adam.minimize(loss) adam.minimize(loss)
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(startup_program) exe.run(startup_program)
feed = gw.to_feed(graph_data) feed = gw.to_feed(graph_data)
print("evaluate result before training: ")
result = test(exe, val_program, prob, evaluator, feed, splitted_edge)
print(result)
print("training")
cc = 0
for epoch in range(1, args.epochs + 1): for epoch in range(1, args.epochs + 1):
feed['src_nodes'] = splitted_edge["train_edge"][:, 0].reshape(-1, 1) for batch_data, batch_label in data_generator(
feed['dst_nodes'] = splitted_edge["train_edge"][:, 1].reshape(-1, 1) graph_data,
feed['edge_label'] = splitted_edge["train_edge_label"].astype( splitted_edge["train_edge"],
"float32").reshape(-1, 1) splitted_edge["train_edge_label"],
res_loss, y_pred = exe.run(train_program, batch_size=args.batch_size):
feed['src_nodes'] = batch_data[:, 0].reshape(-1, 1)
feed['dst_nodes'] = batch_data[:, 1].reshape(-1, 1)
feed['edge_label'] = batch_label.astype("float32")
res_loss, y_pred, b_lr = exe.run(
train_program,
feed=feed, feed=feed,
fetch_list=[loss, prob]) fetch_list=[loss, prob, learning_rate])
print("Loss %s" % res_loss[0]) if cc % 1 == 0:
print("epoch %d | step %d | lr %s | Loss %s" %
(epoch, cc, b_lr[0], res_loss[0]))
cc += 1
result = {} if cc % 20 == 0:
print("Evaluating...") print("Evaluating...")
result = test(exe, val_program, prob, evaluator, feed,
splitted_edge)
print("epoch %d | step %d" % (epoch, cc))
print(result)
def test(exe, val_program, prob, evaluator, feed, splitted_edge):
"""Evaluation"""
result = {}
feed['src_nodes'] = splitted_edge["valid_edge"][:, 0].reshape(-1, 1) feed['src_nodes'] = splitted_edge["valid_edge"][:, 0].reshape(-1, 1)
feed['dst_nodes'] = splitted_edge["valid_edge"][:, 1].reshape(-1, 1) feed['dst_nodes'] = splitted_edge["valid_edge"][:, 1].reshape(-1, 1)
feed['edge_label'] = splitted_edge["valid_edge_label"].astype( feed['edge_label'] = splitted_edge["valid_edge_label"].astype(
...@@ -201,7 +238,34 @@ def main(): ...@@ -201,7 +238,34 @@ def main():
"y_pred": y_pred.reshape(-1, ), "y_pred": y_pred.reshape(-1, ),
} }
result["test"] = evaluator.eval(input_dict) result["test"] = evaluator.eval(input_dict)
print(result) return result
def data_generator(graph, data, label_data, batch_size, shuffle=True):
"""Data Generator"""
perm = np.arange(0, len(data))
if shuffle:
np.random.shuffle(perm)
offset = 0
while offset < len(perm):
batch_index = perm[offset:(offset + batch_size)]
offset += batch_size
pos_data = data[batch_index]
pos_label = label_data[batch_index]
neg_src_node = pos_data[:, 0]
neg_dst_node = np.random.choice(
pos_data.reshape(-1, ), size=len(neg_src_node))
neg_data = np.hstack(
[neg_src_node.reshape(-1, 1), neg_dst_node.reshape(-1, 1)])
exists = graph.has_edges_between(neg_src_node, neg_dst_node)
neg_data = neg_data[np.invert(exists)]
neg_label = np.zeros(shape=len(neg_data), dtype=np.int64)
batch_data = np.vstack([pos_data, neg_data])
label = np.vstack([pos_label.reshape(-1, 1), neg_label.reshape(-1, 1)])
yield batch_data, label
if __name__ == "__main__": if __name__ == "__main__":
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""MolEncoder for ogb
"""
import paddle.fluid as fluid
from ogb.utils.features import get_atom_feature_dims, get_bond_feature_dims
class AtomEncoder(object):
"""AtomEncoder for encoding node features"""
def __init__(self, name, emb_dim):
self.emb_dim = emb_dim
self.name = name
def __call__(self, x):
atom_feature = get_atom_feature_dims()
atom_input = fluid.layers.split(
x, num_or_sections=len(atom_feature), dim=-1)
outputs = None
count = 0
for _x, _atom_input_dim in zip(atom_input, atom_feature):
count += 1
emb = fluid.layers.embedding(
_x,
size=(_atom_input_dim, self.emb_dim),
param_attr=fluid.ParamAttr(
name=self.name + '_atom_feat_%s' % count))
if outputs is None:
outputs = emb
else:
outputs = outputs + emb
return outputs
class BondEncoder(object):
"""Bond for encoding edge features"""
def __init__(self, name, emb_dim):
self.emb_dim = emb_dim
self.name = name
def __call__(self, x):
bond_feature = get_bond_feature_dims()
bond_input = fluid.layers.split(
x, num_or_sections=len(bond_feature), dim=-1)
outputs = None
count = 0
for _x, _bond_input_dim in zip(bond_input, bond_feature):
count += 1
emb = fluid.layers.embedding(
_x,
size=(_bond_input_dim, self.emb_dim),
param_attr=fluid.ParamAttr(
name=self.name + '_bond_feat_%s' % count))
if outputs is None:
outputs = emb
else:
outputs = outputs + emb
return outputs
...@@ -60,7 +60,7 @@ class PglLinkPropPredDataset(object): ...@@ -60,7 +60,7 @@ class PglLinkPropPredDataset(object):
"""pre_process downlaoding data """pre_process downlaoding data
""" """
processed_dir = osp.join(self.root, 'processed') processed_dir = osp.join(self.root, 'processed')
pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed') pre_processed_file_path = osp.join(processed_dir, 'pgl_data_processed')
if osp.exists(pre_processed_file_path): if osp.exists(pre_processed_file_path):
#TODO: Reload Preprocess files #TODO: Reload Preprocess files
......
...@@ -18,7 +18,10 @@ from pgl.layers import conv ...@@ -18,7 +18,10 @@ from pgl.layers import conv
from pgl.layers.conv import * from pgl.layers.conv import *
from pgl.layers import set2set from pgl.layers import set2set
from pgl.layers.set2set import * from pgl.layers.set2set import *
from pgl.layers import graph_pool
from pgl.layers.graph_pool import *
__all__ = [] __all__ = []
__all__ += conv.__all__ __all__ += conv.__all__
__all__ += set2set.__all__ __all__ += set2set.__all__
__all__ += graph_pool.__all__
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This package implements common layers to help building
graph neural networks.
"""
import paddle.fluid as fluid
from pgl import graph_wrapper
from pgl.utils import paddle_helper
from pgl.utils import op
__all__ = ['graph_pooling']
def graph_pooling(gw, node_feat, pool_type):
"""Implementation of graph pooling
This is an implementation of graph pooling
Args:
gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`)
node_feat: A tensor with shape (num_nodes, feature_size).
pool_type: The type of pooling ("sum", "average" , "min")
Return:
A tensor with shape (num_graph, hidden_size)
"""
graph_feat = op.nested_lod_reset(node_feat, gw.graph_lod)
graph_feat = fluid.layers.sequence_pool(graph_feat, pool_type)
return graph_feat
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册