提交 59098f85 编写于 作者: W Webbley

update ogb

上级 a10bb833
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""test ogb
"""
import argparse
import pgl
import numpy as np
import paddle.fluid as fluid
from pgl.contrib.ogb.graphproppred.dataset_pgl import PglGraphPropPredDataset
from pgl.utils import paddle_helper
from ogb.graphproppred import Evaluator
from pgl.contrib.ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder
def train(exe, batch_size, graph_wrapper, train_program, splitted_idx, dataset,
evaluator, fetch_loss, fetch_pred):
"""Train"""
graphs, labels = dataset[splitted_idx["train"]]
perm = np.arange(0, len(graphs))
np.random.shuffle(perm)
start_batch = 0
batch_no = 0
pred_output = np.zeros_like(labels, dtype="float32")
while start_batch < len(perm):
batch_index = perm[start_batch:start_batch + batch_size]
start_batch += batch_size
batch_graph = pgl.graph.MultiGraph(graphs[batch_index])
batch_label = labels[batch_index]
batch_valid = (batch_label == batch_label).astype("float32")
batch_label = np.nan_to_num(batch_label).astype("float32")
feed_dict = graph_wrapper.to_feed(batch_graph)
feed_dict["label"] = batch_label
feed_dict["weight"] = batch_valid
loss, pred = exe.run(train_program,
feed=feed_dict,
fetch_list=[fetch_loss, fetch_pred])
pred_output[batch_index] = pred
batch_no += 1
print("train", evaluator.eval({"y_true": labels, "y_pred": pred_output}))
def evaluate(exe, batch_size, graph_wrapper, val_program, splitted_idx,
dataset, mode, evaluator, fetch_pred):
"""Eval"""
graphs, labels = dataset[splitted_idx[mode]]
perm = np.arange(0, len(graphs))
start_batch = 0
batch_no = 0
pred_output = np.zeros_like(labels, dtype="float32")
while start_batch < len(perm):
batch_index = perm[start_batch:start_batch + batch_size]
start_batch += batch_size
batch_graph = pgl.graph.MultiGraph(graphs[batch_index])
feed_dict = graph_wrapper.to_feed(batch_graph)
pred = exe.run(val_program, feed=feed_dict, fetch_list=[fetch_pred])
pred_output[batch_index] = pred[0]
batch_no += 1
print(mode, evaluator.eval({"y_true": labels, "y_pred": pred_output}))
def send_func(src_feat, dst_feat, edge_feat):
"""Send"""
return src_feat["h"] + edge_feat["h"]
class GNNModel(object):
"""GNNModel"""
def __init__(self, name, emb_dim, num_task, num_layers):
self.num_task = num_task
self.emb_dim = emb_dim
self.num_layers = num_layers
self.name = name
self.atom_encoder = AtomEncoder(name=name, emb_dim=emb_dim)
self.bond_encoder = BondEncoder(name=name, emb_dim=emb_dim)
def forward(self, graph):
"""foward"""
h_node = self.atom_encoder(graph.node_feat['feat'])
h_edge = self.bond_encoder(graph.edge_feat['feat'])
for layer in range(self.num_layers):
msg = graph.send(
send_func,
nfeat_list=[("h", h_node)],
efeat_list=[("h", h_edge)])
h_node = graph.recv(msg, 'sum') + h_node
h_node = fluid.layers.fc(h_node,
size=self.emb_dim,
name=self.name + '_%s' % layer,
act="relu")
graph_nodes = pgl.layers.graph_pooling(graph, h_node, "average")
graph_pred = fluid.layers.fc(graph_nodes, self.num_task, name="final")
return graph_pred
def main():
"""main
"""
# Training settings
parser = argparse.ArgumentParser(description='Graph Dataset')
parser.add_argument(
'--epochs',
type=int,
default=100,
help='number of epochs to train (default: 100)')
parser.add_argument(
'--dataset',
type=str,
default="ogbg-mol-tox21",
help='dataset name (default: proteinfunc)')
args = parser.parse_args()
place = fluid.CPUPlace() # Dataset too big to use GPU
### automatic dataloading and splitting
dataset = PglGraphPropPredDataset(name=args.dataset)
splitted_idx = dataset.get_idx_split()
### automatic evaluator. takes dataset name as input
evaluator = Evaluator(args.dataset)
graph_data, label = dataset[:2]
batch_graph = pgl.graph.MultiGraph(graph_data)
graph_data = batch_graph
train_program = fluid.Program()
startup_program = fluid.Program()
test_program = fluid.Program()
# degree normalize
graph_data.edge_feat["feat"] = graph_data.edge_feat["feat"].astype("int64")
graph_data.node_feat["feat"] = graph_data.node_feat["feat"].astype("int64")
model = GNNModel(
name="gnn", num_task=dataset.num_tasks, emb_dim=64, num_layers=2)
with fluid.program_guard(train_program, startup_program):
gw = pgl.graph_wrapper.GraphWrapper(
"graph",
place=place,
node_feat=graph_data.node_feat_info(),
edge_feat=graph_data.edge_feat_info())
pred = model.forward(gw)
sigmoid_pred = fluid.layers.sigmoid(pred)
val_program = train_program.clone(for_test=True)
initializer = []
with fluid.program_guard(train_program, startup_program):
train_label = fluid.layers.data(
name="label", dtype="float32", shape=[None, dataset.num_tasks])
train_weight = fluid.layers.data(
name="weight", dtype="float32", shape=[None, dataset.num_tasks])
train_loss_t = fluid.layers.sigmoid_cross_entropy_with_logits(
x=pred, label=train_label) * train_weight
train_loss_t = fluid.layers.reduce_sum(train_loss_t)
adam = fluid.optimizer.Adam(
learning_rate=1e-2,
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0005))
adam.minimize(train_loss_t)
exe = fluid.Executor(place)
exe.run(startup_program)
for epoch in range(1, args.epochs + 1):
print("Epoch", epoch)
train(exe, 128, gw, train_program, splitted_idx, dataset, evaluator,
train_loss_t, sigmoid_pred)
evaluate(exe, 128, gw, val_program, splitted_idx, dataset, "valid",
evaluator, sigmoid_pred)
evaluate(exe, 128, gw, val_program, splitted_idx, dataset, "test",
evaluator, sigmoid_pred)
if __name__ == "__main__":
main()
......@@ -14,10 +14,13 @@
"""test ogb
"""
import argparse
import pgl
import time
import logging
import numpy as np
import paddle.fluid as fluid
import pgl
from pgl.contrib.ogb.linkproppred.dataset_pgl import PglLinkPropPredDataset
from pgl.utils import paddle_helper
from ogb.linkproppred import Evaluator
......@@ -44,12 +47,12 @@ class GNNModel(object):
self.src_nodes = fluid.layers.data(
name='src_nodes',
shape=[None, 1],
shape=[None],
dtype='int64', )
self.dst_nodes = fluid.layers.data(
name='dst_nodes',
shape=[None, 1],
shape=[None],
dtype='int64', )
self.edge_label = fluid.layers.data(
......@@ -63,7 +66,6 @@ class GNNModel(object):
shape=[self.num_nodes, self.emb_dim],
dtype="float32",
name=self.name + "_embedding")
# edge_attr = fluid.layers.fc(graph.edge_feat["feat"], size=self.emb_dim)
for layer in range(self.num_layers):
msg = graph.send(
......@@ -83,8 +85,8 @@ class GNNModel(object):
name=self.name + '_bias_%s' % layer)
h = fluid.layers.elementwise_add(h, bias, act="relu")
src = fluid.layers.gather(h, self.src_nodes)
dst = fluid.layers.gather(h, self.dst_nodes)
src = fluid.layers.gather(h, self.src_nodes, overwrite=False)
dst = fluid.layers.gather(h, self.dst_nodes, overwrite=False)
edge_embed = src * dst
pred = fluid.layers.fc(input=edge_embed,
size=1,
......@@ -107,17 +109,22 @@ def main():
parser.add_argument(
'--epochs',
type=int,
default=100,
default=4,
help='number of epochs to train (default: 100)')
parser.add_argument(
'--dataset',
type=str,
default="ogbl-ppa",
help='dataset name (default: protein protein associations)')
parser.add_argument('--use_cuda', action='store_true')
parser.add_argument('--batch_size', type=int, default=5120)
parser.add_argument('--embed_dim', type=int, default=64)
parser.add_argument('--num_layers', type=int, default=2)
parser.add_argument('--lr', type=float, default=0.001)
args = parser.parse_args()
print(args)
#place = fluid.CUDAPlace(0)
place = fluid.CPUPlace() # Dataset too big to use GPU
place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
### automatic dataloading and splitting
print("loadding dataset")
......@@ -135,19 +142,20 @@ def main():
train_program = fluid.Program()
startup_program = fluid.Program()
test_program = fluid.Program()
# degree normalize
indegree = graph_data.indegree()
norm = np.zeros_like(indegree, dtype="float32")
norm[indegree > 0] = np.power(indegree[indegree > 0], -0.5)
graph_data.node_feat["norm"] = np.expand_dims(norm, -1).astype("float32")
# graph_data.node_feat["index"] = np.array([i for i in range(graph_data.num_nodes)], dtype=np.int64).reshape(-1,1)
with fluid.program_guard(train_program, startup_program):
model = GNNModel(
name="gnn",
num_nodes=graph_data.num_nodes,
emb_dim=64,
num_layers=2)
emb_dim=args.embed_dim,
num_layers=args.num_layers)
gw = pgl.graph_wrapper.GraphWrapper(
"graph",
place,
......@@ -158,50 +166,106 @@ def main():
val_program = train_program.clone(for_test=True)
with fluid.program_guard(train_program, startup_program):
global_steps = int(splitted_edge['train_edge'].shape[0] /
args.batch_size * 2)
learning_rate = fluid.layers.polynomial_decay(args.lr, global_steps,
0.00005)
adam = fluid.optimizer.Adam(
learning_rate=1e-2,
learning_rate=learning_rate,
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0005))
adam.minimize(loss)
exe = fluid.Executor(place)
exe.run(startup_program)
feed = gw.to_feed(graph_data)
print("evaluate result before training: ")
result = test(exe, val_program, prob, evaluator, feed, splitted_edge)
print(result)
print("training")
cc = 0
for epoch in range(1, args.epochs + 1):
feed['src_nodes'] = splitted_edge["train_edge"][:, 0].reshape(-1, 1)
feed['dst_nodes'] = splitted_edge["train_edge"][:, 1].reshape(-1, 1)
feed['edge_label'] = splitted_edge["train_edge_label"].astype(
"float32").reshape(-1, 1)
res_loss, y_pred = exe.run(train_program,
feed=feed,
fetch_list=[loss, prob])
print("Loss %s" % res_loss[0])
result = {}
print("Evaluating...")
feed['src_nodes'] = splitted_edge["valid_edge"][:, 0].reshape(-1, 1)
feed['dst_nodes'] = splitted_edge["valid_edge"][:, 1].reshape(-1, 1)
feed['edge_label'] = splitted_edge["valid_edge_label"].astype(
"float32").reshape(-1, 1)
y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0]
input_dict = {
"y_true": splitted_edge["valid_edge_label"],
"y_pred": y_pred.reshape(-1, ),
}
result["valid"] = evaluator.eval(input_dict)
feed['src_nodes'] = splitted_edge["test_edge"][:, 0].reshape(-1, 1)
feed['dst_nodes'] = splitted_edge["test_edge"][:, 1].reshape(-1, 1)
feed['edge_label'] = splitted_edge["test_edge_label"].astype(
"float32").reshape(-1, 1)
y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0]
input_dict = {
"y_true": splitted_edge["test_edge_label"],
"y_pred": y_pred.reshape(-1, ),
}
result["test"] = evaluator.eval(input_dict)
print(result)
for batch_data, batch_label in data_generator(
graph_data,
splitted_edge["train_edge"],
splitted_edge["train_edge_label"],
batch_size=args.batch_size):
feed['src_nodes'] = batch_data[:, 0].reshape(-1, 1)
feed['dst_nodes'] = batch_data[:, 1].reshape(-1, 1)
feed['edge_label'] = batch_label.astype("float32")
res_loss, y_pred, b_lr = exe.run(
train_program,
feed=feed,
fetch_list=[loss, prob, learning_rate])
if cc % 1 == 0:
print("epoch %d | step %d | lr %s | Loss %s" %
(epoch, cc, b_lr[0], res_loss[0]))
cc += 1
if cc % 20 == 0:
print("Evaluating...")
result = test(exe, val_program, prob, evaluator, feed,
splitted_edge)
print("epoch %d | step %d" % (epoch, cc))
print(result)
def test(exe, val_program, prob, evaluator, feed, splitted_edge):
"""Evaluation"""
result = {}
feed['src_nodes'] = splitted_edge["valid_edge"][:, 0].reshape(-1, 1)
feed['dst_nodes'] = splitted_edge["valid_edge"][:, 1].reshape(-1, 1)
feed['edge_label'] = splitted_edge["valid_edge_label"].astype(
"float32").reshape(-1, 1)
y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0]
input_dict = {
"y_true": splitted_edge["valid_edge_label"],
"y_pred": y_pred.reshape(-1, ),
}
result["valid"] = evaluator.eval(input_dict)
feed['src_nodes'] = splitted_edge["test_edge"][:, 0].reshape(-1, 1)
feed['dst_nodes'] = splitted_edge["test_edge"][:, 1].reshape(-1, 1)
feed['edge_label'] = splitted_edge["test_edge_label"].astype(
"float32").reshape(-1, 1)
y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0]
input_dict = {
"y_true": splitted_edge["test_edge_label"],
"y_pred": y_pred.reshape(-1, ),
}
result["test"] = evaluator.eval(input_dict)
return result
def data_generator(graph, data, label_data, batch_size, shuffle=True):
"""Data Generator"""
perm = np.arange(0, len(data))
if shuffle:
np.random.shuffle(perm)
offset = 0
while offset < len(perm):
batch_index = perm[offset:(offset + batch_size)]
offset += batch_size
pos_data = data[batch_index]
pos_label = label_data[batch_index]
neg_src_node = pos_data[:, 0]
neg_dst_node = np.random.choice(
pos_data.reshape(-1, ), size=len(neg_src_node))
neg_data = np.hstack(
[neg_src_node.reshape(-1, 1), neg_dst_node.reshape(-1, 1)])
exists = graph.has_edges_between(neg_src_node, neg_dst_node)
neg_data = neg_data[np.invert(exists)]
neg_label = np.zeros(shape=len(neg_data), dtype=np.int64)
batch_data = np.vstack([pos_data, neg_data])
label = np.vstack([pos_label.reshape(-1, 1), neg_label.reshape(-1, 1)])
yield batch_data, label
if __name__ == "__main__":
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""MolEncoder for ogb
"""
import paddle.fluid as fluid
from ogb.utils.features import get_atom_feature_dims, get_bond_feature_dims
class AtomEncoder(object):
"""AtomEncoder for encoding node features"""
def __init__(self, name, emb_dim):
self.emb_dim = emb_dim
self.name = name
def __call__(self, x):
atom_feature = get_atom_feature_dims()
atom_input = fluid.layers.split(
x, num_or_sections=len(atom_feature), dim=-1)
outputs = None
count = 0
for _x, _atom_input_dim in zip(atom_input, atom_feature):
count += 1
emb = fluid.layers.embedding(
_x,
size=(_atom_input_dim, self.emb_dim),
param_attr=fluid.ParamAttr(
name=self.name + '_atom_feat_%s' % count))
if outputs is None:
outputs = emb
else:
outputs = outputs + emb
return outputs
class BondEncoder(object):
"""Bond for encoding edge features"""
def __init__(self, name, emb_dim):
self.emb_dim = emb_dim
self.name = name
def __call__(self, x):
bond_feature = get_bond_feature_dims()
bond_input = fluid.layers.split(
x, num_or_sections=len(bond_feature), dim=-1)
outputs = None
count = 0
for _x, _bond_input_dim in zip(bond_input, bond_feature):
count += 1
emb = fluid.layers.embedding(
_x,
size=(_bond_input_dim, self.emb_dim),
param_attr=fluid.ParamAttr(
name=self.name + '_bond_feat_%s' % count))
if outputs is None:
outputs = emb
else:
outputs = outputs + emb
return outputs
......@@ -60,7 +60,7 @@ class PglLinkPropPredDataset(object):
"""pre_process downlaoding data
"""
processed_dir = osp.join(self.root, 'processed')
pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed')
pre_processed_file_path = osp.join(processed_dir, 'pgl_data_processed')
if osp.exists(pre_processed_file_path):
#TODO: Reload Preprocess files
......
......@@ -18,7 +18,10 @@ from pgl.layers import conv
from pgl.layers.conv import *
from pgl.layers import set2set
from pgl.layers.set2set import *
from pgl.layers import graph_pool
from pgl.layers.graph_pool import *
__all__ = []
__all__ += conv.__all__
__all__ += set2set.__all__
__all__ += graph_pool.__all__
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This package implements common layers to help building
graph neural networks.
"""
import paddle.fluid as fluid
from pgl import graph_wrapper
from pgl.utils import paddle_helper
from pgl.utils import op
__all__ = ['graph_pooling']
def graph_pooling(gw, node_feat, pool_type):
"""Implementation of graph pooling
This is an implementation of graph pooling
Args:
gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`)
node_feat: A tensor with shape (num_nodes, feature_size).
pool_type: The type of pooling ("sum", "average" , "min")
Return:
A tensor with shape (num_graph, hidden_size)
"""
graph_feat = op.nested_lod_reset(node_feat, gw.graph_lod)
graph_feat = fluid.layers.sequence_pool(graph_feat, pool_type)
return graph_feat
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册