提交 bcbed7f4 编写于 作者: Z Zhong Hui

split code

上级 4fde0157
# Relational Graph Convolutional Neural Network
RGCN shows that GCN framework can be applied to modeling relational data in knowledge base, To learn more about the study of RGCN, see [Modeling Relational Data with Graph Convolutional Networks](https://arxiv.org/pdf/1703.06103.pdf) for more details.
### Datasets
In this repo, we use RGCN to deal with the ogbn-mag dataset. ogbn-mag dataset is a heterogeneous network composed of a subset of the Microsoft Academic Graph. In addition, we adopt GraphSAINT sampler in the training phase.
### Dependencies
- paddlepaddle>=1.7
- pgl>=1.1
- ogb>=1.2.0
### How to run
> CUDA_VISIBLE_DEVICES=0 python main.py --use_cude
### Hyperparameters
- epoch: Number of epochs default (40)
- use_cuda: Use gpu if assign use_cuda.
- sample_workers: The number of workers for multiprocessing subgraph sample.
- lr: Learning rate.
- batch_size: Batch size.
- hidden_size: The hidden size of the RGCN models.
- test_samples: sample num of each layers
- test_batch_size: batch_size in the test phase
### Proformance
We evaulate 8 times on the ogbn-mag dataset. Here is the result.
Dataset| Accuracy| std|
--|--|--|
ogbn-mag | 0.4727 | 0.0031 |
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from pgl.utils.mp_mapper import mp_reader_mapper
from sample import graph_saint_hetero, k_hop_sampler
def dataloader(source_node, label, batch_size=1024):
index = np.arange(len(source_node))
np.random.shuffle(index)
def loader():
start = 0
while start < len(source_node):
end = min(start + batch_size, len(source_node))
yield source_node[index[start:end]], label[index[start:end]]
start = end
return loader
def sample_loader(args, phase, homograph, hetergraph, gw, source_node, label):
if phase == 'train':
sample_func = graph_saint_hetero
batch_size = args.batch_size
# max_depth for deepwalk
other_args = len(args.test_samples)
else:
sample_func = k_hop_sampler
batch_size = args.test_batch_size
# sample num for k_hop
other_args = args.test_samples
def map_fun(node_label):
node, label = node_label
subgraph, train_index, sample_nodes, train_label = sample_func(
homograph, hetergraph, node, other_args)
feed_dict = gw.to_feed(subgraph)
feed_dict['label'] = label if train_label is None else train_label
feed_dict['train_index'] = train_index
feed_dict['sub_node_index'] = sample_nodes
return feed_dict
loader = dataloader(source_node, label, batch_size)
reader = mp_reader_mapper(
loader, func=map_fun, num_works=args.sample_workers)
for feed_dict in reader():
yield feed_dict
......@@ -13,241 +13,77 @@
# limitations under the License.
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'
os.environ['CPU_NUM'] = str(20)
import numpy as np
import argparse
import copy
import paddle
import paddle.fluid as fluid
import numpy as np
import pgl
import paddle.fluid as fluid
#from pgl.sample import graph_saint_random_walk_sample
from pgl.sample import deepwalk_sample
from pgl.contrib.ogb.nodeproppred.dataset_pgl import PglNodePropPredDataset
from rgcn import RGCNModel, softmax_loss, paper_mask
from pgl.utils.mp_mapper import mp_reader_mapper
from paddle.fluid.contrib import summary
from pgl.utils.logger import log
from pgl.utils.share_numpy import ToShareMemGraph
from pgl.contrib.ogb.nodeproppred.dataset_pgl import PglNodePropPredDataset
from rgcn import RGCNModel, cross_entropy_loss
from dataloader import sample_loader
def hetero2homo(heterograph):
edge = []
for edge_type in heterograph.edge_types_info():
print('edge_type: ', edge_type, ' shape of edges : ',
heterograph[edge_type].edges.shape)
log.info('edge_type: {}, shape of edges : {}'.format(
edge_type, heterograph[edge_type].edges.shape))
edge.append(heterograph[edge_type].edges)
edges = np.vstack(edge)
g = pgl.graph.Graph(num_nodes=heterograph.num_nodes, edges=edges)
print('homo graph nodes ', g.num_nodes)
print('homo graph edges ', g.num_edges)
log.info('homo graph nodes %d' % g.num_nodes)
log.info('homo graph edges %d' % g.num_edges)
g.outdegree()
ToShareMemGraph(g)
return g
def extract_edges_from_nodes(hetergraph, sample_nodes):
eids = {}
for key in hetergraph.edge_types_info():
graph = hetergraph[key]
eids[key] = pgl.graph_kernel.extract_edges_from_nodes(
graph.adj_src_index._indptr, graph.adj_src_index._sorted_v,
graph.adj_src_index._sorted_eid, sample_nodes)
return eids
def graph_saint_random_walk_sample(graph,
hetergraph,
nodes,
max_depth,
alias_name=None,
events_name=None):
"""Implement of graph saint random walk sample.
First, this function will get random walks path for given nodes and depth.
Then, it will create subgraph from all sampled nodes.
Reference Paper: https://arxiv.org/abs/1907.04931
Args:
graph: A pgl graph instance
nodes: Walk starting from nodes
max_depth: Max walking depth
Return:
a subgraph of sampled nodes.
"""
# the seed of multiprocess for numpy should be reset.
np.random.seed()
graph.outdegree()
# try sample from random nodes
# nodes=np.random.choice(np.arange(graph.num_nodes, dtype='int64'), size=len(nodes), replace=False)
nodes = np.random.choice(
np.arange(
graph.num_nodes, dtype='int64'), size=20000, replace=False)
walks = deepwalk_sample(graph, nodes, max_depth, alias_name, events_name)
sample_nodes = []
for walk in walks:
sample_nodes.extend(walk)
#print("length of sample_nodes ", len(sample_nodes))
sample_nodes = np.unique(sample_nodes)
#print("length of unique sample_nodes ", len(sample_nodes))
eids = extract_edges_from_nodes(hetergraph, sample_nodes)
subgraph = hetergraph.subgraph(
nodes=sample_nodes, eid=eids, with_node_feat=True, with_edge_feat=True)
#subgraph.node_feat["index"] = np.array(sample_nodes, dtype="int64")
all_label = graph._node_feat['train_label'][sample_nodes]
train_index = np.where(all_label > -1)[0]
train_label = all_label[train_index]
#print("sample", train_index.shape)
#print("sample", train_label.shape)
return subgraph, sample_nodes, train_index, train_label
def graph_saint_hetero(graph, hetergraph, batch_nodes, max_depth=2):
subgraph, sample_nodes, train_index, train_label = graph_saint_random_walk_sample(
graph, hetergraph, batch_nodes, max_depth)
# train_index = subgraph.reindex_from_parrent_nodes(batch_nodes)
return subgraph, train_index, sample_nodes, train_label
def traverse(item):
"""traverse
"""
if isinstance(item, list) or isinstance(item, np.ndarray):
for i in iter(item):
for j in traverse(i):
yield j
else:
yield item
def flat_node_and_edge(nodes):
"""flat_node_and_edge
"""
nodes = list(set(traverse(nodes)))
return nodes
def k_hop_sampler(graph, hetergraph, batch_nodes, samples=[30, 30]):
# for batch_train_samples, batch_train_labels in batch_info:
np.random.seed()
start_nodes = copy.deepcopy(batch_nodes)
nodes = start_nodes
edges = []
for max_deg in samples:
pred_nodes = graph.sample_predecessor(start_nodes, max_degree=max_deg)
for dst_node, src_nodes in zip(start_nodes, pred_nodes):
for src_node in src_nodes:
edges.append((src_node, dst_node))
last_nodes = nodes
nodes = [nodes, pred_nodes]
nodes = flat_node_and_edge(nodes)
# Find new nodes
start_nodes = list(set(nodes) - set(last_nodes))
if len(start_nodes) == 0:
break
nodes = np.unique(np.array(nodes, dtype='int64'))
eids = extract_edges_from_nodes(hetergraph, nodes)
subgraph = hetergraph.subgraph(
nodes=nodes, eid=eids, with_node_feat=True, with_edge_feat=True)
#sub_node_index = subgraph.reindex_from_parrent_nodes(batch_nodes)
train_index = subgraph.reindex_from_parrent_nodes(batch_nodes)
return subgraph, train_index, np.array(nodes, dtype='int64'), None
def dataloader(source_node, label, batch_size=1024):
index = np.arange(len(source_node))
np.random.shuffle(index)
def loader():
start = 0
while start < len(source_node):
end = min(start + batch_size, len(source_node))
yield source_node[index[start:end]], label[index[start:end]]
start = end
return loader
def sample_loader(phase, homograph, hetergraph, gw, source_node, label):
#print(source_node)
#print(label)
if phase == 'train':
sample_func = graph_saint_hetero
batch_size = 20000
else:
sample_func = k_hop_sampler
batch_size = 512
def map_fun(node_label):
node, label = node_label
subgraph, train_index, sample_nodes, train_label = sample_func(
homograph, hetergraph, node)
#print(train_index.shape)
#print(sample_nodes.shape)
#print(sum(subgraph['p2p'].edges[:,0] * subgraph['p2p'].edges[:, 1] == 0) /len(subgraph['p2p'].edges) )
feed_dict = gw.to_feed(subgraph)
feed_dict['label'] = label if train_label is None else train_label
feed_dict['train_index'] = train_index
feed_dict['sub_node_index'] = sample_nodes
return feed_dict
loader = dataloader(source_node, label, batch_size)
reader = mp_reader_mapper(loader, func=map_fun, num_works=6)
for feed_dict in reader():
yield feed_dict
def run_epoch(exe, loss, acc, homograph, hetergraph, gw, train_program,
def run_epoch(args, exe, fetch_list, homograph, hetergraph, gw, train_program,
test_program, all_label, split_idx, split_real_idx):
best_acc = 1.0
for epoch in range(1000):
best_acc = 0.0
for epoch in range(args.epoch):
for phase in ['train', 'valid', 'test']:
# if phase == 'train':
# continue
running_loss = []
running_acc = []
for feed_dict in sample_loader(
phase, homograph, hetergraph, gw,
args, phase, homograph, hetergraph, gw,
split_real_idx[phase]['paper'],
all_label['paper'][split_idx[phase]['paper']]):
#print("train_shape\t", feed_dict['train_index'].shape)
#print("allnode_shape\t", feed_dict['sub_node_index'].shape)
res = exe.run(
train_program if phase == 'train' else test_program,
# test_program,
feed=feed_dict,
fetch_list=[loss.name, acc.name],
use_prune=True)
# print("train_shape\t", feed_dict['train_index'].shape)
# print("allnode_shape\t", feed_dict['sub_node_index'].shape)
res = exe.run(train_program
if phase == 'train' else test_program,
feed=feed_dict,
fetch_list=fetch_list,
use_prune=True)
running_loss.append(res[0])
running_acc.append(res[1])
if phase == 'train':
print("training_acc %f" % res[1])
log.info("training_acc %f" % res[1])
avg_loss = sum(running_loss) / len(running_loss)
avg_acc = sum(running_acc) / len(running_acc)
if phase == 'valid':
if avg_acc > best_acc:
fluid.io.save_persistables(exe, './output/checkpoint',
train_program)
test_program)
best_acc = avg_acc
print('new best_acc %f' % best_acc)
print("%d, %s %f %f" % (epoch, phase, avg_loss, avg_acc))
log.info('new best_acc %f' % best_acc)
log.info("%d, %s %f %f" % (epoch, phase, avg_loss, avg_acc))
def main():
def main(args):
num_class = 349
num_nodes = 1939743
start_paper_index = 1203354
hidden_size = 128
dataset = PglNodePropPredDataset('ogbn-mag')
embedding_size = 128
dataset = PglNodePropPredDataset('ogbn-papers100M')
g, all_label = dataset[0]
num_nodes = g.num_nodes
homograph = hetero2homo(g)
for key in g.edge_types_info():
g[key].outdegree()
......@@ -255,43 +91,56 @@ def main():
split_idx = dataset.get_idx_split()
split_real_idx = copy.deepcopy(split_idx)
start_paper_index = g.num_node_dict['paper'][1]
# reindex the original idx of each type of node
for t, idx in split_real_idx.items():
for k, v in idx.items():
for k in idx.keys():
split_real_idx[t][k] += g.num_node_dict[k][1]
# the num_node_dict record the node nums and the start index of each type of node.
start_paper_index = g.num_node_dict['paper'][1]
end_paper_index = start_paper_index + g.num_node_dict['paper'][0]
# additional feat of paper node, and corresponding index
additional_paper_feature = g.node_feat_dict[
'paper'][:, :embedding_size].astype('float32')
extract_index = (np.arange(start_paper_index,
end_paper_index)).astype('int32')
# extract the train label feat as node feat of homograph
homograph._node_feat['train_label'] = -1 * np.ones(
[homograph.num_nodes, 1], dtype='int64')
# copy train label to homograph node feat
train_label = all_label['paper'][split_idx['train']['paper']]
train_index = split_real_idx['train']['paper']
homograph._node_feat['train_label'][train_index] = train_label
place = fluid.CUDAPlace(0)
#place = fluid.CPUPlace()
if args.use_cuda:
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
train_program = fluid.Program()
startup_program = fluid.Program()
test_program = fluid.Program()
additional_paper_feature = g.node_feat_dict[
'paper'][:, :hidden_size].astype('float32')
extact_index = (np.arange(start_paper_index, num_nodes)).astype('int32')
with fluid.program_guard(train_program, startup_program):
gw = pgl.heter_graph_wrapper.HeterGraphWrapper(
name="heter_graph",
edge_types=g.edge_types_info(),
node_feat=g.node_feat_info(),
edge_feat=g.edge_feat_info())
# set the paper node feature
paper_feature = fluid.layers.create_parameter(
shape=additional_paper_feature.shape,
dtype='float32',
default_initializer=fluid.initializer.NumpyArrayInitializer(
additional_paper_feature),
name='paper_feature')
name="paper_feature")
paper_index = fluid.layers.create_parameter(
shape=extact_index.shape,
shape=extract_index.shape,
dtype='int32',
default_initializer=fluid.initializer.NumpyArrayInitializer(
extact_index),
name='paper_index')
extract_index),
name="paper_index")
paper_feature.stop_gradient = True
paper_index.stop_gradient = True
......@@ -302,66 +151,101 @@ def main():
label = fluid.layers.data(shape=[-1], dtype="int64", name='label')
label = fluid.layers.reshape(label, [-1, 1])
label.stop_gradient = True
gw = pgl.heter_graph_wrapper.HeterGraphWrapper(
name="heter_graph",
edge_types=g.edge_types_info(),
node_feat=g.node_feat_info(),
edge_feat=g.edge_feat_info())
feat = fluid.layers.create_parameter(
shape=[num_nodes, hidden_size], dtype='float32')
# TODO: the paper feature replaced the total feat, not add
shape=[num_nodes, embedding_size], dtype='float32')
# NOTE: the paper feature replaced the total feat, not add
feat = fluid.layers.scatter(
feat, paper_index, paper_feature, overwrite=False)
sub_node_feat = fluid.layers.gather(feat, sub_node_index)
model = RGCNModel(gw, 2, num_class, num_nodes, g.edge_types_info())
model = RGCNModel(
graph_wrapper=gw,
num_layers=args.num_layers,
hidden_size=args.hidden_size,
num_class=num_class,
edge_types=g.edge_types_info())
feat = model.forward(sub_node_feat)
#feat = paper_mask(feat, gw, start_paper_index)
feat = fluid.layers.gather(feat, train_index)
loss, logit, acc = softmax_loss(feat, label, num_class)
opt = fluid.optimizer.AdamOptimizer(learning_rate=0.005)
loss, acc = cross_entropy_loss(feat, label)
opt = fluid.optimizer.Adam(learning_rate=args.lr)
opt.minimize(loss)
test_program = train_program.clone(for_test=True)
from paddle.fluid.contrib import summary
summary(train_program)
summary(train_program)
exe = fluid.Executor(place)
exe.run(startup_program)
# fluid.io.load_persistables(executor=exe, dirname='./output/checkpoint',
# main_program=train_program)
run_epoch(exe, loss, acc, homograph, g, gw, train_program, test_program,
all_label, split_idx, split_real_idx)
if args.load_pretrain:
fluid.io.load_persistables(
executor=exe,
dirname=os.path.join(args.output_path, 'checkpoint'),
main_program=test_program)
fetch_list = [loss.name, acc.name]
run_epoch(args, exe, fetch_list, homograph, g, gw, train_program,
test_program, all_label, split_idx, split_real_idx)
return None
feed_dict = gw.to_feed(g)
#rand_label = (np.random.rand(num_nodes - start_paper_index) >
# 0.5).astype('int64')
#feed_dict['label'] = rand_label
def full_batch(g, gw, all_label, split_idx, split_real_idx, exe, train_program,
test_program, fetch_list):
""" The full batch verison of rgcn. No sufficient gpu memory for full batch!
"""
feed_dict = gw.to_feed(g)
feed_dict['label'] = all_label['paper'][split_idx['train']['paper']]
feed_dict['train_index'] = split_real_idx['train']['paper']
#feed_dict['sub_node_index'] = np.arange(num_nodes).astype('int64')
#feed_dict['paper_index'] = extact_index
#feed_dict['paper_feature'] = additional_paper_feature
feed_dict['sub_node_index'] = np.arange(g.num_nodes).astype('int64')
for epoch in range(10):
feed_dict['label'] = all_label['paper'][split_idx['train']['paper']]
feed_dict['train_index'] = split_real_idx['train']['paper']
for step in range(10):
res = exe.run(train_program,
feed=feed_dict,
fetch_list=[loss.name, acc.name])
print("%d,%d %f %f" % (epoch, step, res[0], res[1]))
#print(res[1])
res = exe.run(train_program, feed=feed_dict, fetch_list=fetch_list)
log.info("Train %d, %f %f" % (epoch, res[0], res[1]))
feed_dict['label'] = all_label['paper'][split_idx['valid']['paper']]
feed_dict['train_index'] = split_real_idx['valid']['paper']
res = exe.run(test_program,
feed=feed_dict,
fetch_list=[loss.name, acc.name])
print("Test %d, %f %f" % (epoch, res[0], res[1]))
res = exe.run(test_program, feed=feed_dict, fetch_list=fetch_list)
log.info("Valid %d, %f %f" % (epoch, res[0], res[1]))
feed_dict['label'] = all_label['paper'][split_idx['test']['paper']]
feed_dict['train_index'] = split_real_idx['test']['paper']
res = exe.run(test_program, feed=feed_dict, fetch_list=fetch_list)
log.info("Test %d, %f %f" % (epoch, res[0], res[1]))
if __name__ == "__main__":
main()
parser = argparse.ArgumentParser(description='graphsaint with rgcn')
parser.add_argument(
"--output_path",
type=str,
default=None,
help="output path to save model")
parser.add_argument(
"--load_pretrain", action='store_true', help="load pretrained mode")
parser.add_argument("--use_cuda", action='store_true', help="use_cuda")
parser.add_argument("--sample_workers", type=int, default=6)
parser.add_argument("--epoch", type=int, default=40)
parser.add_argument("--num_layers", type=int, default=2)
parser.add_argument("--hidden_size", type=int, default=64)
parser.add_argument("--batch_size", type=int, default=20000)
parser.add_argument("--lr", type=float, default=0.005)
parser.add_argument(
"--test_batch_size",
type=int,
default=512,
help="sample nums of k-hop of test phase.")
parser.add_argument(
"--test_samples",
type=int,
nargs='+',
default=[30, 30],
help="sample nums of k-hop.")
args = parser.parse_args()
log.info(args)
main(args)
......@@ -14,9 +14,9 @@
import paddle
import pgl
import paddle.fluid as fluid
import numpy as np
from pgl.contrib.ogb.nodeproppred.dataset_pgl import PglNodePropPredDataset
import paddle.fluid as fluid
from paddle.fluid.contrib import summary
def rgcn_conv(graph_wrapper,
......@@ -34,14 +34,11 @@ def rgcn_conv(graph_wrapper,
"""
return fluid.layers.sequence_pool(feat, pool_type='average')
gw = graph_wrapper
if not isinstance(edge_types, list):
edge_types = [edge_types]
#output = fluid.layers.zeros((feature.shape[0], hidden_size), dtype='float32')
output = None
for i in range(len(edge_types)):
assert feature is not None
tmp_feat = fluid.layers.fc(
feature,
size=hidden_size,
......@@ -50,8 +47,9 @@ def rgcn_conv(graph_wrapper,
act=None)
if output is None:
output = fluid.layers.zeros_like(tmp_feat)
msg = gw[edge_types[i]].send(__message, nfeat_list=[('h', feature)])
neigh_feat = gw[edge_types[i]].recv(msg, __reduce)
msg = graph_wrapper[edge_types[i]].send(
__message, nfeat_list=[('h', tmp_feat)])
neigh_feat = graph_wrapper[edge_types[i]].recv(msg, __reduce)
# The weight of FC should be the same for the same type of node
# The edge type str should be `A2B`(from type A to type B)
neigh_feat = fluid.layers.fc(
......@@ -60,24 +58,25 @@ def rgcn_conv(graph_wrapper,
param_attr=fluid.ParamAttr(name='%s_edge_fc_%s' %
(name, edge_types[i])),
act=None)
# TODO: the tmp_feat and neigh_feat should be add togather.
output = output + neigh_feat * tmp_feat
#output = fluid.layers.relu(out)
return output
class RGCNModel:
def __init__(self, gw, layers, num_class, num_nodes, edge_types):
self.hidden_size = 64
self.layers = layers
self.num_nodes = num_nodes
self.edge_types = edge_types
self.gw = gw
def __init__(self, graph_wrapper, num_layers, hidden_size, num_class,
edge_types):
self.graph_wrapper = graph_wrapper
self.num_layers = num_layers
self.hidden_size = hidden_size
self.num_class = num_class
self.edge_types = edge_types
def forward(self, feat):
for i in range(self.layers - 1):
for i in range(self.num_layers - 1):
feat = rgcn_conv(
self.gw,
self.graph_wrapper,
feat,
self.hidden_size,
self.edge_types,
......@@ -85,49 +84,44 @@ class RGCNModel:
feat = fluid.layers.relu(feat)
feat = fluid.layers.dropout(feat, dropout_prob=0.5)
feat = rgcn_conv(
self.gw,
self.graph_wrapper,
feat,
self.num_class,
self.edge_types,
name="rgcn_%d" % (self.layers - 1))
name="rgcn_%d" % (self.num_layers - 1))
return feat
def softmax_loss(feat, label, class_num):
#logit = fluid.layers.fc(feat, class_num)
logit = feat
def cross_entropy_loss(logit, label):
loss = fluid.layers.softmax_with_cross_entropy(logit, label)
loss = fluid.layers.mean(loss)
acc = fluid.layers.accuracy(fluid.layers.softmax(logit), label)
return loss, logit, acc
def paper_mask(feat, gw, start_index):
mask = fluid.layers.cast(gw[0].node_feat['index'] > start_index)
feat = fluid.layers.mask_select(feat, mask)
return feat
return loss, acc
if __name__ == "__main__":
#PglNodePropPredDataset('ogbn-mag')
num_nodes = 4
num_class = 2
feat_dim = 16
hidden_size = 16
num_layers = 2
node_types = [(0, 'user'), (1, 'user'), (2, 'item'), (3, 'item')]
edges = {
'U2U': [(0, 1), (1, 0)],
'U2I': [(1, 2), (0, 3), (1, 3)],
'I2I': [(2, 3), (3, 2)],
}
node_feat = {'feature': np.random.randn(4, 16)}
node_feat = {'feature': np.random.randn(4, feat_dim)}
edges_feat = {
'U2U': {
'h': np.random.randn(2, 16)
'h': np.random.randn(2, feat_dim)
},
'U2I': {
'h': np.random.randn(3, 16)
'h': np.random.randn(3, feat_dim)
},
'I2I': {
'h': np.random.randn(2, 16)
'h': np.random.randn(2, feat_dim)
},
}
g = pgl.heter_graph.HeterGraph(
......@@ -136,14 +130,13 @@ if __name__ == "__main__":
node_types=node_types,
node_feat=node_feat,
edge_feat=edges_feat)
place = fluid.CPUPlace()
train_program = fluid.Program()
startup_program = fluid.Program()
test_program = fluid.Program()
with fluid.program_guard(train_program, startup_program):
label = fluid.layers.data(shape=[-1], dtype="int64", name='label')
#label = fluid.layers.create_global_var(shape=[4], value=1, dtype="int64")
label = fluid.layers.reshape(label, [-1, 1])
label.stop_gradient = True
gw = pgl.heter_graph_wrapper.HeterGraphWrapper(
......@@ -153,23 +146,24 @@ if __name__ == "__main__":
edge_feat=g.edge_feat_info())
feat = fluid.layers.create_parameter(
shape=[num_nodes, 16], dtype='float32')
shape=[num_nodes, feat_dim], dtype='float32')
model = RGCNModel(gw, 3, num_class, num_nodes, g.edge_types_info())
feat = model.forward(feat)
loss, logit, acc = softmax_loss(feat, label, 2)
opt = fluid.optimizer.AdamOptimizer(learning_rate=0.001)
model = RGCNModel(gw, num_layers, num_class, hidden_size,
g.edge_types_info())
logit = model.forward(feat)
loss, acc = cross_entropy_loss(logit, label)
opt = fluid.optimizer.SGD(learning_rate=0.1)
opt.minimize(loss)
from paddle.fluid.contrib import summary
summary(train_program)
summary(train_program)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_program)
feed_dict = gw.to_feed(g)
feed_dict['label'] = np.array([1, 0, 1, 1]).astype('int64')
for i in range(100):
res = exe.run(train_program,
feed=feed_dict,
fetch_list=[loss.name, logit.name, acc.name])
print("%d %f %f" % (i, res[0], res[2]))
#print(res[1])
fetch_list=[loss.name, acc.name])
print("STEP: %d LOSS: %f ACC: %f" % (i, res[0], res[1]))
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pgl
import copy
import numpy as np
from pgl.sample import deepwalk_sample, traverse
from pgl.sample import extract_edges_from_nodes
def graph_saint_random_walk_sample(graph,
hetergraph,
nodes,
max_depth,
alias_name=None,
events_name=None):
"""Implement of graph saint random walk sample for hetergraph.
Args:
graph: A pgl graph instance, homograph for hetergraph
hetergraph: A pgl hetergraph instance
nodes: Walk starting from nodes
max_depth: Max walking depth
Return:
a subgraph of sampled nodes.
"""
# the seed for numpy should be reset in multiprocessing.
np.random.seed()
graph.outdegree()
# sample random nodes
nodes = np.random.choice(
np.arange(
graph.num_nodes, dtype='int64'), size=20000, replace=False)
walks = deepwalk_sample(graph, nodes, max_depth, alias_name, events_name)
sample_nodes = []
for walk in walks:
sample_nodes.extend(walk)
sample_nodes = np.unique(sample_nodes)
eids = extract_edges_from_nodes(hetergraph, sample_nodes)
subgraph = hetergraph.subgraph(
nodes=sample_nodes, eid=eids, with_node_feat=True, with_edge_feat=True)
return subgraph, sample_nodes
def graph_saint_hetero(graph, hetergraph, batch_nodes, max_depth=2):
subgraph, sample_nodes = graph_saint_random_walk_sample(
graph, hetergraph, batch_nodes, max_depth)
# the new index of sample_nodes is range(0, len(sample_nodes))
all_label = graph._node_feat['train_label'][sample_nodes]
train_index = np.where(all_label > -1)[0]
train_label = all_label[train_index]
return subgraph, train_index, sample_nodes, train_label
def k_hop_sampler(graph, hetergraph, batch_nodes, samples=[30, 30]):
# the seed for numpy should be reset in multiprocessing.
np.random.seed()
start_nodes = copy.deepcopy(batch_nodes)
nodes = start_nodes
edges = []
for max_deg in samples:
pred_nodes = graph.sample_predecessor(start_nodes, max_degree=max_deg)
for dst_node, src_nodes in zip(start_nodes, pred_nodes):
for src_node in src_nodes:
edges.append((src_node, dst_node))
last_nodes = nodes
nodes = [nodes, pred_nodes]
nodes = list(set(traverse(nodes)))
# Find new nodes
start_nodes = list(set(nodes) - set(last_nodes))
if len(start_nodes) == 0:
break
# TODO: Only use certrain sampled edges to construct subgraph.
nodes = np.unique(np.array(nodes, dtype='int64'))
eids = extract_edges_from_nodes(hetergraph, nodes)
subgraph = hetergraph.subgraph(
nodes=nodes, eid=eids, with_node_feat=True, with_edge_feat=True)
train_index = subgraph.reindex_from_parrent_nodes(batch_nodes)
return subgraph, train_index, nodes, None
......@@ -479,10 +479,34 @@ def pinsage_sample(graph,
def extract_edges_from_nodes(graph, sample_nodes):
eids = graph_kernel.extract_edges_from_nodes(
graph.adj_src_index._indptr, graph.adj_src_index._sorted_v,
graph.adj_src_index._sorted_eid, sample_nodes)
return eids
""" Tools for extract all edges of certrain nodes.
Note: Before calling this method, please build adj_src_index first.
Args:
graph: A pgl graph instance of hetergraph instance
sample_nodes: the given nodes
Return:
eids array for graph instance.
or a dict of eids array for hetergraph instance.
"""
sample_nodes = np.array(sample_nodes, dtype='int64')
if isinstance(graph, pgl.graph.Graph):
eids = graph_kernel.extract_edges_from_nodes(
graph.adj_src_index._indptr, graph.adj_src_index._sorted_v,
graph.adj_src_index._sorted_eid, sample_nodes)
return eids
elif isinstance(graph, pgl.heter_graph.HeterGraph):
eids = {}
for key in graph.edge_types_info():
eids[key] = pgl.graph_kernel.extract_edges_from_nodes(
graph[key].adj_src_index._indptr,
graph[key].adj_src_index._sorted_v,
graph[key].adj_src_index._sorted_eid, sample_nodes)
return eids
else:
raise ValueError("Please pass a graph instance to this function!")
def graph_saint_random_walk_sample(graph,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册