未验证 提交 03cb3621 编写于 作者: H Huang Zhengjie 提交者: GitHub

Merge pull request #4 from PaddlePaddle/develop

Develop
......@@ -21,7 +21,7 @@ import tqdm
import numpy as np
import logging
import random
from pgl.contrib import heter_graph
from pgl import heter_graph
import pickle as pkl
......
......@@ -21,7 +21,7 @@ import logging
import paddle.fluid as fluid
import paddle.fluid.layers as fl
from pgl.contrib import heter_graph_wrapper
from pgl import heter_graph_wrapper
class GATNE(object):
......
# Distributed metapath2vec in PGL
[metapath2vec](https://ericdongyx.github.io/papers/KDD17-dong-chawla-swami-metapath2vec.pdf) is a algorithm framework for representation learning in heterogeneous networks which contains multiple types of nodes and links. Given a heterogeneous graph, metapath2vec algorithm first generates meta-path-based random walks and then use skipgram model to train a language model. Based on PGL, we reproduce metapath2vec algorithm in distributed mode.
## Datasets
DBLP: The dataset contains 14376 papers (P), 20 conferences (C), 14475 authors (A), and 8920 terms (T). There are 33791 nodes in this dataset.
You can dowload datasets from [here](https://github.com/librahu/HIN-Datasets-for-Recommendation-and-Network-Embedding)
We use the ```DBLP``` dataset for example. After downloading the dataset, put them, let's say, in ```./data/DBLP/``` .
## Dependencies
- paddlepaddle>=1.6
- pgl>=1.0.0
## How to run
Before training, run the below command to do data preprocessing.
```sh
python data_process.py --data_path ./data/DBLP --output_path ./data/data_processed
```
We adopt [PaddlePaddle Fleet](https://github.com/PaddlePaddle/Fleet) as our distributed training frameworks. ```config.yaml``` is a configure file for metapath2vec hyperparameters and ```local_config``` is a configure file for parameter servers of PaddlePaddle. By default, we have 2 pservers and 2 trainers. One can use ```cloud_run.sh``` to help startup the parameter servers and model trainers.
For examples, train metapath2vec in distributed mode on DBLP dataset.
```sh
# train metapath2vec in distributed mode.
sh cloud_run.sh
# multiclass task example
python multi_class.py --dataset ./data/data_processed/author_label.txt --ckpt_path ./checkpoints/2000 --num_nodes 33791
```
## Hyperparameters
All the hyper parameters are saved in ```config.yaml``` file. So before training, you can open the config.yaml to modify the hyper parameters as you like.
Some important hyper parameters in config.yaml:
- **edge_path**: the directory of graph data that you want to load
- **lr**: learning rate
- **neg_num**: number of negative samples.
- **num_walks**: number of walks started from each node
- **walk_len**: walk length
- **meta_path**: meta path scheme
#!/bin/bash
set -x
mode=${1}
source ./utils.sh
unset http_proxy https_proxy
source ./local_config
if [ ! -d ${log_dir} ]; then
mkdir ${log_dir}
fi
for((i=0;i<${PADDLE_PSERVERS_NUM};i++))
do
echo "start ps server: ${i}"
echo $log_dir
TRAINING_ROLE="PSERVER" PADDLE_TRAINER_ID=${i} sh job.sh &> $log_dir/pserver.$i.log &
done
sleep 10s
for((j=0;j<${PADDLE_TRAINERS_NUM};j++))
do
echo "start ps work: ${j}"
TRAINING_ROLE="TRAINER" PADDLE_TRAINER_ID=${j} sh job.sh &> $log_dir/worker.$j.log &
done
tail -f $log_dir/worker.0.log
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import time
import os
import math
import numpy as np
import paddle.fluid as F
import paddle.fluid.layers as L
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
from pgl.utils.logger import log
from model import Metapath2vecModel
from graph import m2vGraph
from utils import load_config
from walker import multiprocess_data_generator
def init_role():
# reset the place according to role of parameter server
training_role = os.getenv("TRAINING_ROLE", "TRAINER")
paddle_role = role_maker.Role.WORKER
place = F.CPUPlace()
if training_role == "PSERVER":
paddle_role = role_maker.Role.SERVER
# set the fleet runtime environment according to configure
ports = os.getenv("PADDLE_PORT", "6174").split(",")
pserver_ips = os.getenv("PADDLE_PSERVERS").split(",") # ip,ip...
eplist = []
if len(ports) > 1:
# local debug mode, multi port
for port in ports:
eplist.append(':'.join([pserver_ips[0], port]))
else:
# distributed mode, multi ip
for ip in pserver_ips:
eplist.append(':'.join([ip, ports[0]]))
pserver_endpoints = eplist # ip:port,ip:port...
worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
role = role_maker.UserDefinedRoleMaker(
current_id=trainer_id,
role=paddle_role,
worker_num=worker_num,
server_endpoints=pserver_endpoints)
fleet.init(role)
def optimization(base_lr, loss, train_steps, optimizer='sgd'):
decayed_lr = L.learning_rate_scheduler.polynomial_decay(
learning_rate=base_lr,
decay_steps=train_steps,
end_learning_rate=0.0001 * base_lr,
power=1.0,
cycle=False)
if optimizer == 'sgd':
optimizer = F.optimizer.SGD(decayed_lr)
elif optimizer == 'adam':
optimizer = F.optimizer.Adam(decayed_lr, lazy_mode=True)
else:
raise ValueError
log.info('learning rate:%f' % (base_lr))
#create the DistributeTranspiler configure
config = DistributeTranspilerConfig()
config.sync_mode = False
#config.runtime_split_send_recv = False
config.slice_var_up = False
#create the distributed optimizer
optimizer = fleet.distributed_optimizer(optimizer, config)
optimizer.minimize(loss)
def build_complied_prog(train_program, model_loss):
num_threads = int(os.getenv("CPU_NUM", 10))
trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
exec_strategy = F.ExecutionStrategy()
exec_strategy.num_threads = num_threads
#exec_strategy.use_experimental_executor = True
build_strategy = F.BuildStrategy()
build_strategy.enable_inplace = True
#build_strategy.memory_optimize = True
build_strategy.memory_optimize = False
build_strategy.remove_unnecessary_lock = False
if num_threads > 1:
build_strategy.reduce_strategy = F.BuildStrategy.ReduceStrategy.Reduce
compiled_prog = F.compiler.CompiledProgram(
train_program).with_data_parallel(loss_name=model_loss.name)
return compiled_prog
def train_prog(exe, program, loss, node2vec_pyreader, args, train_steps):
trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
step = 0
if not os.path.exists(args.save_path):
os.makedirs(args.save_path)
while True:
try:
begin_time = time.time()
loss_val, = exe.run(program, fetch_list=[loss])
log.info("step %s: loss %.5f speed: %.5f s/step" %
(step, np.mean(loss_val), time.time() - begin_time))
step += 1
except F.core.EOFException:
node2vec_pyreader.reset()
if step % args.steps_per_save == 0 or step == train_steps:
save_path = args.save_path
if trainer_id == 0:
model_path = os.path.join(save_path, "%s" % step)
fleet.save_persistables(exe, model_path)
if step == train_steps:
break
def main(args):
log.info("start")
worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
num_devices = int(os.getenv("CPU_NUM", 10))
model = Metapath2vecModel(config=args)
pyreader = model.pyreader
loss = model.forward()
# init fleet
init_role()
train_steps = math.ceil(args.num_nodes * args.epochs / args.batch_size /
num_devices / worker_num)
log.info("Train step: %s" % train_steps)
real_batch_size = args.batch_size * args.walk_len * args.win_size
if args.optimizer == "sgd":
args.lr *= real_batch_size
optimization(args.lr, loss, train_steps, args.optimizer)
# init and run server or worker
if fleet.is_server():
fleet.init_server(args.warm_start_from_dir)
fleet.run_server()
if fleet.is_worker():
log.info("start init worker done")
fleet.init_worker()
#just the worker, load the sample
log.info("init worker done")
exe = F.Executor(F.CPUPlace())
exe.run(fleet.startup_program)
log.info("Startup done")
dataset = m2vGraph(args)
log.info("Build graph done.")
data_generator = multiprocess_data_generator(args, dataset)
cur_time = time.time()
for idx, _ in enumerate(data_generator()):
log.info("iter %s: %s s" % (idx, time.time() - cur_time))
cur_time = time.time()
if idx == 100:
break
pyreader.decorate_tensor_provider(data_generator)
pyreader.start()
compiled_prog = build_complied_prog(fleet.main_program, loss)
train_prog(exe, compiled_prog, loss, pyreader, args, train_steps)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='metapath2vec')
parser.add_argument("-c", "--config", type=str, default="./config.yaml")
args = parser.parse_args()
config = load_config(args.config)
log.info(config)
main(config)
# graph data config
edge_path: "./data/data_processed"
edge_files: "p2a:paper_author.txt,p2c:paper_conference.txt,p2t:paper_type.txt"
node_types_file: "node_types.txt"
num_nodes: 37791
symmetry: True
# skipgram pair data config
win_size: 5
neg_num: 5
# average; m2v_plus
neg_sample_type: "average"
# random walk config
# m2v; multi_m2v;
walk_mode: "m2v"
meta_path: "c2p-p2a-a2p-p2c"
first_node_type: "c"
walk_len: 24
batch_size: 4
node_shuffle: True
node_files: null
num_sample_workers: 2
# model config
embed_dim: 64
is_sparse: True
# only use when num_nodes > 100,000,000, slower than noraml embedding
is_distributed: False
# trainging config
epochs: 10
optimizer: "sgd"
lr: 1.0
warm_start_from_dir: null
walkpath_files: "None"
train_files: "None"
steps_per_save: 1000
save_path: "./checkpoints"
log_dir: "./logs"
CPU_NUM: 16
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data preprocessing for DBLP dataset"""
import sys
import os
import argparse
import numpy as np
from collections import OrderedDict
AUTHOR = 14475
PAPER = 14376
CONF = 20
TYPE = 8920
LABEL = 4
def build_node_types(meta_node, outfile):
"""build_node_types"""
nt_ori2new = {}
with open(outfile, 'w') as writer:
offset = 0
for node_type, num_nodes in meta_node.items():
ori_id2new_id = {}
for i in range(num_nodes):
writer.write("%d\t%s\n" % (offset + i, node_type))
ori_id2new_id[i + 1] = offset + i
nt_ori2new[node_type] = ori_id2new_id
offset += num_nodes
return nt_ori2new
def remapping_index(args, src_dict, dst_dict, ori_file, new_file):
"""remapping_index"""
ori_file = os.path.join(args.data_path, ori_file)
new_file = os.path.join(args.output_path, new_file)
with open(ori_file, 'r') as reader, open(new_file, 'w') as writer:
for line in reader:
slots = line.strip().split()
s = int(slots[0])
d = int(slots[1])
new_s = src_dict[s]
new_d = dst_dict[d]
writer.write("%d\t%d\n" % (new_s, new_d))
def author_label(args, ori_id2pgl_id, ori_file, real_file, new_file):
"""author_label"""
ori_file = os.path.join(args.data_path, ori_file)
real_file = os.path.join(args.data_path, real_file)
new_file = os.path.join(args.output_path, new_file)
real_id2pgl_id = {}
with open(ori_file, 'r') as reader:
for line in reader:
slots = line.strip().split()
ori_id = int(slots[0])
real_id = int(slots[1])
pgl_id = ori_id2pgl_id[ori_id]
real_id2pgl_id[real_id] = pgl_id
with open(real_file, 'r') as reader, open(new_file, 'w') as writer:
for line in reader:
slots = line.strip().split()
real_id = int(slots[0])
label = int(slots[1])
pgl_id = real_id2pgl_id[real_id]
writer.write("%d\t%d\n" % (pgl_id, label))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='DBLP data preprocessing')
parser.add_argument(
'--data_path',
default=None,
type=str,
help='original data path(default: None)')
parser.add_argument(
'--output_path',
default=None,
type=str,
help='output path(default: None)')
args = parser.parse_args()
meta_node = OrderedDict()
meta_node['a'] = AUTHOR
meta_node['p'] = PAPER
meta_node['c'] = CONF
meta_node['t'] = TYPE
if not os.path.exists(args.output_path):
os.makedirs(args.output_path)
node_types_file = os.path.join(args.output_path, "node_types.txt")
nt_ori2new = build_node_types(meta_node, node_types_file)
remapping_index(args, nt_ori2new['p'], nt_ori2new['a'], 'paper_author.dat',
'paper_author.txt')
remapping_index(args, nt_ori2new['p'], nt_ori2new['c'],
'paper_conference.dat', 'paper_conference.txt')
remapping_index(args, nt_ori2new['p'], nt_ori2new['t'], 'paper_type.dat',
'paper_type.txt')
author_label(args, nt_ori2new['a'], 'author_map_id.dat',
'author_label.dat', 'author_label.txt')
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import sys
import os
import numpy as np
import pickle as pkl
import tqdm
import time
import random
from pgl.utils.logger import log
from pgl import heter_graph
class m2vGraph(object):
"""Implemetation of graph in order to sample metapath random walk.
"""
def __init__(self, config):
self.edge_path = config.edge_path
self.num_nodes = config.num_nodes
self.symmetry = config.symmetry
edge_files = config.edge_files
node_types_file = config.node_types_file
self.edge_file_list = []
for pair in edge_files.split(','):
e_type, filename = pair.split(':')
filename = os.path.join(self.edge_path, filename)
self.edge_file_list.append((e_type, filename))
self.node_types_file = os.path.join(self.edge_path, node_types_file)
self.build_graph()
def build_graph(self):
"""Build pgl heterogeneous graph.
"""
edges_by_types = {}
npy = self.edge_file_list[0][1] + ".npy"
if os.path.exists(npy):
log.info("load data from numpy file")
for pair in self.edge_file_list:
edges_by_types[pair[0]] = np.load(pair[1] + ".npy")
else:
log.info("load data from txt file")
for pair in self.edge_file_list:
edges_by_types[pair[0]] = self.load_edges(pair[1])
# np.save(pair[1] + ".npy", edges_by_types[pair[0]])
for e_type, edges in edges_by_types.items():
log.info(["number of %s edges: " % e_type, len(edges)])
if self.symmetry:
tmp = {}
for key, edges in edges_by_types.items():
n_list = key.split('2')
re_key = n_list[1] + '2' + n_list[0]
tmp[re_key] = edges_by_types[key][:, [1, 0]]
edges_by_types.update(tmp)
log.info(["finished loadding symmetry edges."])
node_types = self.load_node_types(self.node_types_file)
assert len(node_types) == self.num_nodes, \
"num_nodes should be equal to the length of node_types"
log.info(["number of nodes: ", len(node_types)])
node_features = {
'index': np.array([i for i in range(self.num_nodes)]).reshape(
-1, 1).astype(np.int64)
}
self.graph = heter_graph.HeterGraph(
num_nodes=self.num_nodes,
edges=edges_by_types,
node_types=node_types,
node_feat=node_features)
def load_edges(self, file_, symmetry=False):
"""Load edges from file.
"""
edges = []
with open(file_, 'r') as reader:
for line in reader:
items = line.strip().split()
src, dst = int(items[0]), int(items[1])
edges.append((src, dst))
if symmetry:
edges.append((dst, src))
edges = np.array(list(set(edges)), dtype=np.int64)
# edges = list(set(edges))
return edges
def load_node_types(self, file_):
"""Load node types
"""
node_types = []
log.info("node_types_file name: %s" % file_)
with open(file_, 'r') as reader:
for line in reader:
items = line.strip().split()
node_id = int(items[0])
n_type = items[1]
node_types.append((node_id, n_type))
return node_types
#!/bin/bash
set -x
source ./utils.sh
export CPU_NUM=$CPU_NUM
export FLAGS_rpc_deadline=3000000
export FLAGS_communicator_send_queue_size=1
export FLAGS_communicator_min_send_grad_num_before_recv=0
export FLAGS_communicator_max_merge_var_num=1
export FLAGS_communicator_merge_sparse_grad=0
python -u cluster_train.py -c config.yaml
#!/bin/bash
export PADDLE_TRAINERS_NUM=2
export PADDLE_PSERVERS_NUM=2
export PADDLE_PORT=6184,6185
export PADDLE_PSERVERS="127.0.0.1"
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
metapath2vec model.
"""
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
import math
import paddle.fluid.layers as L
import paddle.fluid as F
def distributed_embedding(input,
dict_size,
hidden_size,
initializer,
name,
num_part=16,
is_sparse=False,
learning_rate=1.0):
_part_size = hidden_size // num_part
if hidden_size % num_part != 0:
_part_size += 1
output_embedding = []
p_num = 0
while hidden_size > 0:
_part_size = min(_part_size, hidden_size)
hidden_size -= _part_size
print("part", p_num, "size=", (dict_size, _part_size))
part_embedding = L.embedding(
input=input,
size=(dict_size, int(_part_size)),
is_sparse=is_sparse,
is_distributed=False,
param_attr=F.ParamAttr(
name=name + '_part%s' % p_num,
initializer=initializer,
learning_rate=learning_rate))
p_num += 1
output_embedding.append(part_embedding)
return L.concat(output_embedding, -1)
class Metapath2vecModel(object):
def __init__(self, config, embedding_lr=1.0):
self.config = config
self.neg_num = self.config.neg_num
self.num_nodes = self.config.num_nodes
self.embed_dim = self.config.embed_dim
self.is_sparse = self.config.is_sparse
self.is_distributed = self.config.is_distributed
self.embedding_lr = embedding_lr
self.pyreader = L.py_reader(
capacity=70,
shapes=[[-1, 1, 1], [-1, self.neg_num + 1, 1]],
dtypes=['int64', 'int64'],
lod_levels=[0, 0],
name='train',
use_double_buffer=True)
bound = 1. / math.sqrt(self.embed_dim)
self.embed_init = F.initializer.Uniform(low=-bound, high=bound)
self.loss = None
max_hidden_size = int(math.pow(2, 31) / 4 / self.num_nodes)
self.num_part = int(math.ceil(1. * self.embed_dim / max_hidden_size))
def forward(self):
src, dsts = L.read_file(self.pyreader)
if self.is_sparse:
src = L.reshape(src, [-1, 1])
dsts = L.reshape(dsts, [-1, 1])
if self.num_part is not None and self.num_part != 1 and not self.is_distributed:
src_embed = distributed_embedding(
src,
self.num_nodes,
self.embed_dim,
self.embed_init,
"weight",
self.num_part,
self.is_sparse,
learning_rate=self.embedding_lr)
dsts_embed = distributed_embedding(
dsts,
self.num_nodes,
self.embed_dim,
self.embed_init,
"weight",
self.num_part,
self.is_sparse,
learning_rate=self.embedding_lr)
else:
src_embed = L.embedding(
src, (self.num_nodes, self.embed_dim),
self.is_sparse,
self.is_distributed,
param_attr=F.ParamAttr(
name="weight",
learning_rate=self.embedding_lr,
initializer=self.embed_init))
dsts_embed = L.embedding(
dsts, (self.num_nodes, self.embed_dim),
self.is_sparse,
self.is_distributed,
param_attr=F.ParamAttr(
name="weight",
learning_rate=self.embedding_lr,
initializer=self.embed_init))
if self.is_sparse:
src_embed = L.reshape(src_embed, [-1, 1, self.embed_dim])
dsts_embed = L.reshape(dsts_embed,
[-1, self.neg_num + 1, self.embed_dim])
logits = L.matmul(
src_embed, dsts_embed,
transpose_y=True) # [batch_size, 1, neg_num+1]
pos_label = L.fill_constant_batch_size_like(logits, [-1, 1, 1],
"float32", 1)
neg_label = L.fill_constant_batch_size_like(
logits, [-1, 1, self.neg_num], "float32", 0)
label = L.concat([pos_label, neg_label], -1)
pos_weight = L.fill_constant_batch_size_like(logits, [-1, 1, 1],
"float32", self.neg_num)
neg_weight = L.fill_constant_batch_size_like(
logits, [-1, 1, self.neg_num], "float32", 1)
weight = L.concat([pos_weight, neg_weight], -1)
weight.stop_gradient = True
label.stop_gradient = True
loss = L.sigmoid_cross_entropy_with_logits(logits, label)
loss = loss * weight
loss = L.reduce_mean(loss)
loss = loss * ((self.neg_num + 1) / 2 / self.neg_num)
loss.persistable = True
self.loss = loss
return loss
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Optimized Multiprocessing Reader for PaddlePaddle
"""
import multiprocessing
import numpy as np
import time
import paddle.fluid as fluid
import pyarrow
def _serialize_serializable(obj):
"""Serialize Feed Dict
"""
return {"type": type(obj), "data": obj.__dict__}
def _deserialize_serializable(obj):
"""Deserialize Feed Dict
"""
val = obj["type"].__new__(obj["type"])
val.__dict__.update(obj["data"])
return val
context = pyarrow.default_serialization_context()
context.register_type(
object,
"object",
custom_serializer=_serialize_serializable,
custom_deserializer=_deserialize_serializable)
def serialize_data(data):
"""serialize_data"""
return pyarrow.serialize(data, context=context).to_buffer().to_pybytes()
def deserialize_data(data):
"""deserialize_data"""
return pyarrow.deserialize(data, context=context)
def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
"""
multiprocess_reader use python multi process to read data from readers
and then use multiprocess.Queue or multiprocess.Pipe to merge all
data. The process number is equal to the number of input readers, each
process call one reader.
Multiprocess.Queue require the rw access right to /dev/shm, some
platform does not support.
you need to create multiple readers first, these readers should be independent
to each other so that each process can work independently.
An example:
.. code-block:: python
reader0 = reader(["file01", "file02"])
reader1 = reader(["file11", "file12"])
reader1 = reader(["file21", "file22"])
reader = multiprocess_reader([reader0, reader1, reader2],
queue_size=100, use_pipe=False)
"""
assert type(readers) is list and len(readers) > 0
def _read_into_queue(reader, queue):
"""read_into_queue"""
for sample in reader():
if sample is None:
raise ValueError("sample has None")
queue.put(serialize_data(sample))
queue.put(serialize_data(None))
def queue_reader():
"""queue_reader"""
queue = multiprocessing.Queue(queue_size)
for reader in readers:
p = multiprocessing.Process(
target=_read_into_queue, args=(reader, queue))
p.start()
reader_num = len(readers)
finish_num = 0
while finish_num < reader_num:
sample = deserialize_data(queue.get())
if sample is None:
finish_num += 1
else:
yield sample
def _read_into_pipe(reader, conn):
"""read_into_pipe"""
for sample in reader():
if sample is None:
raise ValueError("sample has None!")
conn.send(serialize_data(sample))
conn.send(serialize_data(None))
conn.close()
def pipe_reader():
"""pipe_reader"""
conns = []
for reader in readers:
parent_conn, child_conn = multiprocessing.Pipe()
conns.append(parent_conn)
p = multiprocessing.Process(
target=_read_into_pipe, args=(reader, child_conn))
p.start()
reader_num = len(readers)
finish_num = 0
conn_to_remove = []
finish_flag = np.zeros(len(conns), dtype="int32")
while finish_num < reader_num:
for conn_id, conn in enumerate(conns):
if finish_flag[conn_id] > 0:
continue
buff = conn.recv()
now = time.time()
sample = deserialize_data(buff)
out = time.time() - now
if sample is None:
finish_num += 1
conn.close()
finish_flag[conn_id] = 1
else:
yield sample
if use_pipe:
return pipe_reader
else:
return queue_reader
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This file provides the multi class task for testing the embedding learned by metapath2vec model.
"""
import argparse
import sys
import os
import tqdm
import time
import math
import logging
import random
import pickle as pkl
import numpy as np
import sklearn.metrics
from sklearn.metrics import f1_score
import pgl
import paddle.fluid as fluid
import paddle.fluid.layers as fl
def load_data(file_):
"""Load data for node classification.
"""
words_label = []
line_count = 0
with open(file_, 'r') as reader:
for line in reader:
line_count += 1
tokens = line.strip().split()
word, label = int(tokens[0]), int(tokens[1]) - 1
words_label.append((word, label))
words_label = np.array(words_label, dtype=np.int64)
np.random.shuffle(words_label)
logging.info('%d/%d word_label pairs have been loaded' %
(len(words_label), line_count))
return words_label
def node_classify_model(config):
"""Build node classify model.
"""
nodes = fl.data('nodes', shape=[None, 1], dtype='int64')
labels = fl.data('labels', shape=[None, 1], dtype='int64')
embed_nodes = fl.embedding(
input=nodes,
size=[config.num_nodes, config.embed_dim],
param_attr=fluid.ParamAttr(name='weight'))
embed_nodes.stop_gradient = True
probs = fl.fc(input=embed_nodes, size=config.num_labels, act='softmax')
predict = fl.argmax(probs, axis=-1)
loss = fl.cross_entropy(input=probs, label=labels)
loss = fl.reduce_mean(loss)
return {
'loss': loss,
'probs': probs,
'predict': predict,
'labels': labels,
}
def run_epoch(exe, prog, model, feed_dict, lr):
"""Run training process of every epoch.
"""
if lr is None:
loss, predict = exe.run(prog,
feed=feed_dict,
fetch_list=[model['loss'], model['predict']],
return_numpy=True)
lr_ = 0
else:
loss, predict, lr_ = exe.run(
prog,
feed=feed_dict,
fetch_list=[model['loss'], model['predict'], lr],
return_numpy=True)
macro_f1 = f1_score(feed_dict['labels'], predict, average="macro")
micro_f1 = f1_score(feed_dict['labels'], predict, average="micro")
return {
'loss': loss,
'pred': predict,
'lr': lr_,
'macro_f1': macro_f1,
'micro_f1': micro_f1
}
def main(args):
"""main function for training node classification task.
"""
words_label = load_data(args.dataset)
# split data for training and testing
split_position = int(words_label.shape[0] * args.train_percent)
train_words_label = words_label[0:split_position, :]
test_words_label = words_label[split_position:, :]
place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
train_prog = fluid.Program()
test_prog = fluid.Program()
startup_prog = fluid.Program()
with fluid.program_guard(train_prog, startup_prog):
with fluid.unique_name.guard():
model = node_classify_model(args)
test_prog = train_prog.clone(for_test=True)
with fluid.program_guard(train_prog, startup_prog):
lr = fl.polynomial_decay(args.lr, 1000, 0.001)
adam = fluid.optimizer.Adam(lr)
adam.minimize(model['loss'])
exe = fluid.Executor(place)
exe.run(startup_prog)
def existed_params(var):
if not isinstance(var, fluid.framework.Parameter):
return False
return os.path.exists(os.path.join(args.ckpt_path, var.name))
fluid.io.load_vars(
exe, args.ckpt_path, main_program=train_prog, predicate=existed_params)
# load_param(args.ckpt_path, ['content'])
feed_dict = {}
X = train_words_label[:, 0].reshape(-1, 1)
labels = train_words_label[:, 1].reshape(-1, 1)
logging.info('%d/%d data to train' %
(labels.shape[0], words_label.shape[0]))
test_feed_dict = {}
test_X = test_words_label[:, 0].reshape(-1, 1)
test_labels = test_words_label[:, 1].reshape(-1, 1)
logging.info('%d/%d data to test' %
(test_labels.shape[0], words_label.shape[0]))
for epoch in range(args.epochs):
feed_dict['nodes'] = X
feed_dict['labels'] = labels
train_result = run_epoch(exe, train_prog, model, feed_dict, lr)
test_feed_dict['nodes'] = test_X
test_feed_dict['labels'] = test_labels
test_result = run_epoch(exe, test_prog, model, test_feed_dict, lr=None)
logging.info(
'epoch %d | lr %.4f | train_loss %.5f | train_macro_F1 %.4f | train_micro_F1 %.4f | test_loss %.5f | test_macro_F1 %.4f | test_micro_F1 %.4f'
% (epoch, train_result['lr'], train_result['loss'],
train_result['macro_f1'], train_result['micro_f1'],
test_result['loss'], test_result['macro_f1'],
test_result['micro_f1']))
logging.info(
'final_test_macro_f1 score: %.4f | final_test_micro_f1 score: %.4f' %
(test_result['macro_f1'], test_result['micro_f1']))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='multi_class')
parser.add_argument(
'--dataset',
default=None,
type=str,
help='training and testing data file(default: None)')
parser.add_argument(
'--ckpt_path', default=None, type=str, help='task name(default: None)')
parser.add_argument("--use_cuda", action='store_true', help="use_cuda")
parser.add_argument(
'--train_percent',
default=0.5,
type=float,
help='train_percent(default: 0.5)')
parser.add_argument(
'--num_labels',
default=4,
type=int,
help='number of labels(default: 4)')
parser.add_argument(
'--epochs',
default=100,
type=int,
help='number of epochs for training(default: 100)')
parser.add_argument(
'--lr',
default=0.025,
type=float,
help='learning rate(default: 0.025)')
parser.add_argument(
'--num_nodes', default=0, type=int, help='number of nodes')
parser.add_argument(
'--embed_dim',
default=64,
type=int,
help='dimension of embedding(default: 64)')
args = parser.parse_args()
log_format = '%(asctime)s-%(levelname)s-%(name)s: %(message)s'
logging.basicConfig(level='INFO', format=log_format)
main(args)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Implementation of some helper functions"""
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
import os
import time
import yaml
import numpy as np
from pgl.utils.logger import log
class AttrDict(dict):
"""Attr dict
"""
def __init__(self, d):
self.dict = d
def __getattr__(self, attr):
value = self.dict[attr]
if isinstance(value, dict):
return AttrDict(value)
else:
return value
def __str__(self):
return str(self.dict)
def load_config(config_file):
"""Load config file"""
with open(config_file) as f:
if hasattr(yaml, 'FullLoader'):
config = yaml.load(f, Loader=yaml.FullLoader)
else:
config = yaml.load(f)
return AttrDict(config)
# parse yaml file
function parse_yaml {
local prefix=$2
local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034')
sed -ne "s|^\($s\):|\1|" \
-e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \
-e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p" $1 |
awk -F$fs '{
indent = length($1)/2;
vname[indent] = $2;
for (i in vname) {if (i > indent) {delete vname[i]}}
if (length($3) > 0) {
vn=""; for (i=0; i<indent; i++) {vn=(vn)(vname[i])("_")}
printf("%s%s%s=\"%s\"\n", "'$prefix'",vn, $2, $3);
}
}'
}
eval $(parse_yaml "$(dirname "${BASH_SOURCE}")"/config.yaml)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""doc
"""
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function
import time
import io
import os
import numpy as np
import random
from pgl.utils.logger import log
from pgl.sample import metapath_randomwalk
from pgl.graph_kernel import skip_gram_gen_pair
from pgl.graph_kernel import alias_sample_build_table
from utils import load_config
from graph import m2vGraph
import mp_reader
class NodeGenerator(object):
"""Node generator"""
def __init__(self, config, graph):
self.config = config
self.graph = graph
self.batch_size = self.config.batch_size
self.shuffle = self.config.node_shuffle
self.node_files = self.config.node_files
self.first_node_type = self.config.first_node_type
self.walk_mode = self.config.walk_mode
def __call__(self):
if self.walk_mode == "m2v":
generator = self.m2v_node_generate
log.info("node gen mode is : %s" % (self.walk_mode))
elif self.walk_mode == "multi_m2v":
generator = self.multi_m2v_node_generate
log.info("node gen mode is : %s" % (self.walk_mode))
elif self.walk_mode == "files":
generator = self.files_node_generate
log.info("node gen mode is : %s" % (self.walk_mode))
else:
generator = self.m2v_node_generate
log.info("node gen mode is : %s" % (self.walk_mode))
while True:
for nodes in generator():
yield nodes
def m2v_node_generate(self):
"""m2v_node_generate"""
for nodes in self.graph.node_batch_iter(
batch_size=self.batch_size,
n_type=self.first_node_type,
shuffle=self.shuffle):
yield nodes
def multi_m2v_node_generate(self):
"""multi_m2v_node_generate"""
n_type_list = self.first_node_type.split(';')
num_n_type = len(n_type_list)
node_types = np.unique(self.graph.node_types).tolist()
node_generators = {}
for n_type in node_types:
node_generators[n_type] = \
self.graph.node_batch_iter(self.batch_size, n_type=n_type)
cc = 0
while True:
idx = cc % num_n_type
n_type = n_type_list[idx]
try:
nodes = node_generators[n_type].next()
except StopIteration as e:
log.info("exception when iteration")
break
yield (nodes, idx)
cc += 1
def files_node_generate(self):
"""files_node_generate"""
nodes = []
for filename in self.node_files:
with io.open(filename) as inf:
for line in inf:
node = int(line.strip('\n\t'))
nodes.append(node)
if len(nodes) == self.batch_size:
yield nodes
nodes = []
if len(nodes):
yield nodes
class WalkGenerator(object):
"""Walk generator"""
def __init__(self, config, dataset):
self.config = config
self.dataset = dataset
self.graph = self.dataset.graph
self.walk_mode = self.config.walk_mode
self.node_generator = NodeGenerator(self.config, self.graph)
if self.walk_mode == "multi_m2v":
num_path = len(self.config.meta_path.split(';'))
num_first_node_type = len(self.config.first_node_type.split(';'))
assert num_first_node_type == num_path, \
"In [multi_m2v] walk_mode, the number of metapath should be the same \
as the number of first_node_type"
assert num_path > 1, "In [multi_m2v] walk_mode, the number of metapath\
should be greater than 1"
def __call__(self):
np.random.seed(os.getpid())
if self.walk_mode == "m2v":
walk_generator = self.m2v_walk
log.info("walk mode is : %s" % (self.walk_mode))
elif self.walk_mode == "multi_m2v":
walk_generator = self.multi_m2v_walk
log.info("walk mode is : %s" % (self.walk_mode))
else:
raise ValueError("walk_mode [%s] is not matched" % self.walk_mode)
for walks in walk_generator():
yield walks
def m2v_walk(self):
"""Metapath2vec walker"""
for nodes in self.node_generator():
walks = metapath_randomwalk(
self.graph, nodes, self.config.meta_path, self.config.walk_len)
yield walks
def multi_m2v_walk(self):
"""Multi metapath2vec walker"""
meta_paths = self.config.meta_path.split(';')
for nodes, idx in self.node_generator():
walks = metapath_randomwalk(self.graph, nodes, meta_paths[idx],
self.config.walk_len)
yield walks
class DataGenerator(object):
def __init__(self, config, dataset):
self.config = config
self.dataset = dataset
self.graph = self.dataset.graph
self.walk_generator = WalkGenerator(self.config, self.dataset)
def __call__(self):
generator = self.pair_generate
for src, pos, negs in generator():
dst = np.concatenate([pos, negs], 1)
yield src, dst
def pair_generate(self):
for walks in self.walk_generator():
try:
src_list, pos_list = [], []
for walk in walks:
s, p = skip_gram_gen_pair(walk, self.config.win_size)
src_list.append(s), pos_list.append(p)
src = [s for x in src_list for s in x]
pos = [s for x in pos_list for s in x]
if len(src) == 0:
continue
negs = self.negative_sample(
src,
pos,
neg_num=self.config.neg_num,
neg_sample_type=self.config.neg_sample_type)
src = np.array(src, dtype=np.int64).reshape(-1, 1, 1)
pos = np.array(pos, dtype=np.int64).reshape(-1, 1, 1)
yield src, pos, negs
except Exception as e:
log.exception(e)
def negative_sample(self, src, pos, neg_num, neg_sample_type):
if neg_sample_type == "average":
neg_sample_size = [len(pos), neg_num, 1]
negs = np.random.randint(
low=0, high=self.graph.num_nodes, size=neg_sample_size)
elif neg_sample_type == "m2v_plus":
negs = []
for s in src:
neg = self.graph.sample_nodes(
sample_num=neg_num, n_type=self.graph.node_types[s])
negs.append(neg)
negs = np.vstack(negs).reshape(-1, neg_num, 1)
else: # equal to "average"
neg_sample_size = [len(pos), neg_num, 1]
negs = np.random.randint(
low=0, high=self.graph.num_nodes, size=neg_sample_size)
negs = negs.astype(np.int64)
return negs
def multiprocess_data_generator(config, dataset):
"""Multiprocess data generator.
"""
if config.num_sample_workers == 1:
data_generator = DataGenerator(config, dataset)
else:
pool = [
DataGenerator(config, dataset)
for i in range(config.num_sample_workers)
]
data_generator = mp_reader.multiprocess_reader(
pool, use_pipe=True, queue_size=100)
return data_generator
if __name__ == "__main__":
config_file = "./config.yaml"
config = load_config(config_file)
dataset = m2vGraph(config)
data_generator = multiprocess_data_generator(config, dataset)
start = time.time()
cc = 0
for src, dst in data_generator():
log.info(src.shape)
log.info("time: %.6f" % (time.time() - start))
start = time.time()
cc += 1
if cc == 100:
break
......@@ -19,8 +19,8 @@ import pgl
import time
from pgl.utils import mp_reader
from pgl.utils.logger import log
import train
import time
import copy
def node_batch_iter(nodes, node_label, batch_size):
......@@ -46,12 +46,11 @@ def traverse(item):
yield item
def flat_node_and_edge(nodes, eids):
def flat_node_and_edge(nodes):
"""flat_node_and_edge
"""
nodes = list(set(traverse(nodes)))
eids = list(set(traverse(eids)))
return nodes, eids
return nodes
def worker(batch_info, graph, graph_wrapper, samples):
......@@ -61,31 +60,42 @@ def worker(batch_info, graph, graph_wrapper, samples):
def work():
"""work
"""
first = True
_graph_wrapper = copy.copy(graph_wrapper)
_graph_wrapper.node_feat_tensor_dict = {}
for batch_train_samples, batch_train_labels in batch_info:
start_nodes = batch_train_samples
nodes = start_nodes
eids = []
edges = []
for max_deg in samples:
pred, pred_eid = graph.sample_predecessor(
start_nodes, max_degree=max_deg, return_eids=True)
pred_nodes = graph.sample_predecessor(
start_nodes, max_degree=max_deg)
for dst_node, src_nodes in zip(start_nodes, pred_nodes):
for src_node in src_nodes:
edges.append((src_node, dst_node))
last_nodes = nodes
nodes = [nodes, pred]
eids = [eids, pred_eid]
nodes, eids = flat_node_and_edge(nodes, eids)
nodes = [nodes, pred_nodes]
nodes = flat_node_and_edge(nodes)
# Find new nodes
start_nodes = list(set(nodes) - set(last_nodes))
if len(start_nodes) == 0:
break
subgraph = graph.subgraph(nodes=nodes, eid=eids)
subgraph = graph.subgraph(
nodes=nodes,
edges=edges,
with_node_feat=False,
with_edge_feat=False)
sub_node_index = subgraph.reindex_from_parrent_nodes(
batch_train_samples)
feed_dict = graph_wrapper.to_feed(subgraph)
feed_dict = _graph_wrapper.to_feed(subgraph)
feed_dict["node_label"] = np.expand_dims(
np.array(
batch_train_labels, dtype="int64"), -1)
feed_dict["node_index"] = sub_node_index
feed_dict["parent_node_index"] = np.array(nodes, dtype="int64")
yield feed_dict
return work
......@@ -97,23 +107,25 @@ def multiprocess_graph_reader(graph,
node_index,
batch_size,
node_label,
with_parent_node_index=False,
num_workers=4):
"""multiprocess_graph_reader
"""
def parse_to_subgraph(rd):
def parse_to_subgraph(rd, prefix, node_feat, _with_parent_node_index):
"""parse_to_subgraph
"""
def work():
"""work
"""
last = time.time()
for data in rd():
this = time.time()
feed_dict = data
now = time.time()
last = now
for key in node_feat:
feed_dict[prefix + '/node_feat/' + key] = node_feat[key][
feed_dict["parent_node_index"]]
if not _with_parent_node_index:
del feed_dict["parent_node_index"]
yield feed_dict
return work
......@@ -129,46 +141,17 @@ def multiprocess_graph_reader(graph,
reader_pool.append(
worker(batch_info[block_size * i:block_size * (i + 1)], graph,
graph_wrapper, samples))
multi_process_sample = mp_reader.multiprocess_reader(
reader_pool, use_pipe=True, queue_size=1000)
r = parse_to_subgraph(multi_process_sample)
return paddle.reader.buffered(r, 1000)
return reader()
def graph_reader(graph, graph_wrapper, samples, node_index, batch_size,
node_label):
"""graph_reader"""
def reader():
"""reader"""
for batch_train_samples, batch_train_labels in node_batch_iter(
node_index, node_label, batch_size=batch_size):
start_nodes = batch_train_samples
nodes = start_nodes
eids = []
for max_deg in samples:
pred, pred_eid = graph.sample_predecessor(
start_nodes, max_degree=max_deg, return_eids=True)
last_nodes = nodes
nodes = [nodes, pred]
eids = [eids, pred_eid]
nodes, eids = flat_node_and_edge(nodes, eids)
# Find new nodes
start_nodes = list(set(nodes) - set(last_nodes))
if len(start_nodes) == 0:
break
subgraph = graph.subgraph(nodes=nodes, eid=eids)
feed_dict = graph_wrapper.to_feed(subgraph)
sub_node_index = subgraph.reindex_from_parrent_nodes(
batch_train_samples)
if len(reader_pool) == 1:
r = parse_to_subgraph(reader_pool[0],
repr(graph_wrapper), graph.node_feat,
with_parent_node_index)
else:
multi_process_sample = mp_reader.multiprocess_reader(
reader_pool, use_pipe=True, queue_size=1000)
r = parse_to_subgraph(multi_process_sample,
repr(graph_wrapper), graph.node_feat,
with_parent_node_index)
return paddle.reader.buffered(r, num_workers)
feed_dict["node_label"] = np.expand_dims(
np.array(
batch_train_labels, dtype="int64"), -1)
feed_dict["node_index"] = np.array(sub_node_index, dtype="int32")
yield feed_dict
return paddle.reader.buffered(reader, 1000)
return reader()
......@@ -63,10 +63,7 @@ def load_data(normalize=True, symmetry=True):
log.info("Feature shape %s" % (repr(feature.shape)))
graph = pgl.graph.Graph(
num_nodes=feature.shape[0],
edges=list(zip(src, dst)),
node_feat={"index": np.arange(
0, len(feature), dtype="int64")})
num_nodes=feature.shape[0], edges=list(zip(src, dst)))
return {
"graph": graph,
......@@ -89,7 +86,13 @@ def build_graph_model(graph_wrapper, num_class, k_hop, graphsage_type,
node_label = fluid.layers.data(
"node_label", shape=[None, 1], dtype="int64", append_batch_size=False)
feature = fluid.layers.gather(feature, graph_wrapper.node_feat['index'])
parent_node_index = fluid.layers.data(
"parent_node_index",
shape=[None],
dtype="int64",
append_batch_size=False)
feature = fluid.layers.gather(feature, parent_node_index)
feature.stop_gradient = True
for i in range(k_hop):
......@@ -221,59 +224,35 @@ def main(args):
exe.run(startup_program)
feature_init(place)
if args.sample_workers > 1:
train_iter = reader.multiprocess_graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
num_workers=args.sample_workers,
batch_size=args.batch_size,
node_index=data['train_index'],
node_label=data["train_label"])
else:
train_iter = reader.graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
batch_size=args.batch_size,
node_index=data['train_index'],
node_label=data["train_label"])
if args.sample_workers > 1:
val_iter = reader.multiprocess_graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
num_workers=args.sample_workers,
batch_size=args.batch_size,
node_index=data['val_index'],
node_label=data["val_label"])
else:
val_iter = reader.graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
batch_size=args.batch_size,
node_index=data['val_index'],
node_label=data["val_label"])
if args.sample_workers > 1:
test_iter = reader.multiprocess_graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
num_workers=args.sample_workers,
batch_size=args.batch_size,
node_index=data['test_index'],
node_label=data["test_label"])
else:
test_iter = reader.graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
batch_size=args.batch_size,
node_index=data['test_index'],
node_label=data["test_label"])
train_iter = reader.multiprocess_graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
num_workers=args.sample_workers,
batch_size=args.batch_size,
with_parent_node_index=True,
node_index=data['train_index'],
node_label=data["train_label"])
val_iter = reader.multiprocess_graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
num_workers=args.sample_workers,
batch_size=args.batch_size,
with_parent_node_index=True,
node_index=data['val_index'],
node_label=data["val_label"])
test_iter = reader.multiprocess_graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
num_workers=args.sample_workers,
batch_size=args.batch_size,
with_parent_node_index=True,
node_index=data['test_index'],
node_label=data["test_label"])
for epoch in range(args.epoch):
run_epoch(
......
......@@ -195,7 +195,7 @@ def run_epoch(batch_iter,
if num_trainer > 1:
num_samples = sum(
[len(batch["node_index"]) for batch in batch_feed_dict])
[len(_batch["node_index"]) for _batch in batch_feed_dict])
else:
num_samples = len(batch_feed_dict["node_index"])
total_loss += batch_loss * num_samples
......@@ -262,59 +262,32 @@ def main(args):
else:
train_exe = exe
if args.sample_workers > 1:
train_iter = reader.multiprocess_graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
num_workers=args.sample_workers,
batch_size=args.batch_size,
node_index=data['train_index'],
node_label=data["train_label"])
else:
train_iter = reader.graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
batch_size=args.batch_size,
node_index=data['train_index'],
node_label=data["train_label"])
if args.sample_workers > 1:
val_iter = reader.multiprocess_graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
num_workers=args.sample_workers,
batch_size=args.batch_size,
node_index=data['val_index'],
node_label=data["val_label"])
else:
val_iter = reader.graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
batch_size=args.batch_size,
node_index=data['val_index'],
node_label=data["val_label"])
if args.sample_workers > 1:
test_iter = reader.multiprocess_graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
num_workers=args.sample_workers,
batch_size=args.batch_size,
node_index=data['test_index'],
node_label=data["test_label"])
else:
test_iter = reader.graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
batch_size=args.batch_size,
node_index=data['test_index'],
node_label=data["test_label"])
train_iter = reader.multiprocess_graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
num_workers=args.sample_workers,
batch_size=args.batch_size,
node_index=data['train_index'],
node_label=data["train_label"])
val_iter = reader.multiprocess_graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
num_workers=args.sample_workers,
batch_size=args.batch_size,
node_index=data['val_index'],
node_label=data["val_label"])
test_iter = reader.multiprocess_graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
num_workers=args.sample_workers,
batch_size=args.batch_size,
node_index=data['test_index'],
node_label=data["test_label"])
for epoch in range(args.epoch):
run_epoch(
......
......@@ -97,11 +97,7 @@ def load_data(normalize=True, symmetry=True, scale=1):
graph = pgl.graph.Graph(
num_nodes=feature.shape[0],
edges=edges,
node_feat={
"index": np.arange(
0, len(feature), dtype="int64"),
"feature": feature
})
node_feat={"feature": feature})
return {
"graph": graph,
......@@ -244,59 +240,32 @@ def main(args):
test_program = train_program.clone(for_test=True)
if args.sample_workers > 1:
train_iter = reader.multiprocess_graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
num_workers=args.sample_workers,
batch_size=args.batch_size,
node_index=data['train_index'],
node_label=data["train_label"])
else:
train_iter = reader.graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
batch_size=args.batch_size,
node_index=data['train_index'],
node_label=data["train_label"])
if args.sample_workers > 1:
val_iter = reader.multiprocess_graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
num_workers=args.sample_workers,
batch_size=args.batch_size,
node_index=data['val_index'],
node_label=data["val_label"])
else:
val_iter = reader.graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
batch_size=args.batch_size,
node_index=data['val_index'],
node_label=data["val_label"])
if args.sample_workers > 1:
test_iter = reader.multiprocess_graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
num_workers=args.sample_workers,
batch_size=args.batch_size,
node_index=data['test_index'],
node_label=data["test_label"])
else:
test_iter = reader.graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
batch_size=args.batch_size,
node_index=data['test_index'],
node_label=data["test_label"])
train_iter = reader.multiprocess_graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
num_workers=args.sample_workers,
batch_size=args.batch_size,
node_index=data['train_index'],
node_label=data["train_label"])
val_iter = reader.multiprocess_graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
num_workers=args.sample_workers,
batch_size=args.batch_size,
node_index=data['val_index'],
node_label=data["val_label"])
test_iter = reader.multiprocess_graph_reader(
data['graph'],
graph_wrapper,
samples=samples,
num_workers=args.sample_workers,
batch_size=args.batch_size,
node_index=data['test_index'],
node_label=data["test_label"])
with fluid.program_guard(train_program, startup_program):
adam = fluid.optimizer.Adam(learning_rate=args.lr)
......
......@@ -23,7 +23,7 @@ import tqdm
import time
import logging
import random
from pgl.contrib import heter_graph
from pgl import heter_graph
import pickle as pkl
......@@ -40,8 +40,10 @@ class Dataset(object):
def __init__(self, config):
self.config = config
self.walk_files = config['input_path'] + config['walk_path']
self.word2id_file = config['input_path'] + config['word2id_file']
self.walk_files = os.path.join(config['input_path'],
config['walk_path'])
self.word2id_file = os.path.join(config['input_path'],
config['word2id_file'])
self.word2freq = {}
self.word2id = {}
......@@ -65,12 +67,16 @@ class Dataset(object):
for walk_file in glob.glob(self.walk_files):
with open(walk_file, 'r') as reader:
for walk in reader:
walk = walk.strip().split(' ')
walk = walk.strip().split()
if len(walk) > 1:
self.sentences_count += 1
for word in walk:
self.token_count += 1
word_freq[word] = word_freq.get(word, 0) + 1
if int(word) >= self.config[
'paper_start_index']: # remove paper
continue
else:
self.token_count += 1
word_freq[word] = word_freq.get(word, 0) + 1
wid = 0
logging.info('Read %d sentences.' % self.sentences_count)
......@@ -123,7 +129,11 @@ class Dataset(object):
for filename in walkpath_files:
with open(filename) as reader:
for line in reader:
words = line.strip().split(' ')
words = line.strip().split()
words = [
w for w in words
if int(w) < self.config['paper_start_index']
]
if len(words) > 1:
word_ids = [
self.word2id[w] for w in words if w in self.word2id
......
......@@ -13,9 +13,12 @@ sampler:
new_author_label_file: author_label.txt
new_venue_label_file: venue_label.txt
walk_saved_path: walks/
walk_batch_size: 1000
num_walks: 1000
walk_length: 100
metapath: conf-paper-author-paper-conf
num_sample_workers: 16
first_node_type: conf
metapath: c2p-p2a-a2p-p2c #conf-paper-author-paper-conf
optimizer:
type: Adam
......@@ -39,9 +42,10 @@ data_loader:
walk_path: walks/*
word2id_file: word2id.pkl
batch_size: 32
win_size: 7 # default: 7
win_size: 5 # default: 7
neg_num: 5
min_count: 10
paper_start_index: 1697414
model:
type: SkipgramModel
......
......@@ -101,7 +101,7 @@ class SkipgramModel(object):
pos_score = fl.squeeze(pos_logits, axes=[1])
pos_score = fl.clip(pos_score, min=-10, max=10)
pos_score = -1.0 * fl.logsigmoid(pos_score)
pos_score = -self.neg_num * fl.logsigmoid(pos_score)
neg_logits = fl.matmul(
embed_src, weight_negs,
......@@ -111,4 +111,4 @@ class SkipgramModel(object):
neg_score = -1.0 * fl.logsigmoid(-1.0 * neg_score)
neg_score = fl.reduce_sum(neg_score, dim=1, keep_dim=True)
self.loss = fl.reduce_mean(pos_score + neg_score)
self.loss = fl.reduce_mean(pos_score + neg_score) / self.neg_num / 2
......@@ -18,6 +18,7 @@ training metapath2vec model.
import multiprocessing
from multiprocessing import Pool
from multiprocessing import Process
import argparse
import sys
import os
......@@ -27,7 +28,7 @@ import tqdm
import time
import logging
import random
from pgl.contrib import heter_graph
from pgl import heter_graph
from pgl.sample import metapath_randomwalk
from utils import *
......@@ -77,9 +78,14 @@ class Sampler(object):
self.config['data_path'] + 'paper_conf.txt', self.paper_id2index,
self.conf_id2index)
edges_by_types['edge'] = paper_author_edges + paper_conf_edges
logging.info('%d edges have been loaded.' %
(len(edges_by_types['edge'])))
# edges_by_types['edge'] = paper_author_edges + paper_conf_edges
edges_by_types['p2c'] = paper_conf_edges
edges_by_types['c2p'] = [(dst, src) for src, dst in paper_conf_edges]
edges_by_types['p2a'] = paper_author_edges
edges_by_types['a2p'] = [(dst, src) for src, dst in paper_author_edges]
# logging.info('%d edges have been loaded.' %
# (len(edges_by_types['edge'])))
node_features = {
'index': np.array([i for i in range(num_nodes)]).reshape(
......@@ -110,7 +116,7 @@ class Sampler(object):
return id2index, name2index, node_types
def load_edges(self, file_, src2index, dst2index, symmetry=True):
def load_edges(self, file_, src2index, dst2index, symmetry=False):
"""Load edges from file.
"""
edges = []
......@@ -143,41 +149,65 @@ class Sampler(object):
return index_label_list
def generate_walks(args):
"""Generate metapath random walk and save to file.
def walk_generator(graph, batch_size, metapath, n_type, walk_length):
"""Generate metapath random walk.
"""
g, meta_path, filename, walk_length = args
walks = []
node_types = g._node_types
first_type = meta_path.split('-')[0]
nodes = np.where(node_types == first_type)[0]
if len(nodes) > 4000:
nodes = np.random.choice(nodes, 4000, replace=False)
logging.info('%d number of start nodes' % (len(nodes)))
logging.info('save walks in file: %s' % (filename))
np.random.seed(os.getpid())
while True:
for start_nodes in graph.node_batch_iter(
batch_size=batch_size, n_type=n_type):
walks = metapath_randomwalk(
graph=graph,
start_nodes=start_nodes,
metapath=metapath,
walk_length=walk_length)
yield walks
def walk_to_files(g, batch_size, metapath, n_type, walk_length, max_num,
filename):
"""Generate metapath randomwalk and save in files"""
# g, batch_size, metapath, n_type, walk_length, max_num, filename = args
with open(filename, 'w') as writer:
for start_node in nodes:
walk = metapath_randomwalk(g, start_node, meta_path, walk_length)
walk = [str(walk[i]) for i in range(0, len(walk), 2)] # skip paper
writer.write(' '.join(walk) + '\n')
cc = 0
for walks in walk_generator(g, batch_size, metapath, n_type,
walk_length):
for walk in walks:
writer.write("%s\n" % "\t".join([str(i) for i in walk]))
cc += 1
if cc == max_num:
return
return
def multiprocess_generate_walks_to_files(graph, n_type, meta_path, num_walks,
walk_length, batch_size,
num_sample_workers, saved_path):
"""Use multiprocess to generate metapath random walk to files.
"""
num_nodes_by_type = graph.num_nodes_by_type(n_type)
logging.info("num_nodes_by_type: %s" % num_nodes_by_type)
max_num = (num_walks * num_nodes_by_type // num_sample_workers) + 1
logging.info("max sample number of every worker: %s" % max_num)
def multiprocess_generate_walks(sampler, edge_type, meta_path, num_walks,
walk_length, saved_path):
"""Use multiprocess to generate metapath random walk.
"""
args = []
for i in range(num_walks):
filename = saved_path + '%04d' % (i)
args.append(
(sampler.graph[edge_type], meta_path, filename, walk_length))
pool = Pool(16)
pool.map(generate_walks, args)
pool.close()
pool.join()
for i in range(num_sample_workers):
filename = os.path.join(saved_path, 'part-%05d' % (i))
args.append((graph, batch_size, meta_path, n_type, walk_length,
max_num, filename))
ps = []
for i in range(num_sample_workers):
p = Process(target=walk_to_files, args=args[i])
p.start()
ps.append(p)
for i in range(num_sample_workers):
ps[i].join()
# pool = Pool(num_sample_workers)
# pool.map(walk_to_files, args)
# pool.close()
# pool.join()
if __name__ == "__main__":
......@@ -220,13 +250,15 @@ if __name__ == "__main__":
begin = time.time()
logging.info('multi process sampling')
multiprocess_generate_walks(
sampler=sampler,
edge_type='edge',
multiprocess_generate_walks_to_files(
graph=sampler.graph,
n_type=config['first_node_type'],
meta_path=config['metapath'],
num_walks=config['num_walks'],
walk_length=config['walk_length'],
saved_path=config['walk_saved_path'])
batch_size=config['walk_batch_size'],
num_sample_workers=config['num_sample_workers'],
saved_path=config['walk_saved_path'], )
logging.info('total time: %.4f' % (time.time() - begin))
logging.info('generating multi class data')
......
# STGCN: Spatio-Temporal Graph Convolutional Network
[Spatio-Temporal Graph Convolutional Network \(STGCN\)](https://arxiv.org/pdf/1709.04875.pdf) is a novel deep learning framework to tackle time series prediction problem. Based on PGL, we reproduce STGCN algorithms to predict new confirmed patients in some cities with the historical immigration records.
### Datasets
You can make your customized dataset by the following format:
* input.csv: Historical immigration records with shape of [num\_time\_steps * num\_cities].
* output.csv: New confirmed patients records with shape of [num\_time\_steps * num\_cities].
* W.csv: Weighted Adjacency Matrix with shape of [num\_cities * num\_cities].
* city.csv: Each line is a number and the corresponding city name.
### Dependencies
- paddlepaddle 1.6
- pgl 1.0.0
### How to run
For examples, use gpu to train STGCN on your dataset.
```
python main.py --use_cuda --input_file dataset/input_csv --label_file dataset/output.csv --adj_mat_file dataset/W.csv --city_file dataset/city.csv
```
#### Hyperparameters
- n\_route: Number of city.
- n\_his: "n\_his" time steps of previous observations of historical immigration records.
- n\_pred: Next "n\_pred" time steps of New confirmed patients records.
- Ks: Number of GCN layers.
- Kt: Kernel size of temporal convolution.
- use\_cuda: Use gpu if assign use\_cuda.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""__init__"""
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""data processing
"""
import numpy as np
import pandas as pd
from utils.math_utils import z_score
class Dataset(object):
"""Dataset
"""
def __init__(self, data, stats):
self.__data = data
self.mean = stats['mean']
self.std = stats['std']
def get_data(self, type): # type: train, val or test
return self.__data[type]
def get_stats(self):
return {'mean': self.mean, 'std': self.std}
def get_len(self, type):
return len(self.__data[type])
def z_inverse(self, type):
return self.__data[type] * self.std + self.mean
def seq_gen(len_seq, data_seq, offset, n_frame, n_route, day_slot, C_0=1):
"""Generate data in the form of standard sequence unit."""
n_slot = day_slot - n_frame + 1
tmp_seq = np.zeros((len_seq * n_slot, n_frame, n_route, C_0))
for i in range(len_seq):
for j in range(n_slot):
sta = (i + offset) * day_slot + j
end = sta + n_frame
tmp_seq[i * n_slot + j, :, :, :] = np.reshape(
data_seq[sta:end, :], [n_frame, n_route, C_0])
return tmp_seq
def adj_matrx_gen_custom(input_file, city_file):
"""genenrate Adjacency Matrix from file
"""
print("generate adj_matrix data (take long time)...")
# data
df = pd.read_csv(
input_file,
sep='\t',
names=['date', '迁出省份', '迁出城市', '迁入省份', '迁入城市', '人数'])
# 只需要2020年的数据
df['date'] = pd.to_datetime(df['date'], format="%Y%m%d")
df = df.set_index('date')
df = df['2020']
city_df = pd.read_csv(city_file)
# 剔除武汉
city_df = city_df.drop(0)
num = len(city_df)
matrix = np.zeros([num, num])
for i in city_df['city']:
for j in city_df['city']:
if (i == j):
continue
# 选出从i到j的每日人数
cut = df[df['迁出城市'].str.contains(i)]
cut = cut[cut['迁入城市'].str.contains(j)]
# 求均值作为权重
average = cut['人数'].mean()
# 赋值给matrix
i_index = int(city_df[city_df['city'] == i]['num']) - 1
j_index = int(city_df[city_df['city'] == j]['num']) - 1
matrix[i_index, j_index] = average
np.savetxt("dataset/W_74.csv", matrix, delimiter=",")
def data_gen_custom(input_file, output_file, city_file, n, n_his, n_pred,
n_config):
"""data_gen_custom"""
print("generate training data...")
# data
df = pd.read_csv(
input_file,
sep='\t',
names=['date', '迁出省份', '迁出城市', '迁入省份', '迁入城市', '人数'])
# 只需要2020年的数据
df['date'] = pd.to_datetime(df['date'], format="%Y%m%d")
df = df.set_index('date')
df = df['2020']
city_df = pd.read_csv(city_file)
input_df = pd.DataFrame()
out_df_wuhan = df[df['迁出城市'].str.contains('武汉')]
for i in city_df['city']:
# 筛选迁入城市
in_df_i = out_df_wuhan[out_df_wuhan['迁入城市'].str.contains(i)]
# 确保按时间升序
# in_df_i.sort_values("date",inplace=True)
# 按时间插入
in_df_i.reset_index(drop=True, inplace=True)
input_df[i] = in_df_i['人数']
# 替换Nan值
input_df = input_df.replace(np.nan, 0)
x = input_df
y = pd.read_csv(output_file)
# 删除第1列
x.drop(
x.columns[x.columns.str.contains(
'unnamed', case=False)],
axis=1,
inplace=True)
y = y.drop(columns=['date'])
# 剔除迁入武汉的数据
x = x.drop(columns=['武汉'])
y = y.drop(columns=['武汉'])
# param
n_val, n_test = n_config
n_train = len(y) - n_val - n_test - 2
# (?,26,74,1)
df = pd.DataFrame(columns=x.columns)
for i in range(len(y) - n_pred + 1):
df = df.append(x[i:i + n_his])
df = df.append(y[i:i + n_pred])
data = df.values.reshape(-1, n_his + n_pred, n,
1) # n == num_nodes == city num
x_stats = {'mean': np.mean(data), 'std': np.std(data)}
x_train = data[:n_train]
x_val = data[n_train:n_train + n_val]
x_test = data[n_train + n_val:]
x_data = {'train': x_train, 'val': x_val, 'test': x_test}
dataset = Dataset(x_data, x_stats)
print("generate successfully!")
return dataset
def data_gen_mydata(input_file, label_file, n, n_his, n_pred, n_config):
"""data processing
"""
# data
x = pd.read_csv(input_file)
y = pd.read_csv(label_file)
x = x.drop(columns=['date'])
y = y.drop(columns=['date'])
x = x.drop(columns=['武汉'])
y = y.drop(columns=['武汉'])
# param
n_val, n_test = n_config
n_train = len(y) - n_val - n_test - 2
# (?,26,74,1)
df = pd.DataFrame(columns=x.columns)
for i in range(len(y) - n_pred + 1):
df = df.append(x[i:i + n_his])
df = df.append(y[i:i + n_pred])
data = df.values.reshape(-1, n_his + n_pred, n, 1)
x_stats = {'mean': np.mean(data), 'std': np.std(data)}
x_train = data[:n_train]
x_val = data[n_train:n_train + n_val]
x_test = data[n_train + n_val:]
x_data = {'train': x_train, 'val': x_val, 'test': x_test}
dataset = Dataset(x_data, x_stats)
return dataset
def data_gen(file_path, data_config, n_route, n_frame=21, day_slot=288):
"""Source file load and dataset generation."""
n_train, n_val, n_test = data_config
# generate training, validation and test data
try:
data_seq = pd.read_csv(file_path, header=None).values
except FileNotFoundError:
print(f'ERROR: input file was not found in {file_path}.')
seq_train = seq_gen(n_train, data_seq, 0, n_frame, n_route, day_slot)
seq_val = seq_gen(n_val, data_seq, n_train, n_frame, n_route, day_slot)
seq_test = seq_gen(n_test, data_seq, n_train + n_val, n_frame, n_route,
day_slot)
# x_stats: dict, the stats for the train dataset, including the value of mean and standard deviation.
x_stats = {'mean': np.mean(seq_train), 'std': np.std(seq_train)}
# x_train, x_val, x_test: np.array, [sample_size, n_frame, n_route, channel_size].
x_train = z_score(seq_train, x_stats['mean'], x_stats['std'])
x_val = z_score(seq_val, x_stats['mean'], x_stats['std'])
x_test = z_score(seq_test, x_stats['mean'], x_stats['std'])
x_data = {'train': x_train, 'val': x_val, 'test': x_test}
dataset = Dataset(x_data, x_stats)
return dataset
def gen_batch(inputs, batch_size, dynamic_batch=False, shuffle=False):
"""Data iterator in batch.
Args:
inputs: np.ndarray, [len_seq, n_frame, n_route, C_0], standard sequence units.
batch_size: int, size of batch.
dynamic_batch: bool, whether changes the batch size in the last batch
if its length is less than the default.
shuffle: bool, whether shuffle the batches.
"""
len_inputs = len(inputs)
if shuffle:
idx = np.arange(len_inputs)
np.random.shuffle(idx)
for start_idx in range(0, len_inputs, batch_size):
end_idx = start_idx + batch_size
if end_idx > len_inputs:
if dynamic_batch:
end_idx = len_inputs
else:
break
if shuffle:
slide = idx[start_idx:end_idx]
else:
slide = slice(start_idx, end_idx)
yield inputs[slide]
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PGL Graph
"""
import sys
import os
import numpy as np
import pandas as pd
from pgl.graph import Graph
def weight_matrix(file_path, sigma2=0.1, epsilon=0.5, scaling=True):
"""Load weight matrix function."""
try:
W = pd.read_csv(file_path, header=None).values
except FileNotFoundError:
print(f'ERROR: input file was not found in {file_path}.')
# check whether W is a 0/1 matrix.
if set(np.unique(W)) == {0, 1}:
print('The input graph is a 0/1 matrix; set "scaling" to False.')
scaling = False
if scaling:
n = W.shape[0]
W = W / 10000.
W2, W_mask = W * W, np.ones([n, n]) - np.identity(n)
# refer to Eq.10
return np.exp(-W2 / sigma2) * (
np.exp(-W2 / sigma2) >= epsilon) * W_mask
else:
return W
class GraphFactory(object):
"""GraphFactory"""
def __init__(self, args):
self.args = args
self.adj_matrix = weight_matrix(self.args.adj_mat_file)
L = np.eye(self.adj_matrix.shape[0]) + self.adj_matrix
D = np.sum(self.adj_matrix, axis=1)
# L = D - self.adj_matrix
# import ipdb; ipdb.set_trace()
edges = []
weights = []
for i in range(self.adj_matrix.shape[0]):
for j in range(self.adj_matrix.shape[1]):
edges.append([i, j])
weights.append(L[i][j])
self.edges = np.array(edges, dtype=np.int64)
self.weights = np.array(weights, dtype=np.float32).reshape(-1, 1)
self.norm = np.zeros_like(D, dtype=np.float32)
self.norm[D > 0] = np.power(D[D > 0], -0.5)
self.norm = self.norm.reshape(-1, 1)
def build_graph(self, x_batch):
"""build graph"""
B, T, n, _ = x_batch.shape
batch = B * T
batch_edges = []
for i in range(batch):
batch_edges.append(self.edges + (i * n))
batch_edges = np.vstack(batch_edges)
num_nodes = B * T * n
node_feat = {'norm': np.tile(self.norm, [batch, 1])}
edge_feat = {'weights': np.tile(self.weights, [batch, 1])}
graph = Graph(
num_nodes=num_nodes,
edges=batch_edges,
node_feat=node_feat,
edge_feat=edge_feat)
return graph
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This file implement the training process of STGCN model.
"""
import os
import sys
import time
import argparse
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.layers as fl
import pgl
from pgl.utils.logger import log
from data_loader.data_utils import data_gen_mydata, gen_batch
from data_loader.graph import GraphFactory
from models.model import STGCNModel
from models.tester import model_inference, model_test
def main(args):
"""main"""
PeMS = data_gen_mydata(args.input_file, args.label_file, args.n_route,
args.n_his, args.n_pred, (args.n_val, args.n_test))
log.info(PeMS.get_stats())
log.info(PeMS.get_len('train'))
gf = GraphFactory(args)
place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
train_program = fluid.Program()
startup_program = fluid.Program()
with fluid.program_guard(train_program, startup_program):
gw = pgl.graph_wrapper.GraphWrapper(
"gw",
place,
node_feat=[('norm', [None, 1], "float32")],
edge_feat=[('weights', [None, 1], "float32")])
model = STGCNModel(args, gw)
train_loss, y_pred = model.forward()
infer_program = train_program.clone(for_test=True)
with fluid.program_guard(train_program, startup_program):
epoch_step = int(PeMS.get_len('train') / args.batch_size) + 1
lr = fl.exponential_decay(
learning_rate=args.lr,
decay_steps=5 * epoch_step,
decay_rate=0.7,
staircase=True)
if args.opt == 'RMSProp':
train_op = fluid.optimizer.RMSPropOptimizer(lr).minimize(
train_loss)
elif args.opt == 'ADAM':
train_op = fluid.optimizer.Adam(lr).minimize(train_loss)
exe = fluid.Executor(place)
exe.run(startup_program)
if args.inf_mode == 'sep':
# for inference mode 'sep', the type of step index is int.
step_idx = args.n_pred - 1
tmp_idx = [step_idx]
min_val = min_va_val = np.array([4e1, 1e5, 1e5])
elif args.inf_mode == 'merge':
# for inference mode 'merge', the type of step index is np.ndarray.
step_idx = tmp_idx = np.arange(3, args.n_pred + 1, 3) - 1
min_val = min_va_val = np.array([4e1, 1e5, 1e5]) * len(step_idx)
else:
raise ValueError(f'ERROR: test mode "{args.inf_mode}" is not defined.')
step = 0
for epoch in range(1, args.epochs + 1):
for idx, x_batch in enumerate(
gen_batch(
PeMS.get_data('train'),
args.batch_size,
dynamic_batch=True,
shuffle=True)):
x = np.array(x_batch[:, 0:args.n_his, :, :], dtype=np.float32)
graph = gf.build_graph(x)
feed = gw.to_feed(graph)
feed['input'] = np.array(
x_batch[:, 0:args.n_his + 1, :, :], dtype=np.float32)
b_loss, b_lr = exe.run(train_program,
feed=feed,
fetch_list=[train_loss, lr])
if idx % 5 == 0:
log.info("epoch %d | step %d | lr %.6f | loss %.6f" %
(epoch, idx, b_lr[0], b_loss[0]))
min_va_val, min_val = \
model_inference(exe, gw, gf, infer_program, y_pred, PeMS, args, \
step_idx, min_va_val, min_val)
for ix in tmp_idx:
va, te = min_va_val[ix - 2:ix + 1], min_val[ix - 2:ix + 1]
print(f'Time Step {ix + 1}: '
f'MAPE {va[0]:7.3%}, {te[0]:7.3%}; '
f'MAE {va[1]:4.3f}, {te[1]:4.3f}; '
f'RMSE {va[2]:6.3f}, {te[2]:6.3f}.')
if epoch % 5 == 0:
model_test(exe, gw, gf, infer_program, y_pred, PeMS, args)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--n_route', type=int, default=74)
parser.add_argument('--n_his', type=int, default=23)
parser.add_argument('--n_pred', type=int, default=3)
parser.add_argument('--batch_size', type=int, default=10)
parser.add_argument('--epochs', type=int, default=100)
parser.add_argument('--save', type=int, default=10)
parser.add_argument('--Ks', type=int, default=3) #equal to num_layers
parser.add_argument('--Kt', type=int, default=3)
parser.add_argument('--lr', type=float, default=1e-2)
parser.add_argument('--keep_prob', type=float, default=1.0)
parser.add_argument('--opt', type=str, default='RMSProp')
parser.add_argument('--inf_mode', type=str, default='sep')
parser.add_argument('--input_file', type=str, default='dataset/input.csv')
parser.add_argument('--label_file', type=str, default='dataset/output.csv')
parser.add_argument(
'--city_file', type=str, default='dataset/crawl_list.csv')
parser.add_argument('--adj_mat_file', type=str, default='dataset/W_74.csv')
parser.add_argument('--output_path', type=str, default='./outputs/')
parser.add_argument('--n_val', type=str, default=1)
parser.add_argument('--n_test', type=str, default=1)
parser.add_argument('--use_cuda', action='store_true')
args = parser.parse_args()
blocks = [[1, 32, 64], [64, 32, 128]]
args.blocks = blocks
log.info(args)
if not os.path.exists(args.output_path):
os.makedirs(args.output_path)
main(args)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This file implement the STGCN model.
"""
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.layers as fl
import pgl
class STGCNModel(object):
"""Implementation of Spatio-Temporal Graph Convolutional Networks"""
def __init__(self, args, gw):
self.args = args
self.gw = gw
self.input = fl.data(
name="input",
shape=[None, args.n_his + 1, args.n_route, 1],
dtype="float32")
def forward(self):
"""forward"""
x = self.input[:, 0:self.args.n_his, :, :]
# Ko>0: kernel size of temporal convolution in the output layer.
Ko = self.args.n_his
# ST-Block
for i, channels in enumerate(self.args.blocks):
x = self.st_conv_block(
x,
self.args.Ks,
self.args.Kt,
channels,
"st_conv_%d" % i,
self.args.keep_prob,
act_func='GLU')
# output layer
if Ko > 1:
y = self.output_layer(x, Ko, 'output_layer')
else:
raise ValueError(f'ERROR: kernel size Ko must be greater than 1, \
but received "{Ko}".')
label = self.input[:, self.args.n_his:self.args.n_his + 1, :, :]
train_loss = fl.reduce_sum((y - label) * (y - label))
single_pred = y[:, 0, :, :] # shape: [batch, n, 1]
return train_loss, single_pred
def st_conv_block(self,
x,
Ks,
Kt,
channels,
name,
keep_prob,
act_func='GLU'):
"""Spatio-Temporal convolution block"""
c_si, c_t, c_oo = channels
x_s = self.temporal_conv_layer(
x, Kt, c_si, c_t, "%s_tconv_in" % name, act_func=act_func)
x_t = self.spatio_conv_layer(x_s, Ks, c_t, c_t, "%s_sonv" % name)
x_o = self.temporal_conv_layer(x_t, Kt, c_t, c_oo,
"%s_tconv_out" % name)
x_ln = fl.layer_norm(x_o)
return fl.dropout(x_ln, dropout_prob=(1.0 - keep_prob))
def temporal_conv_layer(self, x, Kt, c_in, c_out, name, act_func='relu'):
"""Temporal convolution layer"""
_, T, n, _ = x.shape
if c_in > c_out:
x_input = fl.conv2d(
input=x,
num_filters=c_out,
filter_size=[1, 1],
stride=[1, 1],
padding="SAME",
data_format="NHWC",
param_attr=fluid.ParamAttr(name="%s_conv2d_1" % name))
elif c_in < c_out:
# if the size of input channel is less than the output,
# padding x to the same size of output channel.
pad = fl.fill_constant_batch_size_like(
input=x,
shape=[-1, T, n, c_out - c_in],
dtype="float32",
value=0.0)
x_input = fl.concat([x, pad], axis=3)
else:
x_input = x
# x_input = x_input[:, Kt - 1:T, :, :]
if act_func == 'GLU':
# gated liner unit
bt_init = fluid.initializer.ConstantInitializer(value=0.0)
bt = fl.create_parameter(
shape=[2 * c_out],
dtype="float32",
attr=fluid.ParamAttr(
name="%s_bt" % name, trainable=True, initializer=bt_init),
)
x_conv = fl.conv2d(
input=x,
num_filters=2 * c_out,
filter_size=[Kt, 1],
stride=[1, 1],
padding="SAME",
data_format="NHWC",
param_attr=fluid.ParamAttr(name="%s_conv2d_wt" % name))
x_conv = x_conv + bt
return (x_conv[:, :, :, 0:c_out] + x_input
) * fl.sigmoid(x_conv[:, :, :, -c_out:])
else:
bt_init = fluid.initializer.ConstantInitializer(value=0.0)
bt = fl.create_parameter(
shape=[c_out],
dtype="float32",
attr=fluid.ParamAttr(
name="%s_bt" % name, trainable=True, initializer=bt_init),
)
x_conv = fl.conv2d(
input=x,
num_filters=c_out,
filter_size=[Kt, 1],
stride=[1, 1],
padding="SAME",
data_format="NHWC",
param_attr=fluid.ParamAttr(name="%s_conv2d_wt" % name))
x_conv = x_conv + bt
if act_func == "linear":
return x_conv
elif act_func == "sigmoid":
return fl.sigmoid(x_conv)
elif act_func == "relu":
return fl.relu(x_conv + x_input)
else:
raise ValueError(
f'ERROR: activation function "{act_func}" is not defined.')
def spatio_conv_layer(self, x, Ks, c_in, c_out, name):
"""Spatio convolution layer"""
_, T, n, _ = x.shape
if c_in > c_out:
x_input = fl.conv2d(
input=x,
num_filters=c_out,
filter_size=[1, 1],
stride=[1, 1],
padding="SAME",
data_format="NHWC",
param_attr=fluid.ParamAttr(name="%s_conv2d_1" % name))
elif c_in < c_out:
# if the size of input channel is less than the output,
# padding x to the same size of output channel.
pad = fl.fill_constant_batch_size_like(
input=x,
shape=[-1, T, n, c_out - c_in],
dtype="float32",
value=0.0)
x_input = fl.concat([x, pad], axis=3)
else:
x_input = x
for i in range(Ks):
# x_input shape: [B,T, num_nodes, c_out]
x_input = fl.reshape(x_input, [-1, c_out])
x_input = self.message_passing(
self.gw,
x_input,
name="%s_mp_%d" % (name, i),
norm=self.gw.node_feat["norm"])
x_input = fl.fc(x_input,
size=c_out,
bias_attr=False,
param_attr=fluid.ParamAttr(name="%s_gcn_fc_%d" %
(name, i)))
bias = fluid.layers.create_parameter(
shape=[c_out],
dtype='float32',
is_bias=True,
name='%s_gcn_bias_%d' % (name, i))
x_input = fluid.layers.elementwise_add(x_input, bias, act="relu")
x_input = fl.reshape(x_input, [-1, T, n, c_out])
return x_input
def message_passing(self, gw, feature, name, norm=None):
"""Message passing layer"""
def send_src_copy(src_feat, dst_feat, edge_feat):
"""send function"""
return src_feat["h"] * edge_feat['w']
if norm is not None:
feature = feature * norm
msg = gw.send(
send_src_copy,
nfeat_list=[("h", feature)],
efeat_list=[('w', gw.edge_feat['weights'])])
output = gw.recv(msg, "sum")
if norm is not None:
output = output * norm
return output
def output_layer(self, x, T, name, act_func='GLU'):
"""Output layer"""
_, _, n, channel = x.shape
# maps multi-steps to one.
x_i = self.temporal_conv_layer(
x=x,
Kt=T,
c_in=channel,
c_out=channel,
name="%s_in" % name,
act_func=act_func)
x_ln = fl.layer_norm(x_i)
x_o = self.temporal_conv_layer(
x=x_ln,
Kt=1,
c_in=channel,
c_out=channel,
name="%s_out" % name,
act_func='sigmoid')
# maps multi-channels to one.
x_fc = self.fully_con_layer(
x=x_o, n=n, channel=channel, name="%s_fc" % name)
return x_fc
def fully_con_layer(self, x, n, channel, name):
"""Fully connected layer"""
bt_init = fluid.initializer.ConstantInitializer(value=0.0)
bt = fl.create_parameter(
shape=[n, 1],
dtype="float32",
attr=fluid.ParamAttr(
name="%s_bt" % name, trainable=True, initializer=bt_init), )
x_conv = fl.conv2d(
input=x,
num_filters=1,
filter_size=[1, 1],
stride=[1, 1],
padding="SAME",
data_format="NHWC",
param_attr=fluid.ParamAttr(name="%s_conv2d" % name))
x_conv = x_conv + bt
return x_conv
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This file implement the testing process of STGCN model.
"""
import os
import sys
import time
import argparse
import numpy as np
import pandas as pd
import paddle.fluid as fluid
import paddle.fluid.layers as fl
import pgl
from pgl.utils.logger import log
from data_loader.data_utils import gen_batch
from utils.math_utils import evaluation
def multi_pred(exe, gw, gf, program, y_pred, seq, batch_size, \
n_his, n_pred, step_idx, dynamic_batch=True):
"""multi step prediction"""
pred_list = []
for i in gen_batch(
seq, min(batch_size, len(seq)), dynamic_batch=dynamic_batch):
# Note: use np.copy() to avoid the modification of source data.
test_seq = np.copy(i[:, 0:n_his + 1, :, :]).astype(np.float32)
graph = gf.build_graph(i[:, 0:n_his, :, :])
feed = gw.to_feed(graph)
step_list = []
for j in range(n_pred):
feed['input'] = test_seq
pred = exe.run(program, feed=feed, fetch_list=[y_pred])
if isinstance(pred, list):
pred = np.array(pred[0])
test_seq[:, 0:n_his - 1, :, :] = test_seq[:, 1:n_his, :, :]
test_seq[:, n_his - 1, :, :] = pred
step_list.append(pred)
pred_list.append(step_list)
# pred_array -> [n_pred, len(seq), n_route, C_0)
pred_array = np.concatenate(pred_list, axis=1)
return pred_array, pred_array.shape[1]
def model_inference(exe, gw, gf, program, pred, inputs, args, step_idx,
min_va_val, min_val):
"""inference model"""
x_val, x_test, x_stats = inputs.get_data('val'), inputs.get_data(
'test'), inputs.get_stats()
if args.n_his + args.n_pred > x_val.shape[1]:
raise ValueError(
f'ERROR: the value of n_pred "{args.n_pred}" exceeds the length limit.'
)
# y_val shape: [n_pred, len(x_val), n_route, C_0)
y_val, len_val = multi_pred(exe, gw, gf, program, pred, \
x_val, args.batch_size, args.n_his, args.n_pred, step_idx)
evl_val = evaluation(x_val[0:len_val, step_idx + args.n_his, :, :],
y_val[step_idx], x_stats)
# chks: indicator that reflects the relationship of values between evl_val and min_va_val.
chks = evl_val < min_va_val
# update the metric on test set, if model's performance got improved on the validation.
if sum(chks):
min_va_val[chks] = evl_val[chks]
y_pred, len_pred = multi_pred(exe, gw, gf, program, pred, \
x_test, args.batch_size, args.n_his, args.n_pred, step_idx)
evl_pred = evaluation(x_test[0:len_pred, step_idx + args.n_his, :, :],
y_pred[step_idx], x_stats)
min_val = evl_pred
return min_va_val, min_val
def model_test(exe, gw, gf, program, pred, inputs, args):
"""test model"""
if args.inf_mode == 'sep':
# for inference mode 'sep', the type of step index is int.
step_idx = args.n_pred - 1
tmp_idx = [step_idx]
elif args.inf_mode == 'merge':
# for inference mode 'merge', the type of step index is np.ndarray.
step_idx = tmp_idx = np.arange(3, args.n_pred + 1, 3) - 1
print(step_idx)
else:
raise ValueError(f'ERROR: test mode "{args.inf_mode}" is not defined.')
x_test, x_stats = inputs.get_data('test'), inputs.get_stats()
y_test, len_test = multi_pred(exe, gw, gf, program, pred, \
x_test, args.batch_size, args.n_his, args.n_pred, step_idx)
# save result
gt = x_test[0:len_test, args.n_his:, :, :].reshape(-1, args.n_route)
y_pred = y_test.reshape(-1, args.n_route)
city_df = pd.read_csv(args.city_file)
city_df = city_df.drop(0)
np.savetxt(
os.path.join(args.output_path, "groundtruth.csv"),
gt.astype(np.int32),
fmt='%d',
delimiter=',',
header=",".join(city_df['city']))
np.savetxt(
os.path.join(args.output_path, "prediction.csv"),
y_pred.astype(np.int32),
fmt='%d',
delimiter=",",
header=",".join(city_df['city']))
for i in range(step_idx + 1):
evl = evaluation(x_test[0:len_test, step_idx + args.n_his, :, :],
y_test[i], x_stats)
for ix in tmp_idx:
te = evl[ix - 2:ix + 1]
print(
f'Time Step {i + 1}: MAPE {te[0]:7.3%}; MAE {te[1]:4.3f}; RMSE {te[2]:6.3f}.'
)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Evaluation"""
import os
import sys
import time
import argparse
import numpy as np
def z_score(x, mean, std):
"""z_score"""
return (x - mean) / std
def z_inverse(x, mean, std):
"""The inverse of function z_score"""
return x * std + mean
def MAPE(v, v_):
"""Mean absolute percentage error."""
return np.mean(np.abs(v_ - v) / (v + 1e-5))
def RMSE(v, v_):
"""Mean squared error."""
return np.sqrt(np.mean((v_ - v)**2))
def MAE(v, v_):
"""Mean absolute error."""
return np.mean(np.abs(v_ - v))
def evaluation(y, y_, x_stats):
"""Calculate MAPE, MAE and RMSE between ground truth and prediction."""
dim = len(y_.shape)
if dim == 3:
# single_step case
v = z_inverse(y, x_stats['mean'], x_stats['std'])
v_ = z_inverse(y_, x_stats['mean'], x_stats['std'])
return np.array([MAPE(v, v_), MAE(v, v_), RMSE(v, v_)])
else:
# multi_step case
tmp_list = []
# y -> [time_step, batch_size, n_route, 1]
y = np.swapaxes(y, 0, 1)
# recursively call
for i in range(y_.shape[0]):
tmp_res = evaluation(y[i], y_[i], x_stats)
tmp_list.append(tmp_res)
return np.concatenate(tmp_list, axis=-1)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""test ogb
"""
import argparse
import pgl
import numpy as np
import paddle.fluid as fluid
from pgl.contrib.ogb.graphproppred.dataset_pgl import PglGraphPropPredDataset
from pgl.utils import paddle_helper
from ogb.graphproppred import Evaluator
from pgl.contrib.ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder
def train(exe, batch_size, graph_wrapper, train_program, splitted_idx, dataset,
evaluator, fetch_loss, fetch_pred):
"""Train"""
graphs, labels = dataset[splitted_idx["train"]]
perm = np.arange(0, len(graphs))
np.random.shuffle(perm)
start_batch = 0
batch_no = 0
pred_output = np.zeros_like(labels, dtype="float32")
while start_batch < len(perm):
batch_index = perm[start_batch:start_batch + batch_size]
start_batch += batch_size
batch_graph = pgl.graph.MultiGraph(graphs[batch_index])
batch_label = labels[batch_index]
batch_valid = (batch_label == batch_label).astype("float32")
batch_label = np.nan_to_num(batch_label).astype("float32")
feed_dict = graph_wrapper.to_feed(batch_graph)
feed_dict["label"] = batch_label
feed_dict["weight"] = batch_valid
loss, pred = exe.run(train_program,
feed=feed_dict,
fetch_list=[fetch_loss, fetch_pred])
pred_output[batch_index] = pred
batch_no += 1
print("train", evaluator.eval({"y_true": labels, "y_pred": pred_output}))
def evaluate(exe, batch_size, graph_wrapper, val_program, splitted_idx,
dataset, mode, evaluator, fetch_pred):
"""Eval"""
graphs, labels = dataset[splitted_idx[mode]]
perm = np.arange(0, len(graphs))
start_batch = 0
batch_no = 0
pred_output = np.zeros_like(labels, dtype="float32")
while start_batch < len(perm):
batch_index = perm[start_batch:start_batch + batch_size]
start_batch += batch_size
batch_graph = pgl.graph.MultiGraph(graphs[batch_index])
feed_dict = graph_wrapper.to_feed(batch_graph)
pred = exe.run(val_program, feed=feed_dict, fetch_list=[fetch_pred])
pred_output[batch_index] = pred[0]
batch_no += 1
print(mode, evaluator.eval({"y_true": labels, "y_pred": pred_output}))
def send_func(src_feat, dst_feat, edge_feat):
"""Send"""
return src_feat["h"] + edge_feat["h"]
class GNNModel(object):
"""GNNModel"""
def __init__(self, name, emb_dim, num_task, num_layers):
self.num_task = num_task
self.emb_dim = emb_dim
self.num_layers = num_layers
self.name = name
self.atom_encoder = AtomEncoder(name=name, emb_dim=emb_dim)
self.bond_encoder = BondEncoder(name=name, emb_dim=emb_dim)
def forward(self, graph):
"""foward"""
h_node = self.atom_encoder(graph.node_feat['feat'])
h_edge = self.bond_encoder(graph.edge_feat['feat'])
for layer in range(self.num_layers):
msg = graph.send(
send_func,
nfeat_list=[("h", h_node)],
efeat_list=[("h", h_edge)])
h_node = graph.recv(msg, 'sum') + h_node
h_node = fluid.layers.fc(h_node,
size=self.emb_dim,
name=self.name + '_%s' % layer,
act="relu")
graph_nodes = pgl.layers.graph_pooling(graph, h_node, "average")
graph_pred = fluid.layers.fc(graph_nodes, self.num_task, name="final")
return graph_pred
def main():
"""main
"""
# Training settings
parser = argparse.ArgumentParser(description='Graph Dataset')
parser.add_argument(
'--epochs',
type=int,
default=100,
help='number of epochs to train (default: 100)')
parser.add_argument(
'--dataset',
type=str,
default="ogbg-mol-tox21",
help='dataset name (default: proteinfunc)')
args = parser.parse_args()
place = fluid.CPUPlace() # Dataset too big to use GPU
### automatic dataloading and splitting
dataset = PglGraphPropPredDataset(name=args.dataset)
splitted_idx = dataset.get_idx_split()
### automatic evaluator. takes dataset name as input
evaluator = Evaluator(args.dataset)
graph_data, label = dataset[:2]
batch_graph = pgl.graph.MultiGraph(graph_data)
graph_data = batch_graph
train_program = fluid.Program()
startup_program = fluid.Program()
test_program = fluid.Program()
# degree normalize
graph_data.edge_feat["feat"] = graph_data.edge_feat["feat"].astype("int64")
graph_data.node_feat["feat"] = graph_data.node_feat["feat"].astype("int64")
model = GNNModel(
name="gnn", num_task=dataset.num_tasks, emb_dim=64, num_layers=2)
with fluid.program_guard(train_program, startup_program):
gw = pgl.graph_wrapper.GraphWrapper(
"graph",
place=place,
node_feat=graph_data.node_feat_info(),
edge_feat=graph_data.edge_feat_info())
pred = model.forward(gw)
sigmoid_pred = fluid.layers.sigmoid(pred)
val_program = train_program.clone(for_test=True)
initializer = []
with fluid.program_guard(train_program, startup_program):
train_label = fluid.layers.data(
name="label", dtype="float32", shape=[None, dataset.num_tasks])
train_weight = fluid.layers.data(
name="weight", dtype="float32", shape=[None, dataset.num_tasks])
train_loss_t = fluid.layers.sigmoid_cross_entropy_with_logits(
x=pred, label=train_label) * train_weight
train_loss_t = fluid.layers.reduce_sum(train_loss_t)
adam = fluid.optimizer.Adam(
learning_rate=1e-2,
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0005))
adam.minimize(train_loss_t)
exe = fluid.Executor(place)
exe.run(startup_program)
for epoch in range(1, args.epochs + 1):
print("Epoch", epoch)
train(exe, 128, gw, train_program, splitted_idx, dataset, evaluator,
train_loss_t, sigmoid_pred)
evaluate(exe, 128, gw, val_program, splitted_idx, dataset, "valid",
evaluator, sigmoid_pred)
evaluate(exe, 128, gw, val_program, splitted_idx, dataset, "test",
evaluator, sigmoid_pred)
if __name__ == "__main__":
main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""test ogb
"""
import argparse
import time
import logging
import numpy as np
import paddle.fluid as fluid
import pgl
from pgl.contrib.ogb.linkproppred.dataset_pgl import PglLinkPropPredDataset
from pgl.utils import paddle_helper
from ogb.linkproppred import Evaluator
def send_func(src_feat, dst_feat, edge_feat):
"""send_func"""
return src_feat["h"]
def recv_func(feat):
"""recv_func"""
return fluid.layers.sequence_pool(feat, pool_type="sum")
class GNNModel(object):
"""GNNModel"""
def __init__(self, name, num_nodes, emb_dim, num_layers):
self.num_nodes = num_nodes
self.emb_dim = emb_dim
self.num_layers = num_layers
self.name = name
self.src_nodes = fluid.layers.data(
name='src_nodes',
shape=[None],
dtype='int64', )
self.dst_nodes = fluid.layers.data(
name='dst_nodes',
shape=[None],
dtype='int64', )
self.edge_label = fluid.layers.data(
name='edge_label',
shape=[None, 1],
dtype='float32', )
def forward(self, graph):
"""forward"""
h = fluid.layers.create_parameter(
shape=[self.num_nodes, self.emb_dim],
dtype="float32",
name=self.name + "_embedding")
for layer in range(self.num_layers):
msg = graph.send(
send_func,
nfeat_list=[("h", h)], )
h = graph.recv(msg, recv_func)
h = fluid.layers.fc(
h,
size=self.emb_dim,
bias_attr=False,
param_attr=fluid.ParamAttr(name=self.name + '_%s' % layer))
h = h * graph.node_feat["norm"]
bias = fluid.layers.create_parameter(
shape=[self.emb_dim],
dtype='float32',
is_bias=True,
name=self.name + '_bias_%s' % layer)
h = fluid.layers.elementwise_add(h, bias, act="relu")
src = fluid.layers.gather(h, self.src_nodes, overwrite=False)
dst = fluid.layers.gather(h, self.dst_nodes, overwrite=False)
edge_embed = src * dst
pred = fluid.layers.fc(input=edge_embed,
size=1,
name=self.name + "_pred_output")
prob = fluid.layers.sigmoid(pred)
loss = fluid.layers.sigmoid_cross_entropy_with_logits(pred,
self.edge_label)
loss = fluid.layers.reduce_mean(loss)
return pred, prob, loss
def main():
"""main
"""
# Training settings
parser = argparse.ArgumentParser(description='Graph Dataset')
parser.add_argument(
'--epochs',
type=int,
default=4,
help='number of epochs to train (default: 100)')
parser.add_argument(
'--dataset',
type=str,
default="ogbl-ppa",
help='dataset name (default: protein protein associations)')
parser.add_argument('--use_cuda', action='store_true')
parser.add_argument('--batch_size', type=int, default=5120)
parser.add_argument('--embed_dim', type=int, default=64)
parser.add_argument('--num_layers', type=int, default=2)
parser.add_argument('--lr', type=float, default=0.001)
args = parser.parse_args()
print(args)
place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
### automatic dataloading and splitting
print("loadding dataset")
dataset = PglLinkPropPredDataset(name=args.dataset)
splitted_edge = dataset.get_edge_split()
print(splitted_edge['train_edge'].shape)
print(splitted_edge['train_edge_label'].shape)
print("building evaluator")
### automatic evaluator. takes dataset name as input
evaluator = Evaluator(args.dataset)
graph_data = dataset[0]
print("num_nodes: %d" % graph_data.num_nodes)
train_program = fluid.Program()
startup_program = fluid.Program()
# degree normalize
indegree = graph_data.indegree()
norm = np.zeros_like(indegree, dtype="float32")
norm[indegree > 0] = np.power(indegree[indegree > 0], -0.5)
graph_data.node_feat["norm"] = np.expand_dims(norm, -1).astype("float32")
# graph_data.node_feat["index"] = np.array([i for i in range(graph_data.num_nodes)], dtype=np.int64).reshape(-1,1)
with fluid.program_guard(train_program, startup_program):
model = GNNModel(
name="gnn",
num_nodes=graph_data.num_nodes,
emb_dim=args.embed_dim,
num_layers=args.num_layers)
gw = pgl.graph_wrapper.GraphWrapper(
"graph",
place,
node_feat=graph_data.node_feat_info(),
edge_feat=graph_data.edge_feat_info())
pred, prob, loss = model.forward(gw)
val_program = train_program.clone(for_test=True)
with fluid.program_guard(train_program, startup_program):
global_steps = int(splitted_edge['train_edge'].shape[0] /
args.batch_size * 2)
learning_rate = fluid.layers.polynomial_decay(args.lr, global_steps,
0.00005)
adam = fluid.optimizer.Adam(
learning_rate=learning_rate,
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0005))
adam.minimize(loss)
exe = fluid.Executor(place)
exe.run(startup_program)
feed = gw.to_feed(graph_data)
print("evaluate result before training: ")
result = test(exe, val_program, prob, evaluator, feed, splitted_edge)
print(result)
print("training")
cc = 0
for epoch in range(1, args.epochs + 1):
for batch_data, batch_label in data_generator(
graph_data,
splitted_edge["train_edge"],
splitted_edge["train_edge_label"],
batch_size=args.batch_size):
feed['src_nodes'] = batch_data[:, 0].reshape(-1, 1)
feed['dst_nodes'] = batch_data[:, 1].reshape(-1, 1)
feed['edge_label'] = batch_label.astype("float32")
res_loss, y_pred, b_lr = exe.run(
train_program,
feed=feed,
fetch_list=[loss, prob, learning_rate])
if cc % 1 == 0:
print("epoch %d | step %d | lr %s | Loss %s" %
(epoch, cc, b_lr[0], res_loss[0]))
cc += 1
if cc % 20 == 0:
print("Evaluating...")
result = test(exe, val_program, prob, evaluator, feed,
splitted_edge)
print("epoch %d | step %d" % (epoch, cc))
print(result)
def test(exe, val_program, prob, evaluator, feed, splitted_edge):
"""Evaluation"""
result = {}
feed['src_nodes'] = splitted_edge["valid_edge"][:, 0].reshape(-1, 1)
feed['dst_nodes'] = splitted_edge["valid_edge"][:, 1].reshape(-1, 1)
feed['edge_label'] = splitted_edge["valid_edge_label"].astype(
"float32").reshape(-1, 1)
y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0]
input_dict = {
"y_true": splitted_edge["valid_edge_label"],
"y_pred": y_pred.reshape(-1, ),
}
result["valid"] = evaluator.eval(input_dict)
feed['src_nodes'] = splitted_edge["test_edge"][:, 0].reshape(-1, 1)
feed['dst_nodes'] = splitted_edge["test_edge"][:, 1].reshape(-1, 1)
feed['edge_label'] = splitted_edge["test_edge_label"].astype(
"float32").reshape(-1, 1)
y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0]
input_dict = {
"y_true": splitted_edge["test_edge_label"],
"y_pred": y_pred.reshape(-1, ),
}
result["test"] = evaluator.eval(input_dict)
return result
def data_generator(graph, data, label_data, batch_size, shuffle=True):
"""Data Generator"""
perm = np.arange(0, len(data))
if shuffle:
np.random.shuffle(perm)
offset = 0
while offset < len(perm):
batch_index = perm[offset:(offset + batch_size)]
offset += batch_size
pos_data = data[batch_index]
pos_label = label_data[batch_index]
neg_src_node = pos_data[:, 0]
neg_dst_node = np.random.choice(
pos_data.reshape(-1, ), size=len(neg_src_node))
neg_data = np.hstack(
[neg_src_node.reshape(-1, 1), neg_dst_node.reshape(-1, 1)])
exists = graph.has_edges_between(neg_src_node, neg_dst_node)
neg_data = neg_data[np.invert(exists)]
neg_label = np.zeros(shape=len(neg_data), dtype=np.int64)
batch_data = np.vstack([pos_data, neg_data])
label = np.vstack([pos_label.reshape(-1, 1), neg_label.reshape(-1, 1)])
yield batch_data, label
if __name__ == "__main__":
main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""test ogb
"""
import argparse
import pgl
import numpy as np
import paddle.fluid as fluid
from pgl.contrib.ogb.nodeproppred.dataset_pgl import PglNodePropPredDataset
from pgl.utils import paddle_helper
from ogb.nodeproppred import Evaluator
def train():
pass
def send_func(src_feat, dst_feat, edge_feat):
return (src_feat["h"] + edge_feat["h"]) * src_feat["norm"]
class GNNModel(object):
def __init__(self, name, emb_dim, num_task, num_layers):
self.num_task = num_task
self.emb_dim = emb_dim
self.num_layers = num_layers
self.name = name
def forward(self, graph):
h = fluid.layers.embedding(
graph.node_feat["x"],
size=(2, self.emb_dim)) # name=self.name + "_embedding")
edge_attr = fluid.layers.fc(graph.edge_feat["feat"], size=self.emb_dim)
for layer in range(self.num_layers):
msg = graph.send(
send_func,
nfeat_list=[("h", h), ("norm", graph.node_feat["norm"])],
efeat_list=[("h", edge_attr)])
h = graph.recv(msg, "sum")
h = fluid.layers.fc(
h,
size=self.emb_dim,
bias_attr=False,
param_attr=fluid.ParamAttr(name=self.name + '_%s' % layer))
h = h * graph.node_feat["norm"]
bias = fluid.layers.create_parameter(
shape=[self.emb_dim],
dtype='float32',
is_bias=True,
name=self.name + '_bias_%s' % layer)
h = fluid.layers.elementwise_add(h, bias, act="relu")
pred = fluid.layers.fc(h,
self.num_task,
act=None,
name=self.name + "_pred_output")
return pred
def main():
"""main
"""
# Training settings
parser = argparse.ArgumentParser(description='Graph Dataset')
parser.add_argument(
'--epochs',
type=int,
default=100,
help='number of epochs to train (default: 100)')
parser.add_argument(
'--dataset',
type=str,
default="ogbn-proteins",
help='dataset name (default: proteinfunc)')
args = parser.parse_args()
#device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")
#place = fluid.CUDAPlace(0)
place = fluid.CPUPlace() # Dataset too big to use GPU
### automatic dataloading and splitting
dataset = PglNodePropPredDataset(name=args.dataset)
splitted_idx = dataset.get_idx_split()
### automatic evaluator. takes dataset name as input
evaluator = Evaluator(args.dataset)
graph_data, label = dataset[0]
train_program = fluid.Program()
startup_program = fluid.Program()
test_program = fluid.Program()
# degree normalize
indegree = graph_data.indegree()
norm = np.zeros_like(indegree, dtype="float32")
norm[indegree > 0] = np.power(indegree[indegree > 0], -0.5)
graph_data.node_feat["norm"] = np.expand_dims(norm, -1).astype("float32")
graph_data.node_feat["x"] = np.zeros((len(indegree), 1), dtype="int64")
graph_data.edge_feat["feat"] = graph_data.edge_feat["feat"].astype(
"float32")
model = GNNModel(
name="gnn", num_task=dataset.num_tasks, emb_dim=64, num_layers=2)
with fluid.program_guard(train_program, startup_program):
gw = pgl.graph_wrapper.StaticGraphWrapper("graph", graph_data, place)
pred = model.forward(gw)
sigmoid_pred = fluid.layers.sigmoid(pred)
val_program = train_program.clone(for_test=True)
initializer = []
with fluid.program_guard(train_program, startup_program):
train_node_index, init = paddle_helper.constant(
"train_node_index", dtype="int64", value=splitted_idx["train"])
initializer.append(init)
train_node_label, init = paddle_helper.constant(
"train_node_label",
dtype="float32",
value=label[splitted_idx["train"]].astype("float32"))
initializer.append(init)
train_pred_t = fluid.layers.gather(pred, train_node_index)
train_loss_t = fluid.layers.sigmoid_cross_entropy_with_logits(
x=train_pred_t, label=train_node_label)
train_loss_t = fluid.layers.reduce_sum(train_loss_t)
train_pred_t = fluid.layers.sigmoid(train_pred_t)
adam = fluid.optimizer.Adam(
learning_rate=1e-2,
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0005))
adam.minimize(train_loss_t)
exe = fluid.Executor(place)
exe.run(startup_program)
gw.initialize(place)
for init in initializer:
init(place)
for epoch in range(1, args.epochs + 1):
loss = exe.run(train_program, feed={}, fetch_list=[train_loss_t])
print("Loss %s" % loss[0])
print("Evaluating...")
y_pred = exe.run(val_program, feed={}, fetch_list=[sigmoid_pred])[0]
result = {}
input_dict = {
"y_true": label[splitted_idx["train"]],
"y_pred": y_pred[splitted_idx["train"]]
}
result["train"] = evaluator.eval(input_dict)
input_dict = {
"y_true": label[splitted_idx["valid"]],
"y_pred": y_pred[splitted_idx["valid"]]
}
result["valid"] = evaluator.eval(input_dict)
input_dict = {
"y_true": label[splitted_idx["test"]],
"y_pred": y_pred[splitted_idx["test"]]
}
result["test"] = evaluator.eval(input_dict)
print(result)
if __name__ == "__main__":
main()
......@@ -13,9 +13,11 @@
# limitations under the License.
"""Generate pgl apis
"""
__version__ = "1.0.1"
__version__ = "1.0.2"
from pgl import layers
from pgl import graph_wrapper
from pgl import graph
from pgl import data_loader
from pgl import heter_graph
from pgl import heter_graph_wrapper
from pgl import contrib
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -11,8 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Generate Contrib api
"""
from pgl.contrib import heter_graph
from pgl.contrib import heter_graph_wrapper
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""__init__.py"""
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PglGraphPropPredDataset
"""
import pandas as pd
import shutil, os
import os.path as osp
import numpy as np
from ogb.utils.url import decide_download, download_url, extract_zip
from ogb.graphproppred import make_master_file
from pgl.contrib.ogb.io.read_graph_pgl import read_csv_graph_pgl
def to_bool(value):
"""to_bool"""
return np.array([value], dtype="bool")[0]
class PglGraphPropPredDataset(object):
"""PglGraphPropPredDataset"""
def __init__(self, name, root="dataset"):
self.name = name ## original name, e.g., ogbg-mol-tox21
self.dir_name = "_".join(
name.split("-")
) + "_pgl" ## replace hyphen with underline, e.g., ogbg_mol_tox21_dgl
self.original_root = root
self.root = osp.join(root, self.dir_name)
self.meta_info = make_master_file.df #pd.read_csv(
#os.path.join(os.path.dirname(__file__), "master.csv"), index_col=0)
if not self.name in self.meta_info:
print(self.name)
error_mssg = "Invalid dataset name {}.\n".format(self.name)
error_mssg += "Available datasets are as follows:\n"
error_mssg += "\n".join(self.meta_info.keys())
raise ValueError(error_mssg)
self.download_name = self.meta_info[self.name][
"download_name"] ## name of downloaded file, e.g., tox21
self.num_tasks = int(self.meta_info[self.name]["num tasks"])
self.task_type = self.meta_info[self.name]["task type"]
super(PglGraphPropPredDataset, self).__init__()
self.pre_process()
def pre_process(self):
"""Pre-processing"""
processed_dir = osp.join(self.root, 'processed')
raw_dir = osp.join(self.root, 'raw')
pre_processed_file_path = osp.join(processed_dir, 'pgl_data_processed')
if os.path.exists(pre_processed_file_path):
# TODO: Load Preprocessed
pass
else:
### download
url = self.meta_info[self.name]["url"]
if decide_download(url):
path = download_url(url, self.original_root)
extract_zip(path, self.original_root)
os.unlink(path)
# delete folder if there exists
try:
shutil.rmtree(self.root)
except:
pass
shutil.move(
osp.join(self.original_root, self.download_name),
self.root)
else:
print("Stop download.")
exit(-1)
### preprocess
add_inverse_edge = to_bool(self.meta_info[self.name][
"add_inverse_edge"])
self.graphs = read_csv_graph_pgl(
raw_dir, add_inverse_edge=add_inverse_edge)
self.graphs = np.array(self.graphs)
self.labels = np.array(
pd.read_csv(
osp.join(raw_dir, "graph-label.csv.gz"),
compression="gzip",
header=None).values)
# TODO: Load Graph
### load preprocessed files
def get_idx_split(self):
"""Train/Valid/Test split"""
split_type = self.meta_info[self.name]["split"]
path = osp.join(self.root, "split", split_type)
train_idx = pd.read_csv(
osp.join(path, "train.csv.gz"), compression="gzip",
header=None).values.T[0]
valid_idx = pd.read_csv(
osp.join(path, "valid.csv.gz"), compression="gzip",
header=None).values.T[0]
test_idx = pd.read_csv(
osp.join(path, "test.csv.gz"), compression="gzip",
header=None).values.T[0]
return {
"train": np.array(
train_idx, dtype="int64"),
"valid": np.array(
valid_idx, dtype="int64"),
"test": np.array(
test_idx, dtype="int64")
}
def __getitem__(self, idx):
"""Get datapoint with index"""
return self.graphs[idx], self.labels[idx]
def __len__(self):
"""Length of the dataset
Returns
-------
int
Length of Dataset
"""
return len(self.graphs)
def __repr__(self): # pragma: no cover
return '{}({})'.format(self.__class__.__name__, len(self))
if __name__ == "__main__":
pgl_dataset = PglGraphPropPredDataset(name="ogbg-mol-bace")
splitted_index = pgl_dataset.get_idx_split()
print(pgl_dataset)
print(pgl_dataset[3:20])
#print(pgl_dataset[splitted_index["train"]])
#print(pgl_dataset[splitted_index["valid"]])
#print(pgl_dataset[splitted_index["test"]])
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""MolEncoder for ogb
"""
import paddle.fluid as fluid
from ogb.utils.features import get_atom_feature_dims, get_bond_feature_dims
class AtomEncoder(object):
"""AtomEncoder for encoding node features"""
def __init__(self, name, emb_dim):
self.emb_dim = emb_dim
self.name = name
def __call__(self, x):
atom_feature = get_atom_feature_dims()
atom_input = fluid.layers.split(
x, num_or_sections=len(atom_feature), dim=-1)
outputs = None
count = 0
for _x, _atom_input_dim in zip(atom_input, atom_feature):
count += 1
emb = fluid.layers.embedding(
_x,
size=(_atom_input_dim, self.emb_dim),
param_attr=fluid.ParamAttr(
name=self.name + '_atom_feat_%s' % count))
if outputs is None:
outputs = emb
else:
outputs = outputs + emb
return outputs
class BondEncoder(object):
"""Bond for encoding edge features"""
def __init__(self, name, emb_dim):
self.emb_dim = emb_dim
self.name = name
def __call__(self, x):
bond_feature = get_bond_feature_dims()
bond_input = fluid.layers.split(
x, num_or_sections=len(bond_feature), dim=-1)
outputs = None
count = 0
for _x, _bond_input_dim in zip(bond_input, bond_feature):
count += 1
emb = fluid.layers.embedding(
_x,
size=(_bond_input_dim, self.emb_dim),
param_attr=fluid.ParamAttr(
name=self.name + '_bond_feat_%s' % count))
if outputs is None:
outputs = emb
else:
outputs = outputs + emb
return outputs
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""__init__.py
"""
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""pgl read_csv_graph for ogb
"""
import pandas as pd
import os.path as osp
import numpy as np
import pgl
from ogb.io.read_graph_raw import read_csv_graph_raw
def read_csv_graph_pgl(raw_dir, add_inverse_edge=False):
"""Read CSV data and build PGL Graph
"""
graph_list = read_csv_graph_raw(raw_dir, add_inverse_edge)
pgl_graph_list = []
for graph in graph_list:
edges = list(zip(graph["edge_index"][0], graph["edge_index"][1]))
g = pgl.graph.Graph(num_nodes=graph["num_nodes"], edges=edges)
if graph["edge_feat"] is not None:
g.edge_feat["feat"] = graph["edge_feat"]
if graph["node_feat"] is not None:
g.node_feat["feat"] = graph["node_feat"]
pgl_graph_list.append(g)
return pgl_graph_list
if __name__ == "__main__":
# graph_list = read_csv_graph_dgl('dataset/proteinfunc_v2/raw', add_inverse_edge = True)
graph_list = read_csv_graph_pgl(
'dataset/ogbn_proteins_pgl/raw', add_inverse_edge=True)
print(graph_list)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""__init__.py
"""
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""LinkPropPredDataset for pgl
"""
import pandas as pd
import shutil, os
import os.path as osp
import numpy as np
from ogb.utils.url import decide_download, download_url, extract_zip
from ogb.linkproppred import make_master_file
from pgl.contrib.ogb.io.read_graph_pgl import read_csv_graph_pgl
def to_bool(value):
"""to_bool"""
return np.array([value], dtype="bool")[0]
class PglLinkPropPredDataset(object):
"""PglLinkPropPredDataset
"""
def __init__(self, name, root="dataset"):
self.name = name ## original name, e.g., ogbl-ppa
self.dir_name = "_".join(name.split(
"-")) + "_pgl" ## replace hyphen with underline, e.g., ogbl_ppa_pgl
self.original_root = root
self.root = osp.join(root, self.dir_name)
self.meta_info = make_master_file.df #pd.read_csv(os.path.join(os.path.dirname(__file__), "master.csv"), index_col=0)
if not self.name in self.meta_info:
print(self.name)
error_mssg = "Invalid dataset name {}.\n".format(self.name)
error_mssg += "Available datasets are as follows:\n"
error_mssg += "\n".join(self.meta_info.keys())
raise ValueError(error_mssg)
self.download_name = self.meta_info[self.name][
"download_name"] ## name of downloaded file, e.g., ppassoc
self.task_type = self.meta_info[self.name]["task type"]
super(PglLinkPropPredDataset, self).__init__()
self.pre_process()
def pre_process(self):
"""pre_process downlaoding data
"""
processed_dir = osp.join(self.root, 'processed')
pre_processed_file_path = osp.join(processed_dir, 'pgl_data_processed')
if osp.exists(pre_processed_file_path):
#TODO: Reload Preprocess files
pass
else:
### check download
if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")):
url = self.meta_info[self.name]["url"]
if decide_download(url):
path = download_url(url, self.original_root)
extract_zip(path, self.original_root)
os.unlink(path)
# delete folder if there exists
try:
shutil.rmtree(self.root)
except:
pass
shutil.move(
osp.join(self.original_root, self.download_name),
self.root)
else:
print("Stop download.")
exit(-1)
raw_dir = osp.join(self.root, "raw")
### pre-process and save
add_inverse_edge = to_bool(self.meta_info[self.name][
"add_inverse_edge"])
self.graph = read_csv_graph_pgl(
raw_dir, add_inverse_edge=add_inverse_edge)
#TODO: SAVE preprocess graph
def get_edge_split(self):
"""Train/Validation/Test split
"""
split_type = self.meta_info[self.name]["split"]
path = osp.join(self.root, "split", split_type)
train_idx = pd.read_csv(
osp.join(path, "train.csv.gz"), compression="gzip",
header=None).values
valid_idx = pd.read_csv(
osp.join(path, "valid.csv.gz"), compression="gzip",
header=None).values
test_idx = pd.read_csv(
osp.join(path, "test.csv.gz"), compression="gzip",
header=None).values
if self.task_type == "link prediction":
target_type = np.int64
else:
target_type = np.float32
return {
"train_edge": np.array(
train_idx[:, :2], dtype="int64"),
"train_edge_label": np.array(
train_idx[:, 2], dtype=target_type),
"valid_edge": np.array(
valid_idx[:, :2], dtype="int64"),
"valid_edge_label": np.array(
valid_idx[:, 2], dtype=target_type),
"test_edge": np.array(
test_idx[:, :2], dtype="int64"),
"test_edge_label": np.array(
test_idx[:, 2], dtype=target_type)
}
def __getitem__(self, idx):
assert idx == 0, "This dataset has only one graph"
return self.graph[0]
def __len__(self):
return 1
def __repr__(self): # pragma: no cover
return '{}({})'.format(self.__class__.__name__, len(self))
if __name__ == "__main__":
pgl_dataset = PglLinkPropPredDataset(name="ogbl-ppa")
splitted_edge = pgl_dataset.get_edge_split()
print(pgl_dataset[0])
print(splitted_edge)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""__init__.py
"""
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""NodePropPredDataset for pgl
"""
import pandas as pd
import shutil, os
import os.path as osp
import numpy as np
from ogb.utils.url import decide_download, download_url, extract_zip
from ogb.nodeproppred import make_master_file # create master.csv
from pgl.contrib.ogb.io.read_graph_pgl import read_csv_graph_pgl
def to_bool(value):
"""to_bool"""
return np.array([value], dtype="bool")[0]
class PglNodePropPredDataset(object):
"""PglNodePropPredDataset
"""
def __init__(self, name, root="dataset"):
self.name = name ## original name, e.g., ogbn-proteins
self.dir_name = "_".join(
name.split("-")
) + "_pgl" ## replace hyphen with underline, e.g., ogbn_proteins_pgl
self.original_root = root
self.root = osp.join(root, self.dir_name)
self.meta_info = make_master_file.df #pd.read_csv(
#os.path.join(os.path.dirname(__file__), "master.csv"), index_col=0)
if not self.name in self.meta_info:
error_mssg = "Invalid dataset name {}.\n".format(self.name)
error_mssg += "Available datasets are as follows:\n"
error_mssg += "\n".join(self.meta_info.keys())
raise ValueError(error_mssg)
self.download_name = self.meta_info[self.name][
"download_name"] ## name of downloaded file, e.g., tox21
self.num_tasks = int(self.meta_info[self.name]["num tasks"])
self.task_type = self.meta_info[self.name]["task type"]
super(PglNodePropPredDataset, self).__init__()
self.pre_process()
def pre_process(self):
"""pre_process downlaoding data
"""
processed_dir = osp.join(self.root, 'processed')
pre_processed_file_path = osp.join(processed_dir, 'pgl_data_processed')
if osp.exists(pre_processed_file_path):
# TODO: Reload Preprocess files
pass
else:
### check download
if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")):
url = self.meta_info[self.name]["url"]
if decide_download(url):
path = download_url(url, self.original_root)
extract_zip(path, self.original_root)
os.unlink(path)
# delete folder if there exists
try:
shutil.rmtree(self.root)
except:
pass
shutil.move(
osp.join(self.original_root, self.download_name),
self.root)
else:
print("Stop download.")
exit(-1)
raw_dir = osp.join(self.root, "raw")
### pre-process and save
add_inverse_edge = to_bool(self.meta_info[self.name][
"add_inverse_edge"])
self.graph = read_csv_graph_pgl(
raw_dir, add_inverse_edge=add_inverse_edge)
### adding prediction target
node_label = pd.read_csv(
osp.join(raw_dir, 'node-label.csv.gz'),
compression="gzip",
header=None).values
if "classification" in self.task_type:
node_label = np.array(node_label, dtype=np.int64)
else:
node_label = np.array(node_label, dtype=np.float32)
label_dict = {"labels": node_label}
# TODO: SAVE preprocess graph
self.labels = label_dict['labels']
def get_idx_split(self):
"""Train/Validation/Test split
"""
split_type = self.meta_info[self.name]["split"]
path = osp.join(self.root, "split", split_type)
train_idx = pd.read_csv(
osp.join(path, "train.csv.gz"), compression="gzip",
header=None).values.T[0]
valid_idx = pd.read_csv(
osp.join(path, "valid.csv.gz"), compression="gzip",
header=None).values.T[0]
test_idx = pd.read_csv(
osp.join(path, "test.csv.gz"), compression="gzip",
header=None).values.T[0]
return {
"train": np.array(
train_idx, dtype="int64"),
"valid": np.array(
valid_idx, dtype="int64"),
"test": np.array(
test_idx, dtype="int64")
}
def __getitem__(self, idx):
assert idx == 0, "This dataset has only one graph"
return self.graph[idx], self.labels
def __len__(self):
return 1
def __repr__(self): # pragma: no cover
return '{}({})'.format(self.__class__.__name__, len(self))
if __name__ == "__main__":
pgl_dataset = PglNodePropPredDataset(name="ogbn-proteins")
splitted_index = pgl_dataset.get_idx_split()
print(pgl_dataset[0])
print(splitted_index)
......@@ -15,12 +15,14 @@
This package implement Graph structure for handling graph data.
"""
import os
import numpy as np
import pickle as pkl
import time
import pgl.graph_kernel as graph_kernel
from collections import defaultdict
__all__ = ['Graph', 'SubGraph']
__all__ = ['Graph', 'SubGraph', 'MultiGraph']
def _hide_num_nodes(shape):
......@@ -43,8 +45,8 @@ class EdgeIndex(object):
"""
def __init__(self, u, v, num_nodes):
self._v, self._eid, self._degree, self._sorted_u,\
self._sorted_v, self._sorted_eid = graph_kernel.build_index(u, v, num_nodes)
self._degree, self._sorted_v, self._sorted_u, \
self._sorted_eid, self._indptr = graph_kernel.build_index(u, v, num_nodes)
@property
def degree(self):
......@@ -52,23 +54,40 @@ class EdgeIndex(object):
"""
return self._degree
@property
def v(self):
"""Return the compressed v.
def view_v(self, u=None):
"""Return the compressed v for given u.
"""
return self._v
if u is None:
return np.split(self._sorted_v, self._indptr[1:])
else:
u = np.array(u, dtype="int64")
return graph_kernel.slice_by_index(
self._sorted_v, self._indptr, index=u)
@property
def eid(self):
"""Return the edge id.
def view_eid(self, u=None):
"""Return the compressed edge id for given u.
"""
return self._eid
if u is None:
return np.split(self._sorted_eid, self._indptr[1:])
else:
u = np.array(u, dtype="int64")
return graph_kernel.slice_by_index(
self._sorted_eid, self._indptr, index=u)
def triples(self):
"""Return the sorted (u, v, eid) tuples.
"""
return self._sorted_u, self._sorted_v, self._sorted_eid
def dump(self, path):
if not os.path.exists(path):
os.makedirs(path)
np.save(os.path.join(path, 'degree.npy'), self._degree)
np.save(os.path.join(path, 'sorted_u.npy'), self._sorted_u)
np.save(os.path.join(path, 'sorted_v.npy'), self._sorted_v)
np.save(os.path.join(path, 'sorted_eid.npy'), self._sorted_eid)
np.save(os.path.join(path, 'indptr.npy'), self._indptr)
class Graph(object):
"""Implementation of graph structure in pgl.
......@@ -122,21 +141,51 @@ class Graph(object):
self._edges = edges
self._num_nodes = num_nodes
if len(edges) == 0:
raise ValueError("The Graph have no edges.")
self._adj_src_index = None
self._adj_dst_index = None
self.indegree()
self._num_graph = 1
self._graph_lod = np.array([0, self.num_nodes], dtype="int32")
def dump(self, path):
if not os.path.exists(path):
os.makedirs(path)
np.save(os.path.join(path, 'num_nodes.npy'), self._num_nodes)
np.save(os.path.join(path, 'edges.npy'), self._edges)
if self._adj_src_index:
self._adj_src_index.dump(os.path.join(path, 'adj_src'))
if self._adj_dst_index:
self._adj_dst_index.dump(os.path.join(path, 'adj_dst'))
def dump_feat(feat_path, feat):
"""Dump all features to .npy file.
"""
if len(feat) == 0:
return
if not os.path.exists(feat_path):
os.makedirs(feat_path)
for key in feat:
np.save(os.path.join(feat_path, key + ".npy"), feat[key])
dump_feat(os.path.join(path, "node_feat"), self.node_feat)
dump_feat(os.path.join(path, "edge_feat"), self.edge_feat)
@property
def adj_src_index(self):
"""Return an EdgeIndex object for src.
"""
if self._adj_src_index is None:
if len(self._edges) == 0:
u = np.array([], dtype="int64")
v = np.array([], dtype="int64")
else:
u = self._edges[:, 0]
v = self._edges[:, 1]
self._adj_src_index = EdgeIndex(
u=self._edges[:, 0],
v=self._edges[:, 1],
num_nodes=self._num_nodes)
u=u, v=v, num_nodes=self._num_nodes)
return self._adj_src_index
@property
......@@ -144,10 +193,15 @@ class Graph(object):
"""Return an EdgeIndex object for dst.
"""
if self._adj_dst_index is None:
if len(self._edges) == 0:
v = np.array([], dtype="int64")
u = np.array([], dtype="int64")
else:
v = self._edges[:, 0]
u = self._edges[:, 1]
self._adj_dst_index = EdgeIndex(
u=self._edges[:, 1],
v=self._edges[:, 0],
num_nodes=self._num_nodes)
u=u, v=v, num_nodes=self._num_nodes)
return self._adj_dst_index
@property
......@@ -287,17 +341,11 @@ class Graph(object):
[]]
"""
if nodes is None:
if return_eids:
return self.adj_src_index.v, self.adj_src_index.eid
else:
return self.adj_src_index.v
if return_eids:
return self.adj_src_index.view_v(
nodes), self.adj_src_index.view_eid(nodes)
else:
if return_eids:
return self.adj_src_index.v[nodes], self.adj_src_index.eid[
nodes]
else:
return self.adj_src_index.v[nodes]
return self.adj_src_index.view_v(nodes)
def sample_successor(self,
nodes,
......@@ -385,17 +433,11 @@ class Graph(object):
[2]]
"""
if nodes is None:
if return_eids:
return self.adj_dst_index.v, self.adj_dst_index.eid
else:
return self.adj_dst_index.v
if return_eids:
return self.adj_dst_index.view_v(
nodes), self.adj_dst_index.view_eid(nodes)
else:
if return_eids:
return self.adj_dst_index.v[nodes], self.adj_dst_index.eid[
nodes]
else:
return self.adj_dst_index.v[nodes]
return self.adj_dst_index.view_v(nodes)
def sample_predecessor(self,
nodes,
......@@ -510,7 +552,13 @@ class Graph(object):
(key, _hide_num_nodes(value.shape), value.dtype))
return edge_feat_info
def subgraph(self, nodes, eid=None, edges=None):
def subgraph(self,
nodes,
eid=None,
edges=None,
edge_feats=None,
with_node_feat=True,
with_edge_feat=True):
"""Generate subgraph with nodes and edge ids.
This function will generate a :code:`pgl.graph.Subgraph` object and
......@@ -525,6 +573,10 @@ class Graph(object):
eid (optional): Edge ids which will be included in the subgraph.
edges (optional): Edge(src, dst) list which will be included in the subgraph.
with_node_feat: Whether to inherit node features from parent graph.
with_edge_feat: Whether to inherit edge features from parent graph.
Return:
A :code:`pgl.graph.Subgraph` object.
......@@ -547,14 +599,20 @@ class Graph(object):
len(edges), dtype="int64"), edges, reindex)
sub_edge_feat = {}
for key, value in self._edge_feat.items():
if eid is None:
raise ValueError("Eid can not be None with edge features.")
sub_edge_feat[key] = value[eid]
if edges is None:
if with_edge_feat:
for key, value in self._edge_feat.items():
if eid is None:
raise ValueError(
"Eid can not be None with edge features.")
sub_edge_feat[key] = value[eid]
else:
sub_edge_feat = edge_feats
sub_node_feat = {}
for key, value in self._node_feat.items():
sub_node_feat[key] = value[nodes]
if with_node_feat:
for key, value in self._node_feat.items():
sub_node_feat[key] = value[nodes]
subgraph = SubGraph(
num_nodes=len(nodes),
......@@ -730,6 +788,16 @@ class Graph(object):
cur_nodes = nxt_nodes
return walk
@property
def num_graph(self):
""" Return Number of Graphs"""
return self._num_graph
@property
def graph_lod(self):
""" Return Graph Lod Index for Paddle Computation"""
return self._graph_lod
class SubGraph(Graph):
"""Implementation of SubGraph in pgl.
......@@ -783,3 +851,120 @@ class SubGraph(Graph):
A list of node ids in parent graph.
"""
return graph_kernel.map_nodes(nodes, self._to_reindex)
class MultiGraph(Graph):
"""Implementation of multiple disjoint graph structure in pgl.
This is a simple implementation of graph structure in pgl.
Args:
graph_list : A list of Graph Instances
Examples:
.. code-block:: python
batch_graph = MultiGraph([graph1, graph2, graph3])
"""
def __init__(self, graph_list):
num_nodes = np.sum([g.num_nodes for g in graph_list])
node_feat = self._join_node_feature(graph_list)
edge_feat = self._join_edge_feature(graph_list)
edges = self._join_edges(graph_list)
super(MultiGraph, self).__init__(
num_nodes=num_nodes,
edges=edges,
node_feat=node_feat,
edge_feat=edge_feat)
self._num_graph = len(graph_list)
self._src_graph = graph_list
graph_lod = [g.num_nodes for g in graph_list]
graph_lod = np.cumsum(graph_lod, dtype="int32")
graph_lod = np.insert(graph_lod, 0, 0)
self._graph_lod = graph_lod
def __getitem__(self, index):
return self._src_graph[index]
def _join_node_feature(self, graph_list):
"""join node features for multiple graph"""
node_feat = defaultdict(lambda: [])
for graph in graph_list:
for key in graph.node_feat:
node_feat[key].append(graph.node_feat[key])
ret_node_feat = {}
for key in node_feat:
ret_node_feat[key] = np.vstack(node_feat[key])
return ret_node_feat
def _join_edge_feature(self, graph_list):
"""join edge features for multiple graph"""
edge_feat = defaultdict(lambda: [])
for graph in graph_list:
for key in graph.edge_feat:
efeat = graph.edge_feat[key]
if len(efeat) > 0:
edge_feat[key].append(efeat)
ret_edge_feat = {}
for key in edge_feat:
ret_edge_feat[key] = np.vstack(edge_feat[key])
return ret_edge_feat
def _join_edges(self, graph_list):
"""join edges for multiple graph"""
list_edges = []
start_offset = 0
for graph in graph_list:
edges = graph.edges
if len(edges) > 0:
edges = edges + start_offset
list_edges.append(edges)
start_offset += graph.num_nodes
edges = np.vstack(list_edges)
return edges
class MemmapEdgeIndex(EdgeIndex):
def __init__(self, path):
self._degree = np.load(os.path.join(path, 'degree.npy'), mmap_mode="r")
self._sorted_u = np.load(
os.path.join(path, 'sorted_u.npy'), mmap_mode="r")
self._sorted_v = np.load(
os.path.join(path, 'sorted_v.npy'), mmap_mode="r")
self._sorted_eid = np.load(
os.path.join(path, 'sorted_eid.npy'), mmap_mode="r")
self._indptr = np.load(os.path.join(path, 'indptr.npy'), mmap_mode="r")
class MemmapGraph(Graph):
def __init__(self, path):
self._num_nodes = np.load(os.path.join(path, 'num_nodes.npy'))
self._edges = np.load(os.path.join(path, 'edges.npy'), mmap_mode="r")
if os.path.isdir(os.path.join(path, 'adj_src')):
self._adj_src_index = MemmapEdgeIndex(
os.path.join(path, 'adj_src'))
else:
self._adj_src_index = None
if os.path.isdir(os.path.join(path, 'adj_dst')):
self._adj_dst_index = MemmapEdgeIndex(
os.path.join(path, 'adj_dst'))
else:
self._adj_dst_index = None
def load_feat(feat_path):
"""Load features from .npy file.
"""
feat = {}
if os.path.isdir(feat_path):
for feat_name in os.listdir(feat_path):
feat[os.path.splitext(feat_name)[0]] = np.load(
os.path.join(feat_path, feat_name), mmap_mode="r")
return feat
self._node_feat = load_feat(os.path.join(path, 'node_feat'))
self._edge_feat = load_feat(os.path.join(path, 'edge_feat'))
......@@ -53,14 +53,21 @@ def build_index(np.ndarray[np.int64_t, ndim=1] u,
_tmp_eid[indptr[u[i]] + count[u[i]]] = i
_tmp_u[indptr[u[i]] + count[u[i]]] = u[i]
count[u[i]] += 1
return degree, _tmp_v, _tmp_u, _tmp_eid, indptr
cdef list output_eid = []
cdef list output_v = []
for i in xrange(n_size):
output_eid.append(_tmp_eid[indptr[i]:indptr[i+1]])
output_v.append(_tmp_v[indptr[i]:indptr[i+1]])
return np.array(output_v), np.array(output_eid), degree, _tmp_u, _tmp_v, _tmp_eid
@cython.boundscheck(False)
@cython.wraparound(False)
def slice_by_index(np.ndarray[np.int64_t, ndim=1] u,
np.ndarray[np.int64_t, ndim=1] indptr,
np.ndarray[np.int64_t, ndim=1] index):
cdef list output = []
cdef long long i
cdef long long h = len(index)
cdef long long j
for i in xrange(h):
j = index[i]
output.append(u[indptr[j]:indptr[j+1]])
return np.array(output)
@cython.boundscheck(False)
@cython.wraparound(False)
......@@ -212,7 +219,11 @@ def sample_subset(list nids, long long maxdegree, shuffle=False):
output.append(nids[inc])
else:
sample_size = buff_size if buff_size <= maxdegree else maxdegree
subset_choose_index(sample_size, nids[inc], rnd, buff_nid, offset)
if isinstance(nids[inc], list):
tmp = np.array(nids[inc], dtype=np.int64)
else:
tmp = nids[inc]
subset_choose_index(sample_size, tmp, rnd, buff_nid, offset)
output.append(buff_nid[offset:offset+sample_size])
offset += sample_size
return output
......@@ -245,7 +256,14 @@ def sample_subset_with_eid(list nids, list eids, long long maxdegree, shuffle=Fa
output_eid.append(eids[inc])
else:
sample_size = buff_size if buff_size <= maxdegree else maxdegree
subset_choose_index_eid(sample_size, nids[inc], eids[inc], rnd, buff_nid, buff_eid, offset)
if isinstance(nids[inc], list):
tmp = np.array(nids[inc], dtype=np.int64)
tmp_eids = np.array(eids[inc], dtype=np.int64)
else:
tmp = nids[inc]
tmp_eids = eids[inc]
subset_choose_index_eid(sample_size, tmp, tmp_eids, rnd, buff_nid, buff_eid, offset)
output.append(buff_nid[offset:offset+sample_size])
output_eid.append(buff_eid[offset:offset+sample_size])
offset += sample_size
......@@ -253,22 +271,10 @@ def sample_subset_with_eid(list nids, list eids, long long maxdegree, shuffle=Fa
@cython.boundscheck(False)
@cython.wraparound(False)
def skip_gram_gen_pair(vector[long long] walk_path, long win_size=5):
"""Return node paris generated by skip-gram algorithm.
This function will auto remove the pair which src node is the same
as dst node.
Args:
walk_path: List of nodes as a walk path.
win_size: the windows size used in skip-gram.
Return:
A tuple of (src node list, dst node list).
"""
def skip_gram_gen_pair(vector[long long] walk, long win_size=5):
cdef vector[long long] src
cdef vector[long long] dst
cdef long long l = len(walk_path)
cdef long long l = len(walk)
cdef long long real_win_size, left, right, i
cdef np.ndarray[np.int64_t, ndim=1] rnd = np.random.randint(1, win_size+1,
dtype=np.int64, size=l)
......@@ -282,23 +288,15 @@ def skip_gram_gen_pair(vector[long long] walk_path, long win_size=5):
if right >= l:
right = l - 1
for j in xrange(left, right+1):
if walk_path[i] == walk_path[j]:
if walk[i] == walk[j]:
continue
src.push_back(walk_path[i])
dst.push_back(walk_path[j])
src.push_back(walk[i])
dst.push_back(walk[j])
return src, dst
@cython.boundscheck(False)
@cython.wraparound(False)
def alias_sample_build_table(np.ndarray[np.float64_t, ndim=1] probs):
"""Return the alias table and event table for alias sampling.
Args:
porobs: A list of float numbers as the probability.
Return:
A tuple of (alias table, event table).
"""
cdef long long l = len(probs)
cdef np.ndarray[np.float64_t, ndim=1] alias = probs * l
cdef np.ndarray[np.int64_t, ndim=1] events = np.zeros(l, dtype=np.int64)
......
此差异已折叠。
......@@ -64,8 +64,8 @@ class HeterGraphWrapper(object):
import paddle.fluid as fluid
import numpy as np
from pgl.contrib import heter_graph
from pgl.contrib import heter_graph_wrapper
from pgl import heter_graph
from pgl import heter_graph_wrapper
num_nodes = 4
node_types = [(0, 'user'), (1, 'item'), (2, 'item'), (3, 'user')]
edges = {
......
......@@ -18,7 +18,10 @@ from pgl.layers import conv
from pgl.layers.conv import *
from pgl.layers import set2set
from pgl.layers.set2set import *
from pgl.layers import graph_pool
from pgl.layers.graph_pool import *
__all__ = []
__all__ += conv.__all__
__all__ += set2set.__all__
__all__ += graph_pool.__all__
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册