diff --git a/examples/GATNE/Dataset.py b/examples/GATNE/Dataset.py index 854e1743fc144b04ee5cf2ef62267a8bef65641c..9518e96d2d669be86d5b0d6c0db1c30f43389ef3 100644 --- a/examples/GATNE/Dataset.py +++ b/examples/GATNE/Dataset.py @@ -21,7 +21,7 @@ import tqdm import numpy as np import logging import random -from pgl.contrib import heter_graph +from pgl import heter_graph import pickle as pkl diff --git a/examples/GATNE/model.py b/examples/GATNE/model.py index 73775e802fd76f5f68a6d15a24c8decd3fe3654d..b193849e14d31eec7822fc65379a74c7cb2be38d 100644 --- a/examples/GATNE/model.py +++ b/examples/GATNE/model.py @@ -21,7 +21,7 @@ import logging import paddle.fluid as fluid import paddle.fluid.layers as fl -from pgl.contrib import heter_graph_wrapper +from pgl import heter_graph_wrapper class GATNE(object): diff --git a/examples/distribute_metapath2vec/README.md b/examples/distribute_metapath2vec/README.md new file mode 100644 index 0000000000000000000000000000000000000000..de31aa398d281c4cec9acf351320ea5a84c6b3e6 --- /dev/null +++ b/examples/distribute_metapath2vec/README.md @@ -0,0 +1,43 @@ +# Distributed metapath2vec in PGL +[metapath2vec](https://ericdongyx.github.io/papers/KDD17-dong-chawla-swami-metapath2vec.pdf) is a algorithm framework for representation learning in heterogeneous networks which contains multiple types of nodes and links. Given a heterogeneous graph, metapath2vec algorithm first generates meta-path-based random walks and then use skipgram model to train a language model. Based on PGL, we reproduce metapath2vec algorithm in distributed mode. + + +## Datasets +DBLP: The dataset contains 14376 papers (P), 20 conferences (C), 14475 authors (A), and 8920 terms (T). There are 33791 nodes in this dataset. +You can dowload datasets from [here](https://github.com/librahu/HIN-Datasets-for-Recommendation-and-Network-Embedding) + +We use the ```DBLP``` dataset for example. After downloading the dataset, put them, let's say, in ```./data/DBLP/``` . + +## Dependencies +- paddlepaddle>=1.6 +- pgl>=1.0.0 + +## How to run +Before training, run the below command to do data preprocessing. +```sh +python data_process.py --data_path ./data/DBLP --output_path ./data/data_processed +``` + +We adopt [PaddlePaddle Fleet](https://github.com/PaddlePaddle/Fleet) as our distributed training frameworks. ```config.yaml``` is a configure file for metapath2vec hyperparameters and ```local_config``` is a configure file for parameter servers of PaddlePaddle. By default, we have 2 pservers and 2 trainers. One can use ```cloud_run.sh``` to help startup the parameter servers and model trainers. + +For examples, train metapath2vec in distributed mode on DBLP dataset. +```sh +# train metapath2vec in distributed mode. +sh cloud_run.sh + +# multiclass task example +python multi_class.py --dataset ./data/data_processed/author_label.txt --ckpt_path ./checkpoints/2000 --num_nodes 33791 + +``` + + +## Hyperparameters +All the hyper parameters are saved in ```config.yaml``` file. So before training, you can open the config.yaml to modify the hyper parameters as you like. + +Some important hyper parameters in config.yaml: +- **edge_path**: the directory of graph data that you want to load +- **lr**: learning rate +- **neg_num**: number of negative samples. +- **num_walks**: number of walks started from each node +- **walk_len**: walk length +- **meta_path**: meta path scheme diff --git a/examples/distribute_metapath2vec/cloud_run.sh b/examples/distribute_metapath2vec/cloud_run.sh new file mode 100755 index 0000000000000000000000000000000000000000..c5b5e45fe8990396da9e68cc68f7ebd5217dcbe7 --- /dev/null +++ b/examples/distribute_metapath2vec/cloud_run.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -x +mode=${1} + +source ./utils.sh +unset http_proxy https_proxy + +source ./local_config +if [ ! -d ${log_dir} ]; then + mkdir ${log_dir} +fi + +for((i=0;i<${PADDLE_PSERVERS_NUM};i++)) +do + echo "start ps server: ${i}" + echo $log_dir + TRAINING_ROLE="PSERVER" PADDLE_TRAINER_ID=${i} sh job.sh &> $log_dir/pserver.$i.log & +done +sleep 10s +for((j=0;j<${PADDLE_TRAINERS_NUM};j++)) +do + echo "start ps work: ${j}" + TRAINING_ROLE="TRAINER" PADDLE_TRAINER_ID=${j} sh job.sh &> $log_dir/worker.$j.log & +done +tail -f $log_dir/worker.0.log diff --git a/examples/distribute_metapath2vec/cluster_train.py b/examples/distribute_metapath2vec/cluster_train.py new file mode 100644 index 0000000000000000000000000000000000000000..dcd9716a08d173eb8d40265c7d09eccd05ce6d33 --- /dev/null +++ b/examples/distribute_metapath2vec/cluster_train.py @@ -0,0 +1,197 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import time +import os +import math +import numpy as np + +import paddle.fluid as F +import paddle.fluid.layers as L +from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet +from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig +import paddle.fluid.incubate.fleet.base.role_maker as role_maker +from pgl.utils.logger import log + +from model import Metapath2vecModel +from graph import m2vGraph +from utils import load_config +from walker import multiprocess_data_generator + + +def init_role(): + # reset the place according to role of parameter server + training_role = os.getenv("TRAINING_ROLE", "TRAINER") + paddle_role = role_maker.Role.WORKER + place = F.CPUPlace() + if training_role == "PSERVER": + paddle_role = role_maker.Role.SERVER + + # set the fleet runtime environment according to configure + ports = os.getenv("PADDLE_PORT", "6174").split(",") + pserver_ips = os.getenv("PADDLE_PSERVERS").split(",") # ip,ip... + eplist = [] + if len(ports) > 1: + # local debug mode, multi port + for port in ports: + eplist.append(':'.join([pserver_ips[0], port])) + else: + # distributed mode, multi ip + for ip in pserver_ips: + eplist.append(':'.join([ip, ports[0]])) + + pserver_endpoints = eplist # ip:port,ip:port... + worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) + trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + role = role_maker.UserDefinedRoleMaker( + current_id=trainer_id, + role=paddle_role, + worker_num=worker_num, + server_endpoints=pserver_endpoints) + fleet.init(role) + + +def optimization(base_lr, loss, train_steps, optimizer='sgd'): + decayed_lr = L.learning_rate_scheduler.polynomial_decay( + learning_rate=base_lr, + decay_steps=train_steps, + end_learning_rate=0.0001 * base_lr, + power=1.0, + cycle=False) + if optimizer == 'sgd': + optimizer = F.optimizer.SGD(decayed_lr) + elif optimizer == 'adam': + optimizer = F.optimizer.Adam(decayed_lr, lazy_mode=True) + else: + raise ValueError + + log.info('learning rate:%f' % (base_lr)) + #create the DistributeTranspiler configure + config = DistributeTranspilerConfig() + config.sync_mode = False + #config.runtime_split_send_recv = False + + config.slice_var_up = False + #create the distributed optimizer + optimizer = fleet.distributed_optimizer(optimizer, config) + optimizer.minimize(loss) + + +def build_complied_prog(train_program, model_loss): + num_threads = int(os.getenv("CPU_NUM", 10)) + trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) + exec_strategy = F.ExecutionStrategy() + exec_strategy.num_threads = num_threads + #exec_strategy.use_experimental_executor = True + build_strategy = F.BuildStrategy() + build_strategy.enable_inplace = True + #build_strategy.memory_optimize = True + build_strategy.memory_optimize = False + build_strategy.remove_unnecessary_lock = False + if num_threads > 1: + build_strategy.reduce_strategy = F.BuildStrategy.ReduceStrategy.Reduce + + compiled_prog = F.compiler.CompiledProgram( + train_program).with_data_parallel(loss_name=model_loss.name) + return compiled_prog + + +def train_prog(exe, program, loss, node2vec_pyreader, args, train_steps): + trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + step = 0 + if not os.path.exists(args.save_path): + os.makedirs(args.save_path) + while True: + try: + begin_time = time.time() + loss_val, = exe.run(program, fetch_list=[loss]) + log.info("step %s: loss %.5f speed: %.5f s/step" % + (step, np.mean(loss_val), time.time() - begin_time)) + step += 1 + except F.core.EOFException: + node2vec_pyreader.reset() + + if step % args.steps_per_save == 0 or step == train_steps: + save_path = args.save_path + if trainer_id == 0: + model_path = os.path.join(save_path, "%s" % step) + fleet.save_persistables(exe, model_path) + + if step == train_steps: + break + + +def main(args): + log.info("start") + + worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) + num_devices = int(os.getenv("CPU_NUM", 10)) + + model = Metapath2vecModel(config=args) + pyreader = model.pyreader + loss = model.forward() + + # init fleet + init_role() + + train_steps = math.ceil(args.num_nodes * args.epochs / args.batch_size / + num_devices / worker_num) + log.info("Train step: %s" % train_steps) + + real_batch_size = args.batch_size * args.walk_len * args.win_size + if args.optimizer == "sgd": + args.lr *= real_batch_size + optimization(args.lr, loss, train_steps, args.optimizer) + + # init and run server or worker + if fleet.is_server(): + fleet.init_server(args.warm_start_from_dir) + fleet.run_server() + + if fleet.is_worker(): + log.info("start init worker done") + fleet.init_worker() + #just the worker, load the sample + log.info("init worker done") + + exe = F.Executor(F.CPUPlace()) + exe.run(fleet.startup_program) + log.info("Startup done") + + dataset = m2vGraph(args) + log.info("Build graph done.") + + data_generator = multiprocess_data_generator(args, dataset) + + cur_time = time.time() + for idx, _ in enumerate(data_generator()): + log.info("iter %s: %s s" % (idx, time.time() - cur_time)) + cur_time = time.time() + if idx == 100: + break + + pyreader.decorate_tensor_provider(data_generator) + pyreader.start() + + compiled_prog = build_complied_prog(fleet.main_program, loss) + train_prog(exe, compiled_prog, loss, pyreader, args, train_steps) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='metapath2vec') + parser.add_argument("-c", "--config", type=str, default="./config.yaml") + args = parser.parse_args() + config = load_config(args.config) + log.info(config) + main(config) diff --git a/examples/distribute_metapath2vec/config.yaml b/examples/distribute_metapath2vec/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..30506df33cf16dc8f3864ca23026f464a8dff0ae --- /dev/null +++ b/examples/distribute_metapath2vec/config.yaml @@ -0,0 +1,41 @@ +# graph data config +edge_path: "./data/data_processed" +edge_files: "p2a:paper_author.txt,p2c:paper_conference.txt,p2t:paper_type.txt" +node_types_file: "node_types.txt" +num_nodes: 37791 +symmetry: True + +# skipgram pair data config +win_size: 5 +neg_num: 5 +# average; m2v_plus +neg_sample_type: "average" + +# random walk config +# m2v; multi_m2v; +walk_mode: "m2v" +meta_path: "c2p-p2a-a2p-p2c" +first_node_type: "c" +walk_len: 24 +batch_size: 4 +node_shuffle: True +node_files: null +num_sample_workers: 2 + +# model config +embed_dim: 64 +is_sparse: True +# only use when num_nodes > 100,000,000, slower than noraml embedding +is_distributed: False + +# trainging config +epochs: 10 +optimizer: "sgd" +lr: 1.0 +warm_start_from_dir: null +walkpath_files: "None" +train_files: "None" +steps_per_save: 1000 +save_path: "./checkpoints" +log_dir: "./logs" +CPU_NUM: 16 diff --git a/examples/distribute_metapath2vec/data_process.py b/examples/distribute_metapath2vec/data_process.py new file mode 100644 index 0000000000000000000000000000000000000000..2426cd571c4395bd8c26454a5fd17754debd3cfc --- /dev/null +++ b/examples/distribute_metapath2vec/data_process.py @@ -0,0 +1,114 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Data preprocessing for DBLP dataset""" +import sys +import os +import argparse +import numpy as np +from collections import OrderedDict + +AUTHOR = 14475 +PAPER = 14376 +CONF = 20 +TYPE = 8920 +LABEL = 4 + + +def build_node_types(meta_node, outfile): + """build_node_types""" + nt_ori2new = {} + with open(outfile, 'w') as writer: + offset = 0 + for node_type, num_nodes in meta_node.items(): + ori_id2new_id = {} + for i in range(num_nodes): + writer.write("%d\t%s\n" % (offset + i, node_type)) + ori_id2new_id[i + 1] = offset + i + nt_ori2new[node_type] = ori_id2new_id + offset += num_nodes + return nt_ori2new + + +def remapping_index(args, src_dict, dst_dict, ori_file, new_file): + """remapping_index""" + ori_file = os.path.join(args.data_path, ori_file) + new_file = os.path.join(args.output_path, new_file) + with open(ori_file, 'r') as reader, open(new_file, 'w') as writer: + for line in reader: + slots = line.strip().split() + s = int(slots[0]) + d = int(slots[1]) + new_s = src_dict[s] + new_d = dst_dict[d] + writer.write("%d\t%d\n" % (new_s, new_d)) + + +def author_label(args, ori_id2pgl_id, ori_file, real_file, new_file): + """author_label""" + ori_file = os.path.join(args.data_path, ori_file) + real_file = os.path.join(args.data_path, real_file) + new_file = os.path.join(args.output_path, new_file) + real_id2pgl_id = {} + with open(ori_file, 'r') as reader: + for line in reader: + slots = line.strip().split() + ori_id = int(slots[0]) + real_id = int(slots[1]) + pgl_id = ori_id2pgl_id[ori_id] + real_id2pgl_id[real_id] = pgl_id + + with open(real_file, 'r') as reader, open(new_file, 'w') as writer: + for line in reader: + slots = line.strip().split() + real_id = int(slots[0]) + label = int(slots[1]) + pgl_id = real_id2pgl_id[real_id] + writer.write("%d\t%d\n" % (pgl_id, label)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='DBLP data preprocessing') + parser.add_argument( + '--data_path', + default=None, + type=str, + help='original data path(default: None)') + parser.add_argument( + '--output_path', + default=None, + type=str, + help='output path(default: None)') + args = parser.parse_args() + + meta_node = OrderedDict() + meta_node['a'] = AUTHOR + meta_node['p'] = PAPER + meta_node['c'] = CONF + meta_node['t'] = TYPE + + if not os.path.exists(args.output_path): + os.makedirs(args.output_path) + + node_types_file = os.path.join(args.output_path, "node_types.txt") + nt_ori2new = build_node_types(meta_node, node_types_file) + + remapping_index(args, nt_ori2new['p'], nt_ori2new['a'], 'paper_author.dat', + 'paper_author.txt') + remapping_index(args, nt_ori2new['p'], nt_ori2new['c'], + 'paper_conference.dat', 'paper_conference.txt') + remapping_index(args, nt_ori2new['p'], nt_ori2new['t'], 'paper_type.dat', + 'paper_type.txt') + + author_label(args, nt_ori2new['a'], 'author_map_id.dat', + 'author_label.dat', 'author_label.txt') diff --git a/examples/distribute_metapath2vec/graph.py b/examples/distribute_metapath2vec/graph.py new file mode 100644 index 0000000000000000000000000000000000000000..5bfc06e94b4ff426ae548ae537257bc226ca67c3 --- /dev/null +++ b/examples/distribute_metapath2vec/graph.py @@ -0,0 +1,121 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import sys +import os +import numpy as np +import pickle as pkl +import tqdm +import time +import random +from pgl.utils.logger import log +from pgl import heter_graph + + +class m2vGraph(object): + """Implemetation of graph in order to sample metapath random walk. + """ + + def __init__(self, config): + self.edge_path = config.edge_path + self.num_nodes = config.num_nodes + self.symmetry = config.symmetry + edge_files = config.edge_files + node_types_file = config.node_types_file + + self.edge_file_list = [] + for pair in edge_files.split(','): + e_type, filename = pair.split(':') + filename = os.path.join(self.edge_path, filename) + self.edge_file_list.append((e_type, filename)) + + self.node_types_file = os.path.join(self.edge_path, node_types_file) + + self.build_graph() + + def build_graph(self): + """Build pgl heterogeneous graph. + """ + edges_by_types = {} + npy = self.edge_file_list[0][1] + ".npy" + if os.path.exists(npy): + log.info("load data from numpy file") + + for pair in self.edge_file_list: + edges_by_types[pair[0]] = np.load(pair[1] + ".npy") + + else: + log.info("load data from txt file") + for pair in self.edge_file_list: + edges_by_types[pair[0]] = self.load_edges(pair[1]) + # np.save(pair[1] + ".npy", edges_by_types[pair[0]]) + + for e_type, edges in edges_by_types.items(): + log.info(["number of %s edges: " % e_type, len(edges)]) + + if self.symmetry: + tmp = {} + for key, edges in edges_by_types.items(): + n_list = key.split('2') + re_key = n_list[1] + '2' + n_list[0] + tmp[re_key] = edges_by_types[key][:, [1, 0]] + edges_by_types.update(tmp) + + log.info(["finished loadding symmetry edges."]) + + node_types = self.load_node_types(self.node_types_file) + + assert len(node_types) == self.num_nodes, \ + "num_nodes should be equal to the length of node_types" + log.info(["number of nodes: ", len(node_types)]) + + node_features = { + 'index': np.array([i for i in range(self.num_nodes)]).reshape( + -1, 1).astype(np.int64) + } + + self.graph = heter_graph.HeterGraph( + num_nodes=self.num_nodes, + edges=edges_by_types, + node_types=node_types, + node_feat=node_features) + + def load_edges(self, file_, symmetry=False): + """Load edges from file. + """ + edges = [] + with open(file_, 'r') as reader: + for line in reader: + items = line.strip().split() + src, dst = int(items[0]), int(items[1]) + edges.append((src, dst)) + if symmetry: + edges.append((dst, src)) + edges = np.array(list(set(edges)), dtype=np.int64) + # edges = list(set(edges)) + return edges + + def load_node_types(self, file_): + """Load node types + """ + node_types = [] + log.info("node_types_file name: %s" % file_) + with open(file_, 'r') as reader: + for line in reader: + items = line.strip().split() + node_id = int(items[0]) + n_type = items[1] + node_types.append((node_id, n_type)) + + return node_types diff --git a/examples/distribute_metapath2vec/job.sh b/examples/distribute_metapath2vec/job.sh new file mode 100644 index 0000000000000000000000000000000000000000..8b9ee4d1b5d981d9c4dfa920cffbb31723030dcc --- /dev/null +++ b/examples/distribute_metapath2vec/job.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -x +source ./utils.sh + +export CPU_NUM=$CPU_NUM +export FLAGS_rpc_deadline=3000000 + +export FLAGS_communicator_send_queue_size=1 +export FLAGS_communicator_min_send_grad_num_before_recv=0 +export FLAGS_communicator_max_merge_var_num=1 +export FLAGS_communicator_merge_sparse_grad=0 + +python -u cluster_train.py -c config.yaml diff --git a/examples/distribute_metapath2vec/local_config b/examples/distribute_metapath2vec/local_config new file mode 100644 index 0000000000000000000000000000000000000000..0e8ca14c66f40cfdc7beea1a0f0cd2f61b8a51ee --- /dev/null +++ b/examples/distribute_metapath2vec/local_config @@ -0,0 +1,6 @@ +#!/bin/bash +export PADDLE_TRAINERS_NUM=2 +export PADDLE_PSERVERS_NUM=2 +export PADDLE_PORT=6184,6185 +export PADDLE_PSERVERS="127.0.0.1" + diff --git a/examples/distribute_metapath2vec/model.py b/examples/distribute_metapath2vec/model.py new file mode 100644 index 0000000000000000000000000000000000000000..8c455eef5d3b6244808656f47628212a576a47b8 --- /dev/null +++ b/examples/distribute_metapath2vec/model.py @@ -0,0 +1,158 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + metapath2vec model. +""" +from __future__ import division +from __future__ import absolute_import +from __future__ import print_function +from __future__ import unicode_literals +import math + +import paddle.fluid.layers as L +import paddle.fluid as F + + +def distributed_embedding(input, + dict_size, + hidden_size, + initializer, + name, + num_part=16, + is_sparse=False, + learning_rate=1.0): + _part_size = hidden_size // num_part + if hidden_size % num_part != 0: + _part_size += 1 + output_embedding = [] + p_num = 0 + while hidden_size > 0: + _part_size = min(_part_size, hidden_size) + hidden_size -= _part_size + print("part", p_num, "size=", (dict_size, _part_size)) + part_embedding = L.embedding( + input=input, + size=(dict_size, int(_part_size)), + is_sparse=is_sparse, + is_distributed=False, + param_attr=F.ParamAttr( + name=name + '_part%s' % p_num, + initializer=initializer, + learning_rate=learning_rate)) + p_num += 1 + output_embedding.append(part_embedding) + return L.concat(output_embedding, -1) + + +class Metapath2vecModel(object): + def __init__(self, config, embedding_lr=1.0): + self.config = config + self.neg_num = self.config.neg_num + self.num_nodes = self.config.num_nodes + self.embed_dim = self.config.embed_dim + self.is_sparse = self.config.is_sparse + self.is_distributed = self.config.is_distributed + self.embedding_lr = embedding_lr + + self.pyreader = L.py_reader( + capacity=70, + shapes=[[-1, 1, 1], [-1, self.neg_num + 1, 1]], + dtypes=['int64', 'int64'], + lod_levels=[0, 0], + name='train', + use_double_buffer=True) + + bound = 1. / math.sqrt(self.embed_dim) + self.embed_init = F.initializer.Uniform(low=-bound, high=bound) + self.loss = None + max_hidden_size = int(math.pow(2, 31) / 4 / self.num_nodes) + self.num_part = int(math.ceil(1. * self.embed_dim / max_hidden_size)) + + def forward(self): + src, dsts = L.read_file(self.pyreader) + + if self.is_sparse: + src = L.reshape(src, [-1, 1]) + dsts = L.reshape(dsts, [-1, 1]) + + if self.num_part is not None and self.num_part != 1 and not self.is_distributed: + src_embed = distributed_embedding( + src, + self.num_nodes, + self.embed_dim, + self.embed_init, + "weight", + self.num_part, + self.is_sparse, + learning_rate=self.embedding_lr) + + dsts_embed = distributed_embedding( + dsts, + self.num_nodes, + self.embed_dim, + self.embed_init, + "weight", + self.num_part, + self.is_sparse, + learning_rate=self.embedding_lr) + else: + src_embed = L.embedding( + src, (self.num_nodes, self.embed_dim), + self.is_sparse, + self.is_distributed, + param_attr=F.ParamAttr( + name="weight", + learning_rate=self.embedding_lr, + initializer=self.embed_init)) + + dsts_embed = L.embedding( + dsts, (self.num_nodes, self.embed_dim), + self.is_sparse, + self.is_distributed, + param_attr=F.ParamAttr( + name="weight", + learning_rate=self.embedding_lr, + initializer=self.embed_init)) + + if self.is_sparse: + src_embed = L.reshape(src_embed, [-1, 1, self.embed_dim]) + dsts_embed = L.reshape(dsts_embed, + [-1, self.neg_num + 1, self.embed_dim]) + + logits = L.matmul( + src_embed, dsts_embed, + transpose_y=True) # [batch_size, 1, neg_num+1] + + pos_label = L.fill_constant_batch_size_like(logits, [-1, 1, 1], + "float32", 1) + neg_label = L.fill_constant_batch_size_like( + logits, [-1, 1, self.neg_num], "float32", 0) + label = L.concat([pos_label, neg_label], -1) + + pos_weight = L.fill_constant_batch_size_like(logits, [-1, 1, 1], + "float32", self.neg_num) + neg_weight = L.fill_constant_batch_size_like( + logits, [-1, 1, self.neg_num], "float32", 1) + weight = L.concat([pos_weight, neg_weight], -1) + + weight.stop_gradient = True + label.stop_gradient = True + + loss = L.sigmoid_cross_entropy_with_logits(logits, label) + loss = loss * weight + loss = L.reduce_mean(loss) + loss = loss * ((self.neg_num + 1) / 2 / self.neg_num) + loss.persistable = True + self.loss = loss + return loss diff --git a/examples/distribute_metapath2vec/mp_reader.py b/examples/distribute_metapath2vec/mp_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..d4df8998cc4ad672627dc7ba9f846c91ccca0bba --- /dev/null +++ b/examples/distribute_metapath2vec/mp_reader.py @@ -0,0 +1,145 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Optimized Multiprocessing Reader for PaddlePaddle +""" +import multiprocessing +import numpy as np +import time + +import paddle.fluid as fluid +import pyarrow + + +def _serialize_serializable(obj): + """Serialize Feed Dict + """ + return {"type": type(obj), "data": obj.__dict__} + + +def _deserialize_serializable(obj): + """Deserialize Feed Dict + """ + + val = obj["type"].__new__(obj["type"]) + val.__dict__.update(obj["data"]) + return val + + +context = pyarrow.default_serialization_context() + +context.register_type( + object, + "object", + custom_serializer=_serialize_serializable, + custom_deserializer=_deserialize_serializable) + + +def serialize_data(data): + """serialize_data""" + return pyarrow.serialize(data, context=context).to_buffer().to_pybytes() + + +def deserialize_data(data): + """deserialize_data""" + return pyarrow.deserialize(data, context=context) + + +def multiprocess_reader(readers, use_pipe=True, queue_size=1000): + """ + multiprocess_reader use python multi process to read data from readers + and then use multiprocess.Queue or multiprocess.Pipe to merge all + data. The process number is equal to the number of input readers, each + process call one reader. + Multiprocess.Queue require the rw access right to /dev/shm, some + platform does not support. + you need to create multiple readers first, these readers should be independent + to each other so that each process can work independently. + An example: + .. code-block:: python + reader0 = reader(["file01", "file02"]) + reader1 = reader(["file11", "file12"]) + reader1 = reader(["file21", "file22"]) + reader = multiprocess_reader([reader0, reader1, reader2], + queue_size=100, use_pipe=False) + """ + + assert type(readers) is list and len(readers) > 0 + + def _read_into_queue(reader, queue): + """read_into_queue""" + for sample in reader(): + if sample is None: + raise ValueError("sample has None") + queue.put(serialize_data(sample)) + queue.put(serialize_data(None)) + + def queue_reader(): + """queue_reader""" + queue = multiprocessing.Queue(queue_size) + for reader in readers: + p = multiprocessing.Process( + target=_read_into_queue, args=(reader, queue)) + p.start() + + reader_num = len(readers) + finish_num = 0 + while finish_num < reader_num: + sample = deserialize_data(queue.get()) + if sample is None: + finish_num += 1 + else: + yield sample + + def _read_into_pipe(reader, conn): + """read_into_pipe""" + for sample in reader(): + if sample is None: + raise ValueError("sample has None!") + conn.send(serialize_data(sample)) + conn.send(serialize_data(None)) + conn.close() + + def pipe_reader(): + """pipe_reader""" + conns = [] + for reader in readers: + parent_conn, child_conn = multiprocessing.Pipe() + conns.append(parent_conn) + p = multiprocessing.Process( + target=_read_into_pipe, args=(reader, child_conn)) + p.start() + + reader_num = len(readers) + finish_num = 0 + conn_to_remove = [] + finish_flag = np.zeros(len(conns), dtype="int32") + while finish_num < reader_num: + for conn_id, conn in enumerate(conns): + if finish_flag[conn_id] > 0: + continue + buff = conn.recv() + now = time.time() + sample = deserialize_data(buff) + out = time.time() - now + if sample is None: + finish_num += 1 + conn.close() + finish_flag[conn_id] = 1 + else: + yield sample + + if use_pipe: + return pipe_reader + else: + return queue_reader diff --git a/examples/distribute_metapath2vec/multi_class.py b/examples/distribute_metapath2vec/multi_class.py new file mode 100644 index 0000000000000000000000000000000000000000..1b2c5499c4e57911862518702fb539bee681be9a --- /dev/null +++ b/examples/distribute_metapath2vec/multi_class.py @@ -0,0 +1,223 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This file provides the multi class task for testing the embedding learned by metapath2vec model. +""" +import argparse +import sys +import os +import tqdm +import time +import math +import logging +import random +import pickle as pkl + +import numpy as np +import sklearn.metrics +from sklearn.metrics import f1_score + +import pgl +import paddle.fluid as fluid +import paddle.fluid.layers as fl + + +def load_data(file_): + """Load data for node classification. + """ + words_label = [] + line_count = 0 + with open(file_, 'r') as reader: + for line in reader: + line_count += 1 + tokens = line.strip().split() + word, label = int(tokens[0]), int(tokens[1]) - 1 + words_label.append((word, label)) + + words_label = np.array(words_label, dtype=np.int64) + np.random.shuffle(words_label) + + logging.info('%d/%d word_label pairs have been loaded' % + (len(words_label), line_count)) + return words_label + + +def node_classify_model(config): + """Build node classify model. + """ + nodes = fl.data('nodes', shape=[None, 1], dtype='int64') + labels = fl.data('labels', shape=[None, 1], dtype='int64') + + embed_nodes = fl.embedding( + input=nodes, + size=[config.num_nodes, config.embed_dim], + param_attr=fluid.ParamAttr(name='weight')) + + embed_nodes.stop_gradient = True + probs = fl.fc(input=embed_nodes, size=config.num_labels, act='softmax') + predict = fl.argmax(probs, axis=-1) + loss = fl.cross_entropy(input=probs, label=labels) + loss = fl.reduce_mean(loss) + + return { + 'loss': loss, + 'probs': probs, + 'predict': predict, + 'labels': labels, + } + + +def run_epoch(exe, prog, model, feed_dict, lr): + """Run training process of every epoch. + """ + if lr is None: + loss, predict = exe.run(prog, + feed=feed_dict, + fetch_list=[model['loss'], model['predict']], + return_numpy=True) + lr_ = 0 + else: + loss, predict, lr_ = exe.run( + prog, + feed=feed_dict, + fetch_list=[model['loss'], model['predict'], lr], + return_numpy=True) + + macro_f1 = f1_score(feed_dict['labels'], predict, average="macro") + micro_f1 = f1_score(feed_dict['labels'], predict, average="micro") + + return { + 'loss': loss, + 'pred': predict, + 'lr': lr_, + 'macro_f1': macro_f1, + 'micro_f1': micro_f1 + } + + +def main(args): + """main function for training node classification task. + """ + words_label = load_data(args.dataset) + + # split data for training and testing + split_position = int(words_label.shape[0] * args.train_percent) + train_words_label = words_label[0:split_position, :] + test_words_label = words_label[split_position:, :] + + place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace() + train_prog = fluid.Program() + test_prog = fluid.Program() + startup_prog = fluid.Program() + + with fluid.program_guard(train_prog, startup_prog): + with fluid.unique_name.guard(): + model = node_classify_model(args) + + test_prog = train_prog.clone(for_test=True) + + with fluid.program_guard(train_prog, startup_prog): + lr = fl.polynomial_decay(args.lr, 1000, 0.001) + adam = fluid.optimizer.Adam(lr) + adam.minimize(model['loss']) + + exe = fluid.Executor(place) + exe.run(startup_prog) + + def existed_params(var): + if not isinstance(var, fluid.framework.Parameter): + return False + return os.path.exists(os.path.join(args.ckpt_path, var.name)) + + fluid.io.load_vars( + exe, args.ckpt_path, main_program=train_prog, predicate=existed_params) + # load_param(args.ckpt_path, ['content']) + + feed_dict = {} + X = train_words_label[:, 0].reshape(-1, 1) + labels = train_words_label[:, 1].reshape(-1, 1) + logging.info('%d/%d data to train' % + (labels.shape[0], words_label.shape[0])) + + test_feed_dict = {} + test_X = test_words_label[:, 0].reshape(-1, 1) + test_labels = test_words_label[:, 1].reshape(-1, 1) + logging.info('%d/%d data to test' % + (test_labels.shape[0], words_label.shape[0])) + + for epoch in range(args.epochs): + feed_dict['nodes'] = X + feed_dict['labels'] = labels + train_result = run_epoch(exe, train_prog, model, feed_dict, lr) + + test_feed_dict['nodes'] = test_X + test_feed_dict['labels'] = test_labels + + test_result = run_epoch(exe, test_prog, model, test_feed_dict, lr=None) + + logging.info( + 'epoch %d | lr %.4f | train_loss %.5f | train_macro_F1 %.4f | train_micro_F1 %.4f | test_loss %.5f | test_macro_F1 %.4f | test_micro_F1 %.4f' + % (epoch, train_result['lr'], train_result['loss'], + train_result['macro_f1'], train_result['micro_f1'], + test_result['loss'], test_result['macro_f1'], + test_result['micro_f1'])) + + logging.info( + 'final_test_macro_f1 score: %.4f | final_test_micro_f1 score: %.4f' % + (test_result['macro_f1'], test_result['micro_f1'])) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='multi_class') + parser.add_argument( + '--dataset', + default=None, + type=str, + help='training and testing data file(default: None)') + parser.add_argument( + '--ckpt_path', default=None, type=str, help='task name(default: None)') + parser.add_argument("--use_cuda", action='store_true', help="use_cuda") + parser.add_argument( + '--train_percent', + default=0.5, + type=float, + help='train_percent(default: 0.5)') + parser.add_argument( + '--num_labels', + default=4, + type=int, + help='number of labels(default: 4)') + parser.add_argument( + '--epochs', + default=100, + type=int, + help='number of epochs for training(default: 100)') + parser.add_argument( + '--lr', + default=0.025, + type=float, + help='learning rate(default: 0.025)') + parser.add_argument( + '--num_nodes', default=0, type=int, help='number of nodes') + parser.add_argument( + '--embed_dim', + default=64, + type=int, + help='dimension of embedding(default: 64)') + args = parser.parse_args() + + log_format = '%(asctime)s-%(levelname)s-%(name)s: %(message)s' + logging.basicConfig(level='INFO', format=log_format) + + main(args) diff --git a/examples/distribute_metapath2vec/utils.py b/examples/distribute_metapath2vec/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f5810f7fdd7a99b034505feaddf51a962ca34ac1 --- /dev/null +++ b/examples/distribute_metapath2vec/utils.py @@ -0,0 +1,55 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Implementation of some helper functions""" + +from __future__ import division +from __future__ import absolute_import +from __future__ import print_function +from __future__ import unicode_literals + +import os +import time +import yaml +import numpy as np + +from pgl.utils.logger import log + + +class AttrDict(dict): + """Attr dict + """ + + def __init__(self, d): + self.dict = d + + def __getattr__(self, attr): + value = self.dict[attr] + if isinstance(value, dict): + return AttrDict(value) + else: + return value + + def __str__(self): + return str(self.dict) + + +def load_config(config_file): + """Load config file""" + with open(config_file) as f: + if hasattr(yaml, 'FullLoader'): + config = yaml.load(f, Loader=yaml.FullLoader) + else: + config = yaml.load(f) + + return AttrDict(config) diff --git a/examples/distribute_metapath2vec/utils.sh b/examples/distribute_metapath2vec/utils.sh new file mode 100644 index 0000000000000000000000000000000000000000..6f6daa846d600e0bcecd4ce64e04946dba0fdd51 --- /dev/null +++ b/examples/distribute_metapath2vec/utils.sh @@ -0,0 +1,20 @@ + +# parse yaml file +function parse_yaml { + local prefix=$2 + local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034') + sed -ne "s|^\($s\):|\1|" \ + -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \ + -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p" $1 | + awk -F$fs '{ + indent = length($1)/2; + vname[indent] = $2; + for (i in vname) {if (i > indent) {delete vname[i]}} + if (length($3) > 0) { + vn=""; for (i=0; i 1, "In [multi_m2v] walk_mode, the number of metapath\ + should be greater than 1" + + def __call__(self): + np.random.seed(os.getpid()) + if self.walk_mode == "m2v": + walk_generator = self.m2v_walk + log.info("walk mode is : %s" % (self.walk_mode)) + elif self.walk_mode == "multi_m2v": + walk_generator = self.multi_m2v_walk + log.info("walk mode is : %s" % (self.walk_mode)) + else: + raise ValueError("walk_mode [%s] is not matched" % self.walk_mode) + + for walks in walk_generator(): + yield walks + + def m2v_walk(self): + """Metapath2vec walker""" + for nodes in self.node_generator(): + walks = metapath_randomwalk( + self.graph, nodes, self.config.meta_path, self.config.walk_len) + + yield walks + + def multi_m2v_walk(self): + """Multi metapath2vec walker""" + meta_paths = self.config.meta_path.split(';') + for nodes, idx in self.node_generator(): + walks = metapath_randomwalk(self.graph, nodes, meta_paths[idx], + self.config.walk_len) + yield walks + + +class DataGenerator(object): + def __init__(self, config, dataset): + self.config = config + self.dataset = dataset + self.graph = self.dataset.graph + self.walk_generator = WalkGenerator(self.config, self.dataset) + + def __call__(self): + generator = self.pair_generate + + for src, pos, negs in generator(): + dst = np.concatenate([pos, negs], 1) + yield src, dst + + def pair_generate(self): + for walks in self.walk_generator(): + try: + src_list, pos_list = [], [] + for walk in walks: + s, p = skip_gram_gen_pair(walk, self.config.win_size) + src_list.append(s), pos_list.append(p) + src = [s for x in src_list for s in x] + pos = [s for x in pos_list for s in x] + + if len(src) == 0: + continue + + negs = self.negative_sample( + src, + pos, + neg_num=self.config.neg_num, + neg_sample_type=self.config.neg_sample_type) + + src = np.array(src, dtype=np.int64).reshape(-1, 1, 1) + pos = np.array(pos, dtype=np.int64).reshape(-1, 1, 1) + + yield src, pos, negs + + except Exception as e: + log.exception(e) + + def negative_sample(self, src, pos, neg_num, neg_sample_type): + if neg_sample_type == "average": + neg_sample_size = [len(pos), neg_num, 1] + negs = np.random.randint( + low=0, high=self.graph.num_nodes, size=neg_sample_size) + + elif neg_sample_type == "m2v_plus": + negs = [] + for s in src: + neg = self.graph.sample_nodes( + sample_num=neg_num, n_type=self.graph.node_types[s]) + negs.append(neg) + negs = np.vstack(negs).reshape(-1, neg_num, 1) + + else: # equal to "average" + neg_sample_size = [len(pos), neg_num, 1] + negs = np.random.randint( + low=0, high=self.graph.num_nodes, size=neg_sample_size) + + negs = negs.astype(np.int64) + + return negs + + +def multiprocess_data_generator(config, dataset): + """Multiprocess data generator. + """ + if config.num_sample_workers == 1: + data_generator = DataGenerator(config, dataset) + else: + pool = [ + DataGenerator(config, dataset) + for i in range(config.num_sample_workers) + ] + data_generator = mp_reader.multiprocess_reader( + pool, use_pipe=True, queue_size=100) + + return data_generator + + +if __name__ == "__main__": + config_file = "./config.yaml" + config = load_config(config_file) + dataset = m2vGraph(config) + data_generator = multiprocess_data_generator(config, dataset) + start = time.time() + cc = 0 + for src, dst in data_generator(): + log.info(src.shape) + + log.info("time: %.6f" % (time.time() - start)) + start = time.time() + cc += 1 + if cc == 100: + break diff --git a/examples/graphsage/reader.py b/examples/graphsage/reader.py index abc3fe8cbe8bdd380261ffde8933871b1a6094c9..8839b9355ce7b19065b9bdf6689c2480ccee112b 100644 --- a/examples/graphsage/reader.py +++ b/examples/graphsage/reader.py @@ -19,8 +19,8 @@ import pgl import time from pgl.utils import mp_reader from pgl.utils.logger import log -import train import time +import copy def node_batch_iter(nodes, node_label, batch_size): @@ -46,12 +46,11 @@ def traverse(item): yield item -def flat_node_and_edge(nodes, eids): +def flat_node_and_edge(nodes): """flat_node_and_edge """ nodes = list(set(traverse(nodes))) - eids = list(set(traverse(eids))) - return nodes, eids + return nodes def worker(batch_info, graph, graph_wrapper, samples): @@ -61,31 +60,42 @@ def worker(batch_info, graph, graph_wrapper, samples): def work(): """work """ - first = True + _graph_wrapper = copy.copy(graph_wrapper) + _graph_wrapper.node_feat_tensor_dict = {} for batch_train_samples, batch_train_labels in batch_info: start_nodes = batch_train_samples nodes = start_nodes - eids = [] + edges = [] for max_deg in samples: - pred, pred_eid = graph.sample_predecessor( - start_nodes, max_degree=max_deg, return_eids=True) + pred_nodes = graph.sample_predecessor( + start_nodes, max_degree=max_deg) + + for dst_node, src_nodes in zip(start_nodes, pred_nodes): + for src_node in src_nodes: + edges.append((src_node, dst_node)) + last_nodes = nodes - nodes = [nodes, pred] - eids = [eids, pred_eid] - nodes, eids = flat_node_and_edge(nodes, eids) + nodes = [nodes, pred_nodes] + nodes = flat_node_and_edge(nodes) # Find new nodes start_nodes = list(set(nodes) - set(last_nodes)) if len(start_nodes) == 0: break - subgraph = graph.subgraph(nodes=nodes, eid=eids) + subgraph = graph.subgraph( + nodes=nodes, + edges=edges, + with_node_feat=False, + with_edge_feat=False) + sub_node_index = subgraph.reindex_from_parrent_nodes( batch_train_samples) - feed_dict = graph_wrapper.to_feed(subgraph) + feed_dict = _graph_wrapper.to_feed(subgraph) feed_dict["node_label"] = np.expand_dims( np.array( batch_train_labels, dtype="int64"), -1) feed_dict["node_index"] = sub_node_index + feed_dict["parent_node_index"] = np.array(nodes, dtype="int64") yield feed_dict return work @@ -97,23 +107,25 @@ def multiprocess_graph_reader(graph, node_index, batch_size, node_label, + with_parent_node_index=False, num_workers=4): """multiprocess_graph_reader """ - def parse_to_subgraph(rd): + def parse_to_subgraph(rd, prefix, node_feat, _with_parent_node_index): """parse_to_subgraph """ def work(): """work """ - last = time.time() for data in rd(): - this = time.time() feed_dict = data - now = time.time() - last = now + for key in node_feat: + feed_dict[prefix + '/node_feat/' + key] = node_feat[key][ + feed_dict["parent_node_index"]] + if not _with_parent_node_index: + del feed_dict["parent_node_index"] yield feed_dict return work @@ -129,46 +141,17 @@ def multiprocess_graph_reader(graph, reader_pool.append( worker(batch_info[block_size * i:block_size * (i + 1)], graph, graph_wrapper, samples)) - multi_process_sample = mp_reader.multiprocess_reader( - reader_pool, use_pipe=True, queue_size=1000) - r = parse_to_subgraph(multi_process_sample) - return paddle.reader.buffered(r, 1000) - - return reader() - - -def graph_reader(graph, graph_wrapper, samples, node_index, batch_size, - node_label): - """graph_reader""" - - def reader(): - """reader""" - for batch_train_samples, batch_train_labels in node_batch_iter( - node_index, node_label, batch_size=batch_size): - start_nodes = batch_train_samples - nodes = start_nodes - eids = [] - for max_deg in samples: - pred, pred_eid = graph.sample_predecessor( - start_nodes, max_degree=max_deg, return_eids=True) - last_nodes = nodes - nodes = [nodes, pred] - eids = [eids, pred_eid] - nodes, eids = flat_node_and_edge(nodes, eids) - # Find new nodes - start_nodes = list(set(nodes) - set(last_nodes)) - if len(start_nodes) == 0: - break - subgraph = graph.subgraph(nodes=nodes, eid=eids) - feed_dict = graph_wrapper.to_feed(subgraph) - sub_node_index = subgraph.reindex_from_parrent_nodes( - batch_train_samples) + if len(reader_pool) == 1: + r = parse_to_subgraph(reader_pool[0], + repr(graph_wrapper), graph.node_feat, + with_parent_node_index) + else: + multi_process_sample = mp_reader.multiprocess_reader( + reader_pool, use_pipe=True, queue_size=1000) + r = parse_to_subgraph(multi_process_sample, + repr(graph_wrapper), graph.node_feat, + with_parent_node_index) + return paddle.reader.buffered(r, num_workers) - feed_dict["node_label"] = np.expand_dims( - np.array( - batch_train_labels, dtype="int64"), -1) - feed_dict["node_index"] = np.array(sub_node_index, dtype="int32") - yield feed_dict - - return paddle.reader.buffered(reader, 1000) + return reader() diff --git a/examples/graphsage/train.py b/examples/graphsage/train.py index fa19463a5222a306b72686ee47d3d18772808a93..da20f6e9643b25b800b87d3935feedaf3ec2a62b 100644 --- a/examples/graphsage/train.py +++ b/examples/graphsage/train.py @@ -63,10 +63,7 @@ def load_data(normalize=True, symmetry=True): log.info("Feature shape %s" % (repr(feature.shape))) graph = pgl.graph.Graph( - num_nodes=feature.shape[0], - edges=list(zip(src, dst)), - node_feat={"index": np.arange( - 0, len(feature), dtype="int64")}) + num_nodes=feature.shape[0], edges=list(zip(src, dst))) return { "graph": graph, @@ -89,7 +86,13 @@ def build_graph_model(graph_wrapper, num_class, k_hop, graphsage_type, node_label = fluid.layers.data( "node_label", shape=[None, 1], dtype="int64", append_batch_size=False) - feature = fluid.layers.gather(feature, graph_wrapper.node_feat['index']) + parent_node_index = fluid.layers.data( + "parent_node_index", + shape=[None], + dtype="int64", + append_batch_size=False) + + feature = fluid.layers.gather(feature, parent_node_index) feature.stop_gradient = True for i in range(k_hop): @@ -221,59 +224,35 @@ def main(args): exe.run(startup_program) feature_init(place) - if args.sample_workers > 1: - train_iter = reader.multiprocess_graph_reader( - data['graph'], - graph_wrapper, - samples=samples, - num_workers=args.sample_workers, - batch_size=args.batch_size, - node_index=data['train_index'], - node_label=data["train_label"]) - else: - train_iter = reader.graph_reader( - data['graph'], - graph_wrapper, - samples=samples, - batch_size=args.batch_size, - node_index=data['train_index'], - node_label=data["train_label"]) - - if args.sample_workers > 1: - val_iter = reader.multiprocess_graph_reader( - data['graph'], - graph_wrapper, - samples=samples, - num_workers=args.sample_workers, - batch_size=args.batch_size, - node_index=data['val_index'], - node_label=data["val_label"]) - else: - val_iter = reader.graph_reader( - data['graph'], - graph_wrapper, - samples=samples, - batch_size=args.batch_size, - node_index=data['val_index'], - node_label=data["val_label"]) - - if args.sample_workers > 1: - test_iter = reader.multiprocess_graph_reader( - data['graph'], - graph_wrapper, - samples=samples, - num_workers=args.sample_workers, - batch_size=args.batch_size, - node_index=data['test_index'], - node_label=data["test_label"]) - else: - test_iter = reader.graph_reader( - data['graph'], - graph_wrapper, - samples=samples, - batch_size=args.batch_size, - node_index=data['test_index'], - node_label=data["test_label"]) + train_iter = reader.multiprocess_graph_reader( + data['graph'], + graph_wrapper, + samples=samples, + num_workers=args.sample_workers, + batch_size=args.batch_size, + with_parent_node_index=True, + node_index=data['train_index'], + node_label=data["train_label"]) + + val_iter = reader.multiprocess_graph_reader( + data['graph'], + graph_wrapper, + samples=samples, + num_workers=args.sample_workers, + batch_size=args.batch_size, + with_parent_node_index=True, + node_index=data['val_index'], + node_label=data["val_label"]) + + test_iter = reader.multiprocess_graph_reader( + data['graph'], + graph_wrapper, + samples=samples, + num_workers=args.sample_workers, + batch_size=args.batch_size, + with_parent_node_index=True, + node_index=data['test_index'], + node_label=data["test_label"]) for epoch in range(args.epoch): run_epoch( diff --git a/examples/graphsage/train_multi.py b/examples/graphsage/train_multi.py index 8a1e799312e25ff2314de326ea3880ca0970b708..eda3a341c99e4f4b7123dc3eebf0f7f2f79617ad 100644 --- a/examples/graphsage/train_multi.py +++ b/examples/graphsage/train_multi.py @@ -195,7 +195,7 @@ def run_epoch(batch_iter, if num_trainer > 1: num_samples = sum( - [len(batch["node_index"]) for batch in batch_feed_dict]) + [len(_batch["node_index"]) for _batch in batch_feed_dict]) else: num_samples = len(batch_feed_dict["node_index"]) total_loss += batch_loss * num_samples @@ -262,59 +262,32 @@ def main(args): else: train_exe = exe - if args.sample_workers > 1: - train_iter = reader.multiprocess_graph_reader( - data['graph'], - graph_wrapper, - samples=samples, - num_workers=args.sample_workers, - batch_size=args.batch_size, - node_index=data['train_index'], - node_label=data["train_label"]) - else: - train_iter = reader.graph_reader( - data['graph'], - graph_wrapper, - samples=samples, - batch_size=args.batch_size, - node_index=data['train_index'], - node_label=data["train_label"]) - - if args.sample_workers > 1: - val_iter = reader.multiprocess_graph_reader( - data['graph'], - graph_wrapper, - samples=samples, - num_workers=args.sample_workers, - batch_size=args.batch_size, - node_index=data['val_index'], - node_label=data["val_label"]) - else: - val_iter = reader.graph_reader( - data['graph'], - graph_wrapper, - samples=samples, - batch_size=args.batch_size, - node_index=data['val_index'], - node_label=data["val_label"]) - - if args.sample_workers > 1: - test_iter = reader.multiprocess_graph_reader( - data['graph'], - graph_wrapper, - samples=samples, - num_workers=args.sample_workers, - batch_size=args.batch_size, - node_index=data['test_index'], - node_label=data["test_label"]) - else: - test_iter = reader.graph_reader( - data['graph'], - graph_wrapper, - samples=samples, - batch_size=args.batch_size, - node_index=data['test_index'], - node_label=data["test_label"]) + train_iter = reader.multiprocess_graph_reader( + data['graph'], + graph_wrapper, + samples=samples, + num_workers=args.sample_workers, + batch_size=args.batch_size, + node_index=data['train_index'], + node_label=data["train_label"]) + + val_iter = reader.multiprocess_graph_reader( + data['graph'], + graph_wrapper, + samples=samples, + num_workers=args.sample_workers, + batch_size=args.batch_size, + node_index=data['val_index'], + node_label=data["val_label"]) + + test_iter = reader.multiprocess_graph_reader( + data['graph'], + graph_wrapper, + samples=samples, + num_workers=args.sample_workers, + batch_size=args.batch_size, + node_index=data['test_index'], + node_label=data["test_label"]) for epoch in range(args.epoch): run_epoch( diff --git a/examples/graphsage/train_scale.py b/examples/graphsage/train_scale.py index 31a615ff5515d54af7589389ebf7bf5a925fa6cb..f0625d0202b37ca4153b3077451b5032edaf0fbf 100644 --- a/examples/graphsage/train_scale.py +++ b/examples/graphsage/train_scale.py @@ -97,11 +97,7 @@ def load_data(normalize=True, symmetry=True, scale=1): graph = pgl.graph.Graph( num_nodes=feature.shape[0], edges=edges, - node_feat={ - "index": np.arange( - 0, len(feature), dtype="int64"), - "feature": feature - }) + node_feat={"feature": feature}) return { "graph": graph, @@ -244,59 +240,32 @@ def main(args): test_program = train_program.clone(for_test=True) - if args.sample_workers > 1: - train_iter = reader.multiprocess_graph_reader( - data['graph'], - graph_wrapper, - samples=samples, - num_workers=args.sample_workers, - batch_size=args.batch_size, - node_index=data['train_index'], - node_label=data["train_label"]) - else: - train_iter = reader.graph_reader( - data['graph'], - graph_wrapper, - samples=samples, - batch_size=args.batch_size, - node_index=data['train_index'], - node_label=data["train_label"]) - - if args.sample_workers > 1: - val_iter = reader.multiprocess_graph_reader( - data['graph'], - graph_wrapper, - samples=samples, - num_workers=args.sample_workers, - batch_size=args.batch_size, - node_index=data['val_index'], - node_label=data["val_label"]) - else: - val_iter = reader.graph_reader( - data['graph'], - graph_wrapper, - samples=samples, - batch_size=args.batch_size, - node_index=data['val_index'], - node_label=data["val_label"]) - - if args.sample_workers > 1: - test_iter = reader.multiprocess_graph_reader( - data['graph'], - graph_wrapper, - samples=samples, - num_workers=args.sample_workers, - batch_size=args.batch_size, - node_index=data['test_index'], - node_label=data["test_label"]) - else: - test_iter = reader.graph_reader( - data['graph'], - graph_wrapper, - samples=samples, - batch_size=args.batch_size, - node_index=data['test_index'], - node_label=data["test_label"]) + train_iter = reader.multiprocess_graph_reader( + data['graph'], + graph_wrapper, + samples=samples, + num_workers=args.sample_workers, + batch_size=args.batch_size, + node_index=data['train_index'], + node_label=data["train_label"]) + + val_iter = reader.multiprocess_graph_reader( + data['graph'], + graph_wrapper, + samples=samples, + num_workers=args.sample_workers, + batch_size=args.batch_size, + node_index=data['val_index'], + node_label=data["val_label"]) + + test_iter = reader.multiprocess_graph_reader( + data['graph'], + graph_wrapper, + samples=samples, + num_workers=args.sample_workers, + batch_size=args.batch_size, + node_index=data['test_index'], + node_label=data["test_label"]) with fluid.program_guard(train_program, startup_program): adam = fluid.optimizer.Adam(learning_rate=args.lr) diff --git a/examples/metapath2vec/Dataset.py b/examples/metapath2vec/Dataset.py index 5bf97b548f0979adf61545d89b26aeb0b1d48b75..2c2f7e28b86ccd884b5ff177ce87009cf1292065 100644 --- a/examples/metapath2vec/Dataset.py +++ b/examples/metapath2vec/Dataset.py @@ -23,7 +23,7 @@ import tqdm import time import logging import random -from pgl.contrib import heter_graph +from pgl import heter_graph import pickle as pkl @@ -40,8 +40,10 @@ class Dataset(object): def __init__(self, config): self.config = config - self.walk_files = config['input_path'] + config['walk_path'] - self.word2id_file = config['input_path'] + config['word2id_file'] + self.walk_files = os.path.join(config['input_path'], + config['walk_path']) + self.word2id_file = os.path.join(config['input_path'], + config['word2id_file']) self.word2freq = {} self.word2id = {} @@ -65,12 +67,16 @@ class Dataset(object): for walk_file in glob.glob(self.walk_files): with open(walk_file, 'r') as reader: for walk in reader: - walk = walk.strip().split(' ') + walk = walk.strip().split() if len(walk) > 1: self.sentences_count += 1 for word in walk: - self.token_count += 1 - word_freq[word] = word_freq.get(word, 0) + 1 + if int(word) >= self.config[ + 'paper_start_index']: # remove paper + continue + else: + self.token_count += 1 + word_freq[word] = word_freq.get(word, 0) + 1 wid = 0 logging.info('Read %d sentences.' % self.sentences_count) @@ -123,7 +129,11 @@ class Dataset(object): for filename in walkpath_files: with open(filename) as reader: for line in reader: - words = line.strip().split(' ') + words = line.strip().split() + words = [ + w for w in words + if int(w) < self.config['paper_start_index'] + ] if len(words) > 1: word_ids = [ self.word2id[w] for w in words if w in self.word2id diff --git a/examples/metapath2vec/config.yaml b/examples/metapath2vec/config.yaml index aa91e72455e55e185f083a785335817dc26e66a6..b7629273aedb467a20b69569b51983f608d2a963 100644 --- a/examples/metapath2vec/config.yaml +++ b/examples/metapath2vec/config.yaml @@ -13,9 +13,12 @@ sampler: new_author_label_file: author_label.txt new_venue_label_file: venue_label.txt walk_saved_path: walks/ + walk_batch_size: 1000 num_walks: 1000 walk_length: 100 - metapath: conf-paper-author-paper-conf + num_sample_workers: 16 + first_node_type: conf + metapath: c2p-p2a-a2p-p2c #conf-paper-author-paper-conf optimizer: type: Adam @@ -39,9 +42,10 @@ data_loader: walk_path: walks/* word2id_file: word2id.pkl batch_size: 32 - win_size: 7 # default: 7 + win_size: 5 # default: 7 neg_num: 5 min_count: 10 + paper_start_index: 1697414 model: type: SkipgramModel diff --git a/examples/metapath2vec/model.py b/examples/metapath2vec/model.py index be199f8f3f19e15a09f350040ef5f8ab5b735fb8..5937de947f4685f450906a3d7c262a9da9096c97 100644 --- a/examples/metapath2vec/model.py +++ b/examples/metapath2vec/model.py @@ -101,7 +101,7 @@ class SkipgramModel(object): pos_score = fl.squeeze(pos_logits, axes=[1]) pos_score = fl.clip(pos_score, min=-10, max=10) - pos_score = -1.0 * fl.logsigmoid(pos_score) + pos_score = -self.neg_num * fl.logsigmoid(pos_score) neg_logits = fl.matmul( embed_src, weight_negs, @@ -111,4 +111,4 @@ class SkipgramModel(object): neg_score = -1.0 * fl.logsigmoid(-1.0 * neg_score) neg_score = fl.reduce_sum(neg_score, dim=1, keep_dim=True) - self.loss = fl.reduce_mean(pos_score + neg_score) + self.loss = fl.reduce_mean(pos_score + neg_score) / self.neg_num / 2 diff --git a/examples/metapath2vec/sample.py b/examples/metapath2vec/sample.py index cbd7505d5923f807e7a8e7f6f8e3dc630f4d02c1..824f7bf89e8a2edcd614af2b2a78b713261afafd 100644 --- a/examples/metapath2vec/sample.py +++ b/examples/metapath2vec/sample.py @@ -18,6 +18,7 @@ training metapath2vec model. import multiprocessing from multiprocessing import Pool +from multiprocessing import Process import argparse import sys import os @@ -27,7 +28,7 @@ import tqdm import time import logging import random -from pgl.contrib import heter_graph +from pgl import heter_graph from pgl.sample import metapath_randomwalk from utils import * @@ -77,9 +78,14 @@ class Sampler(object): self.config['data_path'] + 'paper_conf.txt', self.paper_id2index, self.conf_id2index) - edges_by_types['edge'] = paper_author_edges + paper_conf_edges - logging.info('%d edges have been loaded.' % - (len(edges_by_types['edge']))) + # edges_by_types['edge'] = paper_author_edges + paper_conf_edges + edges_by_types['p2c'] = paper_conf_edges + edges_by_types['c2p'] = [(dst, src) for src, dst in paper_conf_edges] + edges_by_types['p2a'] = paper_author_edges + edges_by_types['a2p'] = [(dst, src) for src, dst in paper_author_edges] + + # logging.info('%d edges have been loaded.' % + # (len(edges_by_types['edge']))) node_features = { 'index': np.array([i for i in range(num_nodes)]).reshape( @@ -110,7 +116,7 @@ class Sampler(object): return id2index, name2index, node_types - def load_edges(self, file_, src2index, dst2index, symmetry=True): + def load_edges(self, file_, src2index, dst2index, symmetry=False): """Load edges from file. """ edges = [] @@ -143,41 +149,65 @@ class Sampler(object): return index_label_list -def generate_walks(args): - """Generate metapath random walk and save to file. +def walk_generator(graph, batch_size, metapath, n_type, walk_length): + """Generate metapath random walk. """ - g, meta_path, filename, walk_length = args - walks = [] - node_types = g._node_types - first_type = meta_path.split('-')[0] - nodes = np.where(node_types == first_type)[0] - if len(nodes) > 4000: - nodes = np.random.choice(nodes, 4000, replace=False) - - logging.info('%d number of start nodes' % (len(nodes))) - logging.info('save walks in file: %s' % (filename)) - + np.random.seed(os.getpid()) + while True: + for start_nodes in graph.node_batch_iter( + batch_size=batch_size, n_type=n_type): + walks = metapath_randomwalk( + graph=graph, + start_nodes=start_nodes, + metapath=metapath, + walk_length=walk_length) + yield walks + + +def walk_to_files(g, batch_size, metapath, n_type, walk_length, max_num, + filename): + """Generate metapath randomwalk and save in files""" + # g, batch_size, metapath, n_type, walk_length, max_num, filename = args with open(filename, 'w') as writer: - for start_node in nodes: - walk = metapath_randomwalk(g, start_node, meta_path, walk_length) - walk = [str(walk[i]) for i in range(0, len(walk), 2)] # skip paper - writer.write(' '.join(walk) + '\n') + cc = 0 + for walks in walk_generator(g, batch_size, metapath, n_type, + walk_length): + for walk in walks: + writer.write("%s\n" % "\t".join([str(i) for i in walk])) + cc += 1 + if cc == max_num: + return + return + + +def multiprocess_generate_walks_to_files(graph, n_type, meta_path, num_walks, + walk_length, batch_size, + num_sample_workers, saved_path): + """Use multiprocess to generate metapath random walk to files. + """ + num_nodes_by_type = graph.num_nodes_by_type(n_type) + logging.info("num_nodes_by_type: %s" % num_nodes_by_type) + max_num = (num_walks * num_nodes_by_type // num_sample_workers) + 1 + logging.info("max sample number of every worker: %s" % max_num) -def multiprocess_generate_walks(sampler, edge_type, meta_path, num_walks, - walk_length, saved_path): - """Use multiprocess to generate metapath random walk. - """ args = [] - for i in range(num_walks): - filename = saved_path + '%04d' % (i) - args.append( - (sampler.graph[edge_type], meta_path, filename, walk_length)) - - pool = Pool(16) - pool.map(generate_walks, args) - pool.close() - pool.join() + for i in range(num_sample_workers): + filename = os.path.join(saved_path, 'part-%05d' % (i)) + args.append((graph, batch_size, meta_path, n_type, walk_length, + max_num, filename)) + + ps = [] + for i in range(num_sample_workers): + p = Process(target=walk_to_files, args=args[i]) + p.start() + ps.append(p) + for i in range(num_sample_workers): + ps[i].join() + # pool = Pool(num_sample_workers) + # pool.map(walk_to_files, args) + # pool.close() + # pool.join() if __name__ == "__main__": @@ -220,13 +250,15 @@ if __name__ == "__main__": begin = time.time() logging.info('multi process sampling') - multiprocess_generate_walks( - sampler=sampler, - edge_type='edge', + multiprocess_generate_walks_to_files( + graph=sampler.graph, + n_type=config['first_node_type'], meta_path=config['metapath'], num_walks=config['num_walks'], walk_length=config['walk_length'], - saved_path=config['walk_saved_path']) + batch_size=config['walk_batch_size'], + num_sample_workers=config['num_sample_workers'], + saved_path=config['walk_saved_path'], ) logging.info('total time: %.4f' % (time.time() - begin)) logging.info('generating multi class data') diff --git a/examples/stgcn/README.md b/examples/stgcn/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a185210a1fff51dfceee12ec010eac164b2ed99f --- /dev/null +++ b/examples/stgcn/README.md @@ -0,0 +1,36 @@ +# STGCN: Spatio-Temporal Graph Convolutional Network + +[Spatio-Temporal Graph Convolutional Network \(STGCN\)](https://arxiv.org/pdf/1709.04875.pdf) is a novel deep learning framework to tackle time series prediction problem. Based on PGL, we reproduce STGCN algorithms to predict new confirmed patients in some cities with the historical immigration records. + +### Datasets + +You can make your customized dataset by the following format: + +* input.csv: Historical immigration records with shape of [num\_time\_steps * num\_cities]. + +* output.csv: New confirmed patients records with shape of [num\_time\_steps * num\_cities]. + +* W.csv: Weighted Adjacency Matrix with shape of [num\_cities * num\_cities]. + +* city.csv: Each line is a number and the corresponding city name. + +### Dependencies + +- paddlepaddle 1.6 +- pgl 1.0.0 + +### How to run + +For examples, use gpu to train STGCN on your dataset. +``` +python main.py --use_cuda --input_file dataset/input_csv --label_file dataset/output.csv --adj_mat_file dataset/W.csv --city_file dataset/city.csv +``` + +#### Hyperparameters + +- n\_route: Number of city. +- n\_his: "n\_his" time steps of previous observations of historical immigration records. +- n\_pred: Next "n\_pred" time steps of New confirmed patients records. +- Ks: Number of GCN layers. +- Kt: Kernel size of temporal convolution. +- use\_cuda: Use gpu if assign use\_cuda. diff --git a/examples/stgcn/data_loader/__init__.py b/examples/stgcn/data_loader/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..28650aa680d220074886694622d70c2203a944e8 --- /dev/null +++ b/examples/stgcn/data_loader/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""__init__""" diff --git a/examples/stgcn/data_loader/data_utils.py b/examples/stgcn/data_loader/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f9c73c774d6b9b239fe8509024a52ee37a4890f9 --- /dev/null +++ b/examples/stgcn/data_loader/data_utils.py @@ -0,0 +1,251 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""data processing +""" +import numpy as np +import pandas as pd + +from utils.math_utils import z_score + + +class Dataset(object): + """Dataset + """ + + def __init__(self, data, stats): + self.__data = data + self.mean = stats['mean'] + self.std = stats['std'] + + def get_data(self, type): # type: train, val or test + return self.__data[type] + + def get_stats(self): + return {'mean': self.mean, 'std': self.std} + + def get_len(self, type): + return len(self.__data[type]) + + def z_inverse(self, type): + return self.__data[type] * self.std + self.mean + + +def seq_gen(len_seq, data_seq, offset, n_frame, n_route, day_slot, C_0=1): + """Generate data in the form of standard sequence unit.""" + n_slot = day_slot - n_frame + 1 + + tmp_seq = np.zeros((len_seq * n_slot, n_frame, n_route, C_0)) + for i in range(len_seq): + for j in range(n_slot): + sta = (i + offset) * day_slot + j + end = sta + n_frame + tmp_seq[i * n_slot + j, :, :, :] = np.reshape( + data_seq[sta:end, :], [n_frame, n_route, C_0]) + return tmp_seq + + +def adj_matrx_gen_custom(input_file, city_file): + """genenrate Adjacency Matrix from file + """ + print("generate adj_matrix data (take long time)...") + # data + df = pd.read_csv( + input_file, + sep='\t', + names=['date', '迁出省份', '迁出城市', '迁入省份', '迁入城市', '人数']) + # 只需要2020年的数据 + df['date'] = pd.to_datetime(df['date'], format="%Y%m%d") + df = df.set_index('date') + df = df['2020'] + city_df = pd.read_csv(city_file) + # 剔除武汉 + city_df = city_df.drop(0) + num = len(city_df) + matrix = np.zeros([num, num]) + for i in city_df['city']: + for j in city_df['city']: + if (i == j): + continue + # 选出从i到j的每日人数 + cut = df[df['迁出城市'].str.contains(i)] + cut = cut[cut['迁入城市'].str.contains(j)] + # 求均值作为权重 + average = cut['人数'].mean() + # 赋值给matrix + i_index = int(city_df[city_df['city'] == i]['num']) - 1 + j_index = int(city_df[city_df['city'] == j]['num']) - 1 + matrix[i_index, j_index] = average + + np.savetxt("dataset/W_74.csv", matrix, delimiter=",") + + +def data_gen_custom(input_file, output_file, city_file, n, n_his, n_pred, + n_config): + """data_gen_custom""" + print("generate training data...") + # data + df = pd.read_csv( + input_file, + sep='\t', + names=['date', '迁出省份', '迁出城市', '迁入省份', '迁入城市', '人数']) + # 只需要2020年的数据 + df['date'] = pd.to_datetime(df['date'], format="%Y%m%d") + df = df.set_index('date') + df = df['2020'] + city_df = pd.read_csv(city_file) + input_df = pd.DataFrame() + + out_df_wuhan = df[df['迁出城市'].str.contains('武汉')] + for i in city_df['city']: + # 筛选迁入城市 + in_df_i = out_df_wuhan[out_df_wuhan['迁入城市'].str.contains(i)] + # 确保按时间升序 + # in_df_i.sort_values("date",inplace=True) + # 按时间插入 + in_df_i.reset_index(drop=True, inplace=True) + input_df[i] = in_df_i['人数'] + + # 替换Nan值 + input_df = input_df.replace(np.nan, 0) + + x = input_df + y = pd.read_csv(output_file) + # 删除第1列 + x.drop( + x.columns[x.columns.str.contains( + 'unnamed', case=False)], + axis=1, + inplace=True) + y = y.drop(columns=['date']) + + # 剔除迁入武汉的数据 + x = x.drop(columns=['武汉']) + y = y.drop(columns=['武汉']) + + # param + n_val, n_test = n_config + n_train = len(y) - n_val - n_test - 2 + + # (?,26,74,1) + df = pd.DataFrame(columns=x.columns) + for i in range(len(y) - n_pred + 1): + df = df.append(x[i:i + n_his]) + df = df.append(y[i:i + n_pred]) + data = df.values.reshape(-1, n_his + n_pred, n, + 1) # n == num_nodes == city num + + x_stats = {'mean': np.mean(data), 'std': np.std(data)} + + x_train = data[:n_train] + x_val = data[n_train:n_train + n_val] + x_test = data[n_train + n_val:] + + x_data = {'train': x_train, 'val': x_val, 'test': x_test} + dataset = Dataset(x_data, x_stats) + print("generate successfully!") + + return dataset + + +def data_gen_mydata(input_file, label_file, n, n_his, n_pred, n_config): + """data processing + """ + # data + x = pd.read_csv(input_file) + y = pd.read_csv(label_file) + x = x.drop(columns=['date']) + y = y.drop(columns=['date']) + + x = x.drop(columns=['武汉']) + y = y.drop(columns=['武汉']) + + # param + n_val, n_test = n_config + n_train = len(y) - n_val - n_test - 2 + + # (?,26,74,1) + df = pd.DataFrame(columns=x.columns) + for i in range(len(y) - n_pred + 1): + df = df.append(x[i:i + n_his]) + df = df.append(y[i:i + n_pred]) + + data = df.values.reshape(-1, n_his + n_pred, n, 1) + + x_stats = {'mean': np.mean(data), 'std': np.std(data)} + + x_train = data[:n_train] + x_val = data[n_train:n_train + n_val] + x_test = data[n_train + n_val:] + + x_data = {'train': x_train, 'val': x_val, 'test': x_test} + dataset = Dataset(x_data, x_stats) + return dataset + + +def data_gen(file_path, data_config, n_route, n_frame=21, day_slot=288): + """Source file load and dataset generation.""" + n_train, n_val, n_test = data_config + # generate training, validation and test data + try: + data_seq = pd.read_csv(file_path, header=None).values + except FileNotFoundError: + print(f'ERROR: input file was not found in {file_path}.') + + seq_train = seq_gen(n_train, data_seq, 0, n_frame, n_route, day_slot) + seq_val = seq_gen(n_val, data_seq, n_train, n_frame, n_route, day_slot) + seq_test = seq_gen(n_test, data_seq, n_train + n_val, n_frame, n_route, + day_slot) + + # x_stats: dict, the stats for the train dataset, including the value of mean and standard deviation. + x_stats = {'mean': np.mean(seq_train), 'std': np.std(seq_train)} + + # x_train, x_val, x_test: np.array, [sample_size, n_frame, n_route, channel_size]. + x_train = z_score(seq_train, x_stats['mean'], x_stats['std']) + x_val = z_score(seq_val, x_stats['mean'], x_stats['std']) + x_test = z_score(seq_test, x_stats['mean'], x_stats['std']) + + x_data = {'train': x_train, 'val': x_val, 'test': x_test} + dataset = Dataset(x_data, x_stats) + return dataset + + +def gen_batch(inputs, batch_size, dynamic_batch=False, shuffle=False): + """Data iterator in batch. + + Args: + inputs: np.ndarray, [len_seq, n_frame, n_route, C_0], standard sequence units. + batch_size: int, size of batch. + dynamic_batch: bool, whether changes the batch size in the last batch + if its length is less than the default. + shuffle: bool, whether shuffle the batches. + """ + len_inputs = len(inputs) + + if shuffle: + idx = np.arange(len_inputs) + np.random.shuffle(idx) + + for start_idx in range(0, len_inputs, batch_size): + end_idx = start_idx + batch_size + if end_idx > len_inputs: + if dynamic_batch: + end_idx = len_inputs + else: + break + if shuffle: + slide = idx[start_idx:end_idx] + else: + slide = slice(start_idx, end_idx) + + yield inputs[slide] diff --git a/examples/stgcn/data_loader/graph.py b/examples/stgcn/data_loader/graph.py new file mode 100644 index 0000000000000000000000000000000000000000..fc04eecd1779ab82cf83fd42be8e7a5c8a530770 --- /dev/null +++ b/examples/stgcn/data_loader/graph.py @@ -0,0 +1,92 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PGL Graph +""" +import sys +import os +import numpy as np +import pandas as pd + +from pgl.graph import Graph + + +def weight_matrix(file_path, sigma2=0.1, epsilon=0.5, scaling=True): + """Load weight matrix function.""" + try: + W = pd.read_csv(file_path, header=None).values + except FileNotFoundError: + print(f'ERROR: input file was not found in {file_path}.') + + # check whether W is a 0/1 matrix. + if set(np.unique(W)) == {0, 1}: + print('The input graph is a 0/1 matrix; set "scaling" to False.') + scaling = False + + if scaling: + n = W.shape[0] + W = W / 10000. + W2, W_mask = W * W, np.ones([n, n]) - np.identity(n) + # refer to Eq.10 + return np.exp(-W2 / sigma2) * ( + np.exp(-W2 / sigma2) >= epsilon) * W_mask + else: + return W + + +class GraphFactory(object): + """GraphFactory""" + + def __init__(self, args): + self.args = args + self.adj_matrix = weight_matrix(self.args.adj_mat_file) + + L = np.eye(self.adj_matrix.shape[0]) + self.adj_matrix + D = np.sum(self.adj_matrix, axis=1) + # L = D - self.adj_matrix + # import ipdb; ipdb.set_trace() + + edges = [] + weights = [] + for i in range(self.adj_matrix.shape[0]): + for j in range(self.adj_matrix.shape[1]): + edges.append([i, j]) + weights.append(L[i][j]) + + self.edges = np.array(edges, dtype=np.int64) + self.weights = np.array(weights, dtype=np.float32).reshape(-1, 1) + + self.norm = np.zeros_like(D, dtype=np.float32) + self.norm[D > 0] = np.power(D[D > 0], -0.5) + self.norm = self.norm.reshape(-1, 1) + + def build_graph(self, x_batch): + """build graph""" + B, T, n, _ = x_batch.shape + batch = B * T + + batch_edges = [] + for i in range(batch): + batch_edges.append(self.edges + (i * n)) + batch_edges = np.vstack(batch_edges) + + num_nodes = B * T * n + node_feat = {'norm': np.tile(self.norm, [batch, 1])} + edge_feat = {'weights': np.tile(self.weights, [batch, 1])} + graph = Graph( + num_nodes=num_nodes, + edges=batch_edges, + node_feat=node_feat, + edge_feat=edge_feat) + + return graph diff --git a/examples/stgcn/main.py b/examples/stgcn/main.py new file mode 100644 index 0000000000000000000000000000000000000000..6be8df9991177daf2ba9fed1f39eebbb8e8ad83f --- /dev/null +++ b/examples/stgcn/main.py @@ -0,0 +1,157 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This file implement the training process of STGCN model. +""" + +import os +import sys +import time +import argparse +import numpy as np + +import paddle.fluid as fluid +import paddle.fluid.layers as fl +import pgl +from pgl.utils.logger import log + +from data_loader.data_utils import data_gen_mydata, gen_batch +from data_loader.graph import GraphFactory +from models.model import STGCNModel +from models.tester import model_inference, model_test + + +def main(args): + """main""" + PeMS = data_gen_mydata(args.input_file, args.label_file, args.n_route, + args.n_his, args.n_pred, (args.n_val, args.n_test)) + + log.info(PeMS.get_stats()) + log.info(PeMS.get_len('train')) + + gf = GraphFactory(args) + + place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace() + train_program = fluid.Program() + startup_program = fluid.Program() + + with fluid.program_guard(train_program, startup_program): + gw = pgl.graph_wrapper.GraphWrapper( + "gw", + place, + node_feat=[('norm', [None, 1], "float32")], + edge_feat=[('weights', [None, 1], "float32")]) + + model = STGCNModel(args, gw) + train_loss, y_pred = model.forward() + + infer_program = train_program.clone(for_test=True) + + with fluid.program_guard(train_program, startup_program): + epoch_step = int(PeMS.get_len('train') / args.batch_size) + 1 + lr = fl.exponential_decay( + learning_rate=args.lr, + decay_steps=5 * epoch_step, + decay_rate=0.7, + staircase=True) + if args.opt == 'RMSProp': + train_op = fluid.optimizer.RMSPropOptimizer(lr).minimize( + train_loss) + elif args.opt == 'ADAM': + train_op = fluid.optimizer.Adam(lr).minimize(train_loss) + + exe = fluid.Executor(place) + exe.run(startup_program) + + if args.inf_mode == 'sep': + # for inference mode 'sep', the type of step index is int. + step_idx = args.n_pred - 1 + tmp_idx = [step_idx] + min_val = min_va_val = np.array([4e1, 1e5, 1e5]) + elif args.inf_mode == 'merge': + # for inference mode 'merge', the type of step index is np.ndarray. + step_idx = tmp_idx = np.arange(3, args.n_pred + 1, 3) - 1 + min_val = min_va_val = np.array([4e1, 1e5, 1e5]) * len(step_idx) + else: + raise ValueError(f'ERROR: test mode "{args.inf_mode}" is not defined.') + + step = 0 + for epoch in range(1, args.epochs + 1): + for idx, x_batch in enumerate( + gen_batch( + PeMS.get_data('train'), + args.batch_size, + dynamic_batch=True, + shuffle=True)): + + x = np.array(x_batch[:, 0:args.n_his, :, :], dtype=np.float32) + graph = gf.build_graph(x) + feed = gw.to_feed(graph) + feed['input'] = np.array( + x_batch[:, 0:args.n_his + 1, :, :], dtype=np.float32) + b_loss, b_lr = exe.run(train_program, + feed=feed, + fetch_list=[train_loss, lr]) + + if idx % 5 == 0: + log.info("epoch %d | step %d | lr %.6f | loss %.6f" % + (epoch, idx, b_lr[0], b_loss[0])) + + min_va_val, min_val = \ + model_inference(exe, gw, gf, infer_program, y_pred, PeMS, args, \ + step_idx, min_va_val, min_val) + + for ix in tmp_idx: + va, te = min_va_val[ix - 2:ix + 1], min_val[ix - 2:ix + 1] + print(f'Time Step {ix + 1}: ' + f'MAPE {va[0]:7.3%}, {te[0]:7.3%}; ' + f'MAE {va[1]:4.3f}, {te[1]:4.3f}; ' + f'RMSE {va[2]:6.3f}, {te[2]:6.3f}.') + + if epoch % 5 == 0: + model_test(exe, gw, gf, infer_program, y_pred, PeMS, args) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--n_route', type=int, default=74) + parser.add_argument('--n_his', type=int, default=23) + parser.add_argument('--n_pred', type=int, default=3) + parser.add_argument('--batch_size', type=int, default=10) + parser.add_argument('--epochs', type=int, default=100) + parser.add_argument('--save', type=int, default=10) + parser.add_argument('--Ks', type=int, default=3) #equal to num_layers + parser.add_argument('--Kt', type=int, default=3) + parser.add_argument('--lr', type=float, default=1e-2) + parser.add_argument('--keep_prob', type=float, default=1.0) + parser.add_argument('--opt', type=str, default='RMSProp') + parser.add_argument('--inf_mode', type=str, default='sep') + parser.add_argument('--input_file', type=str, default='dataset/input.csv') + parser.add_argument('--label_file', type=str, default='dataset/output.csv') + parser.add_argument( + '--city_file', type=str, default='dataset/crawl_list.csv') + parser.add_argument('--adj_mat_file', type=str, default='dataset/W_74.csv') + parser.add_argument('--output_path', type=str, default='./outputs/') + parser.add_argument('--n_val', type=str, default=1) + parser.add_argument('--n_test', type=str, default=1) + parser.add_argument('--use_cuda', action='store_true') + args = parser.parse_args() + + blocks = [[1, 32, 64], [64, 32, 128]] + args.blocks = blocks + log.info(args) + if not os.path.exists(args.output_path): + os.makedirs(args.output_path) + + main(args) diff --git a/examples/stgcn/models/model.py b/examples/stgcn/models/model.py new file mode 100644 index 0000000000000000000000000000000000000000..1e8f4963995550d97e5420134558c63dac3a723f --- /dev/null +++ b/examples/stgcn/models/model.py @@ -0,0 +1,271 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""This file implement the STGCN model. +""" +import numpy as np + +import paddle.fluid as fluid +import paddle.fluid.layers as fl +import pgl + + +class STGCNModel(object): + """Implementation of Spatio-Temporal Graph Convolutional Networks""" + + def __init__(self, args, gw): + self.args = args + self.gw = gw + + self.input = fl.data( + name="input", + shape=[None, args.n_his + 1, args.n_route, 1], + dtype="float32") + + def forward(self): + """forward""" + x = self.input[:, 0:self.args.n_his, :, :] + # Ko>0: kernel size of temporal convolution in the output layer. + Ko = self.args.n_his + # ST-Block + for i, channels in enumerate(self.args.blocks): + x = self.st_conv_block( + x, + self.args.Ks, + self.args.Kt, + channels, + "st_conv_%d" % i, + self.args.keep_prob, + act_func='GLU') + + # output layer + if Ko > 1: + y = self.output_layer(x, Ko, 'output_layer') + else: + raise ValueError(f'ERROR: kernel size Ko must be greater than 1, \ + but received "{Ko}".') + + label = self.input[:, self.args.n_his:self.args.n_his + 1, :, :] + train_loss = fl.reduce_sum((y - label) * (y - label)) + single_pred = y[:, 0, :, :] # shape: [batch, n, 1] + + return train_loss, single_pred + + def st_conv_block(self, + x, + Ks, + Kt, + channels, + name, + keep_prob, + act_func='GLU'): + """Spatio-Temporal convolution block""" + c_si, c_t, c_oo = channels + + x_s = self.temporal_conv_layer( + x, Kt, c_si, c_t, "%s_tconv_in" % name, act_func=act_func) + x_t = self.spatio_conv_layer(x_s, Ks, c_t, c_t, "%s_sonv" % name) + x_o = self.temporal_conv_layer(x_t, Kt, c_t, c_oo, + "%s_tconv_out" % name) + + x_ln = fl.layer_norm(x_o) + return fl.dropout(x_ln, dropout_prob=(1.0 - keep_prob)) + + def temporal_conv_layer(self, x, Kt, c_in, c_out, name, act_func='relu'): + """Temporal convolution layer""" + _, T, n, _ = x.shape + if c_in > c_out: + x_input = fl.conv2d( + input=x, + num_filters=c_out, + filter_size=[1, 1], + stride=[1, 1], + padding="SAME", + data_format="NHWC", + param_attr=fluid.ParamAttr(name="%s_conv2d_1" % name)) + elif c_in < c_out: + # if the size of input channel is less than the output, + # padding x to the same size of output channel. + pad = fl.fill_constant_batch_size_like( + input=x, + shape=[-1, T, n, c_out - c_in], + dtype="float32", + value=0.0) + x_input = fl.concat([x, pad], axis=3) + else: + x_input = x + + # x_input = x_input[:, Kt - 1:T, :, :] + if act_func == 'GLU': + # gated liner unit + bt_init = fluid.initializer.ConstantInitializer(value=0.0) + bt = fl.create_parameter( + shape=[2 * c_out], + dtype="float32", + attr=fluid.ParamAttr( + name="%s_bt" % name, trainable=True, initializer=bt_init), + ) + x_conv = fl.conv2d( + input=x, + num_filters=2 * c_out, + filter_size=[Kt, 1], + stride=[1, 1], + padding="SAME", + data_format="NHWC", + param_attr=fluid.ParamAttr(name="%s_conv2d_wt" % name)) + x_conv = x_conv + bt + return (x_conv[:, :, :, 0:c_out] + x_input + ) * fl.sigmoid(x_conv[:, :, :, -c_out:]) + else: + bt_init = fluid.initializer.ConstantInitializer(value=0.0) + bt = fl.create_parameter( + shape=[c_out], + dtype="float32", + attr=fluid.ParamAttr( + name="%s_bt" % name, trainable=True, initializer=bt_init), + ) + x_conv = fl.conv2d( + input=x, + num_filters=c_out, + filter_size=[Kt, 1], + stride=[1, 1], + padding="SAME", + data_format="NHWC", + param_attr=fluid.ParamAttr(name="%s_conv2d_wt" % name)) + x_conv = x_conv + bt + if act_func == "linear": + return x_conv + elif act_func == "sigmoid": + return fl.sigmoid(x_conv) + elif act_func == "relu": + return fl.relu(x_conv + x_input) + else: + raise ValueError( + f'ERROR: activation function "{act_func}" is not defined.') + + def spatio_conv_layer(self, x, Ks, c_in, c_out, name): + """Spatio convolution layer""" + _, T, n, _ = x.shape + if c_in > c_out: + x_input = fl.conv2d( + input=x, + num_filters=c_out, + filter_size=[1, 1], + stride=[1, 1], + padding="SAME", + data_format="NHWC", + param_attr=fluid.ParamAttr(name="%s_conv2d_1" % name)) + elif c_in < c_out: + # if the size of input channel is less than the output, + # padding x to the same size of output channel. + pad = fl.fill_constant_batch_size_like( + input=x, + shape=[-1, T, n, c_out - c_in], + dtype="float32", + value=0.0) + x_input = fl.concat([x, pad], axis=3) + else: + x_input = x + + for i in range(Ks): + # x_input shape: [B,T, num_nodes, c_out] + x_input = fl.reshape(x_input, [-1, c_out]) + + x_input = self.message_passing( + self.gw, + x_input, + name="%s_mp_%d" % (name, i), + norm=self.gw.node_feat["norm"]) + + x_input = fl.fc(x_input, + size=c_out, + bias_attr=False, + param_attr=fluid.ParamAttr(name="%s_gcn_fc_%d" % + (name, i))) + + bias = fluid.layers.create_parameter( + shape=[c_out], + dtype='float32', + is_bias=True, + name='%s_gcn_bias_%d' % (name, i)) + x_input = fluid.layers.elementwise_add(x_input, bias, act="relu") + + x_input = fl.reshape(x_input, [-1, T, n, c_out]) + + return x_input + + def message_passing(self, gw, feature, name, norm=None): + """Message passing layer""" + + def send_src_copy(src_feat, dst_feat, edge_feat): + """send function""" + return src_feat["h"] * edge_feat['w'] + + if norm is not None: + feature = feature * norm + + msg = gw.send( + send_src_copy, + nfeat_list=[("h", feature)], + efeat_list=[('w', gw.edge_feat['weights'])]) + output = gw.recv(msg, "sum") + + if norm is not None: + output = output * norm + + return output + + def output_layer(self, x, T, name, act_func='GLU'): + """Output layer""" + _, _, n, channel = x.shape + + # maps multi-steps to one. + x_i = self.temporal_conv_layer( + x=x, + Kt=T, + c_in=channel, + c_out=channel, + name="%s_in" % name, + act_func=act_func) + x_ln = fl.layer_norm(x_i) + x_o = self.temporal_conv_layer( + x=x_ln, + Kt=1, + c_in=channel, + c_out=channel, + name="%s_out" % name, + act_func='sigmoid') + + # maps multi-channels to one. + x_fc = self.fully_con_layer( + x=x_o, n=n, channel=channel, name="%s_fc" % name) + return x_fc + + def fully_con_layer(self, x, n, channel, name): + """Fully connected layer""" + bt_init = fluid.initializer.ConstantInitializer(value=0.0) + bt = fl.create_parameter( + shape=[n, 1], + dtype="float32", + attr=fluid.ParamAttr( + name="%s_bt" % name, trainable=True, initializer=bt_init), ) + x_conv = fl.conv2d( + input=x, + num_filters=1, + filter_size=[1, 1], + stride=[1, 1], + padding="SAME", + data_format="NHWC", + param_attr=fluid.ParamAttr(name="%s_conv2d" % name)) + x_conv = x_conv + bt + return x_conv diff --git a/examples/stgcn/models/tester.py b/examples/stgcn/models/tester.py new file mode 100644 index 0000000000000000000000000000000000000000..5c10caeb4a7a01930ecb544fef13c272a4dffd56 --- /dev/null +++ b/examples/stgcn/models/tester.py @@ -0,0 +1,134 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""This file implement the testing process of STGCN model. +""" +import os +import sys +import time +import argparse +import numpy as np +import pandas as pd + +import paddle.fluid as fluid +import paddle.fluid.layers as fl +import pgl +from pgl.utils.logger import log + +from data_loader.data_utils import gen_batch +from utils.math_utils import evaluation + + +def multi_pred(exe, gw, gf, program, y_pred, seq, batch_size, \ + n_his, n_pred, step_idx, dynamic_batch=True): + """multi step prediction""" + pred_list = [] + for i in gen_batch( + seq, min(batch_size, len(seq)), dynamic_batch=dynamic_batch): + + # Note: use np.copy() to avoid the modification of source data. + test_seq = np.copy(i[:, 0:n_his + 1, :, :]).astype(np.float32) + graph = gf.build_graph(i[:, 0:n_his, :, :]) + feed = gw.to_feed(graph) + step_list = [] + for j in range(n_pred): + feed['input'] = test_seq + pred = exe.run(program, feed=feed, fetch_list=[y_pred]) + if isinstance(pred, list): + pred = np.array(pred[0]) + test_seq[:, 0:n_his - 1, :, :] = test_seq[:, 1:n_his, :, :] + test_seq[:, n_his - 1, :, :] = pred + step_list.append(pred) + pred_list.append(step_list) + # pred_array -> [n_pred, len(seq), n_route, C_0) + pred_array = np.concatenate(pred_list, axis=1) + return pred_array, pred_array.shape[1] + + +def model_inference(exe, gw, gf, program, pred, inputs, args, step_idx, + min_va_val, min_val): + """inference model""" + x_val, x_test, x_stats = inputs.get_data('val'), inputs.get_data( + 'test'), inputs.get_stats() + + if args.n_his + args.n_pred > x_val.shape[1]: + raise ValueError( + f'ERROR: the value of n_pred "{args.n_pred}" exceeds the length limit.' + ) + + # y_val shape: [n_pred, len(x_val), n_route, C_0) + y_val, len_val = multi_pred(exe, gw, gf, program, pred, \ + x_val, args.batch_size, args.n_his, args.n_pred, step_idx) + + evl_val = evaluation(x_val[0:len_val, step_idx + args.n_his, :, :], + y_val[step_idx], x_stats) + + # chks: indicator that reflects the relationship of values between evl_val and min_va_val. + chks = evl_val < min_va_val + # update the metric on test set, if model's performance got improved on the validation. + if sum(chks): + min_va_val[chks] = evl_val[chks] + y_pred, len_pred = multi_pred(exe, gw, gf, program, pred, \ + x_test, args.batch_size, args.n_his, args.n_pred, step_idx) + + evl_pred = evaluation(x_test[0:len_pred, step_idx + args.n_his, :, :], + y_pred[step_idx], x_stats) + min_val = evl_pred + + return min_va_val, min_val + + +def model_test(exe, gw, gf, program, pred, inputs, args): + """test model""" + if args.inf_mode == 'sep': + # for inference mode 'sep', the type of step index is int. + step_idx = args.n_pred - 1 + tmp_idx = [step_idx] + elif args.inf_mode == 'merge': + # for inference mode 'merge', the type of step index is np.ndarray. + step_idx = tmp_idx = np.arange(3, args.n_pred + 1, 3) - 1 + print(step_idx) + else: + raise ValueError(f'ERROR: test mode "{args.inf_mode}" is not defined.') + + x_test, x_stats = inputs.get_data('test'), inputs.get_stats() + y_test, len_test = multi_pred(exe, gw, gf, program, pred, \ + x_test, args.batch_size, args.n_his, args.n_pred, step_idx) + + # save result + gt = x_test[0:len_test, args.n_his:, :, :].reshape(-1, args.n_route) + y_pred = y_test.reshape(-1, args.n_route) + city_df = pd.read_csv(args.city_file) + city_df = city_df.drop(0) + + np.savetxt( + os.path.join(args.output_path, "groundtruth.csv"), + gt.astype(np.int32), + fmt='%d', + delimiter=',', + header=",".join(city_df['city'])) + np.savetxt( + os.path.join(args.output_path, "prediction.csv"), + y_pred.astype(np.int32), + fmt='%d', + delimiter=",", + header=",".join(city_df['city'])) + + for i in range(step_idx + 1): + evl = evaluation(x_test[0:len_test, step_idx + args.n_his, :, :], + y_test[i], x_stats) + for ix in tmp_idx: + te = evl[ix - 2:ix + 1] + print( + f'Time Step {i + 1}: MAPE {te[0]:7.3%}; MAE {te[1]:4.3f}; RMSE {te[2]:6.3f}.' + ) diff --git a/examples/stgcn/utils/math_utils.py b/examples/stgcn/utils/math_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ddc0b3f77bc975da91be82442f434837ab0fee89 --- /dev/null +++ b/examples/stgcn/utils/math_utils.py @@ -0,0 +1,65 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Evaluation""" +import os +import sys +import time +import argparse +import numpy as np + + +def z_score(x, mean, std): + """z_score""" + return (x - mean) / std + + +def z_inverse(x, mean, std): + """The inverse of function z_score""" + return x * std + mean + + +def MAPE(v, v_): + """Mean absolute percentage error.""" + return np.mean(np.abs(v_ - v) / (v + 1e-5)) + + +def RMSE(v, v_): + """Mean squared error.""" + return np.sqrt(np.mean((v_ - v)**2)) + + +def MAE(v, v_): + """Mean absolute error.""" + return np.mean(np.abs(v_ - v)) + + +def evaluation(y, y_, x_stats): + """Calculate MAPE, MAE and RMSE between ground truth and prediction.""" + dim = len(y_.shape) + + if dim == 3: + # single_step case + v = z_inverse(y, x_stats['mean'], x_stats['std']) + v_ = z_inverse(y_, x_stats['mean'], x_stats['std']) + return np.array([MAPE(v, v_), MAE(v, v_), RMSE(v, v_)]) + else: + # multi_step case + tmp_list = [] + # y -> [time_step, batch_size, n_route, 1] + y = np.swapaxes(y, 0, 1) + # recursively call + for i in range(y_.shape[0]): + tmp_res = evaluation(y[i], y_[i], x_stats) + tmp_list.append(tmp_res) + return np.concatenate(tmp_list, axis=-1) diff --git a/ogb_examples/graphproppred/main_pgl.py b/ogb_examples/graphproppred/main_pgl.py new file mode 100644 index 0000000000000000000000000000000000000000..ef7c112e5a364db8d26d297d5b4f297e2b6ef7ad --- /dev/null +++ b/ogb_examples/graphproppred/main_pgl.py @@ -0,0 +1,189 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""test ogb +""" +import argparse + +import pgl +import numpy as np +import paddle.fluid as fluid +from pgl.contrib.ogb.graphproppred.dataset_pgl import PglGraphPropPredDataset +from pgl.utils import paddle_helper +from ogb.graphproppred import Evaluator +from pgl.contrib.ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder + + +def train(exe, batch_size, graph_wrapper, train_program, splitted_idx, dataset, + evaluator, fetch_loss, fetch_pred): + """Train""" + graphs, labels = dataset[splitted_idx["train"]] + perm = np.arange(0, len(graphs)) + np.random.shuffle(perm) + start_batch = 0 + batch_no = 0 + pred_output = np.zeros_like(labels, dtype="float32") + while start_batch < len(perm): + batch_index = perm[start_batch:start_batch + batch_size] + start_batch += batch_size + batch_graph = pgl.graph.MultiGraph(graphs[batch_index]) + batch_label = labels[batch_index] + batch_valid = (batch_label == batch_label).astype("float32") + batch_label = np.nan_to_num(batch_label).astype("float32") + feed_dict = graph_wrapper.to_feed(batch_graph) + feed_dict["label"] = batch_label + feed_dict["weight"] = batch_valid + loss, pred = exe.run(train_program, + feed=feed_dict, + fetch_list=[fetch_loss, fetch_pred]) + pred_output[batch_index] = pred + batch_no += 1 + print("train", evaluator.eval({"y_true": labels, "y_pred": pred_output})) + + +def evaluate(exe, batch_size, graph_wrapper, val_program, splitted_idx, + dataset, mode, evaluator, fetch_pred): + """Eval""" + graphs, labels = dataset[splitted_idx[mode]] + perm = np.arange(0, len(graphs)) + start_batch = 0 + batch_no = 0 + pred_output = np.zeros_like(labels, dtype="float32") + while start_batch < len(perm): + batch_index = perm[start_batch:start_batch + batch_size] + start_batch += batch_size + batch_graph = pgl.graph.MultiGraph(graphs[batch_index]) + feed_dict = graph_wrapper.to_feed(batch_graph) + pred = exe.run(val_program, feed=feed_dict, fetch_list=[fetch_pred]) + pred_output[batch_index] = pred[0] + batch_no += 1 + print(mode, evaluator.eval({"y_true": labels, "y_pred": pred_output})) + + +def send_func(src_feat, dst_feat, edge_feat): + """Send""" + return src_feat["h"] + edge_feat["h"] + + +class GNNModel(object): + """GNNModel""" + + def __init__(self, name, emb_dim, num_task, num_layers): + self.num_task = num_task + self.emb_dim = emb_dim + self.num_layers = num_layers + self.name = name + self.atom_encoder = AtomEncoder(name=name, emb_dim=emb_dim) + self.bond_encoder = BondEncoder(name=name, emb_dim=emb_dim) + + def forward(self, graph): + """foward""" + h_node = self.atom_encoder(graph.node_feat['feat']) + h_edge = self.bond_encoder(graph.edge_feat['feat']) + for layer in range(self.num_layers): + msg = graph.send( + send_func, + nfeat_list=[("h", h_node)], + efeat_list=[("h", h_edge)]) + h_node = graph.recv(msg, 'sum') + h_node + h_node = fluid.layers.fc(h_node, + size=self.emb_dim, + name=self.name + '_%s' % layer, + act="relu") + graph_nodes = pgl.layers.graph_pooling(graph, h_node, "average") + graph_pred = fluid.layers.fc(graph_nodes, self.num_task, name="final") + return graph_pred + + +def main(): + """main + """ + # Training settings + parser = argparse.ArgumentParser(description='Graph Dataset') + parser.add_argument( + '--epochs', + type=int, + default=100, + help='number of epochs to train (default: 100)') + parser.add_argument( + '--dataset', + type=str, + default="ogbg-mol-tox21", + help='dataset name (default: proteinfunc)') + args = parser.parse_args() + + place = fluid.CPUPlace() # Dataset too big to use GPU + + ### automatic dataloading and splitting + dataset = PglGraphPropPredDataset(name=args.dataset) + splitted_idx = dataset.get_idx_split() + + ### automatic evaluator. takes dataset name as input + evaluator = Evaluator(args.dataset) + + graph_data, label = dataset[:2] + batch_graph = pgl.graph.MultiGraph(graph_data) + graph_data = batch_graph + + train_program = fluid.Program() + startup_program = fluid.Program() + test_program = fluid.Program() + # degree normalize + graph_data.edge_feat["feat"] = graph_data.edge_feat["feat"].astype("int64") + graph_data.node_feat["feat"] = graph_data.node_feat["feat"].astype("int64") + + model = GNNModel( + name="gnn", num_task=dataset.num_tasks, emb_dim=64, num_layers=2) + + with fluid.program_guard(train_program, startup_program): + gw = pgl.graph_wrapper.GraphWrapper( + "graph", + place=place, + node_feat=graph_data.node_feat_info(), + edge_feat=graph_data.edge_feat_info()) + pred = model.forward(gw) + sigmoid_pred = fluid.layers.sigmoid(pred) + + val_program = train_program.clone(for_test=True) + + initializer = [] + with fluid.program_guard(train_program, startup_program): + train_label = fluid.layers.data( + name="label", dtype="float32", shape=[None, dataset.num_tasks]) + train_weight = fluid.layers.data( + name="weight", dtype="float32", shape=[None, dataset.num_tasks]) + train_loss_t = fluid.layers.sigmoid_cross_entropy_with_logits( + x=pred, label=train_label) * train_weight + train_loss_t = fluid.layers.reduce_sum(train_loss_t) + + adam = fluid.optimizer.Adam( + learning_rate=1e-2, + regularization=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=0.0005)) + adam.minimize(train_loss_t) + + exe = fluid.Executor(place) + exe.run(startup_program) + + for epoch in range(1, args.epochs + 1): + print("Epoch", epoch) + train(exe, 128, gw, train_program, splitted_idx, dataset, evaluator, + train_loss_t, sigmoid_pred) + evaluate(exe, 128, gw, val_program, splitted_idx, dataset, "valid", + evaluator, sigmoid_pred) + evaluate(exe, 128, gw, val_program, splitted_idx, dataset, "test", + evaluator, sigmoid_pred) + + +if __name__ == "__main__": + main() diff --git a/ogb_examples/linkproppred/main_pgl.py b/ogb_examples/linkproppred/main_pgl.py new file mode 100644 index 0000000000000000000000000000000000000000..0a0634b7a8ac6a86cc6e74b3bf7ce22fb3a0488a --- /dev/null +++ b/ogb_examples/linkproppred/main_pgl.py @@ -0,0 +1,272 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""test ogb +""" +import argparse +import time +import logging +import numpy as np + +import paddle.fluid as fluid + +import pgl +from pgl.contrib.ogb.linkproppred.dataset_pgl import PglLinkPropPredDataset +from pgl.utils import paddle_helper +from ogb.linkproppred import Evaluator + + +def send_func(src_feat, dst_feat, edge_feat): + """send_func""" + return src_feat["h"] + + +def recv_func(feat): + """recv_func""" + return fluid.layers.sequence_pool(feat, pool_type="sum") + + +class GNNModel(object): + """GNNModel""" + + def __init__(self, name, num_nodes, emb_dim, num_layers): + self.num_nodes = num_nodes + self.emb_dim = emb_dim + self.num_layers = num_layers + self.name = name + + self.src_nodes = fluid.layers.data( + name='src_nodes', + shape=[None], + dtype='int64', ) + + self.dst_nodes = fluid.layers.data( + name='dst_nodes', + shape=[None], + dtype='int64', ) + + self.edge_label = fluid.layers.data( + name='edge_label', + shape=[None, 1], + dtype='float32', ) + + def forward(self, graph): + """forward""" + h = fluid.layers.create_parameter( + shape=[self.num_nodes, self.emb_dim], + dtype="float32", + name=self.name + "_embedding") + + for layer in range(self.num_layers): + msg = graph.send( + send_func, + nfeat_list=[("h", h)], ) + h = graph.recv(msg, recv_func) + h = fluid.layers.fc( + h, + size=self.emb_dim, + bias_attr=False, + param_attr=fluid.ParamAttr(name=self.name + '_%s' % layer)) + h = h * graph.node_feat["norm"] + bias = fluid.layers.create_parameter( + shape=[self.emb_dim], + dtype='float32', + is_bias=True, + name=self.name + '_bias_%s' % layer) + h = fluid.layers.elementwise_add(h, bias, act="relu") + + src = fluid.layers.gather(h, self.src_nodes, overwrite=False) + dst = fluid.layers.gather(h, self.dst_nodes, overwrite=False) + edge_embed = src * dst + pred = fluid.layers.fc(input=edge_embed, + size=1, + name=self.name + "_pred_output") + + prob = fluid.layers.sigmoid(pred) + + loss = fluid.layers.sigmoid_cross_entropy_with_logits(pred, + self.edge_label) + loss = fluid.layers.reduce_mean(loss) + + return pred, prob, loss + + +def main(): + """main + """ + # Training settings + parser = argparse.ArgumentParser(description='Graph Dataset') + parser.add_argument( + '--epochs', + type=int, + default=4, + help='number of epochs to train (default: 100)') + parser.add_argument( + '--dataset', + type=str, + default="ogbl-ppa", + help='dataset name (default: protein protein associations)') + parser.add_argument('--use_cuda', action='store_true') + parser.add_argument('--batch_size', type=int, default=5120) + parser.add_argument('--embed_dim', type=int, default=64) + parser.add_argument('--num_layers', type=int, default=2) + parser.add_argument('--lr', type=float, default=0.001) + args = parser.parse_args() + print(args) + + place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace() + + ### automatic dataloading and splitting + print("loadding dataset") + dataset = PglLinkPropPredDataset(name=args.dataset) + splitted_edge = dataset.get_edge_split() + print(splitted_edge['train_edge'].shape) + print(splitted_edge['train_edge_label'].shape) + + print("building evaluator") + ### automatic evaluator. takes dataset name as input + evaluator = Evaluator(args.dataset) + + graph_data = dataset[0] + print("num_nodes: %d" % graph_data.num_nodes) + + train_program = fluid.Program() + startup_program = fluid.Program() + + # degree normalize + indegree = graph_data.indegree() + norm = np.zeros_like(indegree, dtype="float32") + norm[indegree > 0] = np.power(indegree[indegree > 0], -0.5) + graph_data.node_feat["norm"] = np.expand_dims(norm, -1).astype("float32") + # graph_data.node_feat["index"] = np.array([i for i in range(graph_data.num_nodes)], dtype=np.int64).reshape(-1,1) + + with fluid.program_guard(train_program, startup_program): + model = GNNModel( + name="gnn", + num_nodes=graph_data.num_nodes, + emb_dim=args.embed_dim, + num_layers=args.num_layers) + gw = pgl.graph_wrapper.GraphWrapper( + "graph", + place, + node_feat=graph_data.node_feat_info(), + edge_feat=graph_data.edge_feat_info()) + pred, prob, loss = model.forward(gw) + + val_program = train_program.clone(for_test=True) + + with fluid.program_guard(train_program, startup_program): + global_steps = int(splitted_edge['train_edge'].shape[0] / + args.batch_size * 2) + learning_rate = fluid.layers.polynomial_decay(args.lr, global_steps, + 0.00005) + + adam = fluid.optimizer.Adam( + learning_rate=learning_rate, + regularization=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=0.0005)) + adam.minimize(loss) + + exe = fluid.Executor(place) + exe.run(startup_program) + feed = gw.to_feed(graph_data) + + print("evaluate result before training: ") + result = test(exe, val_program, prob, evaluator, feed, splitted_edge) + print(result) + + print("training") + cc = 0 + for epoch in range(1, args.epochs + 1): + for batch_data, batch_label in data_generator( + graph_data, + splitted_edge["train_edge"], + splitted_edge["train_edge_label"], + batch_size=args.batch_size): + feed['src_nodes'] = batch_data[:, 0].reshape(-1, 1) + feed['dst_nodes'] = batch_data[:, 1].reshape(-1, 1) + feed['edge_label'] = batch_label.astype("float32") + + res_loss, y_pred, b_lr = exe.run( + train_program, + feed=feed, + fetch_list=[loss, prob, learning_rate]) + if cc % 1 == 0: + print("epoch %d | step %d | lr %s | Loss %s" % + (epoch, cc, b_lr[0], res_loss[0])) + cc += 1 + + if cc % 20 == 0: + print("Evaluating...") + result = test(exe, val_program, prob, evaluator, feed, + splitted_edge) + print("epoch %d | step %d" % (epoch, cc)) + print(result) + + +def test(exe, val_program, prob, evaluator, feed, splitted_edge): + """Evaluation""" + result = {} + feed['src_nodes'] = splitted_edge["valid_edge"][:, 0].reshape(-1, 1) + feed['dst_nodes'] = splitted_edge["valid_edge"][:, 1].reshape(-1, 1) + feed['edge_label'] = splitted_edge["valid_edge_label"].astype( + "float32").reshape(-1, 1) + y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0] + input_dict = { + "y_true": splitted_edge["valid_edge_label"], + "y_pred": y_pred.reshape(-1, ), + } + result["valid"] = evaluator.eval(input_dict) + + feed['src_nodes'] = splitted_edge["test_edge"][:, 0].reshape(-1, 1) + feed['dst_nodes'] = splitted_edge["test_edge"][:, 1].reshape(-1, 1) + feed['edge_label'] = splitted_edge["test_edge_label"].astype( + "float32").reshape(-1, 1) + y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0] + input_dict = { + "y_true": splitted_edge["test_edge_label"], + "y_pred": y_pred.reshape(-1, ), + } + result["test"] = evaluator.eval(input_dict) + return result + + +def data_generator(graph, data, label_data, batch_size, shuffle=True): + """Data Generator""" + perm = np.arange(0, len(data)) + if shuffle: + np.random.shuffle(perm) + + offset = 0 + while offset < len(perm): + batch_index = perm[offset:(offset + batch_size)] + offset += batch_size + pos_data = data[batch_index] + pos_label = label_data[batch_index] + + neg_src_node = pos_data[:, 0] + neg_dst_node = np.random.choice( + pos_data.reshape(-1, ), size=len(neg_src_node)) + neg_data = np.hstack( + [neg_src_node.reshape(-1, 1), neg_dst_node.reshape(-1, 1)]) + exists = graph.has_edges_between(neg_src_node, neg_dst_node) + neg_data = neg_data[np.invert(exists)] + neg_label = np.zeros(shape=len(neg_data), dtype=np.int64) + + batch_data = np.vstack([pos_data, neg_data]) + label = np.vstack([pos_label.reshape(-1, 1), neg_label.reshape(-1, 1)]) + yield batch_data, label + + +if __name__ == "__main__": + main() diff --git a/ogb_examples/nodeproppred/main_pgl.py b/ogb_examples/nodeproppred/main_pgl.py new file mode 100644 index 0000000000000000000000000000000000000000..8e787060b2c504f4eabd406d7d826281504ee047 --- /dev/null +++ b/ogb_examples/nodeproppred/main_pgl.py @@ -0,0 +1,176 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""test ogb +""" +import argparse + +import pgl +import numpy as np +import paddle.fluid as fluid +from pgl.contrib.ogb.nodeproppred.dataset_pgl import PglNodePropPredDataset +from pgl.utils import paddle_helper +from ogb.nodeproppred import Evaluator + + +def train(): + pass + + +def send_func(src_feat, dst_feat, edge_feat): + return (src_feat["h"] + edge_feat["h"]) * src_feat["norm"] + + +class GNNModel(object): + def __init__(self, name, emb_dim, num_task, num_layers): + self.num_task = num_task + self.emb_dim = emb_dim + self.num_layers = num_layers + self.name = name + + def forward(self, graph): + h = fluid.layers.embedding( + graph.node_feat["x"], + size=(2, self.emb_dim)) # name=self.name + "_embedding") + edge_attr = fluid.layers.fc(graph.edge_feat["feat"], size=self.emb_dim) + for layer in range(self.num_layers): + msg = graph.send( + send_func, + nfeat_list=[("h", h), ("norm", graph.node_feat["norm"])], + efeat_list=[("h", edge_attr)]) + h = graph.recv(msg, "sum") + h = fluid.layers.fc( + h, + size=self.emb_dim, + bias_attr=False, + param_attr=fluid.ParamAttr(name=self.name + '_%s' % layer)) + h = h * graph.node_feat["norm"] + bias = fluid.layers.create_parameter( + shape=[self.emb_dim], + dtype='float32', + is_bias=True, + name=self.name + '_bias_%s' % layer) + h = fluid.layers.elementwise_add(h, bias, act="relu") + pred = fluid.layers.fc(h, + self.num_task, + act=None, + name=self.name + "_pred_output") + return pred + + +def main(): + """main + """ + # Training settings + parser = argparse.ArgumentParser(description='Graph Dataset') + parser.add_argument( + '--epochs', + type=int, + default=100, + help='number of epochs to train (default: 100)') + parser.add_argument( + '--dataset', + type=str, + default="ogbn-proteins", + help='dataset name (default: proteinfunc)') + args = parser.parse_args() + + #device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") + #place = fluid.CUDAPlace(0) + place = fluid.CPUPlace() # Dataset too big to use GPU + + ### automatic dataloading and splitting + dataset = PglNodePropPredDataset(name=args.dataset) + splitted_idx = dataset.get_idx_split() + + ### automatic evaluator. takes dataset name as input + evaluator = Evaluator(args.dataset) + + graph_data, label = dataset[0] + + train_program = fluid.Program() + startup_program = fluid.Program() + test_program = fluid.Program() + # degree normalize + indegree = graph_data.indegree() + norm = np.zeros_like(indegree, dtype="float32") + norm[indegree > 0] = np.power(indegree[indegree > 0], -0.5) + graph_data.node_feat["norm"] = np.expand_dims(norm, -1).astype("float32") + graph_data.node_feat["x"] = np.zeros((len(indegree), 1), dtype="int64") + graph_data.edge_feat["feat"] = graph_data.edge_feat["feat"].astype( + "float32") + model = GNNModel( + name="gnn", num_task=dataset.num_tasks, emb_dim=64, num_layers=2) + + with fluid.program_guard(train_program, startup_program): + gw = pgl.graph_wrapper.StaticGraphWrapper("graph", graph_data, place) + pred = model.forward(gw) + sigmoid_pred = fluid.layers.sigmoid(pred) + + val_program = train_program.clone(for_test=True) + + initializer = [] + with fluid.program_guard(train_program, startup_program): + train_node_index, init = paddle_helper.constant( + "train_node_index", dtype="int64", value=splitted_idx["train"]) + initializer.append(init) + + train_node_label, init = paddle_helper.constant( + "train_node_label", + dtype="float32", + value=label[splitted_idx["train"]].astype("float32")) + initializer.append(init) + train_pred_t = fluid.layers.gather(pred, train_node_index) + train_loss_t = fluid.layers.sigmoid_cross_entropy_with_logits( + x=train_pred_t, label=train_node_label) + train_loss_t = fluid.layers.reduce_sum(train_loss_t) + train_pred_t = fluid.layers.sigmoid(train_pred_t) + + adam = fluid.optimizer.Adam( + learning_rate=1e-2, + regularization=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=0.0005)) + adam.minimize(train_loss_t) + + exe = fluid.Executor(place) + exe.run(startup_program) + gw.initialize(place) + for init in initializer: + init(place) + + for epoch in range(1, args.epochs + 1): + loss = exe.run(train_program, feed={}, fetch_list=[train_loss_t]) + print("Loss %s" % loss[0]) + print("Evaluating...") + y_pred = exe.run(val_program, feed={}, fetch_list=[sigmoid_pred])[0] + result = {} + input_dict = { + "y_true": label[splitted_idx["train"]], + "y_pred": y_pred[splitted_idx["train"]] + } + result["train"] = evaluator.eval(input_dict) + input_dict = { + "y_true": label[splitted_idx["valid"]], + "y_pred": y_pred[splitted_idx["valid"]] + } + result["valid"] = evaluator.eval(input_dict) + input_dict = { + "y_true": label[splitted_idx["test"]], + "y_pred": y_pred[splitted_idx["test"]] + } + result["test"] = evaluator.eval(input_dict) + print(result) + + +if __name__ == "__main__": + main() diff --git a/pgl/__init__.py b/pgl/__init__.py index c7f323d9f9020b657b7d1b1c2c1251107448fff8..535632ea7afb3a157a4d92bbb08cb6445b21a125 100644 --- a/pgl/__init__.py +++ b/pgl/__init__.py @@ -13,9 +13,11 @@ # limitations under the License. """Generate pgl apis """ -__version__ = "1.0.1" +__version__ = "1.0.2" from pgl import layers from pgl import graph_wrapper from pgl import graph from pgl import data_loader +from pgl import heter_graph +from pgl import heter_graph_wrapper from pgl import contrib diff --git a/pgl/contrib/__init__.py b/pgl/contrib/__init__.py index 5dcd918aebbc3ce1cc9ce77e3e742985d223c39f..abf198b97e6e818e1fbe59006f98492640bcee54 100644 --- a/pgl/contrib/__init__.py +++ b/pgl/contrib/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,8 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Generate Contrib api -""" - -from pgl.contrib import heter_graph -from pgl.contrib import heter_graph_wrapper diff --git a/pgl/contrib/heter_graph.py b/pgl/contrib/heter_graph.py deleted file mode 100644 index 117baed86bd82caaf9a278e1b6c987e9064f8ec9..0000000000000000000000000000000000000000 --- a/pgl/contrib/heter_graph.py +++ /dev/null @@ -1,188 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - This package implement Heterogeneous Graph structure for handling Heterogeneous graph data. -""" -import time -import numpy as np -import pickle as pkl -import time -import pgl.graph_kernel as graph_kernel -from pgl.graph import Graph - -__all__ = ['HeterGraph'] - - -def _hide_num_nodes(shape): - """Set the first dimension as unknown - """ - shape = list(shape) - shape[0] = None - return shape - - -class NodeGraph(Graph): - """Implementation of a graph that has multple node types. - - Args: - num_nodes: number of nodes in the graph - edges: list of (u, v) tuples - node_types (optional): list of (u, node_type) tuples to specify the node type of every node - node_feat (optional): a dict of numpy array as node features - edge_feat (optional): a dict of numpy array as edge features - """ - - def __init__(self, - num_nodes, - edges, - node_types=None, - node_feat=None, - edge_feat=None): - super(NodeGraph, self).__init__(num_nodes, edges, node_feat, edge_feat) - - if isinstance(node_types, list): - self._node_types = np.array(node_types, dtype=object)[:, 1] - else: - self._node_types = node_types - - -class HeterGraph(object): - """Implementation of heterogeneous graph structure in pgl - - This is a simple implementation of heterogeneous graph structure in pgl. - - Args: - num_nodes: number of nodes in a heterogeneous graph - edges: dict, every element in dict is a list of (u, v) tuples. - node_types (optional): list of (u, node_type) tuples to specify the node type of every node - node_feat (optional): a dict of numpy array as node features - edge_feat (optional): a dict of dict as edge features for every edge type - - Examples: - .. code-block:: python - - import numpy as np - num_nodes = 4 - node_types = [(0, 'user'), (1, 'item'), (2, 'item'), (3, 'user')] - edges = { - 'edges_type1': [(0,1), (3,2)], - 'edges_type2': [(1,2), (3,1)], - } - node_feat = {'feature': np.random.randn(4, 16)} - edges_feat = { - 'edges_type1': {'h': np.random.randn(2, 16)}, - 'edges_type2': {'h': np.random.randn(2, 16)}, - } - - g = heter_graph.HeterGraph( - num_nodes=num_nodes, - edges=edges, - node_types=node_types, - node_feat=node_feat, - edge_feat=edges_feat) - """ - - def __init__(self, - num_nodes, - edges, - node_types=None, - node_feat=None, - edge_feat=None): - self._num_nodes = num_nodes - self._edges_dict = edges - - if node_feat is not None: - self._node_feat = node_feat - else: - self._node_feat = {} - - if edge_feat is not None: - self._edge_feat = edge_feat - else: - self._edge_feat = {} - - self._multi_graph = {} - for key, value in self._edges_dict.items(): - if not self._edge_feat: - edge_feat = None - else: - edge_feat = self._edge_feat[key] - - self._multi_graph[key] = NodeGraph( - num_nodes=self._num_nodes, - edges=value, - node_types=node_types, - node_feat=self._node_feat, - edge_feat=edge_feat) - - @property - def num_nodes(self): - """Return the number of nodes. - """ - return self._num_nodes - - def __getitem__(self, edge_type): - """__getitem__ - """ - return self._multi_graph[edge_type] - - def node_feat_info(self): - """Return the information of node feature for HeterGraphWrapper. - - This function return the information of node features of all node types. And this - function is used to help constructing HeterGraphWrapper - - Return: - A list of tuple (name, shape, dtype) for all given node feature. - - """ - node_feat_info = [] - for feat_name, feat in self._node_feat.items(): - node_feat_info.append( - (feat_name, _hide_num_nodes(feat.shape), feat.dtype)) - - return node_feat_info - - def edge_feat_info(self): - """Return the information of edge feature for HeterGraphWrapper. - - This function return the information of edge features of all edge types. And this - function is used to help constructing HeterGraphWrapper - - Return: - A dict of list of tuple (name, shape, dtype) for all given edge feature. - - """ - edge_feat_info = {} - for edge_type_name, feat_dict in self._edge_feat.items(): - tmp_edge_feat_info = [] - for feat_name, feat in feat_dict.items(): - full_name = feat_name - tmp_edge_feat_info.append( - (full_name, _hide_num_nodes(feat.shape), feat.dtype)) - edge_feat_info[edge_type_name] = tmp_edge_feat_info - return edge_feat_info - - def edge_types_info(self): - """Return the information of all edge types. - - Return: - A list of all edge types. - - """ - edge_types_info = [] - for key, _ in self._edges_dict.items(): - edge_types_info.append(key) - - return edge_types_info diff --git a/pgl/contrib/ogb/__init__.py b/pgl/contrib/ogb/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/pgl/contrib/ogb/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/pgl/contrib/ogb/graphproppred/__init__.py b/pgl/contrib/ogb/graphproppred/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f843d353364abdb7bb8068f3a07e86796e1cc98a --- /dev/null +++ b/pgl/contrib/ogb/graphproppred/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""__init__.py""" diff --git a/pgl/contrib/ogb/graphproppred/dataset_pgl.py b/pgl/contrib/ogb/graphproppred/dataset_pgl.py new file mode 100644 index 0000000000000000000000000000000000000000..bc5670adc1aaa8a4932fb63852c60ab882c04f7c --- /dev/null +++ b/pgl/contrib/ogb/graphproppred/dataset_pgl.py @@ -0,0 +1,152 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PglGraphPropPredDataset +""" +import pandas as pd +import shutil, os +import os.path as osp +import numpy as np +from ogb.utils.url import decide_download, download_url, extract_zip +from ogb.graphproppred import make_master_file +from pgl.contrib.ogb.io.read_graph_pgl import read_csv_graph_pgl + + +def to_bool(value): + """to_bool""" + return np.array([value], dtype="bool")[0] + + +class PglGraphPropPredDataset(object): + """PglGraphPropPredDataset""" + + def __init__(self, name, root="dataset"): + self.name = name ## original name, e.g., ogbg-mol-tox21 + self.dir_name = "_".join( + name.split("-") + ) + "_pgl" ## replace hyphen with underline, e.g., ogbg_mol_tox21_dgl + + self.original_root = root + self.root = osp.join(root, self.dir_name) + + self.meta_info = make_master_file.df #pd.read_csv( + #os.path.join(os.path.dirname(__file__), "master.csv"), index_col=0) + if not self.name in self.meta_info: + print(self.name) + error_mssg = "Invalid dataset name {}.\n".format(self.name) + error_mssg += "Available datasets are as follows:\n" + error_mssg += "\n".join(self.meta_info.keys()) + raise ValueError(error_mssg) + + self.download_name = self.meta_info[self.name][ + "download_name"] ## name of downloaded file, e.g., tox21 + + self.num_tasks = int(self.meta_info[self.name]["num tasks"]) + self.task_type = self.meta_info[self.name]["task type"] + + super(PglGraphPropPredDataset, self).__init__() + + self.pre_process() + + def pre_process(self): + """Pre-processing""" + processed_dir = osp.join(self.root, 'processed') + raw_dir = osp.join(self.root, 'raw') + pre_processed_file_path = osp.join(processed_dir, 'pgl_data_processed') + + if os.path.exists(pre_processed_file_path): + # TODO: Load Preprocessed + pass + else: + ### download + url = self.meta_info[self.name]["url"] + if decide_download(url): + path = download_url(url, self.original_root) + extract_zip(path, self.original_root) + os.unlink(path) + # delete folder if there exists + try: + shutil.rmtree(self.root) + except: + pass + shutil.move( + osp.join(self.original_root, self.download_name), + self.root) + else: + print("Stop download.") + exit(-1) + + ### preprocess + add_inverse_edge = to_bool(self.meta_info[self.name][ + "add_inverse_edge"]) + self.graphs = read_csv_graph_pgl( + raw_dir, add_inverse_edge=add_inverse_edge) + self.graphs = np.array(self.graphs) + self.labels = np.array( + pd.read_csv( + osp.join(raw_dir, "graph-label.csv.gz"), + compression="gzip", + header=None).values) + + # TODO: Load Graph + ### load preprocessed files + + def get_idx_split(self): + """Train/Valid/Test split""" + split_type = self.meta_info[self.name]["split"] + path = osp.join(self.root, "split", split_type) + + train_idx = pd.read_csv( + osp.join(path, "train.csv.gz"), compression="gzip", + header=None).values.T[0] + valid_idx = pd.read_csv( + osp.join(path, "valid.csv.gz"), compression="gzip", + header=None).values.T[0] + test_idx = pd.read_csv( + osp.join(path, "test.csv.gz"), compression="gzip", + header=None).values.T[0] + + return { + "train": np.array( + train_idx, dtype="int64"), + "valid": np.array( + valid_idx, dtype="int64"), + "test": np.array( + test_idx, dtype="int64") + } + + def __getitem__(self, idx): + """Get datapoint with index""" + return self.graphs[idx], self.labels[idx] + + def __len__(self): + """Length of the dataset + Returns + ------- + int + Length of Dataset + """ + return len(self.graphs) + + def __repr__(self): # pragma: no cover + return '{}({})'.format(self.__class__.__name__, len(self)) + + +if __name__ == "__main__": + pgl_dataset = PglGraphPropPredDataset(name="ogbg-mol-bace") + splitted_index = pgl_dataset.get_idx_split() + print(pgl_dataset) + print(pgl_dataset[3:20]) + #print(pgl_dataset[splitted_index["train"]]) + #print(pgl_dataset[splitted_index["valid"]]) + #print(pgl_dataset[splitted_index["test"]]) diff --git a/pgl/contrib/ogb/graphproppred/mol_encoder.py b/pgl/contrib/ogb/graphproppred/mol_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..2662d141532dc58925f30e0973d5d85bb4953bd3 --- /dev/null +++ b/pgl/contrib/ogb/graphproppred/mol_encoder.py @@ -0,0 +1,71 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MolEncoder for ogb +""" +import paddle.fluid as fluid +from ogb.utils.features import get_atom_feature_dims, get_bond_feature_dims + + +class AtomEncoder(object): + """AtomEncoder for encoding node features""" + + def __init__(self, name, emb_dim): + self.emb_dim = emb_dim + self.name = name + + def __call__(self, x): + atom_feature = get_atom_feature_dims() + atom_input = fluid.layers.split( + x, num_or_sections=len(atom_feature), dim=-1) + outputs = None + count = 0 + for _x, _atom_input_dim in zip(atom_input, atom_feature): + count += 1 + emb = fluid.layers.embedding( + _x, + size=(_atom_input_dim, self.emb_dim), + param_attr=fluid.ParamAttr( + name=self.name + '_atom_feat_%s' % count)) + if outputs is None: + outputs = emb + else: + outputs = outputs + emb + return outputs + + +class BondEncoder(object): + """Bond for encoding edge features""" + + def __init__(self, name, emb_dim): + self.emb_dim = emb_dim + self.name = name + + def __call__(self, x): + bond_feature = get_bond_feature_dims() + bond_input = fluid.layers.split( + x, num_or_sections=len(bond_feature), dim=-1) + outputs = None + count = 0 + for _x, _bond_input_dim in zip(bond_input, bond_feature): + count += 1 + emb = fluid.layers.embedding( + _x, + size=(_bond_input_dim, self.emb_dim), + param_attr=fluid.ParamAttr( + name=self.name + '_bond_feat_%s' % count)) + if outputs is None: + outputs = emb + else: + outputs = outputs + emb + return outputs diff --git a/pgl/contrib/ogb/io/__init__.py b/pgl/contrib/ogb/io/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3e466b247da1bf6da8ba06195f1c65bcc1789a17 --- /dev/null +++ b/pgl/contrib/ogb/io/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""__init__.py +""" diff --git a/pgl/contrib/ogb/io/read_graph_pgl.py b/pgl/contrib/ogb/io/read_graph_pgl.py new file mode 100644 index 0000000000000000000000000000000000000000..b60a091a7fd52842d1545c2f02fdad4aa0b50f3c --- /dev/null +++ b/pgl/contrib/ogb/io/read_graph_pgl.py @@ -0,0 +1,49 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""pgl read_csv_graph for ogb +""" + +import pandas as pd +import os.path as osp +import numpy as np +import pgl +from ogb.io.read_graph_raw import read_csv_graph_raw + + +def read_csv_graph_pgl(raw_dir, add_inverse_edge=False): + """Read CSV data and build PGL Graph + """ + graph_list = read_csv_graph_raw(raw_dir, add_inverse_edge) + pgl_graph_list = [] + + for graph in graph_list: + edges = list(zip(graph["edge_index"][0], graph["edge_index"][1])) + g = pgl.graph.Graph(num_nodes=graph["num_nodes"], edges=edges) + + if graph["edge_feat"] is not None: + g.edge_feat["feat"] = graph["edge_feat"] + + if graph["node_feat"] is not None: + g.node_feat["feat"] = graph["node_feat"] + + pgl_graph_list.append(g) + + return pgl_graph_list + + +if __name__ == "__main__": + # graph_list = read_csv_graph_dgl('dataset/proteinfunc_v2/raw', add_inverse_edge = True) + graph_list = read_csv_graph_pgl( + 'dataset/ogbn_proteins_pgl/raw', add_inverse_edge=True) + print(graph_list) diff --git a/pgl/contrib/ogb/linkproppred/__init__.py b/pgl/contrib/ogb/linkproppred/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3e466b247da1bf6da8ba06195f1c65bcc1789a17 --- /dev/null +++ b/pgl/contrib/ogb/linkproppred/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""__init__.py +""" diff --git a/pgl/contrib/ogb/linkproppred/dataset_pgl.py b/pgl/contrib/ogb/linkproppred/dataset_pgl.py new file mode 100644 index 0000000000000000000000000000000000000000..928f4a4f95d6df045089a5bee7b30e28eac95713 --- /dev/null +++ b/pgl/contrib/ogb/linkproppred/dataset_pgl.py @@ -0,0 +1,149 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""LinkPropPredDataset for pgl +""" +import pandas as pd +import shutil, os +import os.path as osp +import numpy as np +from ogb.utils.url import decide_download, download_url, extract_zip +from ogb.linkproppred import make_master_file +from pgl.contrib.ogb.io.read_graph_pgl import read_csv_graph_pgl + + +def to_bool(value): + """to_bool""" + return np.array([value], dtype="bool")[0] + + +class PglLinkPropPredDataset(object): + """PglLinkPropPredDataset + """ + + def __init__(self, name, root="dataset"): + self.name = name ## original name, e.g., ogbl-ppa + self.dir_name = "_".join(name.split( + "-")) + "_pgl" ## replace hyphen with underline, e.g., ogbl_ppa_pgl + + self.original_root = root + self.root = osp.join(root, self.dir_name) + + self.meta_info = make_master_file.df #pd.read_csv(os.path.join(os.path.dirname(__file__), "master.csv"), index_col=0) + if not self.name in self.meta_info: + print(self.name) + error_mssg = "Invalid dataset name {}.\n".format(self.name) + error_mssg += "Available datasets are as follows:\n" + error_mssg += "\n".join(self.meta_info.keys()) + raise ValueError(error_mssg) + + self.download_name = self.meta_info[self.name][ + "download_name"] ## name of downloaded file, e.g., ppassoc + + self.task_type = self.meta_info[self.name]["task type"] + + super(PglLinkPropPredDataset, self).__init__() + + self.pre_process() + + def pre_process(self): + """pre_process downlaoding data + """ + processed_dir = osp.join(self.root, 'processed') + pre_processed_file_path = osp.join(processed_dir, 'pgl_data_processed') + + if osp.exists(pre_processed_file_path): + #TODO: Reload Preprocess files + pass + else: + ### check download + if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")): + url = self.meta_info[self.name]["url"] + if decide_download(url): + path = download_url(url, self.original_root) + extract_zip(path, self.original_root) + os.unlink(path) + # delete folder if there exists + try: + shutil.rmtree(self.root) + except: + pass + shutil.move( + osp.join(self.original_root, self.download_name), + self.root) + else: + print("Stop download.") + exit(-1) + + raw_dir = osp.join(self.root, "raw") + + ### pre-process and save + add_inverse_edge = to_bool(self.meta_info[self.name][ + "add_inverse_edge"]) + self.graph = read_csv_graph_pgl( + raw_dir, add_inverse_edge=add_inverse_edge) + + #TODO: SAVE preprocess graph + + def get_edge_split(self): + """Train/Validation/Test split + """ + split_type = self.meta_info[self.name]["split"] + path = osp.join(self.root, "split", split_type) + + train_idx = pd.read_csv( + osp.join(path, "train.csv.gz"), compression="gzip", + header=None).values + valid_idx = pd.read_csv( + osp.join(path, "valid.csv.gz"), compression="gzip", + header=None).values + test_idx = pd.read_csv( + osp.join(path, "test.csv.gz"), compression="gzip", + header=None).values + + if self.task_type == "link prediction": + target_type = np.int64 + else: + target_type = np.float32 + + return { + "train_edge": np.array( + train_idx[:, :2], dtype="int64"), + "train_edge_label": np.array( + train_idx[:, 2], dtype=target_type), + "valid_edge": np.array( + valid_idx[:, :2], dtype="int64"), + "valid_edge_label": np.array( + valid_idx[:, 2], dtype=target_type), + "test_edge": np.array( + test_idx[:, :2], dtype="int64"), + "test_edge_label": np.array( + test_idx[:, 2], dtype=target_type) + } + + def __getitem__(self, idx): + assert idx == 0, "This dataset has only one graph" + return self.graph[0] + + def __len__(self): + return 1 + + def __repr__(self): # pragma: no cover + return '{}({})'.format(self.__class__.__name__, len(self)) + + +if __name__ == "__main__": + pgl_dataset = PglLinkPropPredDataset(name="ogbl-ppa") + splitted_edge = pgl_dataset.get_edge_split() + print(pgl_dataset[0]) + print(splitted_edge) diff --git a/pgl/contrib/ogb/nodeproppred/__init__.py b/pgl/contrib/ogb/nodeproppred/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3e466b247da1bf6da8ba06195f1c65bcc1789a17 --- /dev/null +++ b/pgl/contrib/ogb/nodeproppred/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""__init__.py +""" diff --git a/pgl/contrib/ogb/nodeproppred/dataset_pgl.py b/pgl/contrib/ogb/nodeproppred/dataset_pgl.py new file mode 100644 index 0000000000000000000000000000000000000000..3331b3b788e0e2426fc0b5d9dcd768fabeb40610 --- /dev/null +++ b/pgl/contrib/ogb/nodeproppred/dataset_pgl.py @@ -0,0 +1,153 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""NodePropPredDataset for pgl +""" +import pandas as pd +import shutil, os +import os.path as osp +import numpy as np +from ogb.utils.url import decide_download, download_url, extract_zip +from ogb.nodeproppred import make_master_file # create master.csv +from pgl.contrib.ogb.io.read_graph_pgl import read_csv_graph_pgl + + +def to_bool(value): + """to_bool""" + return np.array([value], dtype="bool")[0] + + +class PglNodePropPredDataset(object): + """PglNodePropPredDataset + """ + + def __init__(self, name, root="dataset"): + self.name = name ## original name, e.g., ogbn-proteins + self.dir_name = "_".join( + name.split("-") + ) + "_pgl" ## replace hyphen with underline, e.g., ogbn_proteins_pgl + + self.original_root = root + self.root = osp.join(root, self.dir_name) + + self.meta_info = make_master_file.df #pd.read_csv( + #os.path.join(os.path.dirname(__file__), "master.csv"), index_col=0) + if not self.name in self.meta_info: + error_mssg = "Invalid dataset name {}.\n".format(self.name) + error_mssg += "Available datasets are as follows:\n" + error_mssg += "\n".join(self.meta_info.keys()) + raise ValueError(error_mssg) + + self.download_name = self.meta_info[self.name][ + "download_name"] ## name of downloaded file, e.g., tox21 + + self.num_tasks = int(self.meta_info[self.name]["num tasks"]) + self.task_type = self.meta_info[self.name]["task type"] + + super(PglNodePropPredDataset, self).__init__() + + self.pre_process() + + def pre_process(self): + """pre_process downlaoding data + """ + processed_dir = osp.join(self.root, 'processed') + pre_processed_file_path = osp.join(processed_dir, 'pgl_data_processed') + + if osp.exists(pre_processed_file_path): + # TODO: Reload Preprocess files + pass + else: + ### check download + if not osp.exists(osp.join(self.root, "raw", "edge.csv.gz")): + url = self.meta_info[self.name]["url"] + if decide_download(url): + path = download_url(url, self.original_root) + extract_zip(path, self.original_root) + os.unlink(path) + # delete folder if there exists + try: + shutil.rmtree(self.root) + except: + pass + shutil.move( + osp.join(self.original_root, self.download_name), + self.root) + else: + print("Stop download.") + exit(-1) + + raw_dir = osp.join(self.root, "raw") + + ### pre-process and save + add_inverse_edge = to_bool(self.meta_info[self.name][ + "add_inverse_edge"]) + self.graph = read_csv_graph_pgl( + raw_dir, add_inverse_edge=add_inverse_edge) + + ### adding prediction target + node_label = pd.read_csv( + osp.join(raw_dir, 'node-label.csv.gz'), + compression="gzip", + header=None).values + if "classification" in self.task_type: + node_label = np.array(node_label, dtype=np.int64) + else: + node_label = np.array(node_label, dtype=np.float32) + + label_dict = {"labels": node_label} + + # TODO: SAVE preprocess graph + self.labels = label_dict['labels'] + + def get_idx_split(self): + """Train/Validation/Test split + """ + split_type = self.meta_info[self.name]["split"] + path = osp.join(self.root, "split", split_type) + + train_idx = pd.read_csv( + osp.join(path, "train.csv.gz"), compression="gzip", + header=None).values.T[0] + valid_idx = pd.read_csv( + osp.join(path, "valid.csv.gz"), compression="gzip", + header=None).values.T[0] + test_idx = pd.read_csv( + osp.join(path, "test.csv.gz"), compression="gzip", + header=None).values.T[0] + + return { + "train": np.array( + train_idx, dtype="int64"), + "valid": np.array( + valid_idx, dtype="int64"), + "test": np.array( + test_idx, dtype="int64") + } + + def __getitem__(self, idx): + assert idx == 0, "This dataset has only one graph" + return self.graph[idx], self.labels + + def __len__(self): + return 1 + + def __repr__(self): # pragma: no cover + return '{}({})'.format(self.__class__.__name__, len(self)) + + +if __name__ == "__main__": + pgl_dataset = PglNodePropPredDataset(name="ogbn-proteins") + splitted_index = pgl_dataset.get_idx_split() + print(pgl_dataset[0]) + print(splitted_index) diff --git a/pgl/graph.py b/pgl/graph.py index 8af2038f6378bef688f3530757bef3bd2a11b0c2..30a147566821f3a2b76e2ae5976cfce5b0782e0c 100644 --- a/pgl/graph.py +++ b/pgl/graph.py @@ -15,12 +15,14 @@ This package implement Graph structure for handling graph data. """ +import os import numpy as np import pickle as pkl import time import pgl.graph_kernel as graph_kernel +from collections import defaultdict -__all__ = ['Graph', 'SubGraph'] +__all__ = ['Graph', 'SubGraph', 'MultiGraph'] def _hide_num_nodes(shape): @@ -43,8 +45,8 @@ class EdgeIndex(object): """ def __init__(self, u, v, num_nodes): - self._v, self._eid, self._degree, self._sorted_u,\ - self._sorted_v, self._sorted_eid = graph_kernel.build_index(u, v, num_nodes) + self._degree, self._sorted_v, self._sorted_u, \ + self._sorted_eid, self._indptr = graph_kernel.build_index(u, v, num_nodes) @property def degree(self): @@ -52,23 +54,40 @@ class EdgeIndex(object): """ return self._degree - @property - def v(self): - """Return the compressed v. + def view_v(self, u=None): + """Return the compressed v for given u. """ - return self._v + if u is None: + return np.split(self._sorted_v, self._indptr[1:]) + else: + u = np.array(u, dtype="int64") + return graph_kernel.slice_by_index( + self._sorted_v, self._indptr, index=u) - @property - def eid(self): - """Return the edge id. + def view_eid(self, u=None): + """Return the compressed edge id for given u. """ - return self._eid + if u is None: + return np.split(self._sorted_eid, self._indptr[1:]) + else: + u = np.array(u, dtype="int64") + return graph_kernel.slice_by_index( + self._sorted_eid, self._indptr, index=u) def triples(self): """Return the sorted (u, v, eid) tuples. """ return self._sorted_u, self._sorted_v, self._sorted_eid + def dump(self, path): + if not os.path.exists(path): + os.makedirs(path) + np.save(os.path.join(path, 'degree.npy'), self._degree) + np.save(os.path.join(path, 'sorted_u.npy'), self._sorted_u) + np.save(os.path.join(path, 'sorted_v.npy'), self._sorted_v) + np.save(os.path.join(path, 'sorted_eid.npy'), self._sorted_eid) + np.save(os.path.join(path, 'indptr.npy'), self._indptr) + class Graph(object): """Implementation of graph structure in pgl. @@ -122,21 +141,51 @@ class Graph(object): self._edges = edges self._num_nodes = num_nodes - if len(edges) == 0: - raise ValueError("The Graph have no edges.") - self._adj_src_index = None self._adj_dst_index = None + self.indegree() + self._num_graph = 1 + self._graph_lod = np.array([0, self.num_nodes], dtype="int32") + + def dump(self, path): + if not os.path.exists(path): + os.makedirs(path) + np.save(os.path.join(path, 'num_nodes.npy'), self._num_nodes) + np.save(os.path.join(path, 'edges.npy'), self._edges) + + if self._adj_src_index: + self._adj_src_index.dump(os.path.join(path, 'adj_src')) + + if self._adj_dst_index: + self._adj_dst_index.dump(os.path.join(path, 'adj_dst')) + + def dump_feat(feat_path, feat): + """Dump all features to .npy file. + """ + if len(feat) == 0: + return + if not os.path.exists(feat_path): + os.makedirs(feat_path) + for key in feat: + np.save(os.path.join(feat_path, key + ".npy"), feat[key]) + + dump_feat(os.path.join(path, "node_feat"), self.node_feat) + dump_feat(os.path.join(path, "edge_feat"), self.edge_feat) @property def adj_src_index(self): """Return an EdgeIndex object for src. """ if self._adj_src_index is None: + if len(self._edges) == 0: + u = np.array([], dtype="int64") + v = np.array([], dtype="int64") + else: + u = self._edges[:, 0] + v = self._edges[:, 1] + self._adj_src_index = EdgeIndex( - u=self._edges[:, 0], - v=self._edges[:, 1], - num_nodes=self._num_nodes) + u=u, v=v, num_nodes=self._num_nodes) return self._adj_src_index @property @@ -144,10 +193,15 @@ class Graph(object): """Return an EdgeIndex object for dst. """ if self._adj_dst_index is None: + if len(self._edges) == 0: + v = np.array([], dtype="int64") + u = np.array([], dtype="int64") + else: + v = self._edges[:, 0] + u = self._edges[:, 1] + self._adj_dst_index = EdgeIndex( - u=self._edges[:, 1], - v=self._edges[:, 0], - num_nodes=self._num_nodes) + u=u, v=v, num_nodes=self._num_nodes) return self._adj_dst_index @property @@ -287,17 +341,11 @@ class Graph(object): []] """ - if nodes is None: - if return_eids: - return self.adj_src_index.v, self.adj_src_index.eid - else: - return self.adj_src_index.v + if return_eids: + return self.adj_src_index.view_v( + nodes), self.adj_src_index.view_eid(nodes) else: - if return_eids: - return self.adj_src_index.v[nodes], self.adj_src_index.eid[ - nodes] - else: - return self.adj_src_index.v[nodes] + return self.adj_src_index.view_v(nodes) def sample_successor(self, nodes, @@ -385,17 +433,11 @@ class Graph(object): [2]] """ - if nodes is None: - if return_eids: - return self.adj_dst_index.v, self.adj_dst_index.eid - else: - return self.adj_dst_index.v + if return_eids: + return self.adj_dst_index.view_v( + nodes), self.adj_dst_index.view_eid(nodes) else: - if return_eids: - return self.adj_dst_index.v[nodes], self.adj_dst_index.eid[ - nodes] - else: - return self.adj_dst_index.v[nodes] + return self.adj_dst_index.view_v(nodes) def sample_predecessor(self, nodes, @@ -510,7 +552,13 @@ class Graph(object): (key, _hide_num_nodes(value.shape), value.dtype)) return edge_feat_info - def subgraph(self, nodes, eid=None, edges=None): + def subgraph(self, + nodes, + eid=None, + edges=None, + edge_feats=None, + with_node_feat=True, + with_edge_feat=True): """Generate subgraph with nodes and edge ids. This function will generate a :code:`pgl.graph.Subgraph` object and @@ -525,6 +573,10 @@ class Graph(object): eid (optional): Edge ids which will be included in the subgraph. edges (optional): Edge(src, dst) list which will be included in the subgraph. + + with_node_feat: Whether to inherit node features from parent graph. + + with_edge_feat: Whether to inherit edge features from parent graph. Return: A :code:`pgl.graph.Subgraph` object. @@ -547,14 +599,20 @@ class Graph(object): len(edges), dtype="int64"), edges, reindex) sub_edge_feat = {} - for key, value in self._edge_feat.items(): - if eid is None: - raise ValueError("Eid can not be None with edge features.") - sub_edge_feat[key] = value[eid] + if edges is None: + if with_edge_feat: + for key, value in self._edge_feat.items(): + if eid is None: + raise ValueError( + "Eid can not be None with edge features.") + sub_edge_feat[key] = value[eid] + else: + sub_edge_feat = edge_feats sub_node_feat = {} - for key, value in self._node_feat.items(): - sub_node_feat[key] = value[nodes] + if with_node_feat: + for key, value in self._node_feat.items(): + sub_node_feat[key] = value[nodes] subgraph = SubGraph( num_nodes=len(nodes), @@ -730,6 +788,16 @@ class Graph(object): cur_nodes = nxt_nodes return walk + @property + def num_graph(self): + """ Return Number of Graphs""" + return self._num_graph + + @property + def graph_lod(self): + """ Return Graph Lod Index for Paddle Computation""" + return self._graph_lod + class SubGraph(Graph): """Implementation of SubGraph in pgl. @@ -783,3 +851,120 @@ class SubGraph(Graph): A list of node ids in parent graph. """ return graph_kernel.map_nodes(nodes, self._to_reindex) + + +class MultiGraph(Graph): + """Implementation of multiple disjoint graph structure in pgl. + + This is a simple implementation of graph structure in pgl. + + Args: + graph_list : A list of Graph Instances + + Examples: + + .. code-block:: python + + batch_graph = MultiGraph([graph1, graph2, graph3]) + + """ + + def __init__(self, graph_list): + num_nodes = np.sum([g.num_nodes for g in graph_list]) + node_feat = self._join_node_feature(graph_list) + edge_feat = self._join_edge_feature(graph_list) + edges = self._join_edges(graph_list) + super(MultiGraph, self).__init__( + num_nodes=num_nodes, + edges=edges, + node_feat=node_feat, + edge_feat=edge_feat) + self._num_graph = len(graph_list) + self._src_graph = graph_list + graph_lod = [g.num_nodes for g in graph_list] + graph_lod = np.cumsum(graph_lod, dtype="int32") + graph_lod = np.insert(graph_lod, 0, 0) + self._graph_lod = graph_lod + + def __getitem__(self, index): + return self._src_graph[index] + + def _join_node_feature(self, graph_list): + """join node features for multiple graph""" + node_feat = defaultdict(lambda: []) + for graph in graph_list: + for key in graph.node_feat: + node_feat[key].append(graph.node_feat[key]) + ret_node_feat = {} + for key in node_feat: + ret_node_feat[key] = np.vstack(node_feat[key]) + return ret_node_feat + + def _join_edge_feature(self, graph_list): + """join edge features for multiple graph""" + edge_feat = defaultdict(lambda: []) + for graph in graph_list: + for key in graph.edge_feat: + efeat = graph.edge_feat[key] + if len(efeat) > 0: + edge_feat[key].append(efeat) + + ret_edge_feat = {} + for key in edge_feat: + ret_edge_feat[key] = np.vstack(edge_feat[key]) + return ret_edge_feat + + def _join_edges(self, graph_list): + """join edges for multiple graph""" + list_edges = [] + start_offset = 0 + for graph in graph_list: + edges = graph.edges + if len(edges) > 0: + edges = edges + start_offset + list_edges.append(edges) + start_offset += graph.num_nodes + edges = np.vstack(list_edges) + return edges + + +class MemmapEdgeIndex(EdgeIndex): + def __init__(self, path): + self._degree = np.load(os.path.join(path, 'degree.npy'), mmap_mode="r") + self._sorted_u = np.load( + os.path.join(path, 'sorted_u.npy'), mmap_mode="r") + self._sorted_v = np.load( + os.path.join(path, 'sorted_v.npy'), mmap_mode="r") + self._sorted_eid = np.load( + os.path.join(path, 'sorted_eid.npy'), mmap_mode="r") + self._indptr = np.load(os.path.join(path, 'indptr.npy'), mmap_mode="r") + + +class MemmapGraph(Graph): + def __init__(self, path): + self._num_nodes = np.load(os.path.join(path, 'num_nodes.npy')) + self._edges = np.load(os.path.join(path, 'edges.npy'), mmap_mode="r") + if os.path.isdir(os.path.join(path, 'adj_src')): + self._adj_src_index = MemmapEdgeIndex( + os.path.join(path, 'adj_src')) + else: + self._adj_src_index = None + + if os.path.isdir(os.path.join(path, 'adj_dst')): + self._adj_dst_index = MemmapEdgeIndex( + os.path.join(path, 'adj_dst')) + else: + self._adj_dst_index = None + + def load_feat(feat_path): + """Load features from .npy file. + """ + feat = {} + if os.path.isdir(feat_path): + for feat_name in os.listdir(feat_path): + feat[os.path.splitext(feat_name)[0]] = np.load( + os.path.join(feat_path, feat_name), mmap_mode="r") + return feat + + self._node_feat = load_feat(os.path.join(path, 'node_feat')) + self._edge_feat = load_feat(os.path.join(path, 'edge_feat')) diff --git a/pgl/graph_kernel.pyx b/pgl/graph_kernel.pyx index c69d4e920b0ea54717996b4dc99e8a954b94d9bd..5e5f289725ae751c105d323207b38e1e7f612a3d 100644 --- a/pgl/graph_kernel.pyx +++ b/pgl/graph_kernel.pyx @@ -53,14 +53,21 @@ def build_index(np.ndarray[np.int64_t, ndim=1] u, _tmp_eid[indptr[u[i]] + count[u[i]]] = i _tmp_u[indptr[u[i]] + count[u[i]]] = u[i] count[u[i]] += 1 + return degree, _tmp_v, _tmp_u, _tmp_eid, indptr - cdef list output_eid = [] - cdef list output_v = [] - for i in xrange(n_size): - output_eid.append(_tmp_eid[indptr[i]:indptr[i+1]]) - output_v.append(_tmp_v[indptr[i]:indptr[i+1]]) - return np.array(output_v), np.array(output_eid), degree, _tmp_u, _tmp_v, _tmp_eid - +@cython.boundscheck(False) +@cython.wraparound(False) +def slice_by_index(np.ndarray[np.int64_t, ndim=1] u, + np.ndarray[np.int64_t, ndim=1] indptr, + np.ndarray[np.int64_t, ndim=1] index): + cdef list output = [] + cdef long long i + cdef long long h = len(index) + cdef long long j + for i in xrange(h): + j = index[i] + output.append(u[indptr[j]:indptr[j+1]]) + return np.array(output) @cython.boundscheck(False) @cython.wraparound(False) @@ -212,7 +219,11 @@ def sample_subset(list nids, long long maxdegree, shuffle=False): output.append(nids[inc]) else: sample_size = buff_size if buff_size <= maxdegree else maxdegree - subset_choose_index(sample_size, nids[inc], rnd, buff_nid, offset) + if isinstance(nids[inc], list): + tmp = np.array(nids[inc], dtype=np.int64) + else: + tmp = nids[inc] + subset_choose_index(sample_size, tmp, rnd, buff_nid, offset) output.append(buff_nid[offset:offset+sample_size]) offset += sample_size return output @@ -245,7 +256,14 @@ def sample_subset_with_eid(list nids, list eids, long long maxdegree, shuffle=Fa output_eid.append(eids[inc]) else: sample_size = buff_size if buff_size <= maxdegree else maxdegree - subset_choose_index_eid(sample_size, nids[inc], eids[inc], rnd, buff_nid, buff_eid, offset) + if isinstance(nids[inc], list): + tmp = np.array(nids[inc], dtype=np.int64) + tmp_eids = np.array(eids[inc], dtype=np.int64) + else: + tmp = nids[inc] + tmp_eids = eids[inc] + + subset_choose_index_eid(sample_size, tmp, tmp_eids, rnd, buff_nid, buff_eid, offset) output.append(buff_nid[offset:offset+sample_size]) output_eid.append(buff_eid[offset:offset+sample_size]) offset += sample_size @@ -253,22 +271,10 @@ def sample_subset_with_eid(list nids, list eids, long long maxdegree, shuffle=Fa @cython.boundscheck(False) @cython.wraparound(False) -def skip_gram_gen_pair(vector[long long] walk_path, long win_size=5): - """Return node paris generated by skip-gram algorithm. - - This function will auto remove the pair which src node is the same - as dst node. - - Args: - walk_path: List of nodes as a walk path. - win_size: the windows size used in skip-gram. - - Return: - A tuple of (src node list, dst node list). - """ +def skip_gram_gen_pair(vector[long long] walk, long win_size=5): cdef vector[long long] src cdef vector[long long] dst - cdef long long l = len(walk_path) + cdef long long l = len(walk) cdef long long real_win_size, left, right, i cdef np.ndarray[np.int64_t, ndim=1] rnd = np.random.randint(1, win_size+1, dtype=np.int64, size=l) @@ -282,23 +288,15 @@ def skip_gram_gen_pair(vector[long long] walk_path, long win_size=5): if right >= l: right = l - 1 for j in xrange(left, right+1): - if walk_path[i] == walk_path[j]: + if walk[i] == walk[j]: continue - src.push_back(walk_path[i]) - dst.push_back(walk_path[j]) + src.push_back(walk[i]) + dst.push_back(walk[j]) return src, dst @cython.boundscheck(False) @cython.wraparound(False) def alias_sample_build_table(np.ndarray[np.float64_t, ndim=1] probs): - """Return the alias table and event table for alias sampling. - - Args: - porobs: A list of float numbers as the probability. - - Return: - A tuple of (alias table, event table). - """ cdef long long l = len(probs) cdef np.ndarray[np.float64_t, ndim=1] alias = probs * l cdef np.ndarray[np.int64_t, ndim=1] events = np.zeros(l, dtype=np.int64) diff --git a/pgl/graph_wrapper.py b/pgl/graph_wrapper.py index 797796bd3410d3288fc8134874edee2acdb14665..9545a8210567e55dbadf37251ddf475a5ac9f5f9 100644 --- a/pgl/graph_wrapper.py +++ b/pgl/graph_wrapper.py @@ -36,19 +36,22 @@ def send(src, dst, nfeat, efeat, message_func): return msg -def recv(dst, uniq_dst, bucketing_index, msg, reduce_function, node_ids): +def recv(dst, uniq_dst, bucketing_index, msg, reduce_function, num_nodes, + num_edges): """Recv message from given msg to dst nodes. """ + empty_msg_flag = fluid.layers.cast(num_edges > 0, dtype="float32") if reduce_function == "sum": if isinstance(msg, dict): raise TypeError("The message for build-in function" " should be Tensor not dict.") try: - out_dims = msg.shape[-1] - init_output = fluid.layers.fill_constant_batch_size_like( - node_ids, shape=[1, out_dims], value=0, dtype="float32") + out_dim = msg.shape[-1] + init_output = fluid.layers.fill_constant( + shape=[num_nodes, out_dim], value=0, dtype="float32") init_output.stop_gradient = False + msg = msg * empty_msg_flag output = paddle_helper.scatter_add(init_output, dst, msg) return output except TypeError as e: @@ -60,17 +63,16 @@ def recv(dst, uniq_dst, bucketing_index, msg, reduce_function, node_ids): reduce_function = sum_func - # convert msg into lodtensor bucketed_msg = op.nested_lod_reset(msg, bucketing_index) - # Check dim for bucketed_msg equal to out_dims output = reduce_function(bucketed_msg) - out_dims = output.shape[-1] + output_dim = output.shape[-1] + output = output * empty_msg_flag - init_output = fluid.layers.fill_constant_batch_size_like( - node_ids, shape=[1, out_dims], value=0, dtype="float32") - init_output.stop_gradient = False - output = fluid.layers.scatter(init_output, uniq_dst, output) - return output + init_output = fluid.layers.fill_constant( + shape=[num_nodes, output_dim], value=0, dtype="float32") + init_output.stop_gradient = True + final_output = fluid.layers.scatter(init_output, uniq_dst, output) + return final_output class BaseGraphWrapper(object): @@ -89,8 +91,8 @@ class BaseGraphWrapper(object): """ def __init__(self): - self._node_feat_tensor_dict = {} - self._edge_feat_tensor_dict = {} + self.node_feat_tensor_dict = {} + self.edge_feat_tensor_dict = {} self._edges_src = None self._edges_dst = None self._num_nodes = None @@ -98,6 +100,12 @@ class BaseGraphWrapper(object): self._edge_uniq_dst = None self._edge_uniq_dst_count = None self._node_ids = None + self._graph_lod = None + self._num_graph = None + self._data_name_prefix = "" + + def __repr__(self): + return self._data_name_prefix def send(self, message_func, nfeat_list=None, efeat_list=None): """Send message from all src nodes to dst nodes. @@ -190,7 +198,8 @@ class BaseGraphWrapper(object): bucketing_index=self._edge_uniq_dst_count, msg=msg, reduce_function=reduce_function, - node_ids=self._node_ids) + num_edges=self._num_edges, + num_nodes=self._num_nodes) return output @property @@ -212,6 +221,24 @@ class BaseGraphWrapper(object): """ return self._num_nodes + @property + def graph_lod(self): + """Return graph index for graphs + + Return: + A variable with shape [None ] as the Lod information of multiple-graph. + """ + return self._graph_lod + + @property + def num_graph(self): + """Return a variable of number of graphs + + Return: + A variable with shape (1,) as the number of Graphs in int64. + """ + return self._num_graph + @property def edge_feat(self): """Return a dictionary of tensor representing edge features. @@ -220,7 +247,7 @@ class BaseGraphWrapper(object): A dictionary whose keys are the feature names and the values are feature tensor. """ - return self._edge_feat_tensor_dict + return self.edge_feat_tensor_dict @property def node_feat(self): @@ -230,7 +257,7 @@ class BaseGraphWrapper(object): A dictionary whose keys are the feature names and the values are feature tensor. """ - return self._node_feat_tensor_dict + return self.node_feat_tensor_dict def indegree(self): """Return the indegree tensor for all nodes. @@ -298,14 +325,13 @@ class StaticGraphWrapper(BaseGraphWrapper): def __init__(self, name, graph, place): super(StaticGraphWrapper, self).__init__() + self._data_name_prefix = name self._initializers = [] - self.__data_name_prefix = name self.__create_graph_attr(graph) def __create_graph_attr(self, graph): """Create graph attributes for paddlepaddle. """ - src, dst = list(zip(*graph.edges)) src, dst, eid = graph.sorted_edges(sort_by="dst") indegree = graph.indegree() nodes = graph.nodes @@ -313,6 +339,17 @@ class StaticGraphWrapper(BaseGraphWrapper): uniq_dst_count = indegree[indegree > 0] uniq_dst_count = np.cumsum(uniq_dst_count, dtype='int32') uniq_dst_count = np.insert(uniq_dst_count, 0, 0) + graph_lod = graph.graph_lod + num_graph = graph.num_graph + + num_edges = len(src) + if num_edges == 0: + # Fake Graph + src = np.array([0], dtype="int64") + dst = np.array([0], dtype="int64") + eid = np.array([0], dtype="int64") + uniq_dst_count = np.array([0, 1], dtype="int32") + uniq_dst = np.array([0], dtype="int64") edge_feat = {} @@ -323,46 +360,66 @@ class StaticGraphWrapper(BaseGraphWrapper): self.__create_graph_node_feat(node_feat, self._initializers) self.__create_graph_edge_feat(edge_feat, self._initializers) + self._num_edges, init = paddle_helper.constant( + dtype="int64", + value=np.array( + [num_edges], dtype="int64"), + name=self._data_name_prefix + '/num_edges') + self._initializers.append(init) + + self._num_graph, init = paddle_helper.constant( + dtype="int64", + value=np.array( + [num_graph], dtype="int64"), + name=self._data_name_prefix + '/num_graph') + self._initializers.append(init) + self._edges_src, init = paddle_helper.constant( dtype="int64", value=src, - name=self.__data_name_prefix + '/edges_src') + name=self._data_name_prefix + '/edges_src') self._initializers.append(init) self._edges_dst, init = paddle_helper.constant( dtype="int64", value=dst, - name=self.__data_name_prefix + '/edges_dst') + name=self._data_name_prefix + '/edges_dst') self._initializers.append(init) self._num_nodes, init = paddle_helper.constant( dtype="int64", hide_batch_size=False, value=np.array([graph.num_nodes]), - name=self.__data_name_prefix + '/num_nodes') + name=self._data_name_prefix + '/num_nodes') self._initializers.append(init) self._edge_uniq_dst, init = paddle_helper.constant( - name=self.__data_name_prefix + "/uniq_dst", + name=self._data_name_prefix + "/uniq_dst", dtype="int64", value=uniq_dst) self._initializers.append(init) self._edge_uniq_dst_count, init = paddle_helper.constant( - name=self.__data_name_prefix + "/uniq_dst_count", + name=self._data_name_prefix + "/uniq_dst_count", dtype="int32", value=uniq_dst_count) self._initializers.append(init) + self._graph_lod, init = paddle_helper.constant( + name=self._data_name_prefix + "/graph_lod", + dtype="int32", + value=graph_lod) + self._initializers.append(init) + node_ids_value = np.arange(0, graph.num_nodes, dtype="int64") self._node_ids, init = paddle_helper.constant( - name=self.__data_name_prefix + "/node_ids", + name=self._data_name_prefix + "/node_ids", dtype="int64", value=node_ids_value) self._initializers.append(init) self._indegree, init = paddle_helper.constant( - name=self.__data_name_prefix + "/indegree", + name=self._data_name_prefix + "/indegree", dtype="int64", value=indegree) self._initializers.append(init) @@ -373,9 +430,9 @@ class StaticGraphWrapper(BaseGraphWrapper): for node_feat_name, node_feat_value in node_feat.items(): node_feat_shape = node_feat_value.shape node_feat_dtype = node_feat_value.dtype - self._node_feat_tensor_dict[ + self.node_feat_tensor_dict[ node_feat_name], init = paddle_helper.constant( - name=self.__data_name_prefix + '/node_feat/' + + name=self._data_name_prefix + '/node_feat/' + node_feat_name, dtype=node_feat_dtype, value=node_feat_value) @@ -387,9 +444,9 @@ class StaticGraphWrapper(BaseGraphWrapper): for edge_feat_name, edge_feat_value in edge_feat.items(): edge_feat_shape = edge_feat_value.shape edge_feat_dtype = edge_feat_value.dtype - self._edge_feat_tensor_dict[ + self.edge_feat_tensor_dict[ edge_feat_name], init = paddle_helper.constant( - name=self.__data_name_prefix + '/edge_feat/' + + name=self._data_name_prefix + '/edge_feat/' + edge_feat_name, dtype=edge_feat_dtype, value=edge_feat_value) @@ -477,8 +534,8 @@ class GraphWrapper(BaseGraphWrapper): def __init__(self, name, place, node_feat=[], edge_feat=[]): super(GraphWrapper, self).__init__() # collect holders for PyReader + self._data_name_prefix = name self._holder_list = [] - self.__data_name_prefix = name self._place = place self.__create_graph_attr_holders() for node_feat_name, node_feat_shape, node_feat_dtype in node_feat: @@ -492,52 +549,81 @@ class GraphWrapper(BaseGraphWrapper): def __create_graph_attr_holders(self): """Create data holders for graph attributes. """ + self._num_edges = fluid.layers.data( + self._data_name_prefix + '/num_edges', + shape=[1], + append_batch_size=False, + dtype="int64", + stop_gradient=True) + self._num_graph = fluid.layers.data( + self._data_name_prefix + '/num_graph', + shape=[1], + append_batch_size=False, + dtype="int64", + stop_gradient=True) self._edges_src = fluid.layers.data( - self.__data_name_prefix + '/edges_src', + self._data_name_prefix + '/edges_src', shape=[None], append_batch_size=False, dtype="int64", stop_gradient=True) self._edges_dst = fluid.layers.data( - self.__data_name_prefix + '/edges_dst', + self._data_name_prefix + '/edges_dst', shape=[None], append_batch_size=False, dtype="int64", stop_gradient=True) self._num_nodes = fluid.layers.data( - self.__data_name_prefix + '/num_nodes', + self._data_name_prefix + '/num_nodes', shape=[1], append_batch_size=False, dtype='int64', stop_gradient=True) + self._edge_uniq_dst = fluid.layers.data( - self.__data_name_prefix + "/uniq_dst", + self._data_name_prefix + "/uniq_dst", shape=[None], append_batch_size=False, dtype="int64", stop_gradient=True) + + self._graph_lod = fluid.layers.data( + self._data_name_prefix + "/graph_lod", + shape=[None], + append_batch_size=False, + dtype="int32", + stop_gradient=True) + self._edge_uniq_dst_count = fluid.layers.data( - self.__data_name_prefix + "/uniq_dst_count", + self._data_name_prefix + "/uniq_dst_count", shape=[None], append_batch_size=False, dtype="int32", stop_gradient=True) + self._node_ids = fluid.layers.data( - self.__data_name_prefix + "/node_ids", + self._data_name_prefix + "/node_ids", shape=[None], append_batch_size=False, dtype="int64", stop_gradient=True) self._indegree = fluid.layers.data( - self.__data_name_prefix + "/indegree", + self._data_name_prefix + "/indegree", shape=[None], append_batch_size=False, dtype="int64", stop_gradient=True) self._holder_list.extend([ - self._edges_src, self._edges_dst, self._num_nodes, - self._edge_uniq_dst, self._edge_uniq_dst_count, self._node_ids, - self._indegree + self._edges_src, + self._edges_dst, + self._num_nodes, + self._edge_uniq_dst, + self._edge_uniq_dst_count, + self._node_ids, + self._indegree, + self._graph_lod, + self._num_graph, + self._num_edges, ]) def __create_graph_node_feat_holders(self, node_feat_name, node_feat_shape, @@ -545,12 +631,12 @@ class GraphWrapper(BaseGraphWrapper): """Create data holders for node features. """ feat_holder = fluid.layers.data( - self.__data_name_prefix + '/node_feat/' + node_feat_name, + self._data_name_prefix + '/node_feat/' + node_feat_name, shape=node_feat_shape, append_batch_size=False, dtype=node_feat_dtype, stop_gradient=True) - self._node_feat_tensor_dict[node_feat_name] = feat_holder + self.node_feat_tensor_dict[node_feat_name] = feat_holder self._holder_list.append(feat_holder) def __create_graph_edge_feat_holders(self, edge_feat_name, edge_feat_shape, @@ -558,12 +644,12 @@ class GraphWrapper(BaseGraphWrapper): """Create edge holders for edge features. """ feat_holder = fluid.layers.data( - self.__data_name_prefix + '/edge_feat/' + edge_feat_name, + self._data_name_prefix + '/edge_feat/' + edge_feat_name, shape=edge_feat_shape, append_batch_size=False, dtype=edge_feat_dtype, stop_gradient=True) - self._edge_feat_tensor_dict[edge_feat_name] = feat_holder + self.edge_feat_tensor_dict[edge_feat_name] = feat_holder self._holder_list.append(feat_holder) def to_feed(self, graph): @@ -583,10 +669,22 @@ class GraphWrapper(BaseGraphWrapper): src, dst, eid = graph.sorted_edges(sort_by="dst") indegree = graph.indegree() nodes = graph.nodes + num_edges = len(src) uniq_dst = nodes[indegree > 0] uniq_dst_count = indegree[indegree > 0] uniq_dst_count = np.cumsum(uniq_dst_count, dtype='int32') uniq_dst_count = np.insert(uniq_dst_count, 0, 0) + num_graph = graph.num_graph + graph_lod = graph.graph_lod + + if num_edges == 0: + # Fake Graph + src = np.array([0], dtype="int64") + dst = np.array([0], dtype="int64") + eid = np.array([0], dtype="int64") + + uniq_dst_count = np.array([0, 1], dtype="int32") + uniq_dst = np.array([0], dtype="int64") edge_feat = {} @@ -594,20 +692,27 @@ class GraphWrapper(BaseGraphWrapper): edge_feat[key] = value[eid] node_feat = graph.node_feat - feed_dict[self.__data_name_prefix + '/edges_src'] = src - feed_dict[self.__data_name_prefix + '/edges_dst'] = dst - feed_dict[self.__data_name_prefix + '/num_nodes'] = np.array(graph.num_nodes) - feed_dict[self.__data_name_prefix + '/uniq_dst'] = uniq_dst - feed_dict[self.__data_name_prefix + '/uniq_dst_count'] = uniq_dst_count - feed_dict[self.__data_name_prefix + '/node_ids'] = graph.nodes - feed_dict[self.__data_name_prefix + '/indegree'] = indegree - - for key in self._node_feat_tensor_dict: - feed_dict[self.__data_name_prefix + '/node_feat/' + + feed_dict[self._data_name_prefix + '/num_edges'] = np.array( + [num_edges], dtype="int64") + feed_dict[self._data_name_prefix + '/edges_src'] = src + feed_dict[self._data_name_prefix + '/edges_dst'] = dst + feed_dict[self._data_name_prefix + '/num_nodes'] = np.array( + [graph.num_nodes], dtype="int64") + feed_dict[self._data_name_prefix + '/uniq_dst'] = uniq_dst + feed_dict[self._data_name_prefix + '/uniq_dst_count'] = uniq_dst_count + feed_dict[self._data_name_prefix + '/node_ids'] = graph.nodes + feed_dict[self._data_name_prefix + '/indegree'] = indegree + feed_dict[self._data_name_prefix + '/graph_lod'] = graph_lod + feed_dict[self._data_name_prefix + '/num_graph'] = np.array( + [num_graph], dtype="int64") + feed_dict[self._data_name_prefix + '/indegree'] = indegree + + for key in self.node_feat_tensor_dict: + feed_dict[self._data_name_prefix + '/node_feat/' + key] = node_feat[key] - for key in self._edge_feat_tensor_dict: - feed_dict[self.__data_name_prefix + '/edge_feat/' + + for key in self.edge_feat_tensor_dict: + feed_dict[self._data_name_prefix + '/edge_feat/' + key] = edge_feat[key] return feed_dict diff --git a/pgl/heter_graph.py b/pgl/heter_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..b473003f51c9d62b9b7389fbba5c7cfbac84aa0a --- /dev/null +++ b/pgl/heter_graph.py @@ -0,0 +1,462 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + This package implement Heterogeneous Graph structure for handling Heterogeneous graph data. +""" +import time +import numpy as np +import pickle as pkl +import time +import pgl.graph_kernel as graph_kernel +from pgl.graph import Graph + +__all__ = ['HeterGraph', 'SubHeterGraph'] + + +def _hide_num_nodes(shape): + """Set the first dimension as unknown + """ + shape = list(shape) + shape[0] = None + return shape + + +class HeterGraph(object): + """Implementation of heterogeneous graph structure in pgl + + This is a simple implementation of heterogeneous graph structure in pgl. + + Args: + num_nodes: number of nodes in a heterogeneous graph + edges: dict, every element in dict is a list of (u, v) tuples. + node_types (optional): list of (u, node_type) tuples to specify the node type of every node + node_feat (optional): a dict of numpy array as node features + edge_feat (optional): a dict of dict as edge features for every edge type + + Examples: + .. code-block:: python + + import numpy as np + num_nodes = 4 + node_types = [(0, 'user'), (1, 'item'), (2, 'item'), (3, 'user')] + edges = { + 'edges_type1': [(0,1), (3,2)], + 'edges_type2': [(1,2), (3,1)], + } + node_feat = {'feature': np.random.randn(4, 16)} + edges_feat = { + 'edges_type1': {'h': np.random.randn(2, 16)}, + 'edges_type2': {'h': np.random.randn(2, 16)}, + } + + g = heter_graph.HeterGraph( + num_nodes=num_nodes, + edges=edges, + node_types=node_types, + node_feat=node_feat, + edge_feat=edges_feat) + """ + + def __init__(self, + num_nodes, + edges, + node_types=None, + node_feat=None, + edge_feat=None): + self._num_nodes = num_nodes + self._edges_dict = edges + + if isinstance(node_types, list): + self._node_types = np.array(node_types, dtype=object)[:, 1] + else: + self._node_types = node_types + + self._nodes_type_dict = {} + for n_type in np.unique(self._node_types): + self._nodes_type_dict[n_type] = np.where( + self._node_types == n_type)[0] + + if node_feat is not None: + self._node_feat = node_feat + else: + self._node_feat = {} + + if edge_feat is not None: + self._edge_feat = edge_feat + else: + self._edge_feat = {} + + self._multi_graph = {} + + for key, value in self._edges_dict.items(): + if not self._edge_feat: + edge_feat = None + else: + edge_feat = self._edge_feat[key] + + self._multi_graph[key] = Graph( + num_nodes=self._num_nodes, + edges=value, + node_feat=self._node_feat, + edge_feat=edge_feat) + + self._edge_types = self.edge_types_info() + + @property + def edge_types(self): + """Return a list of edge types. + """ + return self._edge_types + + @property + def num_nodes(self): + """Return the number of nodes. + """ + return self._num_nodes + + @property + def num_edges(self): + """Return edges number of all edge types. + """ + n_edges = {} + for e_type in self._edge_types: + n_edges[e_type] = self._multi_graph[e_type].num_edges + return n_edges + + @property + def node_types(self): + """Return the node types. + """ + return self._node_types + + @property + def edge_feat(self, edge_type=None): + """Return edge features of all edge types. + """ + return self._edge_feat + + @property + def node_feat(self): + """Return a dictionary of node features. + """ + return self._node_feat + + @property + def nodes(self): + """Return all nodes id from 0 to :code:`num_nodes - 1` + """ + return np.arange(self._num_nodes, dtype='int64') + + def __getitem__(self, edge_type): + """__getitem__ + """ + return self._multi_graph[edge_type] + + def num_nodes_by_type(self, n_type=None): + """Return the number of nodes with the specified node type. + """ + if n_type not in self._nodes_type_dict: + raise ("%s is not in valid node type" % n_type) + else: + return len(self._nodes_type_dict[n_type]) + + def indegree(self, nodes=None, edge_type=None): + """Return the indegree of the given nodes with the specified edge_type. + + Args: + nodes: Return the indegree of given nodes. + if nodes is None, return indegree for all nodes. + + edge_types: Return the indegree with specified edge_type. + if edge_type is None, return the total indegree of the given nodes. + + Return: + A numpy.ndarray as the given nodes' indegree. + """ + if edge_type is None: + indegrees = [] + for e_type in self._edge_types: + indegrees.append(self._multi_graph[e_type].indegree(nodes)) + indegrees = np.sum(np.vstack(indegrees), axis=0) + return indegrees + else: + return self._multi_graph[edge_type].indegree(nodes) + + def outdegree(self, nodes=None, edge_type=None): + """Return the outdegree of the given nodes with the specified edge_type. + + Args: + nodes: Return the outdegree of given nodes, + if nodes is None, return outdegree for all nodes + + edge_types: Return the outdegree with specified edge_type. + if edge_type is None, return the total outdegree of the given nodes. + + Return: + A numpy.array as the given nodes' outdegree. + """ + if edge_type is None: + outdegrees = [] + for e_type in self._edge_types: + outdegrees.append(self._multi_graph[e_type].outdegree(nodes)) + outdegrees = np.sum(np.vstack(outdegrees), axis=0) + return outdegrees + else: + return self._multi_graph[edge_type].outdegree(nodes) + + def successor(self, edge_type, nodes=None, return_eids=False): + """Find successor of given nodes with the specified edge_type. + + Args: + nodes: Return the successor of given nodes, + if nodes is None, return successor for all nodes + + edge_types: Return the successor with specified edge_type. + if edge_type is None, return the total successor of the given nodes + and eids are invalid in this way. + + return_eids: If True return nodes together with corresponding eid + """ + return self._multi_graph[edge_type].successor(nodes, return_eids) + + def sample_successor(self, + edge_type, + nodes, + max_degree, + return_eids=False, + shuffle=False): + """Sample successors of given nodes with the specified edge_type. + + Args: + edge_type: The specified edge_type. + + nodes: Given nodes whose successors will be sampled. + + max_degree: The max sampled successors for each nodes. + + return_eids: Whether to return the corresponding eids. + + Return: + + Return a list of numpy.ndarray and each numpy.ndarray represent a list + of sampled successor ids for given nodes with specified edge type. + If :code:`return_eids=True`, there will be an additional list of + numpy.ndarray and each numpy.ndarray represent a list of eids that + connected nodes to their successors. + """ + return self._multi_graph[edge_type].sample_successor( + nodes=nodes, + max_degree=max_degree, + return_eids=return_eids, + shuffle=shuffle) + + def predecessor(self, edge_type, nodes=None, return_eids=False): + """Find predecessor of given nodes with the specified edge_type. + + Args: + nodes: Return the predecessor of given nodes, + if nodes is None, return predecessor for all nodes + + edge_types: Return the predecessor with specified edge_type. + + return_eids: If True return nodes together with corresponding eid + """ + return self._multi_graph[edge_type].predecessor(nodes, return_eids) + + def sample_predecessor(self, + edge_type, + nodes, + max_degree, + return_eids=False, + shuffle=False): + """Sample predecessors of given nodes with the specified edge_type. + + Args: + edge_type: The specified edge_type. + + nodes: Given nodes whose predecessors will be sampled. + + max_degree: The max sampled predecessors for each nodes. + + return_eids: Whether to return the corresponding eids. + + Return: + + Return a list of numpy.ndarray and each numpy.ndarray represent a list + of sampled predecessor ids for given nodes with specified edge type. + If :code:`return_eids=True`, there will be an additional list of + numpy.ndarray and each numpy.ndarray represent a list of eids that + connected nodes to their predecessors. + """ + return self._multi_graph[edge_type].sample_predecessor( + nodes=nodes, + max_degree=max_degree, + return_eids=return_eids, + shuffle=shuffle) + + def node_batch_iter(self, batch_size, shuffle=True, n_type=None): + """Node batch iterator + + Iterate all nodes by batch with the specified node type. + + Args: + batch_size: The batch size of each batch of nodes. + + shuffle: Whether shuffle the nodes. + + n_type: Iterate the nodes with the specified node type. If n_type is None, + iterate all nodes by batch. + + Return: + Batch iterator + """ + if n_type is None: + nodes = np.arange(self._num_nodes, dtype="int64") + else: + nodes = self._nodes_type_dict[n_type] + + if shuffle: + np.random.shuffle(nodes) + start = 0 + while start < len(nodes): + yield nodes[start:start + batch_size] + start += batch_size + + def sample_nodes(self, sample_num, n_type=None): + """Sample nodes with the specified n_type from the graph + + This function helps to sample nodes with the specified n_type from the graph. + If n_type is None, this function will sample nodes from all nodes. + Nodes might be duplicated. + + Args: + sample_num: The number of samples + n_type: The nodes of type to be sampled + + Return: + A list of nodes + """ + if n_type is not None: + return np.random.choice( + self._nodes_type_dict[n_type], size=sample_num) + else: + return np.random.randint( + low=0, high=self._num_nodes, size=sample_num) + + def node_feat_info(self): + """Return the information of node feature for HeterGraphWrapper. + + This function return the information of node features of all node types. And this + function is used to help constructing HeterGraphWrapper + + Return: + A list of tuple (name, shape, dtype) for all given node feature. + + """ + node_feat_info = [] + for feat_name, feat in self._node_feat.items(): + node_feat_info.append( + (feat_name, _hide_num_nodes(feat.shape), feat.dtype)) + + return node_feat_info + + def edge_feat_info(self): + """Return the information of edge feature for HeterGraphWrapper. + + This function return the information of edge features of all edge types. And this + function is used to help constructing HeterGraphWrapper + + Return: + A dict of list of tuple (name, shape, dtype) for all given edge feature. + + """ + edge_feat_info = {} + for edge_type_name, feat_dict in self._edge_feat.items(): + tmp_edge_feat_info = [] + for feat_name, feat in feat_dict.items(): + full_name = feat_name + tmp_edge_feat_info.append( + (full_name, _hide_num_nodes(feat.shape), feat.dtype)) + edge_feat_info[edge_type_name] = tmp_edge_feat_info + return edge_feat_info + + def edge_types_info(self): + """Return the information of all edge types. + + Return: + A list of all edge types. + + """ + edge_types_info = [] + for key, _ in self._edges_dict.items(): + edge_types_info.append(key) + + return edge_types_info + + +class SubHeterGraph(HeterGraph): + """Implementation of SubHeterGraph in pgl. + + SubHeterGraph is inherit from :code:`HeterGraph`. + + Args: + num_nodes: number of nodes in a heterogeneous graph + edges: dict, every element in dict is a list of (u, v) tuples. + node_types (optional): list of (u, node_type) tuples to specify the node type of every node + node_feat (optional): a dict of numpy array as node features + edge_feat (optional): a dict of dict as edge features for every edge type + + reindex: A dictionary that maps parent hetergraph node id to subhetergraph node id. + """ + + def __init__(self, + num_nodes, + edges, + node_types=None, + node_feat=None, + edge_feat=None, + reindex=None): + super(SubHeterGraph, self).__init__( + num_nodes=num_nodes, + edges=edges, + node_types=node_types, + node_feat=node_feat, + edge_feat=edge_feat) + + if reindex is None: + reindex = {} + self._from_reindex = reindex + self._to_reindex = {u: v for v, u in reindex.items()} + + def reindex_from_parrent_nodes(self, nodes): + """Map the given parent graph node id to subgraph id. + + Args: + nodes: A list of nodes from parent graph. + + Return: + A list of subgraph ids. + """ + return graph_kernel.map_nodes(nodes, self._from_reindex) + + def reindex_to_parrent_nodes(self, nodes): + """Map the given subgraph node id to parent graph id. + + Args: + nodes: A list of nodes in this subgraph. + + Return: + A list of node ids in parent graph. + """ + return graph_kernel.map_nodes(nodes, self._to_reindex) diff --git a/pgl/contrib/heter_graph_wrapper.py b/pgl/heter_graph_wrapper.py similarity index 98% rename from pgl/contrib/heter_graph_wrapper.py rename to pgl/heter_graph_wrapper.py index 5282bfa51c7deb5776400d557dbca917fb37f04f..3f16efb892dfdda94e96e3bfb38068cd4f9ca82f 100644 --- a/pgl/contrib/heter_graph_wrapper.py +++ b/pgl/heter_graph_wrapper.py @@ -64,8 +64,8 @@ class HeterGraphWrapper(object): import paddle.fluid as fluid import numpy as np - from pgl.contrib import heter_graph - from pgl.contrib import heter_graph_wrapper + from pgl import heter_graph + from pgl import heter_graph_wrapper num_nodes = 4 node_types = [(0, 'user'), (1, 'item'), (2, 'item'), (3, 'user')] edges = { diff --git a/pgl/layers/__init__.py b/pgl/layers/__init__.py index a88c9b69b33335e091d5713400395f7c003361b4..efc27aa5bda6316348c7c65d6d714de70584b1dc 100644 --- a/pgl/layers/__init__.py +++ b/pgl/layers/__init__.py @@ -18,7 +18,10 @@ from pgl.layers import conv from pgl.layers.conv import * from pgl.layers import set2set from pgl.layers.set2set import * +from pgl.layers import graph_pool +from pgl.layers.graph_pool import * __all__ = [] __all__ += conv.__all__ __all__ += set2set.__all__ +__all__ += graph_pool.__all__ diff --git a/pgl/layers/graph_pool.py b/pgl/layers/graph_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..a88468f7b6ef12131c4554db9339c35472a66f11 --- /dev/null +++ b/pgl/layers/graph_pool.py @@ -0,0 +1,42 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""This package implements common layers to help building +graph neural networks. +""" +import paddle.fluid as fluid +from pgl import graph_wrapper +from pgl.utils import paddle_helper +from pgl.utils import op + +__all__ = ['graph_pooling'] + + +def graph_pooling(gw, node_feat, pool_type): + """Implementation of graph pooling + + This is an implementation of graph pooling + + Args: + gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`) + + node_feat: A tensor with shape (num_nodes, feature_size). + + pool_type: The type of pooling ("sum", "average" , "min") + + Return: + A tensor with shape (num_graph, hidden_size) + """ + graph_feat = op.nested_lod_reset(node_feat, gw.graph_lod) + graph_feat = fluid.layers.sequence_pool(graph_feat, pool_type) + return graph_feat diff --git a/pgl/redis_hetergraph.py b/pgl/redis_hetergraph.py new file mode 100644 index 0000000000000000000000000000000000000000..5bdd40f862e0fc11c79a8f251114aa144b27cc8c --- /dev/null +++ b/pgl/redis_hetergraph.py @@ -0,0 +1,272 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""redis_hetergraph""" + +import pgl +import redis +from redis import BlockingConnectionPool, StrictRedis +from redis._compat import b, unicode, bytes, long, basestring +from rediscluster.nodemanager import NodeManager +from rediscluster.crc import crc16 +from collections import OrderedDict +import threading +import numpy as np +import time +import json +import pgl.graph as pgraph +import pickle as pkl +from pgl.utils.logger import log +import pgl.graph_kernel as graph_kernel +from pgl import heter_graph +import pgl.redis_graph as rg + + +class RedisHeterGraph(rg.RedisGraph): + """Redis Heterogeneous Graph""" + + def __init__(self, name, edge_types, redis_config, num_parts): + super(RedisHeterGraph, self).__init__(name, redis_config, num_parts) + self._num_edges = {} + self.edge_types = edge_types + self.e_type = None + + self._edge_feat_info = {} + self._edge_feat_dtype = {} + self._edge_feat_shape = {} + + def num_edges_by_type(self, e_type): + """get edge number by specified edge type""" + if e_type not in self._num_edges: + self._num_edges[e_type] = int( + self._rs.get("%s:num_edges" % e_type)) + + return self._num_edges[e_type] + + def num_edges(self): + """num_edges""" + num_edges = {} + for e_type in self.edge_types: + num_edges[e_type] = self.num_edges_by_type(e_type) + + return num_edges + + def edge_feat_info_by_type(self, e_type): + """get edge features information by specified edge type""" + if e_type not in self._edge_feat_info: + buff = self._rs.get("%s:ef:infos" % e_type) + if buff is not None: + self._edge_feat_info[e_type] = json.loads(buff.decode()) + else: + self._edge_feat_info[e_type] = [] + return self._edge_feat_info[e_type] + + def edge_feat_info(self): + """edge_feat_info""" + edge_feat_info = {} + for e_type in self.edge_types: + edge_feat_info[e_type] = self.edge_feat_info_by_type(e_type) + return edge_feat_info + + def edge_feat_shape(self, e_type, key): + """edge_feat_shape""" + if e_type not in self._edge_feat_shape: + e_feat_shape = {} + for k, shape, _ in self.edge_feat_info()[e_type]: + e_feat_shape[k] = shape + self._edge_feat_shape[e_type] = e_feat_shape + return self._edge_feat_shape[e_type][key] + + def edge_feat_dtype(self, e_type, key): + """edge_feat_dtype""" + if e_type not in self._edge_feat_dtype: + e_feat_dtype = {} + for k, _, dtype in self.edge_feat_info()[e_type]: + e_feat_dtype[k] = dtype + self._edge_feat_dtype[e_type] = e_feat_dtype + return self._edge_feat_dtype[e_type][key] + + def sample_predecessor(self, e_type, nodes, max_degree, return_eids=False): + """sample predecessor with the specified edge type""" + query = ["%s:d:%s" % (e_type, n) for n in nodes] + rets = rg.hmget_sample_helper(self._rs, query, self.num_parts, + max_degree) + v = [] + eid = [] + for buff in rets: + if buff is None: + v.append(np.array([], dtype="int64")) + eid.append(np.array([], dtype="int64")) + else: + npret = np.frombuffer( + buff, dtype="int64").reshape([-1, 2]).astype("int64") + v.append(npret[:, 0]) + eid.append(npret[:, 1]) + if return_eids: + return np.array(v), np.array(eid) + else: + return np.array(v) + + def sample_successor(self, e_type, nodes, max_degree, return_eids=False): + """sample successor with the specified edge type""" + query = ["%s:s:%s" % (e_type, n) for n in nodes] + rets = rg.hmget_sample_helper(self._rs, query, self.num_parts, + max_degree) + v = [] + eid = [] + for buff in rets: + if buff is None: + v.append(np.array([], dtype="int64")) + eid.append(np.array([], dtype="int64")) + else: + npret = np.frombuffer( + buff, dtype="int64").reshape([-1, 2]).astype("int64") + v.append(npret[:, 0]) + eid.append(npret[:, 1]) + if return_eids: + return np.array(v), np.array(eid) + else: + return np.array(v) + + def predecessor(self, e_type, nodes, return_eids=False): + """predecessor with the specified edge type""" + query = ["%s:d:%s" % (e_type, n) for n in nodes] + ret = rg.hmget_helper(self._rs, query, self.num_parts) + v = [] + eid = [] + for buff in ret: + if buff is not None: + npret = np.frombuffer( + buff, dtype="int64").reshape([-1, 2]).astype("int64") + v.append(npret[:, 0]) + eid.append(npret[:, 1]) + else: + v.append(np.array([], dtype="int64")) + eid.append(np.array([], dtype="int64")) + if return_eids: + return np.array(v), np.array(eid) + else: + return np.array(v) + + def successor(self, e_type, nodes, return_eids=False): + """successor with the specified edge type""" + query = ["%s:s:%s" % (e_type, n) for n in nodes] + ret = rg.hmget_helper(self._rs, query, self.num_parts) + v = [] + eid = [] + for buff in ret: + if buff is not None: + npret = np.frombuffer( + buff, dtype="int64").reshape([-1, 2]).astype("int64") + v.append(npret[:, 0]) + eid.append(npret[:, 1]) + else: + v.append(np.array([], dtype="int64")) + eid.append(np.array([], dtype="int64")) + if return_eids: + return np.array(v), np.array(eid) + else: + return np.array(v) + + def get_edges_by_id(self, e_type, eids): + """get_edges_by_id""" + queries = ["%s:e:%s" % (e_type, e) for e in eids] + ret = rg.hmget_helper(self._rs, queries, self.num_parts) + o = np.asarray(ret, dtype="int64") + dst = o % self.num_nodes + src = o // self.num_nodes + data = np.hstack( + [src.reshape([-1, 1]), dst.reshape([-1, 1])]).astype("int64") + return data + + def get_edge_feat_by_id(self, e_type, key, eids): + """get_edge_feat_by_id""" + queries = ["%s:ef:%s:%i" % (e_type, key, e) for e in eids] + ret = rg.hmget_helper(self._rs, queries, self.num_parts) + if ret is None: + return None + else: + ret = b"".join(ret) + data = np.frombuffer(ret, dtype=self.edge_feat_dtype(e_type, key)) + data = data.reshape(self.edge_feat_shape(e_type, key)) + return data + + def get_node_types(self, nodes): + """get_node_types """ + queries = ["nt:%i" % n for n in nodes] + ret = rg.hmget_helper(self._rs, queries, self.num_parts) + node_types = [] + for buff in ret: + if buff: + node_types.append(buff.decode()) + else: + node_types = None + return node_types + + def subgraph(self, nodes, eid, edges=None): + """Generate heterogeneous subgraph with nodes and edge ids. + + WARNING: ALL NODES IN EID MUST BE INCLUDED BY NODES + + Args: + nodes: Node ids which will be included in the subgraph. + + eid: Edge ids which will be included in the subgraph. + + Return: + A :code:`pgl.heter_graph.Subgraph` object. + """ + reindex = {} + + for ind, node in enumerate(nodes): + reindex[node] = ind + + _node_types = self.get_node_types(nodes) + if _node_types is None: + node_types = None + else: + node_types = [] + for idx, t in zip(nodes, _node_types): + node_types.append([reindex[idx], t]) + + if edges is None: + edges = {} + for e_type, eid_list in eid.items(): + edges[e_type] = self.get_edges_by_id(e_type, eid_list) + + sub_edges = {} + for e_type, edges_list in edges.items(): + sub_edges[e_type] = graph_kernel.map_edges( + np.arange( + len(edges_list), dtype="int64"), edges_list, reindex) + + sub_edge_feat = {} + for e_type, edge_feat_info in self.edge_feat_info().items(): + type_edge_feat = {} + for key, _, _ in edge_feat_info: + type_edge_feat[key] = self.get_edge_feat_by_id(e_type, key, + eid) + sub_edge_feat[e_type] = type_edge_feat + + sub_node_feat = {} + for key, _, _ in self.node_feat_info(): + sub_node_feat[key] = self.get_node_feat_by_id(key, nodes) + + subgraph = heter_graph.SubHeterGraph( + num_nodes=len(nodes), + edges=sub_edges, + node_types=node_types, + node_feat=sub_node_feat, + edge_feat=sub_edge_feat, + reindex=reindex) + return subgraph diff --git a/pgl/sample.py b/pgl/sample.py index b0b13bb3150276f6134fb8fdadd0fa9f62a48937..89c1d1ba532fa4123b6b088f8220a61cd0fbdb9e 100644 --- a/pgl/sample.py +++ b/pgl/sample.py @@ -24,10 +24,29 @@ from pgl import graph_kernel __all__ = [ 'graphsage_sample', 'node2vec_sample', 'deepwalk_sample', - 'metapath_randomwalk' + 'metapath_randomwalk', 'pinsage_sample' ] +def traverse(item): + """traverse the list or numpy""" + if isinstance(item, list) or isinstance(item, np.ndarray): + for i in iter(item): + for j in traverse(i): + yield j + else: + yield item + + +def flat_node_and_edge(nodes, eids, weights=None): + """flatten the sub-lists to one list""" + nodes = list(set(traverse(nodes))) + eids = list(traverse(eids)) + if weights is not None: + weights = list(traverse(weights)) + return nodes, eids, weights + + def edge_hash(src, dst): """edge_hash """ @@ -37,7 +56,7 @@ def edge_hash(src, dst): def graphsage_sample(graph, nodes, samples, ignore_edges=[]): """Implement of graphsage sample. - Reference paper: https://cs.stanford.edu/~jure/pubs/node2vec-kdd16.pdf. + Reference paper: https://cs.stanford.edu/people/jure/pubs/graphsage-nips17.pdf. Args: graph: A pgl graph instance @@ -65,7 +84,6 @@ def graphsage_sample(graph, nodes, samples, ignore_edges=[]): continue batch_pred_nodes, batch_pred_eids = graph.sample_predecessor( start_nodes, samples[layer_idx], return_eids=True) - log.debug("sample_predecessor time: %s" % (time.time() - start)) start = time.time() last_nodes_set = nodes_set @@ -88,7 +106,6 @@ def graphsage_sample(graph, nodes, samples, ignore_edges=[]): start_nodes = list(nodes_set - last_nodes_set) layer_nodes = [nodes] + layer_nodes layer_eids = [eids] + layer_eids - log.debug("flat time: %s" % (time.time() - start)) start = time.time() # Find new nodes @@ -102,7 +119,6 @@ def graphsage_sample(graph, nodes, samples, ignore_edges=[]): # only for this task subgraphs[i].node_feat["index"] = np.array( layer_nodes[0], dtype="int64") - log.debug("subgraph time: %s" % (time.time() - start)) return subgraphs @@ -256,43 +272,207 @@ def node2vec_sample(graph, nodes, max_depth, p=1.0, q=1.0): return walk -def metapath_randomwalk(graph, start_node, metapath, walk_length): +def metapath_randomwalk(graph, + start_nodes, + metapath, + walk_length, + alias_name=None, + events_name=None): """Implementation of metapath random walk in heterogeneous graph. Args: graph: instance of pgl heterogeneous graph - start_node: start node to generate walk + start_nodes: start nodes to generate walk metapath: meta path for sample nodes. - e.g: "user-item-user" + e.g: "c2p-p2a-a2p-p2c" walk_length: the walk length Return: - a list of metapath walk, each element is a node id. + a list of metapath walks. """ - np.random.seed() + + edge_types = metapath.split('-') + walk = [] - metapath = metapath.split('-') - assert metapath[0] == metapath[ - -1], "The last meta path item should be the same as the first one" - mp_len = len(metapath) - 1 - - walk.append(start_node) - for i in range(1, walk_length): - cur_node = walk[-1] - succs = graph.successor(cur_node) - if succs.size > 0: - succs_node_types = graph._node_types[succs] + for node in start_nodes: + walk.append([node]) + + cur_walk_ids = np.arange(0, len(start_nodes)) + cur_nodes = np.array(start_nodes) + mp_len = len(edge_types) + for i in range(0, walk_length - 1): + g = graph[edge_types[i % mp_len]] + + cur_succs = g.successor(cur_nodes) + mask = [len(succ) > 0 for succ in cur_succs] + + if np.any(mask): + cur_walk_ids = cur_walk_ids[mask] + cur_nodes = cur_nodes[mask] + cur_succs = cur_succs[mask] else: - # no successor of current node + # stop when all nodes have no successor break - succs_nodes = succs[np.where(succs_node_types == metapath[i % mp_len])[ - 0]] - if succs_nodes.size > 0: - walk.append(np.random.choice(succs_nodes)) + if alias_name is not None and events_name is not None: + sample_index = [ + alias_sample([1], g.node_feat[alias_name][node], + g.node_feat[events_name][node])[0] + for node in cur_nodes + ] else: - # no successor of such node type - break + outdegree = [len(cur_succ) for cur_succ in cur_succs] + sample_index = np.floor( + np.random.rand(cur_succs.shape[0]) * outdegree).astype("int64") + + nxt_cur_nodes = [] + for s, ind, walk_id in zip(cur_succs, sample_index, cur_walk_ids): + walk[walk_id].append(s[ind]) + nxt_cur_nodes.append(s[ind]) + cur_nodes = np.array(nxt_cur_nodes) + + return walk + + +def random_walk_with_start_prob(graph, nodes, max_depth, proba=0.5): + """Implement of random walk with the probability of returning the origin node. + + This function get random walks path for given nodes and depth. + + Args: + nodes: Walk starting from nodes + max_depth: Max walking depth + proba: the proba to return the origin node + + Return: + A list of walks. + """ + walk = [] + # init + for node in nodes: + walk.append([node]) + + walk_ids = np.arange(0, len(nodes)) + cur_nodes = np.array(nodes) + nodes = np.array(nodes) + for l in range(max_depth): + # select the walks not end + if l >= 1: + return_proba = np.random.rand(cur_nodes.shape[0]) + proba_mask = (return_proba < proba) + cur_nodes[proba_mask] = nodes[proba_mask] + outdegree = graph.outdegree(cur_nodes) + mask = (outdegree != 0) + if np.any(mask): + cur_walk_ids = walk_ids[mask] + outdegree = outdegree[mask] + else: + # stop when all nodes have no successor, wait start next loop to get precesssor + continue + succ = graph.successor(cur_nodes[mask]) + sample_index = np.floor( + np.random.rand(outdegree.shape[0]) * outdegree).astype("int64") + nxt_cur_nodes = cur_nodes + for s, ind, walk_id in zip(succ, sample_index, cur_walk_ids): + walk[walk_id].append(s[ind]) + nxt_cur_nodes[walk_id] = s[ind] + cur_nodes = np.array(nxt_cur_nodes) return walk + + +def pinsage_sample(graph, + nodes, + samples, + top_k=10, + proba=0.5, + norm_bais=1.0, + ignore_edges=set()): + """Implement of graphsage sample. + + Reference paper: . + + Args: + graph: A pgl graph instance + nodes: Sample starting from nodes + samples: A list, number of neighbors in each layer + top_k: select the top_k visit count nodes to construct the edges + proba: the probability to return the origin node + norm_bais: the normlization for the visit count + ignore_edges: list of edge(src, dst) will be ignored. + + Return: + A list of subgraphs + """ + start = time.time() + num_layers = len(samples) + start_nodes = nodes + edges, weights = [], [] + layer_nodes, layer_edges, layer_weights = [], [], [] + ignore_edge_set = set([edge_hash(src, dst) for src, dst in ignore_edges]) + + for layer_idx in reversed(range(num_layers)): + if len(start_nodes) == 0: + layer_nodes = [nodes] + layer_nodes + layer_edges = [edges] + layer_edges + layer_edges_weight = [weights] + layer_weights + continue + walks = random_walk_with_start_prob( + graph, start_nodes, samples[layer_idx], proba=proba) + walks = [walk[1:] for walk in walks] + pred_edges = [] + pred_weights = [] + pred_nodes = [] + for node, walk in zip(start_nodes, walks): + walk_nodes = [] + walk_weights = [] + count_sum = 0 + + for random_walk_node in walk: + if len(ignore_edge_set) > 0 and random_walk_node != node and \ + edge_hash(random_walk_node, node) in ignore_edge_set: + continue + walk_nodes.append(random_walk_node) + unique, counts = np.unique(walk_nodes, return_counts=True) + frequencies = np.asarray((unique, counts)).T + frequencies = frequencies[np.argsort(frequencies[:, 1])] + frequencies = frequencies[-1 * top_k:, :] + for random_walk_node, random_count in zip( + frequencies[:, 0].tolist(), frequencies[:, 1].tolist()): + pred_nodes.append(random_walk_node) + pred_edges.append((random_walk_node, node)) + walk_weights.append(random_count) + count_sum += random_count + count_sum += len(walk_weights) * norm_bais + walk_weights = (np.array(walk_weights) + norm_bais) / (count_sum) + pred_weights.extend(walk_weights.tolist()) + last_node_set = set(nodes) + nodes, edges, weights = flat_node_and_edge([nodes, pred_nodes], \ + [edges, pred_edges], [weights, pred_weights]) + + layer_edges = [edges] + layer_edges + layer_weights = [weights] + layer_weights + layer_nodes = [nodes] + layer_nodes + + start_nodes = list(set(nodes) - last_node_set) + start = time.time() + + feed_dict = {} + + subgraphs = [] + + for i in range(num_layers): + edge_feat_dict = { + "weight": np.array( + layer_weights[i], dtype='float32') + } + subgraphs.append( + graph.subgraph( + nodes=layer_nodes[0], + edges=layer_edges[i], + edge_feats=edge_feat_dict)) + subgraphs[i].node_feat["index"] = np.array( + layer_nodes[0], dtype="int64") + + return subgraphs diff --git a/pgl/tests/test_hetergraph.py b/pgl/tests/test_hetergraph.py new file mode 100644 index 0000000000000000000000000000000000000000..7e824b1c53f26719df3645f4bd1e47c07659f33f --- /dev/null +++ b/pgl/tests/test_hetergraph.py @@ -0,0 +1,141 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""test_hetergraph""" + +import time +import unittest +import json +import os + +import numpy as np +from pgl.sample import metapath_randomwalk +from pgl.graph import Graph +from pgl import heter_graph + + +class HeterGraphTest(unittest.TestCase): + """HeterGraph test + """ + + @classmethod + def setUpClass(cls): + np.random.seed(1) + edges = {} + # for test no successor + edges['c2p'] = [(1, 4), (0, 5), (1, 9), (1, 8), (2, 8), (2, 5), (3, 6), + (3, 7), (3, 4), (3, 8)] + edges['p2c'] = [(v, u) for u, v in edges['c2p']] + edges['p2a'] = [(4, 10), (4, 11), (4, 12), (4, 14), (4, 13), (6, 12), + (6, 11), (6, 14), (7, 12), (7, 11), (8, 14), (9, 10)] + edges['a2p'] = [(v, u) for u, v in edges['p2a']] + + # for test speed + # edges['c2p'] = [(0, 4), (0, 5), (1, 9), (1,8), (2,8), (2,5), (3,6), (3,7), (3,4), (3,8)] + # edges['p2c'] = [(v,u) for u, v in edges['c2p']] + # edges['p2a'] = [(4,10), (4,11), (4,12), (4,14), (5,13), (6,13), (6,11), (6,14), (7,12), (7,11), (8,14), (9,13)] + # edges['a2p'] = [(v,u) for u, v in edges['p2a']] + + node_types = ['c' for _ in range(4)] + ['p' for _ in range(6) + ] + ['a' for _ in range(5)] + node_types = [(i, t) for i, t in enumerate(node_types)] + + cls.graph = heter_graph.HeterGraph( + num_nodes=len(node_types), edges=edges, node_types=node_types) + + def test_num_nodes_by_type(self): + print() + n_types = {'c': 4, 'p': 6, 'a': 5} + for nt in n_types: + num_nodes = self.graph.num_nodes_by_type(nt) + self.assertEqual(num_nodes, n_types[nt]) + + def test_node_batch_iter(self): + print() + batch_size = 2 + ground = [[4, 5], [6, 7], [8, 9]] + for idx, nodes in enumerate( + self.graph.node_batch_iter( + batch_size=batch_size, shuffle=False, n_type='p')): + self.assertEqual(len(nodes), batch_size) + self.assertListEqual(list(nodes), ground[idx]) + + def test_sample_successor(self): + print() + nodes = [4, 5, 8] + md = 2 + succes = self.graph.sample_successor( + edge_type='p2a', nodes=nodes, max_degree=md, return_eids=False) + self.assertIsInstance(succes, list) + ground = [[10, 11, 12, 14, 13], [], [14]] + for succ, g in zip(succes, ground): + self.assertIsInstance(succ, np.ndarray) + for i in succ: + self.assertIn(i, g) + + nodes = [4] + succes = self.graph.sample_successor( + edge_type='p2a', nodes=nodes, max_degree=md, return_eids=False) + self.assertIsInstance(succes, list) + ground = [[10, 11, 12, 14, 13]] + for succ, g in zip(succes, ground): + self.assertIsInstance(succ, np.ndarray) + for i in succ: + self.assertIn(i, g) + + def test_successor(self): + print() + nodes = [4, 5, 8] + e_type = 'p2a' + succes = self.graph.successor( + edge_type=e_type, + nodes=nodes, ) + + self.assertIsInstance(succes, np.ndarray) + ground = [[10, 11, 12, 14, 13], [], [14]] + for succ, g in zip(succes, ground): + self.assertIsInstance(succ, np.ndarray) + self.assertCountEqual(succ, g) + + nodes = [4] + e_type = 'p2a' + succes = self.graph.successor( + edge_type=e_type, + nodes=nodes, ) + + self.assertIsInstance(succes, np.ndarray) + ground = [[10, 11, 12, 14, 13]] + for succ, g in zip(succes, ground): + self.assertIsInstance(succ, np.ndarray) + self.assertCountEqual(succ, g) + + def test_sample_nodes(self): + print() + p_ground = [4, 5, 6, 7, 8, 9] + sample_num = 10 + nodes = self.graph.sample_nodes(sample_num=sample_num, n_type='p') + + self.assertEqual(len(nodes), sample_num) + for n in nodes: + self.assertIn(n, p_ground) + + # test n_type == None + ground = [i for i in range(15)] + nodes = self.graph.sample_nodes(sample_num=sample_num, n_type=None) + self.assertEqual(len(nodes), sample_num) + for n in nodes: + self.assertIn(n, ground) + + +if __name__ == "__main__": + unittest.main() diff --git a/pgl/tests/test_metapath_randomwalk.py b/pgl/tests/test_metapath_randomwalk.py new file mode 100644 index 0000000000000000000000000000000000000000..08fc832bde5afcb5b7c8222afb37cccb16de8439 --- /dev/null +++ b/pgl/tests/test_metapath_randomwalk.py @@ -0,0 +1,76 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""test_metapath_randomwalk""" +import time +import unittest +import json +import os + +import numpy as np +from pgl.sample import metapath_randomwalk +from pgl.graph import Graph +from pgl import heter_graph + +np.random.seed(1) + + +class MetapathRandomwalkTest(unittest.TestCase): + """metapath_randomwalk test + """ + + def setUp(self): + edges = {} + # for test no successor + edges['c2p'] = [(1, 4), (0, 5), (1, 9), (1, 8), (2, 8), (2, 5), (3, 6), + (3, 7), (3, 4), (3, 8)] + edges['p2c'] = [(v, u) for u, v in edges['c2p']] + edges['p2a'] = [(4, 10), (4, 11), (4, 12), (4, 14), (4, 13), (6, 12), + (6, 11), (6, 14), (7, 12), (7, 11), (8, 14), (9, 10)] + edges['a2p'] = [(v, u) for u, v in edges['p2a']] + + # for test speed + # edges['c2p'] = [(0, 4), (0, 5), (1, 9), (1,8), (2,8), (2,5), (3,6), (3,7), (3,4), (3,8)] + # edges['p2c'] = [(v,u) for u, v in edges['c2p']] + # edges['p2a'] = [(4,10), (4,11), (4,12), (4,14), (5,13), (6,13), (6,11), (6,14), (7,12), (7,11), (8,14), (9,13)] + # edges['a2p'] = [(v,u) for u, v in edges['p2a']] + + self.node_types = ['c' for _ in range(4)] + [ + 'p' for _ in range(6) + ] + ['a' for _ in range(5)] + node_types = [(i, t) for i, t in enumerate(self.node_types)] + + self.graph = heter_graph.HeterGraph( + num_nodes=len(node_types), edges=edges, node_types=node_types) + + def test_metapath_randomwalk(self): + meta_path = 'c2p-p2a-a2p-p2c' + path = ['c', 'p', 'a', 'p', 'c'] + start_nodes = [0, 1, 2, 3] + walk_len = 10 + walks = metapath_randomwalk( + graph=self.graph, + start_nodes=start_nodes, + metapath=meta_path, + walk_length=walk_len) + + self.assertEqual(len(walks), 4) + + for walk in walks: + for i in range(len(walk)): + idx = i % (len(path) - 1) + self.assertEqual(self.node_types[walk[i]], path[idx]) + + +if __name__ == "__main__": + unittest.main() diff --git a/pgl/utils/mp_reader.py b/pgl/utils/mp_reader.py index 9f26a60ddb38ea7b7044dd211cc629ba679a3616..b7aec4d268e13d282c8420d80628f975e5472499 100644 --- a/pgl/utils/mp_reader.py +++ b/pgl/utils/mp_reader.py @@ -25,6 +25,8 @@ except: import numpy as np import time import paddle.fluid as fluid +from queue import Queue +import threading def serialize_data(data): @@ -129,22 +131,39 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000, pipe_size=10): p.start() reader_num = len(readers) - finish_num = 0 conn_to_remove = [] finish_flag = np.zeros(len(conns), dtype="int32") + start = time.time() + + def queue_worker(sub_conn, que): + while True: + buff = sub_conn.recv() + sample = deserialize_data(buff) + if sample is None: + que.put(None) + sub_conn.close() + break + que.put(sample) + + thread_pool = [] + output_queue = Queue(maxsize=reader_num) + for i in range(reader_num): + t = threading.Thread( + target=queue_worker, args=(conns[i], output_queue)) + t.daemon = True + t.start() + thread_pool.append(t) + + finish_num = 0 while finish_num < reader_num: - for conn_id, conn in enumerate(conns): - if finish_flag[conn_id] > 0: - continue - if conn.poll(0.01): - buff = conn.recv() - sample = deserialize_data(buff) - if sample is None: - finish_num += 1 - conn.close() - finish_flag[conn_id] = 1 - else: - yield sample + sample = output_queue.get() + if sample is None: + finish_num += 1 + else: + yield sample + + for thread in thread_pool: + thread.join() if use_pipe: return pipe_reader