diff --git a/examples/distribute_metapath2vec/README.md b/examples/distribute_metapath2vec/README.md index de31aa398d281c4cec9acf351320ea5a84c6b3e6..0cbc6a7ff8ba0ad190d9185618b6d2f49edfb2b1 100644 --- a/examples/distribute_metapath2vec/README.md +++ b/examples/distribute_metapath2vec/README.md @@ -1,18 +1,18 @@ -# Distributed metapath2vec in PGL +# Distributed metapath2vec, metapath2vec++, multi-metapath2vec++ in PGL [metapath2vec](https://ericdongyx.github.io/papers/KDD17-dong-chawla-swami-metapath2vec.pdf) is a algorithm framework for representation learning in heterogeneous networks which contains multiple types of nodes and links. Given a heterogeneous graph, metapath2vec algorithm first generates meta-path-based random walks and then use skipgram model to train a language model. Based on PGL, we reproduce metapath2vec algorithm in distributed mode. -## Datasets +### Datasets DBLP: The dataset contains 14376 papers (P), 20 conferences (C), 14475 authors (A), and 8920 terms (T). There are 33791 nodes in this dataset. You can dowload datasets from [here](https://github.com/librahu/HIN-Datasets-for-Recommendation-and-Network-Embedding) We use the ```DBLP``` dataset for example. After downloading the dataset, put them, let's say, in ```./data/DBLP/``` . -## Dependencies +### Dependencies - paddlepaddle>=1.6 - pgl>=1.0.0 -## How to run +### How to run Before training, run the below command to do data preprocessing. ```sh python data_process.py --data_path ./data/DBLP --output_path ./data/data_processed @@ -30,11 +30,21 @@ python multi_class.py --dataset ./data/data_processed/author_label.txt --ckpt_pa ``` +### Model Selection +Actually, There are 3 models in this example, they are ```metapath2vec```, ```metapath2vec++``` and ```multi_metapath2vec++```. You can select different models by modifying ```config.yaml```. -## Hyperparameters +In order to run ```metapath2vec++``` model, you can easily rewrite the hyper parameter of **neg_sample_type** to **m2v_plus**, then ```metapath2vec++``` model will be selected. + +```multi-metapath2vec++``` means that you are not only use a single metapath, instead, you can use several metapaths at the same time to train the model. For example, you might want to use ```c2p-p2a-a2p-p2c``` and ```p2a-a2p``` simultaneously. Then you can rewrite the below hyper parameters in ```config.yaml```. +- **neg_sample_type**: "m2v_plus" +- **walk_mode**: "multi_m2v" +- **meta_path**: "c2p-p2a-a2p-p2c;p2a-a2p" +- **first_node_type**: "c;p" + +### Hyperparameters All the hyper parameters are saved in ```config.yaml``` file. So before training, you can open the config.yaml to modify the hyper parameters as you like. -Some important hyper parameters in config.yaml: +Some important hyper parameters in ```config.yaml```: - **edge_path**: the directory of graph data that you want to load - **lr**: learning rate - **neg_num**: number of negative samples. diff --git a/examples/distribute_metapath2vec/config.yaml b/examples/distribute_metapath2vec/config.yaml index 30506df33cf16dc8f3864ca23026f464a8dff0ae..db3f38cf84b303603ce5193657f713b734c2a2bf 100644 --- a/examples/distribute_metapath2vec/config.yaml +++ b/examples/distribute_metapath2vec/config.yaml @@ -31,7 +31,7 @@ is_distributed: False # trainging config epochs: 10 optimizer: "sgd" -lr: 1.0 +lr: 0.1 warm_start_from_dir: null walkpath_files: "None" train_files: "None" diff --git a/examples/distribute_metapath2vec/walker.py b/examples/distribute_metapath2vec/walker.py index db340f9d1f4880ca54af1683ec1a368735b4ebe4..65037665e76a66772329c584c843c2f1fbd8535a 100644 --- a/examples/distribute_metapath2vec/walker.py +++ b/examples/distribute_metapath2vec/walker.py @@ -87,9 +87,12 @@ class NodeGenerator(object): idx = cc % num_n_type n_type = n_type_list[idx] try: - nodes = node_generators[n_type].next() + nodes = next(node_generators[n_type]) except StopIteration as e: - log.info("exception when iteration") + log.info("node type of %s iteration finished in one epoch" % + (n_type)) + node_generators[n_type] = \ + self.graph.node_batch_iter(self.batch_size, n_type=n_type) break yield (nodes, idx) cc += 1 diff --git a/examples/erniesage/config/erniesage_v1_cpu.yaml b/examples/erniesage/config/erniesage_v1_cpu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1f7e5eddc0b6bda5f8c3377c7320429d16b0718b --- /dev/null +++ b/examples/erniesage/config/erniesage_v1_cpu.yaml @@ -0,0 +1,56 @@ +# Global Enviroment Settings +# +# trainer config ------ +learner_type: "cpu" +optimizer_type: "adam" +lr: 0.00005 +batch_size: 2 +CPU_NUM: 10 +epoch: 20 +log_per_step: 1 +save_per_step: 100 +output_path: "./output" +ckpt_path: "./ernie_base_ckpt" + +# data config ------ +input_data: "./data.txt" +graph_path: "./workdir" +sample_workers: 1 +use_pyreader: true +input_type: "text" + +# model config ------ +samples: [10] +model_type: "ErnieSageModelV1" +layer_type: "graphsage_sum" + +max_seqlen: 40 + +num_layers: 1 +hidden_size: 128 +final_fc: true +final_l2_norm: true +loss_type: "hinge" +margin: 0.3 + +# infer config ------ +infer_model: "./output/last" +infer_batch_size: 128 + +# ernie config ------ +encoding: "utf8" +ernie_vocab_file: "./vocab.txt" +ernie_config: + attention_probs_dropout_prob: 0.1 + hidden_act: "relu" + hidden_dropout_prob: 0.1 + hidden_size: 768 + initializer_range: 0.02 + max_position_embeddings: 513 + num_attention_heads: 12 + num_hidden_layers: 12 + sent_type_vocab_size: 4 + task_type_vocab_size: 3 + vocab_size: 18000 + use_task_id: false + use_fp16: false diff --git a/examples/erniesage/config/erniesage_v1_gpu.yaml b/examples/erniesage/config/erniesage_v1_gpu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7b883fe3fa06332cf196d5142c40acaee8b98259 --- /dev/null +++ b/examples/erniesage/config/erniesage_v1_gpu.yaml @@ -0,0 +1,56 @@ +# Global Enviroment Settings +# +# trainer config ------ +learner_type: "gpu" +optimizer_type: "adam" +lr: 0.00005 +batch_size: 32 +CPU_NUM: 10 +epoch: 20 +log_per_step: 1 +save_per_step: 100 +output_path: "./output" +ckpt_path: "./ernie_base_ckpt" + +# data config ------ +input_data: "./data.txt" +graph_path: "./workdir" +sample_workers: 1 +use_pyreader: true +input_type: "text" + +# model config ------ +samples: [10] +model_type: "ErnieSageModelV1" +layer_type: "graphsage_sum" + +max_seqlen: 40 + +num_layers: 1 +hidden_size: 128 +final_fc: true +final_l2_norm: true +loss_type: "hinge" +margin: 0.3 + +# infer config ------ +infer_model: "./output/last" +infer_batch_size: 128 + +# ernie config ------ +encoding: "utf8" +ernie_vocab_file: "./vocab.txt" +ernie_config: + attention_probs_dropout_prob: 0.1 + hidden_act: "relu" + hidden_dropout_prob: 0.1 + hidden_size: 768 + initializer_range: 0.02 + max_position_embeddings: 513 + num_attention_heads: 12 + num_hidden_layers: 12 + sent_type_vocab_size: 4 + task_type_vocab_size: 3 + vocab_size: 18000 + use_task_id: false + use_fp16: false diff --git a/examples/erniesage/config/erniesage_v2_cpu.yaml b/examples/erniesage/config/erniesage_v2_cpu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d39e2442a71c9400a29ddd365a7fc3e2ad126731 --- /dev/null +++ b/examples/erniesage/config/erniesage_v2_cpu.yaml @@ -0,0 +1,55 @@ +# Global Enviroment Settings +# +# trainer config ------ +learner_type: "cpu" +optimizer_type: "adam" +lr: 0.00005 +batch_size: 2 +CPU_NUM: 10 +epoch: 20 +log_per_step: 1 +save_per_step: 100 +output_path: "./output" +ckpt_path: "./ernie_base_ckpt" + +# data config ------ +input_data: "./data.txt" +graph_path: "./workdir" +sample_workers: 1 +use_pyreader: true +input_type: "text" + +# model config ------ +samples: [10] +model_type: "ErnieSageModelV2" + +max_seqlen: 40 + +num_layers: 1 +hidden_size: 128 +final_fc: true +final_l2_norm: true +loss_type: "hinge" +margin: 0.3 + +# infer config ------ +infer_model: "./output/last" +infer_batch_size: 128 + +# ernie config ------ +encoding: "utf8" +ernie_vocab_file: "./vocab.txt" +ernie_config: + attention_probs_dropout_prob: 0.1 + hidden_act: "relu" + hidden_dropout_prob: 0.1 + hidden_size: 768 + initializer_range: 0.02 + max_position_embeddings: 513 + num_attention_heads: 12 + num_hidden_layers: 12 + sent_type_vocab_size: 4 + task_type_vocab_size: 3 + vocab_size: 18000 + use_task_id: false + use_fp16: false diff --git a/examples/erniesage/config/erniesage_v2_gpu.yaml b/examples/erniesage/config/erniesage_v2_gpu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a346808c8c1f7a5fe36544c6d2dc06eda98e0ed8 --- /dev/null +++ b/examples/erniesage/config/erniesage_v2_gpu.yaml @@ -0,0 +1,55 @@ +# Global Enviroment Settings +# +# trainer config ------ +learner_type: "gpu" +optimizer_type: "adam" +lr: 0.00005 +batch_size: 32 +CPU_NUM: 10 +epoch: 20 +log_per_step: 1 +save_per_step: 100 +output_path: "./output" +ckpt_path: "./ernie_base_ckpt" + +# data config ------ +input_data: "./data.txt" +graph_path: "./workdir" +sample_workers: 1 +use_pyreader: true +input_type: "text" + +# model config ------ +samples: [10] +model_type: "ErnieSageModelV2" + +max_seqlen: 40 + +num_layers: 1 +hidden_size: 128 +final_fc: true +final_l2_norm: true +loss_type: "hinge" +margin: 0.3 + +# infer config ------ +infer_model: "./output/last" +infer_batch_size: 128 + +# ernie config ------ +encoding: "utf8" +ernie_vocab_file: "./vocab.txt" +ernie_config: + attention_probs_dropout_prob: 0.1 + hidden_act: "relu" + hidden_dropout_prob: 0.1 + hidden_size: 768 + initializer_range: 0.02 + max_position_embeddings: 513 + num_attention_heads: 12 + num_hidden_layers: 12 + sent_type_vocab_size: 4 + task_type_vocab_size: 3 + vocab_size: 18000 + use_task_id: false + use_fp16: false diff --git a/examples/erniesage/config/erniesage_v3_cpu.yaml b/examples/erniesage/config/erniesage_v3_cpu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2172a26133c9718358163f4495133720dbeb9eff --- /dev/null +++ b/examples/erniesage/config/erniesage_v3_cpu.yaml @@ -0,0 +1,55 @@ +# Global Enviroment Settings +# +# trainer config ------ +learner_type: "cpu" +optimizer_type: "adam" +lr: 0.00005 +batch_size: 2 +CPU_NUM: 10 +epoch: 20 +log_per_step: 1 +save_per_step: 100 +output_path: "./output" +ckpt_path: "./ernie_base_ckpt" + +# data config ------ +input_data: "./data.txt" +graph_path: "./workdir" +sample_workers: 1 +use_pyreader: true +input_type: "text" + +# model config ------ +samples: [10] +model_type: "ErnieSageModelV3" + +max_seqlen: 40 + +num_layers: 1 +hidden_size: 128 +final_fc: true +final_l2_norm: true +loss_type: "hinge" +margin: 0.3 + +# infer config ------ +infer_model: "./output/last" +infer_batch_size: 128 + +# ernie config ------ +encoding: "utf8" +ernie_vocab_file: "./vocab.txt" +ernie_config: + attention_probs_dropout_prob: 0.1 + hidden_act: "relu" + hidden_dropout_prob: 0.1 + hidden_size: 768 + initializer_range: 0.02 + max_position_embeddings: 513 + num_attention_heads: 12 + num_hidden_layers: 12 + sent_type_vocab_size: 4 + task_type_vocab_size: 3 + vocab_size: 18000 + use_task_id: false + use_fp16: false diff --git a/examples/erniesage/config/erniesage_v3_gpu.yaml b/examples/erniesage/config/erniesage_v3_gpu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e53ab33c41f8b8760e75d602bf1b8ed9f1735fb8 --- /dev/null +++ b/examples/erniesage/config/erniesage_v3_gpu.yaml @@ -0,0 +1,55 @@ +# Global Enviroment Settings +# +# trainer config ------ +learner_type: "gpu" +optimizer_type: "adam" +lr: 0.00005 +batch_size: 32 +CPU_NUM: 10 +epoch: 20 +log_per_step: 1 +save_per_step: 100 +output_path: "./output" +ckpt_path: "./ernie_base_ckpt" + +# data config ------ +input_data: "./data.txt" +graph_path: "./workdir" +sample_workers: 1 +use_pyreader: true +input_type: "text" + +# model config ------ +samples: [10] +model_type: "ErnieSageModelV3" + +max_seqlen: 40 + +num_layers: 1 +hidden_size: 128 +final_fc: true +final_l2_norm: true +loss_type: "hinge" +margin: 0.3 + +# infer config ------ +infer_model: "./output/last" +infer_batch_size: 128 + +# ernie config ------ +encoding: "utf8" +ernie_vocab_file: "./vocab.txt" +ernie_config: + attention_probs_dropout_prob: 0.1 + hidden_act: "relu" + hidden_dropout_prob: 0.1 + hidden_size: 768 + initializer_range: 0.02 + max_position_embeddings: 513 + num_attention_heads: 12 + num_hidden_layers: 12 + sent_type_vocab_size: 4 + task_type_vocab_size: 3 + vocab_size: 18000 + use_task_id: false + use_fp16: false diff --git a/examples/erniesage/dataset/__init__.py b/examples/erniesage/dataset/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/examples/erniesage/dataset/base_dataset.py b/examples/erniesage/dataset/base_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..3b29b5761769e9be9e62fb4536e41d43f9c9abb4 --- /dev/null +++ b/examples/erniesage/dataset/base_dataset.py @@ -0,0 +1,158 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Base DataLoader +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from __future__ import absolute_import + +import os +import sys +import six +from io import open +from collections import namedtuple +import numpy as np +import tqdm +import paddle +from pgl.utils import mp_reader +import collections +import time +from pgl.utils.logger import log +import traceback + + +if six.PY3: + import io + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') + + +def batch_iter(data, perm, batch_size, fid, num_workers): + """node_batch_iter + """ + size = len(data) + start = 0 + cc = 0 + while start < size: + index = perm[start:start + batch_size] + start += batch_size + cc += 1 + if cc % num_workers != fid: + continue + yield data[index] + + +def scan_batch_iter(data, batch_size, fid, num_workers): + """node_batch_iter + """ + batch = [] + cc = 0 + for line_example in data.scan(): + cc += 1 + if cc % num_workers != fid: + continue + batch.append(line_example) + if len(batch) == batch_size: + yield batch + batch = [] + + if len(batch) > 0: + yield batch + + +class BaseDataGenerator(object): + """Base Data Geneartor""" + + def __init__(self, buf_size, batch_size, num_workers, shuffle=True): + self.num_workers = num_workers + self.batch_size = batch_size + self.line_examples = [] + self.buf_size = buf_size + self.shuffle = shuffle + + def batch_fn(self, batch_examples): + """ batch_fn batch producer""" + raise NotImplementedError("No defined Batch Fn") + + def batch_iter(self, fid, perm): + """ batch iterator""" + if self.shuffle: + for batch in batch_iter(self, perm, self.batch_size, fid, self.num_workers): + yield batch + else: + for batch in scan_batch_iter(self, self.batch_size, fid, self.num_workers): + yield batch + + def __len__(self): + return len(self.line_examples) + + def __getitem__(self, idx): + if isinstance(idx, collections.Iterable): + return [self[bidx] for bidx in idx] + else: + return self.line_examples[idx] + + def generator(self): + """batch dict generator""" + + def worker(filter_id, perm): + """ multiprocess worker""" + + def func_run(): + """ func_run """ + pid = os.getpid() + np.random.seed(pid + int(time.time())) + for batch_examples in self.batch_iter(filter_id, perm): + try: + batch_dict = self.batch_fn(batch_examples) + except Exception as e: + traceback.print_exc() + log.info(traceback.format_exc()) + log.info(str(e)) + continue + + if batch_dict is None: + continue + yield batch_dict + + + + return func_run + + # consume a seed + np.random.rand() + + if self.shuffle: + perm = np.arange(0, len(self)) + np.random.shuffle(perm) + else: + perm = None + + if self.num_workers == 1: + r = paddle.reader.buffered(worker(0, perm), self.buf_size) + else: + worker_pool = [worker(wid, perm) for wid in range(self.num_workers)] + worker = mp_reader.multiprocess_reader( + worker_pool, use_pipe=True, queue_size=1000) + r = paddle.reader.buffered(worker, self.buf_size) + + for batch in r(): + yield batch + + def scan(self): + for line_example in self.line_examples: + yield line_example diff --git a/examples/erniesage/dataset/graph_reader.py b/examples/erniesage/dataset/graph_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..99d029a98a3ac0f482cdf5d4cd6591967ce86495 --- /dev/null +++ b/examples/erniesage/dataset/graph_reader.py @@ -0,0 +1,119 @@ +"""Graph Dataset +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from __future__ import absolute_import + +import os +import pgl +import sys + +import numpy as np + +from pgl.utils.logger import log +from dataset.base_dataset import BaseDataGenerator +from pgl.sample import alias_sample +from pgl.sample import pinsage_sample +from pgl.sample import graphsage_sample +from pgl.sample import edge_hash + + +class GraphGenerator(BaseDataGenerator): + def __init__(self, graph_wrappers, data, batch_size, samples, + num_workers, feed_name_list, use_pyreader, + phase, graph_data_path, shuffle=True, buf_size=1000): + + super(GraphGenerator, self).__init__( + buf_size=buf_size, + num_workers=num_workers, + batch_size=batch_size, shuffle=shuffle) + # For iteration + self.line_examples = data + + self.graph_wrappers = graph_wrappers + self.samples = samples + self.feed_name_list = feed_name_list + self.use_pyreader = use_pyreader + self.phase = phase + self.load_graph(graph_data_path) + self.num_layers = len(graph_wrappers) + + def load_graph(self, graph_data_path): + self.graph = pgl.graph.MemmapGraph(graph_data_path) + self.alias = np.load(os.path.join(graph_data_path, "alias.npy"), mmap_mode="r") + self.events = np.load(os.path.join(graph_data_path, "events.npy"), mmap_mode="r") + self.term_ids = np.load(os.path.join(graph_data_path, "term_ids.npy"), mmap_mode="r") + + def batch_fn(self, batch_ex): + # batch_ex = [ + # (src, dst, neg), + # (src, dst, neg), + # (src, dst, neg), + # ] + # + batch_src = [] + batch_dst = [] + batch_neg = [] + for batch in batch_ex: + batch_src.append(batch[0]) + batch_dst.append(batch[1]) + if len(batch) == 3: # default neg samples + batch_neg.append(batch[2]) + + if len(batch_src) != self.batch_size: + if self.phase == "train": + return None #Skip + + if len(batch_neg) > 0: + batch_neg = np.unique(np.concatenate(batch_neg)) + batch_src = np.array(batch_src, dtype="int64") + batch_dst = np.array(batch_dst, dtype="int64") + + sampled_batch_neg = alias_sample(batch_dst.shape, self.alias, self.events) + + if len(batch_neg) > 0: + batch_neg = np.concatenate([batch_neg, sampled_batch_neg], 0) + else: + batch_neg = sampled_batch_neg + + if self.phase == "train": + ignore_edges = set() + else: + ignore_edges = set() + + nodes = np.unique(np.concatenate([batch_src, batch_dst, batch_neg], 0)) + subgraphs = graphsage_sample(self.graph, nodes, self.samples, ignore_edges=ignore_edges) + feed_dict = {} + for i in range(self.num_layers): + feed_dict.update(self.graph_wrappers[i].to_feed(subgraphs[i])) + + # only reindex from first subgraph + sub_src_idx = subgraphs[0].reindex_from_parrent_nodes(batch_src) + sub_dst_idx = subgraphs[0].reindex_from_parrent_nodes(batch_dst) + sub_neg_idx = subgraphs[0].reindex_from_parrent_nodes(batch_neg) + + feed_dict["user_index"] = np.array(sub_src_idx, dtype="int64") + feed_dict["item_index"] = np.array(sub_dst_idx, dtype="int64") + #feed_dict["neg_item_index"] = np.array(sub_neg_idx, dtype="int64") + feed_dict["term_ids"] = self.term_ids[subgraphs[0].node_feat["index"]] + return feed_dict + + def __call__(self): + return self.generator() + + def generator(self): + try: + for feed_dict in super(GraphGenerator, self).generator(): + if self.use_pyreader: + yield [feed_dict[name] for name in self.feed_name_list] + else: + yield feed_dict + + except Exception as e: + log.exception(e) + + + diff --git a/examples/erniesage/docs/source/_static/ernie_aggregator.png b/examples/erniesage/docs/source/_static/ernie_aggregator.png new file mode 100644 index 0000000000000000000000000000000000000000..206a0673d76e97bcc6a47108df683583e4a0b240 Binary files /dev/null and b/examples/erniesage/docs/source/_static/ernie_aggregator.png differ diff --git a/examples/erniesage/docs/source/_static/text_graph.png b/examples/erniesage/docs/source/_static/text_graph.png new file mode 100644 index 0000000000000000000000000000000000000000..26f89eb124f272acee2b097f39cb310416de45e1 Binary files /dev/null and b/examples/erniesage/docs/source/_static/text_graph.png differ diff --git a/examples/erniesage/infer.py b/examples/erniesage/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..20735ddc487e216c309f9bcfccc8a8ed3a602873 --- /dev/null +++ b/examples/erniesage/infer.py @@ -0,0 +1,187 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import division +from __future__ import absolute_import +from __future__ import print_function +from __future__ import unicode_literals +import argparse +import pickle +import time +import glob +import os +import io +import traceback +import pickle as pkl +role = os.getenv("TRAINING_ROLE", "TRAINER") + +import numpy as np +import yaml +from easydict import EasyDict as edict +import pgl +from pgl.utils.logger import log +from pgl.utils import paddle_helper +import paddle +import paddle.fluid as F + +from models.model_factory import Model +from dataset.graph_reader import GraphGenerator + + +class PredictData(object): + def __init__(self, num_nodes): + trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) + train_usr = np.arange(trainer_id, num_nodes, trainer_count) + #self.data = (train_usr, train_usr) + self.data = train_usr + + def __getitem__(self, index): + return [self.data[index], self.data[index]] + +def tostr(data_array): + return " ".join(["%.5lf" % d for d in data_array]) + +def run_predict(py_reader, + exe, + program, + model_dict, + log_per_step=1, + args=None): + + if args.input_type == "text": + id2str = np.load(os.path.join(args.graph_path, "id2str.npy"), mmap_mode="r") + + trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) + if not os.path.exists(args.output_path): + os.mkdir(args.output_path) + + fout = io.open("%s/part-%s" % (args.output_path, trainer_id), "w", encoding="utf8") + batch = 0 + + for batch_feed_dict in py_reader(): + batch += 1 + batch_usr_feat, batch_ad_feat, batch_src_real_index = exe.run( + program, + feed=batch_feed_dict, + fetch_list=model_dict.outputs) + + if batch % log_per_step == 0: + log.info("Predict %s finished" % batch) + + for ufs, _, sri in zip(batch_usr_feat, batch_ad_feat, batch_src_real_index): + if args.input_type == "text": + sri = id2str[int(sri)] + line = "{}\t{}\n".format(sri, tostr(ufs)) + fout.write(line) + + fout.close() + +def _warmstart(exe, program, path='params'): + def _existed_persitables(var): + #if not isinstance(var, fluid.framework.Parameter): + # return False + if not F.io.is_persistable(var): + return False + param_path = os.path.join(path, var.name) + log.info("Loading parameter: {} persistable: {} exists: {}".format( + param_path, + F.io.is_persistable(var), + os.path.exists(param_path), + )) + return os.path.exists(param_path) + F.io.load_vars( + exe, + path, + main_program=program, + predicate=_existed_persitables + ) + +def main(config): + model = Model.factory(config) + + if config.learner_type == "cpu": + place = F.CPUPlace() + elif config.learner_type == "gpu": + gpu_id = int(os.getenv("FLAGS_selected_gpus", "0")) + place = F.CUDAPlace(gpu_id) + else: + raise ValueError + + exe = F.Executor(place) + + val_program = F.default_main_program().clone(for_test=True) + exe.run(F.default_startup_program()) + _warmstart(exe, F.default_startup_program(), path=config.infer_model) + + num_threads = int(os.getenv("CPU_NUM", 1)) + trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) + + exec_strategy = F.ExecutionStrategy() + exec_strategy.num_threads = num_threads + build_strategy = F.BuildStrategy() + build_strategy.enable_inplace = True + build_strategy.memory_optimize = True + build_strategy.remove_unnecessary_lock = False + build_strategy.memory_optimize = False + + if num_threads > 1: + build_strategy.reduce_strategy = F.BuildStrategy.ReduceStrategy.Reduce + + val_compiled_prog = F.compiler.CompiledProgram( + val_program).with_data_parallel( + build_strategy=build_strategy, + exec_strategy=exec_strategy) + + num_nodes = int(np.load(os.path.join(config.graph_path, "num_nodes.npy"))) + + predict_data = PredictData(num_nodes) + + predict_iter = GraphGenerator( + graph_wrappers=model.graph_wrappers, + batch_size=config.infer_batch_size, + data=predict_data, + samples=config.samples, + num_workers=config.sample_workers, + feed_name_list=[var.name for var in model.feed_list], + use_pyreader=config.use_pyreader, + phase="predict", + graph_data_path=config.graph_path, + shuffle=False) + + if config.learner_type == "cpu": + model.data_loader.decorate_batch_generator( + predict_iter, places=F.cpu_places()) + elif config.learner_type == "gpu": + gpu_id = int(os.getenv("FLAGS_selected_gpus", "0")) + place = F.CUDAPlace(gpu_id) + model.data_loader.decorate_batch_generator( + predict_iter, places=place) + else: + raise ValueError + + run_predict(model.data_loader, + program=val_compiled_prog, + exe=exe, + model_dict=model, + args=config) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='main') + parser.add_argument("--conf", type=str, default="./config.yaml") + args = parser.parse_args() + config = edict(yaml.load(open(args.conf), Loader=yaml.FullLoader)) + print(config) + main(config) diff --git a/examples/erniesage/job.sh b/examples/erniesage/job.sh new file mode 100644 index 0000000000000000000000000000000000000000..77739df75a3250b1f4d096a50c0841cac5b83a6a --- /dev/null +++ b/examples/erniesage/job.sh @@ -0,0 +1,45 @@ + +unset http_proxy https_proxy +set -x +mode=${1:-local} +config=${2:-"./config.yaml"} + +function parse_yaml { + local prefix=$2 + local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034') + sed -ne "s|^\($s\):|\1|" \ + -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \ + -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p" $1 | + awk -F$fs '{ + indent = length($1)/2; + vname[indent] = $2; + for (i in vname) {if (i > indent) {delete vname[i]}} + if (length($3) > 0) { + vn=""; for (i=0; i 1: + build_strategy.reduce_strategy = F.BuildStrategy.ReduceStrategy.Reduce + + log.info("start build compile program...") + compiled_prog = F.compiler.CompiledProgram(tfleet.main_program + ).with_data_parallel( + loss_name=loss.name, + build_strategy=build_strategy, + exec_strategy=exec_strategy) + + return compiled_prog + + +class CollectiveLearner(Learner): + def __init__(self): + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + cfleet.init(role) + + def optimize(self, loss, optimizer_type, lr): + optimizer = F.optimizer.Adam(learning_rate=lr) + dist_strategy = DistributedStrategy() + optimizer = cfleet.distributed_optimizer(optimizer, strategy=dist_strategy) + _, param_grads = optimizer.minimize(loss, F.default_startup_program()) + + def build(self, model, data_gen, config): + self.optimize(model.loss, config.optimizer_type, config.lr) + self.program = cfleet.main_program + gpu_id = int(os.getenv("FLAGS_selected_gpus", "0")) + place = F.CUDAPlace(gpu_id) + self.exe = F.Executor(place) + self.exe.run(F.default_startup_program()) + self.warmstart(F.default_startup_program(), config.ckpt_path) + self.fleet = cfleet + model.data_loader.decorate_batch_generator( + data_gen, places=place) + self.config = config + self.model = model diff --git a/examples/erniesage/local_run.sh b/examples/erniesage/local_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..7b76d1c8288472d5c84abfd8585032c71ace8dab --- /dev/null +++ b/examples/erniesage/local_run.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +set -x +config=${1:-"./config.yaml"} +unset http_proxy https_proxy + +function parse_yaml { + local prefix=$2 + local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034') + sed -ne "s|^\($s\):|\1|" \ + -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \ + -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p" $1 | + awk -F$fs '{ + indent = length($1)/2; + vname[indent] = $2; + for (i in vname) {if (i > indent) {delete vname[i]}} + if (length($3) > 0) { + vn=""; for (i=0; i $BASE/pserver.$i.log & + echo $! >> job_id + done + sleep 3s + for((j=0;j<${PADDLE_TRAINERS_NUM};j++)) + do + echo "start ps work: ${j}" + TRAINING_ROLE="TRAINER" PADDLE_TRAINER_ID=${j} sh job.sh local $config \ + echo $! >> job_id + done +} + +collective_local_train(){ + export PATH=./python27-gcc482-gpu/bin/:$PATH + echo `which python` + python -m paddle.distributed.launch train.py --conf $config + python -m paddle.distributed.launch infer.py --conf $config +} + +eval $(parse_yaml $config) +unalias python + +python3 ./preprocessing/dump_graph.py -i $input_data -o $graph_path --encoding $encoding \ + -l $max_seqlen --vocab_file $ernie_vocab_file + +if [[ $learner_type == "cpu" ]];then + transpiler_local_train +fi +if [[ $learner_type == "gpu" ]];then + collective_local_train +fi diff --git a/examples/erniesage/models/__init__.py b/examples/erniesage/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/examples/erniesage/models/base.py b/examples/erniesage/models/base.py new file mode 100644 index 0000000000000000000000000000000000000000..c910c714db4269bc9b3bac391e2adf2699f476c6 --- /dev/null +++ b/examples/erniesage/models/base.py @@ -0,0 +1,202 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import time +import glob +import os + +import numpy as np + +import pgl +import paddle.fluid as F +import paddle.fluid.layers as L + +from models import message_passing + +def get_layer(layer_type, gw, feature, hidden_size, act, initializer, learning_rate, name, is_test=False): + return getattr(message_passing, layer_type)(gw, feature, hidden_size, act, initializer, learning_rate, name) + + +class BaseGraphWrapperBuilder(object): + def __init__(self, config): + self.config = config + self.node_feature_info = [] + self.edge_feature_info = [] + + def __call__(self): + place = F.CPUPlace() + graph_wrappers = [] + for i in range(self.config.num_layers): + # all graph have same node_feat_info + graph_wrappers.append( + pgl.graph_wrapper.GraphWrapper( + "layer_%s" % i, place, node_feat=self.node_feature_info, edge_feat=self.edge_feature_info)) + return graph_wrappers + + +class GraphsageGraphWrapperBuilder(BaseGraphWrapperBuilder): + def __init__(self, config): + super(GraphsageGraphWrapperBuilder, self).__init__(config) + self.node_feature_info.append(('index', [None], np.dtype('int64'))) + + +class BaseGNNModel(object): + def __init__(self, config): + self.config = config + self.graph_wrapper_builder = self.gen_graph_wrapper_builder(config) + self.net_fn = self.gen_net_fn(config) + self.feed_list_builder = self.gen_feed_list_builder(config) + self.data_loader_builder = self.gen_data_loader_builder(config) + self.loss_fn = self.gen_loss_fn(config) + self.build() + + + def gen_graph_wrapper_builder(self, config): + return GraphsageGraphWrapperBuilder(config) + + def gen_net_fn(self, config): + return BaseNet(config) + + def gen_feed_list_builder(self, config): + return BaseFeedListBuilder(config) + + def gen_data_loader_builder(self, config): + return BaseDataLoaderBuilder(config) + + def gen_loss_fn(self, config): + return BaseLoss(config) + + def build(self): + self.graph_wrappers = self.graph_wrapper_builder() + self.inputs, self.outputs = self.net_fn(self.graph_wrappers) + self.feed_list = self.feed_list_builder(self.inputs, self.graph_wrappers) + self.data_loader = self.data_loader_builder(self.feed_list) + self.loss = self.loss_fn(self.outputs) + +class BaseFeedListBuilder(object): + def __init__(self, config): + self.config = config + + def __call__(self, inputs, graph_wrappers): + feed_list = [] + for i in range(len(graph_wrappers)): + feed_list.extend(graph_wrappers[i].holder_list) + feed_list.extend(inputs) + return feed_list + + +class BaseDataLoaderBuilder(object): + def __init__(self, config): + self.config = config + + def __call__(self, feed_list): + data_loader = F.io.PyReader( + feed_list=feed_list, capacity=20, use_double_buffer=True, iterable=True) + return data_loader + + + +class BaseNet(object): + def __init__(self, config): + self.config = config + + def take_final_feature(self, feature, index, name): + """take final feature""" + feat = L.gather(feature, index, overwrite=False) + + if self.config.final_fc: + feat = L.fc(feat, + self.config.hidden_size, + param_attr=F.ParamAttr(name=name + '_w'), + bias_attr=F.ParamAttr(name=name + '_b')) + + if self.config.final_l2_norm: + feat = L.l2_normalize(feat, axis=1) + return feat + + def build_inputs(self): + user_index = L.data( + "user_index", shape=[None], dtype="int64", append_batch_size=False) + item_index = L.data( + "item_index", shape=[None], dtype="int64", append_batch_size=False) + return [user_index, item_index] + + def build_embedding(self, graph_wrappers, inputs=None): + num_embed = int(np.load(os.path.join(self.config.graph_path, "num_nodes.npy"))) + is_sparse = self.config.trainer_type == "Transpiler" + + embed = L.embedding( + input=L.reshape(graph_wrappers[0].node_feat['index'], [-1, 1]), + size=[num_embed, self.config.hidden_size], + is_sparse=is_sparse, + param_attr=F.ParamAttr(name="node_embedding", initializer=F.initializer.Uniform( + low=-1. / self.config.hidden_size, + high=1. / self.config.hidden_size))) + return embed + + def gnn_layers(self, graph_wrappers, feature): + features = [feature] + + initializer = None + fc_lr = self.config.lr / 0.001 + + for i in range(self.config.num_layers): + if i == self.config.num_layers - 1: + act = None + else: + act = "leaky_relu" + + feature = get_layer( + self.config.layer_type, + graph_wrappers[i], + feature, + self.config.hidden_size, + act, + initializer, + learning_rate=fc_lr, + name="%s_%s" % (self.config.layer_type, i)) + features.append(feature) + return features + + def __call__(self, graph_wrappers): + inputs = self.build_inputs() + feature = self.build_embedding(graph_wrappers, inputs) + features = self.gnn_layers(graph_wrappers, feature) + outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs] + src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0]) + outputs.append(src_real_index) + return inputs, outputs + +class BaseLoss(object): + def __init__(self, config): + self.config = config + + def __call__(self, outputs): + user_feat, item_feat = outputs[0], outputs[1] + loss_type = self.config.loss_type + # Calc Loss + if self.config.loss_type == "hinge": + pos = L.reduce_sum(user_feat * item_feat, -1, keep_dim=True) # [B, 1] + neg = L.matmul(user_feat, item_feat, transpose_y=True) # [B, B] + loss = L.reduce_mean(L.relu(neg - pos + self.config.margin)) + elif self.config.loss_type == "softmax": + pass + # TODO + # pos = L.reduce_sum(user_feat * item_feat, -1, keep_dim=True) # [B, 1] + # neg = L.matmul(user_feat, neg_feat, transpose_y=True) # [B, B] + # logits = L.concat([pos, neg], -1) # [B, 1+B] + # labels = L.fill_constant_batch_size_like(logits, [-1, 1], "int64", 0) + # loss = L.reduce_mean(L.softmax_with_cross_entropy(logits, labels)) + else: + raise ValueError + return loss diff --git a/examples/erniesage/models/ernie.py b/examples/erniesage/models/ernie.py new file mode 100644 index 0000000000000000000000000000000000000000..6b53ebd439713f7f249c21ad96cafa0b2ae84f06 --- /dev/null +++ b/examples/erniesage/models/ernie.py @@ -0,0 +1,40 @@ +"""Ernie +""" +from models.base import BaseNet, BaseGNNModel + +class Ernie(BaseNet): + + def build_inputs(self): + inputs = super(Ernie, self).build_inputs() + term_ids = L.data( + "term_ids", shape=[None, self.config.max_seqlen], dtype="int64", append_batch_size=False) + return inputs + [term_ids] + + def build_embedding(self, graph_wrappers, term_ids): + term_ids = L.unsqueeze(term_ids, [-1]) + ernie_config = self.config.ernie_config + ernie = ErnieModel( + src_ids=term_ids, + sentence_ids=L.zeros_like(term_ids), + task_ids=None, + config=ernie_config, + use_fp16=False, + name="student_") + feature = ernie.get_pooled_output() + return feature + + def __call__(self, graph_wrappers): + inputs = self.build_inputs() + feature = self.build_embedding(graph_wrappers, inputs[-1]) + features = [feature] + outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs[:-1]] + src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0]) + outputs.append(src_real_index) + return inputs, outputs + + +class ErnieModel(BaseGNNModel): + def gen_net_fn(self, config): + return Ernie(config) + + diff --git a/examples/erniesage/models/ernie_model/__init__.py b/examples/erniesage/models/ernie_model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/examples/erniesage/models/ernie_model/ernie.py b/examples/erniesage/models/ernie_model/ernie.py new file mode 100644 index 0000000000000000000000000000000000000000..3ba4f9bbd82f3889e66a8ff16aa7f1eee27abc79 --- /dev/null +++ b/examples/erniesage/models/ernie_model/ernie.py @@ -0,0 +1,399 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Ernie model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import json +import six +import logging +import paddle.fluid as fluid +import paddle.fluid.layers as L + +from io import open + +from models.ernie_model.transformer_encoder import encoder, pre_process_layer +from models.ernie_model.transformer_encoder import graph_encoder + +log = logging.getLogger(__name__) + + +class ErnieConfig(object): + def __init__(self, config_path): + self._config_dict = self._parse(config_path) + + def _parse(self, config_path): + try: + with open(config_path, 'r', encoding='utf8') as json_file: + config_dict = json.load(json_file) + except Exception: + raise IOError("Error in parsing Ernie model config file '%s'" % + config_path) + else: + return config_dict + + def __getitem__(self, key): + return self._config_dict.get(key, None) + + def print_config(self): + for arg, value in sorted(six.iteritems(self._config_dict)): + log.info('%s: %s' % (arg, value)) + log.info('------------------------------------------------') + + +class ErnieModel(object): + def __init__(self, + src_ids, + sentence_ids, + task_ids=None, + config=None, + weight_sharing=True, + use_fp16=False, + name=""): + + self._set_config(config, name, weight_sharing) + input_mask = self._build_input_mask(src_ids) + position_ids = self._build_position_ids(src_ids) + self._build_model(src_ids, position_ids, sentence_ids, task_ids, + input_mask) + self._debug_summary(input_mask) + + def _debug_summary(self, input_mask): + #histogram + seqlen_before_pad = L.cast( + L.reduce_sum( + input_mask, dim=1), dtype='float32') + seqlen_after_pad = L.reduce_sum( + L.cast( + L.zeros_like(input_mask), dtype='float32') + 1.0, dim=1) + pad_num = seqlen_after_pad - seqlen_before_pad + pad_rate = pad_num / seqlen_after_pad + + def _build_position_ids(self, src_ids): + d_shape = L.shape(src_ids) + d_seqlen = d_shape[1] + d_batch = d_shape[0] + position_ids = L.reshape( + L.range( + 0, d_seqlen, 1, dtype='int32'), [1, d_seqlen, 1], + inplace=True) + position_ids = L.expand(position_ids, [d_batch, 1, 1]) + position_ids = L.cast(position_ids, 'int64') + position_ids.stop_gradient = True + return position_ids + + def _build_input_mask(self, src_ids): + zero = L.fill_constant([1], dtype='int64', value=0) + input_mask = L.logical_not(L.equal(src_ids, + zero)) # assume pad id == 0 + input_mask = L.cast(input_mask, 'float') + input_mask.stop_gradient = True + return input_mask + + def _set_config(self, config, name, weight_sharing): + self._emb_size = config['hidden_size'] + self._n_layer = config['num_hidden_layers'] + self._n_head = config['num_attention_heads'] + self._voc_size = config['vocab_size'] + self._max_position_seq_len = config['max_position_embeddings'] + if config.get('sent_type_vocab_size'): + self._sent_types = config['sent_type_vocab_size'] + else: + self._sent_types = config['type_vocab_size'] + + self._use_task_id = config['use_task_id'] + if self._use_task_id: + self._task_types = config['task_type_vocab_size'] + self._hidden_act = config['hidden_act'] + self._postprocess_cmd = config.get('postprocess_cmd', 'dan') + self._preprocess_cmd = config.get('preprocess_cmd', '') + self._prepostprocess_dropout = config['hidden_dropout_prob'] + self._attention_dropout = config['attention_probs_dropout_prob'] + self._weight_sharing = weight_sharing + self.name = name + + self._word_emb_name = self.name + "word_embedding" + self._pos_emb_name = self.name + "pos_embedding" + self._sent_emb_name = self.name + "sent_embedding" + self._task_emb_name = self.name + "task_embedding" + self._dtype = "float16" if config['use_fp16'] else "float32" + self._emb_dtype = "float32" + + # Initialize all weigths by truncated normal initializer, and all biases + # will be initialized by constant zero by default. + self._param_initializer = fluid.initializer.TruncatedNormal( + scale=config['initializer_range']) + + def _build_model(self, src_ids, position_ids, sentence_ids, task_ids, + input_mask): + + emb_out = self._build_embedding(src_ids, position_ids, sentence_ids, + task_ids) + self.input_mask = input_mask + self._enc_out, self.all_hidden, self.all_attn, self.all_ffn = encoder( + enc_input=emb_out, + input_mask=input_mask, + n_layer=self._n_layer, + n_head=self._n_head, + d_key=self._emb_size // self._n_head, + d_value=self._emb_size // self._n_head, + d_model=self._emb_size, + d_inner_hid=self._emb_size * 4, + prepostprocess_dropout=self._prepostprocess_dropout, + attention_dropout=self._attention_dropout, + relu_dropout=0, + hidden_act=self._hidden_act, + preprocess_cmd=self._preprocess_cmd, + postprocess_cmd=self._postprocess_cmd, + param_initializer=self._param_initializer, + name=self.name + 'encoder') + if self._dtype == "float16": + self._enc_out = fluid.layers.cast( + x=self._enc_out, dtype=self._emb_dtype) + + def _build_embedding(self, src_ids, position_ids, sentence_ids, task_ids): + # padding id in vocabulary must be set to 0 + emb_out = fluid.layers.embedding( + input=src_ids, + size=[self._voc_size, self._emb_size], + dtype=self._emb_dtype, + param_attr=fluid.ParamAttr( + name=self._word_emb_name, initializer=self._param_initializer), + is_sparse=False) + + position_emb_out = fluid.layers.embedding( + input=position_ids, + size=[self._max_position_seq_len, self._emb_size], + dtype=self._emb_dtype, + param_attr=fluid.ParamAttr( + name=self._pos_emb_name, initializer=self._param_initializer)) + + sent_emb_out = fluid.layers.embedding( + sentence_ids, + size=[self._sent_types, self._emb_size], + dtype=self._emb_dtype, + param_attr=fluid.ParamAttr( + name=self._sent_emb_name, initializer=self._param_initializer)) + + self.all_emb = [emb_out, position_emb_out, sent_emb_out] + emb_out = emb_out + position_emb_out + emb_out = emb_out + sent_emb_out + + if self._use_task_id: + task_emb_out = fluid.layers.embedding( + task_ids, + size=[self._task_types, self._emb_size], + dtype=self._emb_dtype, + param_attr=fluid.ParamAttr( + name=self._task_emb_name, + initializer=self._param_initializer)) + + emb_out = emb_out + task_emb_out + + emb_out = pre_process_layer( + emb_out, + 'nd', + self._prepostprocess_dropout, + name=self.name + 'pre_encoder') + + if self._dtype == "float16": + emb_out = fluid.layers.cast(x=emb_out, dtype=self._dtype) + return emb_out + + def get_sequence_output(self): + return self._enc_out + + def get_pooled_output(self): + """Get the first feature of each sequence for classification""" + next_sent_feat = self._enc_out[:, 0, :] + #next_sent_feat = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1]) + next_sent_feat = fluid.layers.fc( + input=next_sent_feat, + size=self._emb_size, + act="tanh", + param_attr=fluid.ParamAttr( + name=self.name + "pooled_fc.w_0", + initializer=self._param_initializer), + bias_attr=self.name + "pooled_fc.b_0") + return next_sent_feat + + def get_lm_output(self, mask_label, mask_pos): + """Get the loss & accuracy for pretraining""" + + mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32') + + # extract the first token feature in each sentence + self.next_sent_feat = self.get_pooled_output() + reshaped_emb_out = fluid.layers.reshape( + x=self._enc_out, shape=[-1, self._emb_size]) + # extract masked tokens' feature + mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos) + + # transform: fc + mask_trans_feat = fluid.layers.fc( + input=mask_feat, + size=self._emb_size, + act=self._hidden_act, + param_attr=fluid.ParamAttr( + name=self.name + 'mask_lm_trans_fc.w_0', + initializer=self._param_initializer), + bias_attr=fluid.ParamAttr(name=self.name + 'mask_lm_trans_fc.b_0')) + + # transform: layer norm + mask_trans_feat = fluid.layers.layer_norm( + mask_trans_feat, + begin_norm_axis=len(mask_trans_feat.shape) - 1, + param_attr=fluid.ParamAttr( + name=self.name + 'mask_lm_trans_layer_norm_scale', + initializer=fluid.initializer.Constant(1.)), + bias_attr=fluid.ParamAttr( + name=self.name + 'mask_lm_trans_layer_norm_bias', + initializer=fluid.initializer.Constant(0.))) + # transform: layer norm + #mask_trans_feat = pre_process_layer( + # mask_trans_feat, 'n', name=self.name + 'mask_lm_trans') + + mask_lm_out_bias_attr = fluid.ParamAttr( + name=self.name + "mask_lm_out_fc.b_0", + initializer=fluid.initializer.Constant(value=0.0)) + if self._weight_sharing: + fc_out = fluid.layers.matmul( + x=mask_trans_feat, + y=fluid.default_main_program().global_block().var( + self._word_emb_name), + transpose_y=True) + fc_out += fluid.layers.create_parameter( + shape=[self._voc_size], + dtype=self._emb_dtype, + attr=mask_lm_out_bias_attr, + is_bias=True) + + else: + fc_out = fluid.layers.fc(input=mask_trans_feat, + size=self._voc_size, + param_attr=fluid.ParamAttr( + name=self.name + "mask_lm_out_fc.w_0", + initializer=self._param_initializer), + bias_attr=mask_lm_out_bias_attr) + + mask_lm_loss = fluid.layers.softmax_with_cross_entropy( + logits=fc_out, label=mask_label) + return mask_lm_loss + + def get_task_output(self, task, task_labels): + task_fc_out = fluid.layers.fc( + input=self.next_sent_feat, + size=task["num_labels"], + param_attr=fluid.ParamAttr( + name=self.name + task["task_name"] + "_fc.w_0", + initializer=self._param_initializer), + bias_attr=self.name + task["task_name"] + "_fc.b_0") + task_loss, task_softmax = fluid.layers.softmax_with_cross_entropy( + logits=task_fc_out, label=task_labels, return_softmax=True) + task_acc = fluid.layers.accuracy(input=task_softmax, label=task_labels) + return task_loss, task_acc + + +class ErnieGraphModel(ErnieModel): + def __init__(self, + src_ids, + task_ids=None, + config=None, + weight_sharing=True, + use_fp16=False, + slot_seqlen=40, + name=""): + self.slot_seqlen = slot_seqlen + self._set_config(config, name, weight_sharing) + input_mask = self._build_input_mask(src_ids) + position_ids = self._build_position_ids(src_ids) + sentence_ids = self._build_sentence_ids(src_ids) + self._build_model(src_ids, position_ids, sentence_ids, task_ids, + input_mask) + self._debug_summary(input_mask) + + def _build_position_ids(self, src_ids): + src_shape = L.shape(src_ids) + src_seqlen = src_shape[1] + src_batch = src_shape[0] + + slot_seqlen = self.slot_seqlen + + num_b = (src_seqlen / slot_seqlen) - 1 + a_position_ids = L.reshape( + L.range( + 0, slot_seqlen, 1, dtype='int32'), [1, slot_seqlen, 1], + inplace=True) # [1, slot_seqlen, 1] + a_position_ids = L.expand(a_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen * num_b, 1] + + zero = L.fill_constant([1], dtype='int64', value=0) + input_mask = L.cast(L.equal(src_ids[:, :slot_seqlen], zero), "int32") # assume pad id == 0 [B, slot_seqlen, 1] + a_pad_len = L.reduce_sum(input_mask, 1) # [B, 1, 1] + + b_position_ids = L.reshape( + L.range( + slot_seqlen, 2*slot_seqlen, 1, dtype='int32'), [1, slot_seqlen, 1], + inplace=True) # [1, slot_seqlen, 1] + b_position_ids = L.expand(b_position_ids, [src_batch, num_b, 1]) # [B, slot_seqlen * num_b, 1] + b_position_ids = b_position_ids - a_pad_len # [B, slot_seqlen * num_b, 1] + + position_ids = L.concat([a_position_ids, b_position_ids], 1) + position_ids = L.cast(position_ids, 'int64') + position_ids.stop_gradient = True + return position_ids + + def _build_sentence_ids(self, src_ids): + src_shape = L.shape(src_ids) + src_seqlen = src_shape[1] + src_batch = src_shape[0] + + slot_seqlen = self.slot_seqlen + + zeros = L.zeros([src_batch, slot_seqlen, 1], "int64") + ones = L.ones([src_batch, src_seqlen-slot_seqlen, 1], "int64") + + sentence_ids = L.concat([zeros, ones], 1) + sentence_ids.stop_gradient = True + return sentence_ids + + def _build_model(self, src_ids, position_ids, sentence_ids, task_ids, + input_mask): + + emb_out = self._build_embedding(src_ids, position_ids, sentence_ids, + task_ids) + self.input_mask = input_mask + self._enc_out, self.all_hidden, self.all_attn, self.all_ffn = graph_encoder( + enc_input=emb_out, + input_mask=input_mask, + n_layer=self._n_layer, + n_head=self._n_head, + d_key=self._emb_size // self._n_head, + d_value=self._emb_size // self._n_head, + d_model=self._emb_size, + d_inner_hid=self._emb_size * 4, + prepostprocess_dropout=self._prepostprocess_dropout, + attention_dropout=self._attention_dropout, + relu_dropout=0, + hidden_act=self._hidden_act, + preprocess_cmd=self._preprocess_cmd, + postprocess_cmd=self._postprocess_cmd, + param_initializer=self._param_initializer, + slot_seqlen=self.slot_seqlen, + name=self.name + 'encoder') + if self._dtype == "float16": + self._enc_out = fluid.layers.cast( + x=self._enc_out, dtype=self._emb_dtype) diff --git a/examples/erniesage/models/ernie_model/transformer_encoder.py b/examples/erniesage/models/ernie_model/transformer_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..eef78f95595d90a6589ee10ad35cfaf0c88f485a --- /dev/null +++ b/examples/erniesage/models/ernie_model/transformer_encoder.py @@ -0,0 +1,518 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import partial +import numpy as np +from contextlib import contextmanager + +import paddle.fluid as fluid +import paddle.fluid.layers as L +import paddle.fluid.layers as layers +#import propeller.paddle as propeller +#from propeller import log + +#determin this at the begining +to_3d = lambda a: a # will change later +to_2d = lambda a: a + + +def multi_head_attention(queries, + keys, + values, + attn_bias, + d_key, + d_value, + d_model, + n_head=1, + dropout_rate=0., + cache=None, + param_initializer=None, + name='multi_head_att'): + """ + Multi-Head Attention. Note that attn_bias is added to the logit before + computing softmax activiation to mask certain selected positions so that + they will not considered in attention weights. + """ + keys = queries if keys is None else keys + values = keys if values is None else values + + def __compute_qkv(queries, keys, values, n_head, d_key, d_value): + """ + Add linear projection to queries, keys, and values. + """ + q = layers.fc(input=queries, + size=d_key * n_head, + num_flatten_dims=len(queries.shape) - 1, + param_attr=fluid.ParamAttr( + name=name + '_query_fc.w_0', + initializer=param_initializer), + bias_attr=name + '_query_fc.b_0') + k = layers.fc(input=keys, + size=d_key * n_head, + num_flatten_dims=len(keys.shape) - 1, + param_attr=fluid.ParamAttr( + name=name + '_key_fc.w_0', + initializer=param_initializer), + bias_attr=name + '_key_fc.b_0') + v = layers.fc(input=values, + size=d_value * n_head, + num_flatten_dims=len(values.shape) - 1, + param_attr=fluid.ParamAttr( + name=name + '_value_fc.w_0', + initializer=param_initializer), + bias_attr=name + '_value_fc.b_0') + return q, k, v + + def __split_heads(x, n_head): + """ + Reshape the last dimension of inpunt tensor x so that it becomes two + dimensions and then transpose. Specifically, input a tensor with shape + [bs, max_sequence_length, n_head * hidden_dim] then output a tensor + with shape [bs, n_head, max_sequence_length, hidden_dim]. + """ + hidden_size = x.shape[-1] + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + reshaped = layers.reshape( + x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True) + + # permuate the dimensions into: + # [batch_size, n_head, max_sequence_len, hidden_size_per_head] + return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) + + def __combine_heads(x): + """ + Transpose and then reshape the last two dimensions of inpunt tensor x + so that it becomes one dimension, which is reverse to __split_heads. + """ + if len(x.shape) == 3: return x + if len(x.shape) != 4: + raise ValueError("Input(x) should be a 4-D Tensor.") + trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + #trans_x.desc.set_shape((-1, 1, n_head, d_value)) + return layers.reshape(x=trans_x, shape=[0, 0, d_model], inplace=True) + + def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): + """ + Scaled Dot-Product Attention + """ + scaled_q = layers.scale(x=q, scale=d_key**-0.5) + product = layers.matmul(x=scaled_q, y=k, transpose_y=True) + if attn_bias: + product += attn_bias + weights = layers.softmax(product) + if dropout_rate: + weights = layers.dropout( + weights, + dropout_prob=dropout_rate, + dropout_implementation="upscale_in_train", + is_test=False) + out = layers.matmul(weights, v) + #return out, product + return out, weights + + q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) + q = to_3d(q) + k = to_3d(k) + v = to_3d(v) + + if cache is not None: # use cache and concat time steps + # Since the inplace reshape in __split_heads changes the shape of k and + # v, which is the cache input for next time step, reshape the cache + # input from the previous time step first. + k = cache["k"] = layers.concat( + [layers.reshape( + cache["k"], shape=[0, 0, d_model]), k], axis=1) + v = cache["v"] = layers.concat( + [layers.reshape( + cache["v"], shape=[0, 0, d_model]), v], axis=1) + + q = __split_heads(q, n_head) + k = __split_heads(k, n_head) + v = __split_heads(v, n_head) + + ctx_multiheads, ctx_multiheads_attn = scaled_dot_product_attention( + q, k, v, attn_bias, d_key, dropout_rate) + + out = __combine_heads(ctx_multiheads) + + out = to_2d(out) + + # Project back to the model size. + proj_out = layers.fc(input=out, + size=d_model, + num_flatten_dims=len(out.shape) - 1, + param_attr=fluid.ParamAttr( + name=name + '_output_fc.w_0', + initializer=param_initializer), + bias_attr=name + '_output_fc.b_0') + return proj_out, ctx_multiheads_attn + + +def positionwise_feed_forward(x, + d_inner_hid, + d_hid, + dropout_rate, + hidden_act, + param_initializer=None, + name='ffn'): + """ + Position-wise Feed-Forward Networks. + This module consists of two linear transformations with a ReLU activation + in between, which is applied to each position separately and identically. + """ + hidden = layers.fc(input=x, + size=d_inner_hid, + num_flatten_dims=len(x.shape) - 1, + act=hidden_act, + param_attr=fluid.ParamAttr( + name=name + '_fc_0.w_0', + initializer=param_initializer), + bias_attr=name + '_fc_0.b_0') + if dropout_rate: + hidden = layers.dropout( + hidden, + dropout_prob=dropout_rate, + dropout_implementation="upscale_in_train", + is_test=False) + out = layers.fc(input=hidden, + size=d_hid, + num_flatten_dims=len(hidden.shape) - 1, + param_attr=fluid.ParamAttr( + name=name + '_fc_1.w_0', + initializer=param_initializer), + bias_attr=name + '_fc_1.b_0') + return out + + +def pre_post_process_layer(prev_out, + out, + process_cmd, + dropout_rate=0., + name=''): + """ + Add residual connection, layer normalization and droput to the out tensor + optionally according to the value of process_cmd. + This will be used before or after multi-head attention and position-wise + feed-forward networks. + """ + for cmd in process_cmd: + if cmd == "a": # add residual connection + out = out + prev_out if prev_out else out + elif cmd == "n": # add layer normalization + out_dtype = out.dtype + if out_dtype == fluid.core.VarDesc.VarType.FP16: + out = layers.cast(x=out, dtype="float32") + out = layers.layer_norm( + out, + begin_norm_axis=len(out.shape) - 1, + param_attr=fluid.ParamAttr( + name=name + '_layer_norm_scale', + initializer=fluid.initializer.Constant(1.)), + bias_attr=fluid.ParamAttr( + name=name + '_layer_norm_bias', + initializer=fluid.initializer.Constant(0.))) + if out_dtype == fluid.core.VarDesc.VarType.FP16: + out = layers.cast(x=out, dtype="float16") + elif cmd == "d": # add dropout + if dropout_rate: + out = layers.dropout( + out, + dropout_prob=dropout_rate, + dropout_implementation="upscale_in_train", + is_test=False) + return out + + +pre_process_layer = partial(pre_post_process_layer, None) +post_process_layer = pre_post_process_layer + + +def encoder_layer(enc_input, + attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + hidden_act, + preprocess_cmd="n", + postprocess_cmd="da", + param_initializer=None, + name=''): + """The encoder layers that can be stacked to form a deep encoder. + This module consits of a multi-head (self) attention followed by + position-wise feed-forward networks and both the two components companied + with the post_process_layer to add residual connection, layer normalization + and droput. + """ + #L.Print(L.reduce_mean(enc_input), message='1') + attn_output, ctx_multiheads_attn = multi_head_attention( + pre_process_layer( + enc_input, + preprocess_cmd, + prepostprocess_dropout, + name=name + '_pre_att'), + None, + None, + attn_bias, + d_key, + d_value, + d_model, + n_head, + attention_dropout, + param_initializer=param_initializer, + name=name + '_multi_head_att') + #L.Print(L.reduce_mean(attn_output), message='1') + attn_output = post_process_layer( + enc_input, + attn_output, + postprocess_cmd, + prepostprocess_dropout, + name=name + '_post_att') + + #L.Print(L.reduce_mean(attn_output), message='2') + ffd_output = positionwise_feed_forward( + pre_process_layer( + attn_output, + preprocess_cmd, + prepostprocess_dropout, + name=name + '_pre_ffn'), + d_inner_hid, + d_model, + relu_dropout, + hidden_act, + param_initializer=param_initializer, + name=name + '_ffn') + #L.Print(L.reduce_mean(ffd_output), message='3') + ret = post_process_layer( + attn_output, + ffd_output, + postprocess_cmd, + prepostprocess_dropout, + name=name + '_post_ffn') + #L.Print(L.reduce_mean(ret), message='4') + return ret, ctx_multiheads_attn, ffd_output + + +def build_pad_idx(input_mask): + pad_idx = L.where(L.cast(L.squeeze(input_mask, [2]), 'bool')) + return pad_idx + + +def build_attn_bias(input_mask, n_head, dtype): + attn_bias = L.matmul( + input_mask, input_mask, transpose_y=True) # [batch, seq, seq] + attn_bias = (1. - attn_bias) * -10000. + attn_bias = L.stack([attn_bias] * n_head, 1) # [batch, n_head, seq, seq] + if attn_bias.dtype != dtype: + attn_bias = L.cast(attn_bias, dtype) + return attn_bias + + +def build_graph_attn_bias(input_mask, n_head, dtype, slot_seqlen): + + input_shape = L.shape(input_mask) + input_batch = input_shape[0] + input_seqlen = input_shape[1] + num_slot = input_seqlen / slot_seqlen + num_b = num_slot - 1 + ones = L.ones([num_b], dtype="float32") # [num_b] + diag_ones = L.diag(ones) # [num_b, num_b] + diag_ones = L.unsqueeze(diag_ones, [1, -1]) # [num_b, 1, num_b, 1] + diag_ones = L.expand(diag_ones, [1, slot_seqlen, 1, slot_seqlen]) # [num_b, seqlen, num_b, seqlen] + diag_ones = L.reshape(diag_ones, [1, num_b*slot_seqlen, num_b*slot_seqlen]) # [1, num_b*seqlen, num_b*seqlen] + + graph_attn_bias = L.concat([L.ones([1, num_b*slot_seqlen, slot_seqlen], dtype="float32"), diag_ones], 2) + graph_attn_bias = L.concat([L.ones([1, slot_seqlen, num_slot*slot_seqlen], dtype="float32"), graph_attn_bias], 1) # [1, seq, seq] + + pad_attn_bias = L.matmul( + input_mask, input_mask, transpose_y=True) # [batch, seq, seq] + attn_bias = graph_attn_bias * pad_attn_bias + + attn_bias = (1. - attn_bias) * -10000. + attn_bias = L.stack([attn_bias] * n_head, 1) # [batch, n_head, seq, seq] + if attn_bias.dtype != dtype: + attn_bias = L.cast(attn_bias, dtype) + return attn_bias + + +def encoder(enc_input, + input_mask, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + hidden_act, + preprocess_cmd="n", + postprocess_cmd="da", + param_initializer=None, + name=''): + """ + The encoder is composed of a stack of identical layers returned by calling + encoder_layer. + """ + + #global to_2d, to_3d #, batch, seqlen, dynamic_dim + d_shape = L.shape(input_mask) + pad_idx = build_pad_idx(input_mask) + attn_bias = build_attn_bias(input_mask, n_head, enc_input.dtype) + + # d_batch = d_shape[0] + # d_seqlen = d_shape[1] + # pad_idx = L.where( + # L.cast(L.reshape(input_mask, [d_batch, d_seqlen]), 'bool')) + + # attn_bias = L.matmul( + # input_mask, input_mask, transpose_y=True) # [batch, seq, seq] + # attn_bias = (1. - attn_bias) * -10000. + # attn_bias = L.stack([attn_bias] * n_head, 1) + # if attn_bias.dtype != enc_input.dtype: + # attn_bias = L.cast(attn_bias, enc_input.dtype) + + # def to_2d(t_3d): + # t_2d = L.gather_nd(t_3d, pad_idx) + # return t_2d + + # def to_3d(t_2d): + # t_3d = L.scatter_nd( + # pad_idx, t_2d, shape=[d_shape[0], d_shape[1], d_model]) + # return t_3d + + enc_input = to_2d(enc_input) + all_hidden = [] + all_attn = [] + all_ffn = [] + for i in range(n_layer): + enc_output, ctx_multiheads_attn, ffn_output = encoder_layer( + enc_input, + attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + hidden_act, + preprocess_cmd, + postprocess_cmd, + param_initializer=param_initializer, + name=name + '_layer_' + str(i)) + all_hidden.append(enc_output) + all_attn.append(ctx_multiheads_attn) + all_ffn.append(ffn_output) + enc_input = enc_output + enc_output = pre_process_layer( + enc_output, + preprocess_cmd, + prepostprocess_dropout, + name="post_encoder") + enc_output = to_3d(enc_output) + #enc_output.desc.set_shape((-1, 1, final_dim)) + return enc_output, all_hidden, all_attn, all_ffn + +def graph_encoder(enc_input, + input_mask, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + hidden_act, + preprocess_cmd="n", + postprocess_cmd="da", + param_initializer=None, + slot_seqlen=40, + name=''): + """ + The encoder is composed of a stack of identical layers returned by calling + encoder_layer. + """ + + #global to_2d, to_3d #, batch, seqlen, dynamic_dim + d_shape = L.shape(input_mask) + pad_idx = build_pad_idx(input_mask) + attn_bias = build_graph_attn_bias(input_mask, n_head, enc_input.dtype, slot_seqlen) + #attn_bias = build_attn_bias(input_mask, n_head, enc_input.dtype) + + # d_batch = d_shape[0] + # d_seqlen = d_shape[1] + # pad_idx = L.where( + # L.cast(L.reshape(input_mask, [d_batch, d_seqlen]), 'bool')) + + # attn_bias = L.matmul( + # input_mask, input_mask, transpose_y=True) # [batch, seq, seq] + # attn_bias = (1. - attn_bias) * -10000. + # attn_bias = L.stack([attn_bias] * n_head, 1) + # if attn_bias.dtype != enc_input.dtype: + # attn_bias = L.cast(attn_bias, enc_input.dtype) + + # def to_2d(t_3d): + # t_2d = L.gather_nd(t_3d, pad_idx) + # return t_2d + + # def to_3d(t_2d): + # t_3d = L.scatter_nd( + # pad_idx, t_2d, shape=[d_shape[0], d_shape[1], d_model]) + # return t_3d + + enc_input = to_2d(enc_input) + all_hidden = [] + all_attn = [] + all_ffn = [] + for i in range(n_layer): + enc_output, ctx_multiheads_attn, ffn_output = encoder_layer( + enc_input, + attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + hidden_act, + preprocess_cmd, + postprocess_cmd, + param_initializer=param_initializer, + name=name + '_layer_' + str(i)) + all_hidden.append(enc_output) + all_attn.append(ctx_multiheads_attn) + all_ffn.append(ffn_output) + enc_input = enc_output + enc_output = pre_process_layer( + enc_output, + preprocess_cmd, + prepostprocess_dropout, + name="post_encoder") + enc_output = to_3d(enc_output) + #enc_output.desc.set_shape((-1, 1, final_dim)) + return enc_output, all_hidden, all_attn, all_ffn diff --git a/examples/erniesage/models/erniesage_v1.py b/examples/erniesage/models/erniesage_v1.py new file mode 100644 index 0000000000000000000000000000000000000000..696231a785ffecb8098c3379c7a7b0f4ee935e33 --- /dev/null +++ b/examples/erniesage/models/erniesage_v1.py @@ -0,0 +1,42 @@ +import pgl +import paddle.fluid as F +import paddle.fluid.layers as L +from models.base import BaseNet, BaseGNNModel +from models.ernie_model.ernie import ErnieModel +from models.ernie_model.ernie import ErnieGraphModel +from models.ernie_model.ernie import ErnieConfig + +class ErnieSageV1(BaseNet): + + def build_inputs(self): + inputs = super(ErnieSageV1, self).build_inputs() + term_ids = L.data( + "term_ids", shape=[None, self.config.max_seqlen], dtype="int64", append_batch_size=False) + return inputs + [term_ids] + + def build_embedding(self, graph_wrappers, term_ids): + term_ids = L.unsqueeze(term_ids, [-1]) + ernie_config = self.config.ernie_config + ernie = ErnieModel( + src_ids=term_ids, + sentence_ids=L.zeros_like(term_ids), + task_ids=None, + config=ernie_config, + use_fp16=False, + name="student_") + feature = ernie.get_pooled_output() + return feature + + def __call__(self, graph_wrappers): + inputs = self.build_inputs() + feature = self.build_embedding(graph_wrappers, inputs[-1]) + features = self.gnn_layers(graph_wrappers, feature) + outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs[:-1]] + src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0]) + outputs.append(src_real_index) + return inputs, outputs + + +class ErnieSageModelV1(BaseGNNModel): + def gen_net_fn(self, config): + return ErnieSageV1(config) diff --git a/examples/erniesage/models/erniesage_v2.py b/examples/erniesage/models/erniesage_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..e5c03c1fcc0076a8566035c5d523b2bfbf76eb7c --- /dev/null +++ b/examples/erniesage/models/erniesage_v2.py @@ -0,0 +1,102 @@ +import pgl +import paddle.fluid as F +import paddle.fluid.layers as L +from models.base import BaseNet, BaseGNNModel +from models.ernie_model.ernie import ErnieModel +from models.ernie_model.ernie import ErnieGraphModel +from models.ernie_model.ernie import ErnieConfig + + +class ErnieSageV2(BaseNet): + + def build_inputs(self): + inputs = super(ErnieSageV2, self).build_inputs() + term_ids = L.data( + "term_ids", shape=[None, self.config.max_seqlen], dtype="int64", append_batch_size=False) + return inputs + [term_ids] + + def gnn_layer(self, gw, feature, hidden_size, act, initializer, learning_rate, name): + def ernie_send(src_feat, dst_feat, edge_feat): + """doc""" + cls = L.fill_constant_batch_size_like(src_feat["term_ids"], [-1, 1, 1], "int64", 1) + src_ids = L.concat([cls, src_feat["term_ids"]], 1) + dst_ids = dst_feat["term_ids"] + + sent_ids = L.concat([L.zeros_like(src_ids), L.ones_like(dst_ids)], 1) + term_ids = L.concat([src_ids, dst_ids], 1) + + term_ids.stop_gradient = True + sent_ids.stop_gradient = True + ernie = ErnieModel( + term_ids, sent_ids, + config=self.config.ernie_config) + feature = ernie.get_pooled_output() + return feature + + def erniesage_v2_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name): + feature = L.unsqueeze(feature, [-1]) + msg = gw.send(ernie_send, nfeat_list=[("term_ids", feature)]) + neigh_feature = gw.recv(msg, lambda feat: F.layers.sequence_pool(feat, pool_type="sum")) + + term_ids = feature + cls = L.fill_constant_batch_size_like(term_ids, [-1, 1, 1], "int64", 1) + term_ids = L.concat([cls, term_ids], 1) + term_ids.stop_gradient = True + ernie = ErnieModel( + term_ids, L.zeros_like(term_ids), + config=self.config.ernie_config) + self_feature = ernie.get_pooled_output() + + self_feature = L.fc(self_feature, + hidden_size, + act=act, + param_attr=F.ParamAttr(name=name + "_l", + learning_rate=learning_rate), + ) + neigh_feature = L.fc(neigh_feature, + hidden_size, + act=act, + param_attr=F.ParamAttr(name=name + "_r", + learning_rate=learning_rate), + ) + output = L.concat([self_feature, neigh_feature], axis=1) + output = L.l2_normalize(output, axis=1) + return output + return erniesage_v2_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name) + + def gnn_layers(self, graph_wrappers, feature): + features = [feature] + + initializer = None + fc_lr = self.config.lr / 0.001 + + for i in range(self.config.num_layers): + if i == self.config.num_layers - 1: + act = None + else: + act = "leaky_relu" + + feature = self.gnn_layer( + graph_wrappers[i], + feature, + self.config.hidden_size, + act, + initializer, + learning_rate=fc_lr, + name="%s_%s" % ("erniesage_v2", i)) + features.append(feature) + return features + + def __call__(self, graph_wrappers): + inputs = self.build_inputs() + feature = inputs[-1] + features = self.gnn_layers(graph_wrappers, feature) + outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs[:-1]] + src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0]) + outputs.append(src_real_index) + return inputs, outputs + + +class ErnieSageModelV2(BaseGNNModel): + def gen_net_fn(self, config): + return ErnieSageV2(config) diff --git a/examples/erniesage/models/erniesage_v3.py b/examples/erniesage/models/erniesage_v3.py new file mode 100644 index 0000000000000000000000000000000000000000..4fd9968a49213c4bc5fbb0deb5316e44bba30a12 --- /dev/null +++ b/examples/erniesage/models/erniesage_v3.py @@ -0,0 +1,123 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pgl +import paddle.fluid as F +import paddle.fluid.layers as L + +from models.base import BaseNet, BaseGNNModel +from models.ernie_model.ernie import ErnieModel +from models.ernie_model.ernie import ErnieGraphModel +from models.ernie_model.ernie import ErnieConfig +from models.message_passing import copy_send + + +class ErnieSageV3(BaseNet): + def __init__(self, config): + super(ErnieSageV3, self).__init__(config) + self.config.layer_type = "ernie_recv_sum" + + def build_inputs(self): + inputs = super(ErnieSageV3, self).build_inputs() + term_ids = L.data( + "term_ids", shape=[None, self.config.max_seqlen], dtype="int64", append_batch_size=False) + return inputs + [term_ids] + + def gnn_layer(self, gw, feature, hidden_size, act, initializer, learning_rate, name): + def ernie_recv(feat): + """doc""" + # TODO maxlen 400 + #pad_value = L.cast(L.assign(input=np.array([0], dtype=np.int32)), "int64") + pad_value = L.zeros([1], "int64") + out, _ = L.sequence_pad(feat, pad_value=pad_value, maxlen=10) + out = L.reshape(out, [0, 400]) + return out + + def erniesage_v3_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name): + msg = gw.send(copy_send, nfeat_list=[("h", feature)]) + neigh_feature = gw.recv(msg, ernie_recv) + neigh_feature = L.cast(L.unsqueeze(neigh_feature, [-1]), "int64") + + feature = L.unsqueeze(feature, [-1]) + cls = L.fill_constant_batch_size_like(feature, [-1, 1, 1], "int64", 1) + term_ids = L.concat([cls, feature[:, :-1], neigh_feature], 1) + term_ids.stop_gradient = True + return term_ids + return erniesage_v3_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name) + + def gnn_layers(self, graph_wrappers, feature): + features = [feature] + + initializer = None + fc_lr = self.config.lr / 0.001 + + for i in range(self.config.num_layers): + if i == self.config.num_layers - 1: + act = None + else: + act = "leaky_relu" + + feature = self.gnn_layer( + graph_wrappers[i], + feature, + self.config.hidden_size, + act, + initializer, + learning_rate=fc_lr, + name="%s_%s" % (self.config.layer_type, i)) + features.append(feature) + return features + + def take_final_feature(self, feature, index, name): + """take final feature""" + feat = L.gather(feature, index, overwrite=False) + + ernie_config = self.config.ernie_config + ernie = ErnieGraphModel( + src_ids=feat, + config=ernie_config, + slot_seqlen=self.config.max_seqlen, + name="student_") + feat = ernie.get_pooled_output() + fc_lr = self.config.lr / 0.001 + feat= L.fc(feat, + self.config.hidden_size, + act="relu", + param_attr=F.ParamAttr(name=name + "_l", + learning_rate=fc_lr), + ) + feat = L.l2_normalize(feat, axis=1) + + if self.config.final_fc: + feat = L.fc(feat, + self.config.hidden_size, + param_attr=F.ParamAttr(name=name + '_w'), + bias_attr=F.ParamAttr(name=name + '_b')) + + if self.config.final_l2_norm: + feat = L.l2_normalize(feat, axis=1) + return feat + + def __call__(self, graph_wrappers): + inputs = self.build_inputs() + feature = inputs[-1] + features = self.gnn_layers(graph_wrappers, feature) + outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs[:-1]] + src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0]) + outputs.append(src_real_index) + return inputs, outputs + + +class ErnieSageModelV3(BaseGNNModel): + def gen_net_fn(self, config): + return ErnieSageV3(config) diff --git a/examples/erniesage/models/message_passing.py b/examples/erniesage/models/message_passing.py new file mode 100644 index 0000000000000000000000000000000000000000..4567bd694b123841c6b71b61a88e8dcbae8957b7 --- /dev/null +++ b/examples/erniesage/models/message_passing.py @@ -0,0 +1,137 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.layers as L + + +def copy_send(src_feat, dst_feat, edge_feat): + """doc""" + return src_feat["h"] + +def weighted_copy_send(src_feat, dst_feat, edge_feat): + """doc""" + return src_feat["h"] * edge_feat["weight"] + +def mean_recv(feat): + """doc""" + return fluid.layers.sequence_pool(feat, pool_type="average") + + +def sum_recv(feat): + """doc""" + return fluid.layers.sequence_pool(feat, pool_type="sum") + + +def max_recv(feat): + """doc""" + return fluid.layers.sequence_pool(feat, pool_type="max") + + +def lstm_recv(feat): + """doc""" + hidden_dim = 128 + forward, _ = fluid.layers.dynamic_lstm( + input=feat, size=hidden_dim * 4, use_peepholes=False) + output = fluid.layers.sequence_last_step(forward) + return output + + +def graphsage_sum(gw, feature, hidden_size, act, initializer, learning_rate, name): + """doc""" + msg = gw.send(copy_send, nfeat_list=[("h", feature)]) + neigh_feature = gw.recv(msg, sum_recv) + self_feature = feature + self_feature = fluid.layers.fc(self_feature, + hidden_size, + act=act, + param_attr=fluid.ParamAttr(name=name + "_l", initializer=initializer, + learning_rate=learning_rate), + ) + neigh_feature = fluid.layers.fc(neigh_feature, + hidden_size, + act=act, + param_attr=fluid.ParamAttr(name=name + "_r", initializer=initializer, + learning_rate=learning_rate), + ) + output = fluid.layers.concat([self_feature, neigh_feature], axis=1) + output = fluid.layers.l2_normalize(output, axis=1) + return output + + +def graphsage_mean(gw, feature, hidden_size, act, initializer, learning_rate, name): + """doc""" + msg = gw.send(copy_send, nfeat_list=[("h", feature)]) + neigh_feature = gw.recv(msg, mean_recv) + self_feature = feature + self_feature = fluid.layers.fc(self_feature, + hidden_size, + act=act, + param_attr=fluid.ParamAttr(name=name + "_l", initializer=initializer, + learning_rate=learning_rate), + ) + neigh_feature = fluid.layers.fc(neigh_feature, + hidden_size, + act=act, + param_attr=fluid.ParamAttr(name=name + "_r", initializer=initializer, + learning_rate=learning_rate), + ) + output = fluid.layers.concat([self_feature, neigh_feature], axis=1) + output = fluid.layers.l2_normalize(output, axis=1) + return output + + +def pinsage_mean(gw, feature, hidden_size, act, initializer, learning_rate, name): + """doc""" + msg = gw.send(weighted_copy_send, nfeat_list=[("h", feature)], efeat_list=["weight"]) + neigh_feature = gw.recv(msg, mean_recv) + self_feature = feature + self_feature = fluid.layers.fc(self_feature, + hidden_size, + act=act, + param_attr=fluid.ParamAttr(name=name + "_l", initializer=initializer, + learning_rate=learning_rate), + ) + neigh_feature = fluid.layers.fc(neigh_feature, + hidden_size, + act=act, + param_attr=fluid.ParamAttr(name=name + "_r", initializer=initializer, + learning_rate=learning_rate), + ) + output = fluid.layers.concat([self_feature, neigh_feature], axis=1) + output = fluid.layers.l2_normalize(output, axis=1) + return output + + +def pinsage_sum(gw, feature, hidden_size, act, initializer, learning_rate, name): + """doc""" + msg = gw.send(weighted_copy_send, nfeat_list=[("h", feature)], efeat_list=["weight"]) + neigh_feature = gw.recv(msg, sum_recv) + self_feature = feature + self_feature = fluid.layers.fc(self_feature, + hidden_size, + act=act, + param_attr=fluid.ParamAttr(name=name + "_l", initializer=initializer, + learning_rate=learning_rate), + ) + neigh_feature = fluid.layers.fc(neigh_feature, + hidden_size, + act=act, + param_attr=fluid.ParamAttr(name=name + "_r", initializer=initializer, + learning_rate=learning_rate), + ) + output = fluid.layers.concat([self_feature, neigh_feature], axis=1) + output = fluid.layers.l2_normalize(output, axis=1) + return output diff --git a/examples/erniesage/models/model_factory.py b/examples/erniesage/models/model_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..0f69bb1f6932a219f4a41faee9cf5bf6c3f947a8 --- /dev/null +++ b/examples/erniesage/models/model_factory.py @@ -0,0 +1,24 @@ +from models.base import BaseGNNModel +from models.ernie import ErnieModel +from models.erniesage_v1 import ErnieSageModelV1 +from models.erniesage_v2 import ErnieSageModelV2 +from models.erniesage_v3 import ErnieSageModelV3 + +class Model(object): + @classmethod + def factory(cls, config): + name = config.model_type + if name == "BaseGNNModel": + return BaseGNNModel(config) + if name == "ErnieModel": + return ErnieModel(config) + if name == "ErnieSageModelV1": + return ErnieSageModelV1(config) + if name == "ErnieSageModelV2": + return ErnieSageModelV2(config) + if name == "ErnieSageModelV3": + return ErnieSageModelV3(config) + else: + raise ValueError + + diff --git a/examples/erniesage/preprocessing/dump_graph.py b/examples/erniesage/preprocessing/dump_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..06281456c29a11704e921a99cdd80d3dabfa0c3c --- /dev/null +++ b/examples/erniesage/preprocessing/dump_graph.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +######################################################################## +# +# Copyright (c) 2020 Baidu.com, Inc. All Rights Reserved +# +# File: dump_graph.py +# Author: suweiyue(suweiyue@baidu.com) +# Date: 2020/03/01 22:17:13 +# +######################################################################## +""" + Comment. +""" +from __future__ import division +from __future__ import absolute_import +from __future__ import print_function +#from __future__ import unicode_literals + +import io +import os +import sys +import argparse +import logging +import multiprocessing +from functools import partial +from io import open + +import numpy as np +import tqdm +import pgl +from pgl.graph_kernel import alias_sample_build_table +from pgl.utils.logger import log + +from tokenization import FullTokenizer + + +def term2id(string, tokenizer, max_seqlen): + string = string.split("\t")[1] + tokens = tokenizer.tokenize(string) + ids = tokenizer.convert_tokens_to_ids(tokens) + ids = ids[:max_seqlen-1] + ids = ids + [2] # ids + [sep] + ids = ids + [0] * (max_seqlen - len(ids)) + return ids + + +def dump_graph(args): + if not os.path.exists(args.outpath): + os.makedirs(args.outpath) + neg_samples = [] + str2id = dict() + term_file = io.open(os.path.join(args.outpath, "terms.txt"), "w", encoding=args.encoding) + terms = [] + count = 0 + + with io.open(args.inpath, encoding=args.encoding) as f: + edges = [] + for idx, line in enumerate(f): + if idx % 100000 == 0: + log.info("%s readed %s lines" % (args.inpath, idx)) + slots = [] + for col_idx, col in enumerate(line.strip("\n").split("\t")): + s = col[:args.max_seqlen] + if s not in str2id: + str2id[s] = count + count += 1 + term_file.write(str(col_idx) + "\t" + col + "\n") + + slots.append(str2id[s]) + + src = slots[0] + dst = slots[1] + neg_samples.append(slots[2:]) + edges.append((src, dst)) + edges.append((dst, src)) + + term_file.close() + edges = np.array(edges, dtype="int64") + num_nodes = len(str2id) + str2id.clear() + log.info("building graph...") + graph = pgl.graph.Graph(num_nodes=num_nodes, edges=edges) + indegree = graph.indegree() + graph.outdegree() + graph.dump(args.outpath) + + # dump alias sample table + sqrt_indegree = np.sqrt(indegree) + distribution = 1. * sqrt_indegree / sqrt_indegree.sum() + alias, events = alias_sample_build_table(distribution) + np.save(os.path.join(args.outpath, "alias.npy"), alias) + np.save(os.path.join(args.outpath, "events.npy"), events) + np.save(os.path.join(args.outpath, "neg_samples.npy"), np.array(neg_samples)) + log.info("End Build Graph") + +def dump_id2str_map(args): + log.info("Dump id2str map starting...") + id2str = np.array([line.strip("\n") for line in open(os.path.join(args.outpath, "terms.txt"), "r", encoding=args.encoding)]) + np.save(os.path.join(args.outpath, "id2str.npy"), id2str) + log.info("Dump id2str map done.") + +def dump_node_feat(args): + log.info("Dump node feat starting...") + id2str = np.load(os.path.join(args.outpath, "id2str.npy"), mmap_mode="r") + pool = multiprocessing.Pool() + tokenizer = FullTokenizer(args.vocab_file) + term_ids = pool.map(partial(term2id, tokenizer=tokenizer, max_seqlen=args.max_seqlen), id2str) + np.save(os.path.join(args.outpath, "term_ids.npy"), np.array(term_ids)) + log.info("Dump node feat done.") + pool.terminate() + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='main') + parser.add_argument("-i", "--inpath", type=str, default=None) + parser.add_argument("-l", "--max_seqlen", type=int, default=30) + parser.add_argument("--vocab_file", type=str, default="./vocab.txt") + parser.add_argument("--encoding", type=str, default="utf8") + parser.add_argument("-o", "--outpath", type=str, default=None) + args = parser.parse_args() + dump_graph(args) + dump_id2str_map(args) + dump_node_feat(args) diff --git a/examples/erniesage/preprocessing/tokenization.py b/examples/erniesage/preprocessing/tokenization.py new file mode 100644 index 0000000000000000000000000000000000000000..975bb26a531e655bcfa4744f8ebc81fc01c68d9c --- /dev/null +++ b/examples/erniesage/preprocessing/tokenization.py @@ -0,0 +1,461 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import unicodedata +import six +import sentencepiece as sp + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode("utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + fin = open(vocab_file, 'rb') + for num, line in enumerate(fin): + items = convert_to_unicode(line.strip()).split("\t") + if len(items) > 2: + break + token = items[0] + index = items[1] if len(items) == 2 else num + token = token.strip() + vocab[token] = int(index) + return vocab + + +def convert_by_vocab(vocab, items): + """Converts a sequence of [tokens|ids] using the vocab.""" + output = [] + for item in items: + output.append(vocab[item]) + return output + + +def convert_tokens_to_ids_include_unk(vocab, tokens, unk_token="[UNK]"): + output = [] + for token in tokens: + if token in vocab: + output.append(vocab[token]) + else: + output.append(vocab[unk_token]) + return output + + +def convert_tokens_to_ids(vocab, tokens): + return convert_by_vocab(vocab, tokens) + + +def convert_ids_to_tokens(inv_vocab, ids): + return convert_by_vocab(inv_vocab, ids) + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a peice of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class FullTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class CharTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in text.lower().split(" "): + for sub_token in self.tokenizer.tokenize(token): + split_tokens.append(sub_token) + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = convert_to_unicode(text) + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class SentencepieceTokenizer(object): + """Runs SentencePiece tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True, unk_token="[UNK]"): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.do_lower_case = do_lower_case + self.tokenizer = sp.SentencePieceProcessor() + self.tokenizer.Load(vocab_file + ".model") + self.sp_unk_token = "" + self.unk_token = unk_token + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + Returns: + A list of wordpiece tokens. + """ + text = text.lower() if self.do_lower_case else text + text = convert_to_unicode(text.replace("\1", " ")) + tokens = self.tokenizer.EncodeAsPieces(text) + + output_tokens = [] + for token in tokens: + if token == self.sp_unk_token: + token = self.unk_token + + if token in self.vocab: + output_tokens.append(token) + else: + output_tokens.append(self.unk_token) + + return output_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class WordsegTokenizer(object): + """Runs Wordseg tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True, unk_token="[UNK]", + split_token="\1"): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.tokenizer = sp.SentencePieceProcessor() + self.tokenizer.Load(vocab_file + ".model") + + self.do_lower_case = do_lower_case + self.unk_token = unk_token + self.split_token = split_token + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + Returns: + A list of wordpiece tokens. + """ + text = text.lower() if self.do_lower_case else text + text = convert_to_unicode(text) + + output_tokens = [] + for token in text.split(self.split_token): + if token in self.vocab: + output_tokens.append(token) + else: + sp_tokens = self.tokenizer.EncodeAsPieces(token) + for sp_token in sp_tokens: + if sp_token in self.vocab: + output_tokens.append(sp_token) + return output_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenziation.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + + Returns: + A list of wordpiece tokens. + """ + + text = convert_to_unicode(text) + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/examples/erniesage/train.py b/examples/erniesage/train.py new file mode 100644 index 0000000000000000000000000000000000000000..22bc5371ee7bbaf997e18f77180bedba18c30f69 --- /dev/null +++ b/examples/erniesage/train.py @@ -0,0 +1,93 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import argparse +import traceback + +import yaml +import numpy as np +from easydict import EasyDict as edict +from pgl.utils.logger import log +from pgl.utils import paddle_helper + +from learner import Learner +from models.model_factory import Model +from dataset.graph_reader import GraphGenerator + + +class TrainData(object): + def __init__(self, graph_path): + trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) + log.info("trainer_id: %s, trainer_count: %s." % (trainer_id, trainer_count)) + + edges = np.load(os.path.join(graph_path, "edges.npy"), allow_pickle=True) + # edges is bidirectional. + train_usr = edges[trainer_id::trainer_count, 0] + train_ad = edges[trainer_id::trainer_count, 1] + returns = { + "train_data": [train_usr, train_ad] + } + + if os.path.exists(os.path.join(graph_path, "neg_samples.npy")): + neg_samples = np.load(os.path.join(graph_path, "neg_samples.npy"), allow_pickle=True) + if neg_samples.size != 0: + train_negs = neg_samples[trainer_id::trainer_count] + returns["train_data"].append(train_negs) + log.info("Load train_data done.") + self.data = returns + + def __getitem__(self, index): + return [ data[index] for data in self.data["train_data"]] + + def __len__(self): + return len(self.data["train_data"][0]) + + +def main(config): + # Select Model + model = Model.factory(config) + + # Build Train Edges + data = TrainData(config.graph_path) + + # Build Train Data + train_iter = GraphGenerator( + graph_wrappers=model.graph_wrappers, + batch_size=config.batch_size, + data=data, + samples=config.samples, + num_workers=config.sample_workers, + feed_name_list=[var.name for var in model.feed_list], + use_pyreader=config.use_pyreader, + phase="train", + graph_data_path=config.graph_path, + shuffle=True) + + log.info("build graph reader done.") + + learner = Learner.factory(config.learner_type) + learner.build(model, train_iter, config) + + learner.start() + learner.stop() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='main') + parser.add_argument("--conf", type=str, default="./config.yaml") + args = parser.parse_args() + config = edict(yaml.load(open(args.conf), Loader=yaml.FullLoader)) + print(config) + main(config) diff --git a/examples/gin/Dataset.py b/examples/gin/Dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..3a3fd8cfc8671817ebc47c9b42b7c9e1e28ce42b --- /dev/null +++ b/examples/gin/Dataset.py @@ -0,0 +1,313 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This file implement the dataset for GIN model. +""" + +import os +import sys +import numpy as np + +from sklearn.model_selection import StratifiedKFold + +import pgl +from pgl.utils.logger import log + + +def fold10_split(dataset, fold_idx=0, seed=0, shuffle=True): + """10 fold splitter""" + assert 0 <= fold_idx and fold_idx < 10, print( + "fold_idx must be from 0 to 9.") + + skf = StratifiedKFold(n_splits=10, shuffle=shuffle, random_state=seed) + labels = [] + for i in range(len(dataset)): + g, c = dataset[i] + labels.append(c) + + idx_list = [] + for idx in skf.split(np.zeros(len(labels)), labels): + idx_list.append(idx) + train_idx, valid_idx = idx_list[fold_idx] + + log.info("train_set : test_set == %d : %d" % + (len(train_idx), len(valid_idx))) + return Subset(dataset, train_idx), Subset(dataset, valid_idx) + + +def random_split(dataset, split_ratio=0.7, seed=0, shuffle=True): + """random splitter""" + np.random.seed(seed) + indices = list(range(len(dataset))) + np.random.shuffle(indices) + split = int(split_ratio * len(dataset)) + train_idx, valid_idx = indices[:split], indices[split:] + + log.info("train_set : test_set == %d : %d" % + (len(train_idx), len(valid_idx))) + return Subset(dataset, train_idx), Subset(dataset, valid_idx) + + +class BaseDataset(object): + """BaseDataset""" + + def __init__(self): + pass + + def __getitem__(self, idx): + """getitem""" + raise NotImplementedError + + def __len__(self): + """len""" + raise NotImplementedError + + +class Subset(BaseDataset): + """ + Subset of a dataset at specified indices. + """ + + def __init__(self, dataset, indices): + self.dataset = dataset + self.indices = indices + + def __getitem__(self, idx): + """getitem""" + return self.dataset[self.indices[idx]] + + def __len__(self): + """len""" + return len(self.indices) + + +class GINDataset(BaseDataset): + """Dataset for Graph Isomorphism Network (GIN) + Adapted from https://github.com/weihua916/powerful-gnns/blob/master/dataset.zip. + """ + + def __init__(self, + data_path, + dataset_name, + self_loop, + degree_as_nlabel=False): + self.data_path = data_path + self.dataset_name = dataset_name + self.self_loop = self_loop + self.degree_as_nlabel = degree_as_nlabel + + self.graph_list = [] + self.glabel_list = [] + + # relabel + self.glabel_dict = {} + self.nlabel_dict = {} + self.elabel_dict = {} + self.ndegree_dict = {} + + # global num + self.num_graph = 0 # total graphs number + self.n = 0 # total nodes number + self.m = 0 # total edges number + + # global num of classes + self.gclasses = 0 + self.nclasses = 0 + self.eclasses = 0 + self.dim_nfeats = 0 + + # flags + self.degree_as_nlabel = degree_as_nlabel + self.nattrs_flag = False + self.nlabels_flag = False + + self._load_data() + + def __len__(self): + """return the number of graphs""" + return len(self.graph_list) + + def __getitem__(self, idx): + """getitem""" + return self.graph_list[idx], self.glabel_list[idx] + + def _load_data(self): + """Loads dataset + """ + filename = os.path.join(self.data_path, self.dataset_name, + "%s.txt" % self.dataset_name) + log.info("loading data from %s" % filename) + + with open(filename, 'r') as reader: + # first line --> N, means total number of graphs + self.num_graph = int(reader.readline().strip()) + + for i in range(self.num_graph): + if (i + 1) % int(self.num_graph / 10) == 0: + log.info("processing graph %s" % (i + 1)) + graph = dict() + # second line --> [num_node, label] + # means [node number of a graph, class label of a graph] + grow = reader.readline().strip().split() + n_nodes, glabel = [int(w) for w in grow] + + # relabel graphs + if glabel not in self.glabel_dict: + mapped = len(self.glabel_dict) + self.glabel_dict[glabel] = mapped + + graph['num_nodes'] = n_nodes + self.glabel_list.append(self.glabel_dict[glabel]) + + nlabels = [] + node_features = [] + num_edges = 0 + edges = [] + + for j in range(graph['num_nodes']): + slots = reader.readline().strip().split() + + # handle edges and node feature(if has) + tmp = int(slots[ + 1]) + 2 # tmp == 2 + num_edges of current node + if tmp == len(slots): + # no node feature + nrow = [int(w) for w in slots] + nfeat = None + elif tmp < len(slots): + nrow = [int(w) for w in slots[:tmp]] + nfeat = [float(w) for w in slots[tmp:]] + node_features.append(nfeat) + else: + raise Exception('edge number is not correct!') + + # relabel nodes if is has labels + # if it doesn't have node labels, then every nrow[0] == 0 + if not nrow[0] in self.nlabel_dict: + mapped = len(self.nlabel_dict) + self.nlabel_dict[nrow[0]] = mapped + + nlabels.append(self.nlabel_dict[nrow[0]]) + num_edges += nrow[1] + edges.extend([(j, u) for u in nrow[2:]]) + + if self.self_loop: + num_edges += 1 + edges.append((j, j)) + + if node_features != []: + node_features = np.stack(node_features) + graph['attr'] = node_features + self.nattrs_flag = True + else: + node_features = None + graph['attr'] = node_features + + graph['nlabel'] = np.array( + nlabels, dtype="int64").reshape(-1, 1) + if len(self.nlabel_dict) > 1: + self.nlabels_flag = True + + graph['edges'] = edges + assert num_edges == len(edges) + + g = pgl.graph.Graph( + num_nodes=graph['num_nodes'], + edges=graph['edges'], + node_feat={ + 'nlabel': graph['nlabel'], + 'attr': graph['attr'] + }) + + self.graph_list.append(g) + + # update statistics of graphs + self.n += graph['num_nodes'] + self.m += num_edges + + # if no attr + if not self.nattrs_flag: + log.info('there are no node features in this dataset!') + label2idx = {} + # generate node attr by node degree + if self.degree_as_nlabel: + log.info('generate node features by node degree...') + nlabel_set = set([]) + for g in self.graph_list: + + g.node_feat['nlabel'] = g.indegree() + # extracting unique node labels + nlabel_set = nlabel_set.union(set(g.node_feat['nlabel'])) + g.node_feat['nlabel'] = g.node_feat['nlabel'].reshape(-1, + 1) + + nlabel_set = list(nlabel_set) + # in case the labels/degrees are not continuous number + self.ndegree_dict = { + nlabel_set[i]: i + for i in range(len(nlabel_set)) + } + label2idx = self.ndegree_dict + # generate node attr by node label + else: + log.info('generate node features by node label...') + label2idx = self.nlabel_dict + + for g in self.graph_list: + attr = np.zeros((g.num_nodes, len(label2idx))) + idx = [ + label2idx[tag] + for tag in g.node_feat['nlabel'].reshape(-1, ) + ] + attr[:, idx] = 1 + g.node_feat['attr'] = attr.astype("float32") + + # after load, get the #classes and #dim + self.gclasses = len(self.glabel_dict) + self.nclasses = len(self.nlabel_dict) + self.eclasses = len(self.elabel_dict) + self.dim_nfeats = len(self.graph_list[0].node_feat['attr'][0]) + + message = "finished loading data\n" + message += """ + num_graph: %d + num_graph_class: %d + total_num_nodes: %d + node Classes: %d + node_features_dim: %d + num_edges: %d + edge_classes: %d + Avg. of #Nodes: %.2f + Avg. of #Edges: %.2f + Graph Relabeled: %s + Node Relabeled: %s + Degree Relabeled(If degree_as_nlabel=True): %s""" % ( + self.num_graph, + self.gclasses, + self.n, + self.nclasses, + self.dim_nfeats, + self.m, + self.eclasses, + self.n / self.num_graph, + self.m / self.num_graph, + self.glabel_dict, + self.nlabel_dict, + self.ndegree_dict, ) + log.info(message) + + +if __name__ == "__main__": + gindataset = GINDataset( + "./dataset/", "MUTAG", self_loop=True, degree_as_nlabel=False) diff --git a/examples/gin/README.md b/examples/gin/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e35eef7fae1feaa22508a313b35b2103695fc3aa --- /dev/null +++ b/examples/gin/README.md @@ -0,0 +1,33 @@ +# Graph Isomorphism Network (GIN) + +[Graph Isomorphism Network \(GIN\)](https://arxiv.org/pdf/1810.00826.pdf) is a simple graph neural network that expects to achieve the ability as the Weisfeiler-Lehman graph isomorphism test. Based on PGL, we reproduce the GIN model. + +### Datasets + +The dataset can be downloaded from [here](https://github.com/weihua916/powerful-gnns/blob/master/dataset.zip). +After downloading the data,uncompress them, then a directory named `./dataset/` can be found in current directory. Note that the current directory is the root directory of GIN model. + +### Dependencies + +- paddlepaddle >= 1.6 +- pgl 1.0.2 + +### How to run + +For examples, use GPU to train GIN model on MUTAG dataset. +``` +python main.py --use_cuda --dataset_name MUTAG --data_path ./dataset +``` + +### Hyperparameters + +- data\_path: the root path of your dataset +- dataset\_name: the name of the dataset +- fold\_idx: The $fold\_idx^{th}$ fold of dataset splited. Here we use 10 fold cross-validation +- train\_eps: whether the $\epsilon$ parameter is learnable. + +### Experiment results (Accuracy) +| |MUTAG | COLLAB | IMDBBINARY | IMDBMULTI | +|--|-------------|----------|------------|-----------------| +|PGL result | 90.8 | 78.6 | 76.8 | 50.8 | +|paper reuslt |90.0 | 80.0 | 75.1 | 52.3 | diff --git a/examples/gin/dataloader.py b/examples/gin/dataloader.py new file mode 100644 index 0000000000000000000000000000000000000000..d27f48be68e969e4374c69d3c5b5aa187c0512e4 --- /dev/null +++ b/examples/gin/dataloader.py @@ -0,0 +1,152 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This file implement the graph dataloader. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from __future__ import absolute_import + +import os +import sys +import time +import argparse +import numpy as np +import collections + +import paddle +import paddle.fluid as fluid +import paddle.fluid.layers as fl +import pgl +from pgl.utils import mp_reader +from pgl.utils.logger import log + + +def batch_iter(data, batch_size, fid, num_workers): + """node_batch_iter + """ + size = len(data) + perm = np.arange(size) + np.random.shuffle(perm) + start = 0 + cc = 0 + while start < size: + index = perm[start:start + batch_size] + start += batch_size + cc += 1 + if cc % num_workers != fid: + continue + yield data[index] + + +def scan_batch_iter(data, batch_size, fid, num_workers): + """scan_batch_iter + """ + batch = [] + cc = 0 + for line_example in data.scan(): + cc += 1 + if cc % num_workers != fid: + continue + batch.append(line_example) + if len(batch) == batch_size: + yield batch + batch = [] + + if len(batch) > 0: + yield batch + + +class GraphDataloader(object): + """Graph Dataloader + """ + + def __init__( + self, + dataset, + batch_size, + seed=0, + num_workers=1, + buf_size=1000, + shuffle=True, ): + + self.shuffle = shuffle + self.seed = seed + self.num_workers = num_workers + self.buf_size = buf_size + self.batch_size = batch_size + self.dataset = dataset + + def batch_fn(self, batch_examples): + """ batch_fn batch producer""" + graphs = [b[0] for b in batch_examples] + labels = [b[1] for b in batch_examples] + join_graph = pgl.graph.MultiGraph(graphs) + labels = np.array(labels, dtype="int64").reshape(-1, 1) + return join_graph, labels + # feed_dict = self.graph_wrapper.to_feed(join_graph) + + # raise NotImplementedError("No defined Batch Fn") + + def batch_iter(self, fid): + """batch_iter""" + if self.shuffle: + for batch in batch_iter(self, self.batch_size, fid, + self.num_workers): + yield batch + else: + for batch in scan_batch_iter(self, self.batch_size, fid, + self.num_workers): + yield batch + + def __len__(self): + """__len__""" + return len(self.dataset) + + def __getitem__(self, idx): + """__getitem__""" + if isinstance(idx, collections.Iterable): + return [self[bidx] for bidx in idx] + else: + return self.dataset[idx] + + def __iter__(self): + """__iter__""" + + def worker(filter_id): + def func_run(): + for batch_examples in self.batch_iter(filter_id): + batch_dict = self.batch_fn(batch_examples) + yield batch_dict + + return func_run + + if self.num_workers == 1: + r = paddle.reader.buffered(worker(0), self.buf_size) + else: + worker_pool = [worker(wid) for wid in range(self.num_workers)] + worker = mp_reader.multiprocess_reader( + worker_pool, use_pipe=True, queue_size=1000) + r = paddle.reader.buffered(worker, self.buf_size) + + for batch in r(): + yield batch + + def scan(self): + """scan""" + for example in self.dataset: + yield example diff --git a/examples/gin/main.py b/examples/gin/main.py new file mode 100644 index 0000000000000000000000000000000000000000..51fc61ee88ff8ef8dd39c7f988fbe9364800e197 --- /dev/null +++ b/examples/gin/main.py @@ -0,0 +1,149 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This file implement the training process of GIN model. +""" +import os +import sys +import time +import argparse +import numpy as np + +import paddle.fluid as fluid +import paddle.fluid.layers as fl +import pgl +from pgl.utils.logger import log + +from Dataset import GINDataset, fold10_split, random_split +from dataloader import GraphDataloader +from model import GINModel + + +def main(args): + """main function""" + dataset = GINDataset( + args.data_path, + args.dataset_name, + self_loop=not args.train_eps, + degree_as_nlabel=True) + train_dataset, test_dataset = fold10_split( + dataset, fold_idx=args.fold_idx, seed=args.seed) + + train_loader = GraphDataloader(train_dataset, batch_size=args.batch_size) + test_loader = GraphDataloader( + test_dataset, batch_size=args.batch_size, shuffle=False) + + place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace() + train_program = fluid.Program() + startup_program = fluid.Program() + + with fluid.program_guard(train_program, startup_program): + gw = pgl.graph_wrapper.GraphWrapper( + "gw", place=place, node_feat=dataset[0][0].node_feat_info()) + + model = GINModel(args, gw, dataset.gclasses) + model.forward() + + infer_program = train_program.clone(for_test=True) + + with fluid.program_guard(train_program, startup_program): + epoch_step = int(len(train_dataset) / args.batch_size) + 1 + boundaries = [ + i + for i in range(50 * epoch_step, args.epochs * epoch_step, + epoch_step * 50) + ] + values = [args.lr * 0.5**i for i in range(0, len(boundaries) + 1)] + lr = fl.piecewise_decay(boundaries=boundaries, values=values) + train_op = fluid.optimizer.Adam(lr).minimize(model.loss) + + exe = fluid.Executor(place) + exe.run(startup_program) + + # train and evaluate + global_step = 0 + for epoch in range(1, args.epochs + 1): + for idx, batch_data in enumerate(train_loader): + g, labels = batch_data + feed_dict = gw.to_feed(g) + feed_dict['labels'] = labels + ret_loss, ret_lr, ret_acc = exe.run( + train_program, + feed=feed_dict, + fetch_list=[model.loss, lr, model.acc]) + + global_step += 1 + if global_step % 10 == 0: + message = "epoch %d | step %d | " % (epoch, global_step) + message += "lr %.6f | loss %.6f | acc %.4f" % ( + ret_lr, ret_loss, ret_acc) + log.info(message) + + # evaluate + result = evaluate(exe, infer_program, model, gw, test_loader) + + message = "evaluating result" + for key, value in result.items(): + message += " | %s %.6f" % (key, value) + log.info(message) + + +def evaluate(exe, prog, model, gw, loader): + """evaluate""" + total_loss = [] + total_acc = [] + for idx, batch_data in enumerate(loader): + g, labels = batch_data + feed_dict = gw.to_feed(g) + feed_dict['labels'] = labels + ret_loss, ret_acc = exe.run(prog, + feed=feed_dict, + fetch_list=[model.loss, model.acc]) + total_loss.append(ret_loss) + total_acc.append(ret_acc) + + total_loss = np.mean(total_loss) + total_acc = np.mean(total_acc) + + return {"loss": total_loss, "acc": total_acc} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--data_path', type=str, default='./dataset') + parser.add_argument('--dataset_name', type=str, default='MUTAG') + parser.add_argument('--batch_size', type=int, default=32) + parser.add_argument('--fold_idx', type=int, default=0) + parser.add_argument('--output_path', type=str, default='./outputs/') + parser.add_argument('--use_cuda', action='store_true') + parser.add_argument('--num_layers', type=int, default=5) + parser.add_argument('--num_mlp_layers', type=int, default=2) + parser.add_argument('--hidden_size', type=int, default=64) + parser.add_argument( + '--pool_type', + type=str, + default="sum", + choices=["sum", "average", "max"]) + parser.add_argument('--train_eps', action='store_true') + parser.add_argument('--epochs', type=int, default=350) + parser.add_argument('--lr', type=float, default=0.01) + parser.add_argument('--dropout_prob', type=float, default=0.5) + parser.add_argument('--seed', type=int, default=0) + args = parser.parse_args() + + log.info(args) + if not os.path.exists(args.output_path): + os.makedirs(args.output_path) + + main(args) diff --git a/examples/gin/model.py b/examples/gin/model.py new file mode 100644 index 0000000000000000000000000000000000000000..45548f37121afe7fc0945246c415b42df4c9d2c7 --- /dev/null +++ b/examples/gin/model.py @@ -0,0 +1,83 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""This file implement the GIN model. +""" + +import numpy as np + +import paddle.fluid as fluid +import paddle.fluid.layers as fl +import pgl +from pgl.layers.conv import gin + + +class GINModel(object): + """GINModel""" + + def __init__(self, args, gw, num_class): + self.args = args + self.num_layers = self.args.num_layers + self.hidden_size = self.args.hidden_size + self.train_eps = self.args.train_eps + self.pool_type = self.args.pool_type + self.dropout_prob = self.args.dropout_prob + self.num_class = num_class + + self.gw = gw + self.labels = fl.data(name="labels", shape=[None, 1], dtype="int64") + + def forward(self): + """forward""" + features_list = [self.gw.node_feat["attr"]] + + for i in range(self.num_layers): + h = gin(self.gw, + features_list[i], + hidden_size=self.hidden_size, + activation="relu", + name="gin_%s" % (i), + init_eps=0.0, + train_eps=self.train_eps) + + h = fl.layer_norm( + h, + begin_norm_axis=1, + param_attr=fluid.ParamAttr( + name="norm_scale_%s" % (i), + initializer=fluid.initializer.Constant(1.0)), + bias_attr=fluid.ParamAttr( + name="norm_bias_%s" % (i), + initializer=fluid.initializer.Constant(0.0)), ) + + h = fl.relu(h) + + features_list.append(h) + + output = 0 + for i, h in enumerate(features_list): + pooled_h = pgl.layers.graph_pooling(self.gw, h, self.pool_type) + drop_h = fl.dropout( + pooled_h, + self.dropout_prob, + dropout_implementation="upscale_in_train") + output += fl.fc(drop_h, + size=self.num_class, + act=None, + param_attr=fluid.ParamAttr(name="final_fc_%s" % + (i))) + + # calculate loss + self.loss = fl.softmax_with_cross_entropy(output, self.labels) + self.loss = fl.reduce_mean(self.loss) + self.acc = fl.accuracy(fl.softmax(output), self.labels) diff --git a/examples/kg/README.md b/examples/kg/README.md deleted file mode 100644 index 2dcde4b12a3c34088693b45f3f6831d555ced6da..0000000000000000000000000000000000000000 --- a/examples/kg/README.md +++ /dev/null @@ -1,44 +0,0 @@ -# PGL - Knowledge Graph Embedding - -## Introduction -This package is mainly for computing node and relation embedding of knowledge graphs efficiently. - - -This package reproduce the following knowledge embedding models: -- TransE -- TransR -- RotatE - -## Dataset - -The dataset WN18 and FB15k are originally published by TransE paper and and be download [here](https://everest.hds.utc.fr/doku.php?id=en:transe) - - -## Dependencies -If you want to use the PGL-KGE in paddle, please install following packages. -- paddlepaddle>=1.7 -- pgl - - -## Experiment results -FB15k dataset - -| Models |Mean Rank| Mrr | Hits@1 | Hits@3 | Hits@10 | MR@filter| Hits10@filter| -|----------|-------|-------|--------|--------|---------|---------|---------| -| TransE| 214 | -- | -- | -- | 0.491 | 118 | 0.668| -| TransR| 202 | -- | -- | -- | 0.502 | 115 | 0.683| -| RotatE| 156| -- | -- | -- | 0.498 | 52 | 0.710| - -WN18 dataset - -| Models |Mean Rank| Mrr | Hits@1 | Hits@3 | Hits@10 | MR@filter| Hits10@filter| -|----------|-------|-------|--------|--------|---------|---------|---------| -| TransE| 257 | -- | -- | -- | 0.800 | 245 | 0.915| -| TransR| 255 | -- | -- | -- | 0.8012| 243 | 0.9371| -| RotatE| 188 | -- | -- | -- | 0.8325| 176 | 0.9601| - -## References - -[1]. TransE https://ieeexplore.ieee.org/abstract/document/8047276 -[2]. TransR http://www.aaai.org/ocs/index.php/AAAI/AAAI15/paper/viewFile/9571/9523 -[3]. RotatE https://arxiv.org/abs/1902.10197 diff --git a/examples/kg/model/utils.py b/examples/kg/model/utils.py deleted file mode 100644 index b6952316db3353362e57dca4e9d08204134ad4be..0000000000000000000000000000000000000000 --- a/examples/kg/model/utils.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Utils for the models. -""" -import paddle.fluid as fluid -from paddle.fluid.layer_helper import LayerHelper - - -def lookup_table(input, embedding_table, dtype='float32'): - """ - lookup table support for paddle. - :param input: - :param embedding_table: - :param dtype: - :return: - """ - is_sparse = False - is_distributed = False - helper = LayerHelper('embedding', **locals()) - remote_prefetch = is_sparse and (not is_distributed) - if remote_prefetch: - assert is_sparse is True and is_distributed is False - tmp = helper.create_variable_for_type_inference(dtype) - padding_idx = -1 - helper.append_op( - type='lookup_table', - inputs={'Ids': input, - 'W': embedding_table}, - outputs={'Out': tmp}, - attrs={ - 'is_sparse': is_sparse, - 'is_distributed': is_distributed, - 'remote_prefetch': remote_prefetch, - 'padding_idx': padding_idx - }) - return tmp - - -def lookup_table_gather(index, input): - """ - lookup table support for paddle by gather. - :param index: - :param input: - :return: - """ - return fluid.layers.gather(index=index, input=input, overwrite=False) diff --git a/examples/kg/run.sh b/examples/kg/run.sh deleted file mode 100644 index fbc53e02aade4d9cb317ac374bb45247333b8520..0000000000000000000000000000000000000000 --- a/examples/kg/run.sh +++ /dev/null @@ -1,44 +0,0 @@ -#CUDA_VISIBLE_DEVICES=2 \ -#FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -#python main.py \ -# --use_cuda \ -# --model TransE \ -# --optimizer adam \ -# --batch_size=512 \ -# --learning_rate=0.001 \ -# --epoch 100 \ -# --evaluate_per_iteration 20 \ -# --sample_workers 4 \ -# --margin 4 \ -## #--only_evaluate - -#CUDA_VISIBLE_DEVICES=2 \ -#FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -#python main.py \ -# --use_cuda \ -# --model RotatE \ -# --data_dir ./data/WN18 \ -# --optimizer adam \ -# --batch_size=512 \ -# --learning_rate=0.001 \ -# --epoch 100 \ -# --evaluate_per_iteration 100 \ -# --sample_workers 10 \ -# --margin 6 \ -# --neg_times 10 - -CUDA_VISIBLE_DEVICES=2 \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python main.py \ - --use_cuda \ - --model RotatE \ - --data_dir ./data/FB15k \ - --optimizer adam \ - --batch_size=512 \ - --learning_rate=0.001 \ - --epoch 100 \ - --evaluate_per_iteration 100 \ - --sample_workers 10 \ - --margin 8 \ - --neg_times 10 \ - --neg_mode True diff --git a/examples/pgl-ke/README.md b/examples/pgl-ke/README.md new file mode 100644 index 0000000000000000000000000000000000000000..661791fc2b9a5815ee245a8d8b670acca3495d46 --- /dev/null +++ b/examples/pgl-ke/README.md @@ -0,0 +1,78 @@ +# PGL - Knowledge Graph Embedding + + +This package is mainly for computing node and relation embedding of knowledge graphs efficiently. + +This package reproduce the following knowledge embedding models: +- TransE +- TransR +- RotatE + +### Dataset + +The dataset WN18 and FB15k are originally published by TransE paper and can be download [here](https://everest.hds.utc.fr/doku.php?id=en:transe). + +FB15k: [https://drive.google.com/open?id=19I3LqaKjgq-3vOs0us7OgEL06TIs37W8](https://drive.google.com/open?id=19I3LqaKjgq-3vOs0us7OgEL06TIs37W8) + +WN18: [https://drive.google.com/open?id=1MXy257ZsjeXQHZScHLeQeVnUTPjltlwD](https://drive.google.com/open?id=1MXy257ZsjeXQHZScHLeQeVnUTPjltlwD) + +### Dependencies + +If you want to use the PGL-KG in paddle, please install following packages. +- paddlepaddle>=1.7 +- pgl + +### Hyperparameters + +- use\_cuda: use cuda to train. +- model: pgl-kg model names. Now available for `TransE`, `TransR` and `RotatE`. +- data\_dir: the data path of dataset. +- optimizer: optimizer to run the model. +- batch\_size: batch size. +- learning\_rate:learning rate. +- epoch: epochs to run. +- evaluate\_per\_iteration: evaluate after certain epochs. +- sample\_workers: sample workers nums to prepare data. +- margin: hyper-parameter for some model. + +For more hyper parameters usages, please refer the `main.py`. We also provide `run.sh` script to reproduce performance results (please download dataset in `./data` and specify the data\_dir paramter). + + +### How to run + +For examples, use GPU to train TransR model on WN18 dataset. +(please download WN18 dataset to `./data` floder) +``` +python main.py --use_cuda --model TransR --data_dir ./data/WN18 +``` +We also provide `run.sh` script to reproduce following performance results. + +### Experiment results + +Here we report the experiment results on FB15k and WN18 dataset. The evaluation criteria are MR (mean rank), Mrr (mean reciprocal rank), Hit@N (The first N hit rate). The suffix `@f` means that we filter the exists relations of entities. + +FB15k dataset + +| Models | MR | Mrr | Hits@1 | Hits@3 | Hits@10| MR@f |Mrr@f|Hit1@f|Hit3@f|Hits10@f| +|--------|-----|-------|--------|--------|--------|-------|-----|------|------|--------| +| TransE | 215 | 0.205 | 0.093 | 0.234 | 0.446 | 74 |0.379| 0.235| 0.453| 0.647 | +| TransR | 304 | 0.193 | 0.092 | 0.211 | 0.418 | 156 |0.366| 0.232| 0.435| 0.623 | +| RotatE | 157 | 0.270 | 0.162 | 0.303 | 0.501 | 53 |0.478| 0.354| 0.547| 0.710 | + + +WN18 dataset + +| Models | MR | Mrr | Hits@1 | Hits@3 | Hits@10| MR@f |Mrr@f|Hit1@f|Hit3@f|Hits10@f| +|--------|-----|-------|--------|--------|--------|-------|-----|------|------|--------| +| TransE | 219 | 0.338 | 0.082 | 0.523 | 0.800 | 208 |0.463| 0.135| 0.771| 0.932 | +| TransR | 321 | 0.370 | 0.096 | 0.591 | 0.810 | 309 |0.513| 0.158| 0.941| 0.941 | +| RotatE | 167 | 0.623 | 0.476 | 0.688 | 0.830 | 155 |0.915| 0.884| 0.941| 0.957 | + + +## References + +[1]. [TransE: Translating embeddings for modeling multi-relational data.](https://ieeexplore.ieee.org/abstract/document/8047276) + +[2]. [TransR: Learning entity and relation embeddings for knowledge graph completion.](http://www.aaai.org/ocs/index.php/AAAI/AAAI15/paper/viewFile/9571/9523) + +[3]. [RotatE: Knowledge Graph Embedding by Relational Rotation in Complex Space.](https://arxiv.org/abs/1902.10197) diff --git a/examples/kg/data_loader.py b/examples/pgl-ke/data_loader.py similarity index 97% rename from examples/kg/data_loader.py rename to examples/pgl-ke/data_loader.py index 09f2435aa1f16a78819c9ed9a0e27a29e2a0b130..e0aeb295ff5ace50ae24ce7a69fd0a601b4e598b 100644 --- a/examples/kg/data_loader.py +++ b/examples/pgl-ke/data_loader.py @@ -19,10 +19,11 @@ import os import numpy as np from collections import defaultdict from pgl.utils.logger import log -from pybloom import BloomFilter +#from pybloom import BloomFilter -class KBloader: + +class KGLoader: """ load the FB15K """ @@ -65,8 +66,9 @@ class KBloader: def training_data_no_filter(self, train_triple_positive): """faster, no filter for exists triples""" - size = len(train_triple_positive) - train_triple_negative = train_triple_positive + 0 + size = len(train_triple_positive) * self._neg_times + train_triple_negative = train_triple_positive.repeat( + self._neg_times, axis=0) replace_head_probability = 0.5 * np.ones(size) replace_entity_id = np.random.randint(self.entity_total, size=size) random_num = np.random.random(size=size) @@ -122,7 +124,6 @@ class KBloader: """ n = len(self._triple_train) rand_idx = np.random.permutation(n) - rand_idx = rand_idx % n n_triple = len(rand_idx) start = 0 while start < n_triple: diff --git a/examples/kg/evalutate.py b/examples/pgl-ke/evalutate.py similarity index 95% rename from examples/kg/evalutate.py rename to examples/pgl-ke/evalutate.py index 389b211f43517460488498b945b2e0863ce88796..09430288aff9fde484763542713b4dfc34f3eb54 100644 --- a/examples/kg/evalutate.py +++ b/examples/pgl-ke/evalutate.py @@ -99,8 +99,10 @@ class Evaluate: feed=batch_feed_dict) yield batch_feed_dict["test_triple"], head_score, tail_score n_used_eval_triple += 1 - print('[{:.3f}s] #evaluation triple: {}/{}'.format( - timeit.default_timer() - start, n_used_eval_triple, 5000)) + if n_used_eval_triple % 500 == 0: + print('[{:.3f}s] #evaluation triple: {}/{}'.format( + timeit.default_timer( + ) - start, n_used_eval_triple, self.reader.test_num)) res_reader = mp_reader_mapper( reader=iterator, diff --git a/examples/kg/main.py b/examples/pgl-ke/main.py similarity index 81% rename from examples/kg/main.py rename to examples/pgl-ke/main.py index 84e0add710a40e53c6f0a36897b44c6e07c4fd69..3434dfd42c01bfb917124e2d116a75d9e92ff59c 100644 --- a/examples/kg/main.py +++ b/examples/pgl-ke/main.py @@ -16,10 +16,13 @@ The script to run these models. """ import argparse import timeit +import os +import numpy as np import paddle.fluid as fluid -from data_loader import KBloader +from data_loader import KGLoader from evalutate import Evaluate from model import model_dict +from model.utils import load_var from mp_mapper import mp_reader_mapper from pgl.utils.logger import log @@ -49,6 +52,7 @@ def run_round(batch_iter, run_time = 0 data_time = 0 t2 = timeit.default_timer() + start_epoch_time = timeit.default_timer() for batch_feed_dict in batch_iter(): batch += 1 t1 = timeit.default_timer() @@ -62,8 +66,11 @@ def run_round(batch_iter, if batch % log_per_step == 0: tmp_epoch += 1 if prefix == "train": - log.info("Epoch %s Ava Loss %s" % - (epoch + tmp_epoch, tmp_loss / batch)) + log.info("Epoch %s (%.7f sec) Train Loss: %.7f" % + (epoch + tmp_epoch, + timeit.default_timer() - start_epoch_time, + tmp_loss[0] / batch)) + start_epoch_time = timeit.default_timer() else: log.info("Batch %s" % batch) batch = 0 @@ -84,7 +91,7 @@ def train(args): :param args: all args. :return: None """ - kgreader = KBloader( + kgreader = KGLoader( batch_size=args.batch_size, data_dir=args.data_dir, neg_mode=args.neg_mode, @@ -117,8 +124,8 @@ def train(args): reader = mp_reader_mapper( data_repeat, - func=kgreader.training_data_map, - #func=kgreader.training_data_no_filter, + func=kgreader.training_data_no_filter + if args.nofilter else kgreader.training_data_map, num_works=args.sample_workers) return reader @@ -148,6 +155,20 @@ def train(args): exe = fluid.Executor(places[0]) exe.run(model.startup_program) exe.run(fluid.default_startup_program()) + if args.pretrain and model.model_name in ["TransR", "transr"]: + pretrain_ent = os.path.join(args.checkpoint, + model.ent_name.replace("TransR", "TransE")) + pretrain_rel = os.path.join(args.checkpoint, + model.rel_name.replace("TransR", "TransE")) + if os.path.exists(pretrain_ent): + print("loading pretrain!") + #var = fluid.global_scope().find_var(model.ent_name) + load_var(exe, model.train_program, model.ent_name, pretrain_ent) + #var = fluid.global_scope().find_var(model.rel_name) + load_var(exe, model.train_program, model.rel_name, pretrain_rel) + else: + raise ValueError("pretrain file {} not exists!".format( + pretrain_ent)) prog = fluid.CompiledProgram(model.train_program).with_data_parallel( loss_name=model.train_fetch_vars[0].name) @@ -182,9 +203,9 @@ def train(args): log_per_step=kgreader.train_num // args.batch_size, epoch=epoch * args.evaluate_per_iteration) log.info("epoch\t%s" % ((1 + epoch) * args.evaluate_per_iteration)) - if True: - fluid.io.save_params( - exe, dirname=args.checkpoint, main_program=model.train_program) + fluid.io.save_params( + exe, dirname=args.checkpoint, main_program=model.train_program) + if not args.noeval: eva = Evaluate(kgreader) eva.launch_evaluation( exe=exe, @@ -273,6 +294,22 @@ def main(): parser.add_argument( '--neg_mode', type=bool, help='return neg mode flag', default=False) + parser.add_argument( + '--nofilter', + type=bool, + help='don\'t filter invalid examples', + default=False) + parser.add_argument( + '--pretrain', + type=bool, + help='pretrain for TransR model', + default=False) + parser.add_argument( + '--noeval', + type=bool, + help='whether to evaluate the result', + default=False) + args = parser.parse_args() log.info(args) train(args) diff --git a/examples/kg/model/Model.py b/examples/pgl-ke/model/Model.py similarity index 100% rename from examples/kg/model/Model.py rename to examples/pgl-ke/model/Model.py diff --git a/examples/kg/model/RotatE.py b/examples/pgl-ke/model/RotatE.py similarity index 98% rename from examples/kg/model/RotatE.py rename to examples/pgl-ke/model/RotatE.py index 610a2c0c95d41c5538bd3a9a6ecd380a0dcb09d5..3e243a018694ad4df14e856f93f6390e3e34019a 100644 --- a/examples/kg/model/RotatE.py +++ b/examples/pgl-ke/model/RotatE.py @@ -13,9 +13,9 @@ # limitations under the License. """ RotatE: -"Learning entity and relation embeddings for knowledge graph completion." -Lin, Yankai, et al. -https://www.aaai.org/ocs/index.php/AAAI/AAAI15/paper/view/9571/9523 +"RotatE: Knowledge Graph Embedding by Relational Rotation in Complex Space." +Sun, Zhiqing, et al. +https://arxiv.org/abs/1902.10197 """ import paddle.fluid as fluid from .Model import Model diff --git a/examples/kg/model/TransE.py b/examples/pgl-ke/model/TransE.py similarity index 96% rename from examples/kg/model/TransE.py rename to examples/pgl-ke/model/TransE.py index 232b16253bcd5239439cf26ce24c8a97c97cc4da..415290631e6e6420c09581baf80c7b8332b1dcc1 100644 --- a/examples/kg/model/TransE.py +++ b/examples/pgl-ke/model/TransE.py @@ -34,6 +34,7 @@ class TransE(Model): learning_rate, args, optimizer="adam"): + self._neg_times = args.neg_times super(TransE, self).__init__( model_name="TransE", data_reader=data_reader, @@ -84,6 +85,9 @@ class TransE(Model): fluid.layers.abs(pos_score), 1, keep_dim=False) neg = fluid.layers.reduce_sum( fluid.layers.abs(neg_score), 1, keep_dim=False) + neg = fluid.layers.reshape( + neg, shape=[-1, self._neg_times], inplace=True) + loss = fluid.layers.reduce_mean( fluid.layers.relu(pos - neg + self._margin)) return [loss] diff --git a/examples/kg/model/TransR.py b/examples/pgl-ke/model/TransR.py similarity index 93% rename from examples/kg/model/TransR.py rename to examples/pgl-ke/model/TransR.py index ae9daf5d043a33b6f3e9f6e800c1640a9ab5ee14..1b2fca3c076c56aaf3f1560004c31804b894931a 100644 --- a/examples/kg/model/TransR.py +++ b/examples/pgl-ke/model/TransR.py @@ -36,6 +36,7 @@ class TransR(Model): args, optimizer="adam"): """init""" + self._neg_times = args.neg_times super(TransR, self).__init__( model_name="TransR", data_reader=data_reader, @@ -60,19 +61,19 @@ class TransR(Model): dtype="float32", name=self.rel_name, default_initializer=fluid.initializer.Xavier()) + init_values = np.tile( + np.identity( + self._hidden_size, dtype="float32").reshape(-1), + (self._relation_total, 1)) transfer_matrix = fluid.layers.create_parameter( shape=[ self._relation_total, self._hidden_size * self._hidden_size ], dtype="float32", - name=self._prefix + "transfer_matrix", ) - # Here is a trick, must init with identity matrix to get good hit@10 performance. - fluid.layers.assign( - np.tile( - np.identity( - self._hidden_size, dtype="float32").reshape(-1), - (self._relation_total, 1)), - transfer_matrix) + name=self._prefix + "transfer_matrix", + default_initializer=fluid.initializer.NumpyArrayInitializer( + init_values)) + return entity_embedding, relation_embedding, transfer_matrix def score_with_l2_normalize(self, head, rel, tail): @@ -111,7 +112,7 @@ class TransR(Model): pos_head_trans = self.matmul_with_expend_dims(pos_head, rel_matrix) pos_tail_trans = self.matmul_with_expend_dims(pos_tail, rel_matrix) - trans_neg = False + trans_neg = True if trans_neg: rel_matrix_neg = fluid.layers.reshape( lookup_table(self.train_neg_input[:, 1], transfer_matrix), @@ -133,6 +134,9 @@ class TransR(Model): fluid.layers.abs(pos_score), -1, keep_dim=False) neg = fluid.layers.reduce_sum( fluid.layers.abs(neg_score), -1, keep_dim=False) + neg = fluid.layers.reshape( + neg, shape=[-1, self._neg_times], inplace=True) + loss = fluid.layers.reduce_mean( fluid.layers.relu(pos - neg + self._margin)) return [loss] diff --git a/examples/kg/model/__init__.py b/examples/pgl-ke/model/__init__.py similarity index 100% rename from examples/kg/model/__init__.py rename to examples/pgl-ke/model/__init__.py diff --git a/examples/pgl-ke/model/utils.py b/examples/pgl-ke/model/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0b11fd4b5177df48a1a7031b55125344c925bb3e --- /dev/null +++ b/examples/pgl-ke/model/utils.py @@ -0,0 +1,119 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utils for the models. +""" +import paddle.fluid as fluid +from paddle.fluid.layer_helper import LayerHelper + + +def lookup_table(input, embedding_table, dtype='float32'): + """ + lookup table support for paddle. + :param input: + :param embedding_table: + :param dtype: + :return: + """ + is_sparse = False + is_distributed = False + helper = LayerHelper('embedding', **locals()) + remote_prefetch = is_sparse and (not is_distributed) + if remote_prefetch: + assert is_sparse is True and is_distributed is False + tmp = helper.create_variable_for_type_inference(dtype) + padding_idx = -1 + helper.append_op( + type='lookup_table', + inputs={'Ids': input, + 'W': embedding_table}, + outputs={'Out': tmp}, + attrs={ + 'is_sparse': is_sparse, + 'is_distributed': is_distributed, + 'remote_prefetch': remote_prefetch, + 'padding_idx': padding_idx + }) + return tmp + + +def lookup_table_gather(index, input): + """ + lookup table support for paddle by gather. + :param index: + :param input: + :return: + """ + return fluid.layers.gather(index=index, input=input, overwrite=False) + + +def _clone_var_in_block_(block, var): + assert isinstance(var, fluid.Variable) + if var.desc.type() == fluid.core.VarDesc.VarType.LOD_TENSOR: + return block.create_var( + name=var.name, + shape=var.shape, + dtype=var.dtype, + type=var.type, + lod_level=var.lod_level, + persistable=True) + else: + return block.create_var( + name=var.name, + shape=var.shape, + dtype=var.dtype, + type=var.type, + persistable=True) + + +def load_var(executor, main_program=None, var=None, filename=None): + """ + load_var to certain program + :param executor: executor + :param main_program: the program to load + :param var: the variable name in main_program. + :file_name: the file name of the file to load. + :return: None + """ + load_prog = fluid.Program() + load_block = load_prog.global_block() + + if main_program is None: + main_program = fluid.default_main_program() + + if not isinstance(main_program, fluid.Program): + raise TypeError("program should be as Program type or None") + + vars = list(filter(None, main_program.list_vars())) + # save origin param shape + orig_para_shape = {} + load_var_map = {} + for each_var in vars: + if each_var.name != var: + continue + assert isinstance(each_var, fluid.Variable) + if each_var.type == fluid.core.VarDesc.VarType.RAW: + continue + + if isinstance(each_var, fluid.framework.Parameter): + orig_para_shape[each_var.name] = tuple(each_var.desc.get_shape()) + new_var = _clone_var_in_block_(load_block, each_var) + if filename is not None: + load_block.append_op( + type='load', + inputs={}, + outputs={'Out': [new_var]}, + attrs={'file_path': filename}) + + executor.run(load_prog) diff --git a/examples/kg/mp_mapper.py b/examples/pgl-ke/mp_mapper.py similarity index 94% rename from examples/kg/mp_mapper.py rename to examples/pgl-ke/mp_mapper.py index 9c4fbeb436eff3d105723ba0447fca6574ae3ec7..b79f29eb0bdc3f0204c350034027526ad7064089 100644 --- a/examples/kg/mp_mapper.py +++ b/examples/pgl-ke/mp_mapper.py @@ -65,12 +65,16 @@ def mp_reader_mapper(reader, func, num_works=4): all_process.append(p) data_iter = reader() + if not hasattr(data_iter, "__next__"): + __next__ = data_iter.next + else: + __next__ = data_iter.__next__ def next_data(): """next_data""" _next = None try: - _next = data_iter.next() + _next = __next__() except StopIteration: # log.debug(traceback.format_exc()) pass diff --git a/examples/pgl-ke/run.sh b/examples/pgl-ke/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..fa74b167e6e2faf8ecdda5ab0ae6bc24c8ae0ebf --- /dev/null +++ b/examples/pgl-ke/run.sh @@ -0,0 +1,186 @@ +device=3 + +CUDA_VISIBLE_DEVICES=$device \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python main.py \ + --use_cuda \ + --model TransE \ + --data_dir ./data/FB15k \ + --optimizer adam \ + --batch_size=1024 \ + --learning_rate=0.001 \ + --epoch 200 \ + --evaluate_per_iteration 200 \ + --sample_workers 1 \ + --margin 1.0 \ + --nofilter True \ + --neg_times 10 \ + --neg_mode True + #--only_evaluate + +# TransE FB15k +# -----Raw-Average-Results +# MeanRank: 214.94, MRR: 0.2051, Hits@1: 0.0929, Hits@3: 0.2343, Hits@10: 0.4458 +# -----Filter-Average-Results +# MeanRank: 74.41, MRR: 0.3793, Hits@1: 0.2351, Hits@3: 0.4538, Hits@10: 0.6570 + + + +CUDA_VISIBLE_DEVICES=$device \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python main.py \ + --use_cuda \ + --model TransE \ + --data_dir ./data/WN18 \ + --optimizer adam \ + --batch_size=1024 \ + --learning_rate=0.001 \ + --epoch 100 \ + --evaluate_per_iteration 100 \ + --sample_workers 1 \ + --margin 4 \ + --nofilter True \ + --neg_times 10 \ + --neg_mode True + +# TransE WN18 +# -----Raw-Average-Results +# MeanRank: 219.08, MRR: 0.3383, Hits@1: 0.0821, Hits@3: 0.5233, Hits@10: 0.7997 +# -----Filter-Average-Results +# MeanRank: 207.72, MRR: 0.4631, Hits@1: 0.1349, Hits@3: 0.7708, Hits@10: 0.9315 + + + +#for prertrain +CUDA_VISIBLE_DEVICES=$device \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python main.py \ + --use_cuda \ + --model TransE \ + --data_dir ./data/FB15k \ + --optimizer adam \ + --batch_size=512 \ + --learning_rate=0.001 \ + --epoch 30 \ + --evaluate_per_iteration 30 \ + --sample_workers 1 \ + --margin 2.0 \ + --nofilter True \ + --noeval True \ + --neg_times 10 \ + --neg_mode True && \ +CUDA_VISIBLE_DEVICES=$device \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python main.py \ + --use_cuda \ + --model TransR \ + --data_dir ./data/FB15k \ + --optimizer adam \ + --batch_size=512 \ + --learning_rate=0.001 \ + --epoch 200 \ + --evaluate_per_iteration 200 \ + --sample_workers 1 \ + --margin 2.0 \ + --pretrain True \ + --nofilter True \ + --neg_times 10 \ + --neg_mode True + +# FB15k TransR 200, pretrain 20 +# -----Raw-Average-Results +# MeanRank: 303.81, MRR: 0.1931, Hits@1: 0.0920, Hits@3: 0.2109, Hits@10: 0.4181 +# -----Filter-Average-Results +# MeanRank: 156.30, MRR: 0.3663, Hits@1: 0.2318, Hits@3: 0.4352, Hits@10: 0.6231 + + + +# for pretrain +CUDA_VISIBLE_DEVICES=$device \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python main.py \ + --use_cuda \ + --model TransE \ + --data_dir ./data/WN18 \ + --optimizer adam \ + --batch_size=512 \ + --learning_rate=0.001 \ + --epoch 30 \ + --evaluate_per_iteration 30 \ + --sample_workers 1 \ + --margin 4.0 \ + --nofilter True \ + --noeval True \ + --neg_times 10 \ + --neg_mode True && \ +CUDA_VISIBLE_DEVICES=$device \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python main.py \ + --use_cuda \ + --model TransR \ + --data_dir ./data/WN18 \ + --optimizer adam \ + --batch_size=512 \ + --learning_rate=0.001 \ + --epoch 100 \ + --evaluate_per_iteration 100 \ + --sample_workers 1 \ + --margin 4.0 \ + --pretrain True \ + --nofilter True \ + --neg_times 10 \ + --neg_mode True + +# TransR WN18 100, pretrain 30 +# -----Raw-Average-Results +# MeanRank: 321.41, MRR: 0.3706, Hits@1: 0.0955, Hits@3: 0.5906, Hits@10: 0.8099 +# -----Filter-Average-Results +# MeanRank: 309.15, MRR: 0.5126, Hits@1: 0.1584, Hits@3: 0.8601, Hits@10: 0.9409 + + + +CUDA_VISIBLE_DEVICES=$device \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python main.py \ + --use_cuda \ + --model RotatE \ + --data_dir ./data/FB15k \ + --optimizer adam \ + --batch_size=512 \ + --learning_rate=0.001 \ + --epoch 100 \ + --evaluate_per_iteration 100 \ + --sample_workers 10 \ + --margin 8 \ + --neg_times 10 \ + --neg_mode True + +# RotatE FB15k +# -----Raw-Average-Results +# MeanRank: 156.85, MRR: 0.2699, Hits@1: 0.1615, Hits@3: 0.3031, Hits@10: 0.5006 +# -----Filter-Average-Results +# MeanRank: 53.35, MRR: 0.4776, Hits@1: 0.3537, Hits@3: 0.5473, Hits@10: 0.7062 + + + +CUDA_VISIBLE_DEVICES=$device \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python main.py \ + --use_cuda \ + --model RotatE \ + --data_dir ./data/WN18 \ + --optimizer adam \ + --batch_size=512 \ + --learning_rate=0.001 \ + --epoch 100 \ + --evaluate_per_iteration 100 \ + --sample_workers 10 \ + --margin 6 \ + --neg_times 10 \ + --neg_mode True + +# RotaE WN18 +# -----Raw-Average-Results +# MeanRank: 167.27, MRR: 0.6025, Hits@1: 0.4764, Hits@3: 0.6880, Hits@10: 0.8298 +# -----Filter-Average-Results +# MeanRank: 155.23, MRR: 0.9145, Hits@1: 0.8843, Hits@3: 0.9412, Hits@10: 0.9570 diff --git a/ogb_examples/graphproppred/main_pgl.py b/ogb_examples/graphproppred/main_pgl.py deleted file mode 100644 index 1cc505eeffb2c9ec143540edaf7abc864dfe7200..0000000000000000000000000000000000000000 --- a/ogb_examples/graphproppred/main_pgl.py +++ /dev/null @@ -1,188 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""test ogb -""" -import argparse - -import pgl -import numpy as np -import paddle.fluid as fluid -from pgl.contrib.ogb.graphproppred.dataset_pgl import PglGraphPropPredDataset -from pgl.utils import paddle_helper -from ogb.graphproppred import Evaluator -from pgl.contrib.ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder - - -def train(exe, batch_size, graph_wrapper, train_program, splitted_idx, dataset, - evaluator, fetch_loss, fetch_pred): - """Train""" - graphs, labels = dataset[splitted_idx["train"]] - perm = np.arange(0, len(graphs)) - np.random.shuffle(perm) - start_batch = 0 - batch_no = 0 - pred_output = np.zeros_like(labels, dtype="float32") - while start_batch < len(perm): - batch_index = perm[start_batch:start_batch + batch_size] - start_batch += batch_size - batch_graph = pgl.graph.MultiGraph(graphs[batch_index]) - batch_label = labels[batch_index] - batch_valid = (batch_label == batch_label).astype("float32") - batch_label = np.nan_to_num(batch_label).astype("float32") - feed_dict = graph_wrapper.to_feed(batch_graph) - feed_dict["label"] = batch_label - feed_dict["weight"] = batch_valid - loss, pred = exe.run(train_program, - feed=feed_dict, - fetch_list=[fetch_loss, fetch_pred]) - pred_output[batch_index] = pred - batch_no += 1 - print("train", evaluator.eval({"y_true": labels, "y_pred": pred_output})) - - -def evaluate(exe, batch_size, graph_wrapper, val_program, splitted_idx, - dataset, mode, evaluator, fetch_pred): - """Eval""" - graphs, labels = dataset[splitted_idx[mode]] - perm = np.arange(0, len(graphs)) - start_batch = 0 - batch_no = 0 - pred_output = np.zeros_like(labels, dtype="float32") - while start_batch < len(perm): - batch_index = perm[start_batch:start_batch + batch_size] - start_batch += batch_size - batch_graph = pgl.graph.MultiGraph(graphs[batch_index]) - feed_dict = graph_wrapper.to_feed(batch_graph) - pred = exe.run(val_program, feed=feed_dict, fetch_list=[fetch_pred]) - pred_output[batch_index] = pred[0] - batch_no += 1 - print(mode, evaluator.eval({"y_true": labels, "y_pred": pred_output})) - - -def send_func(src_feat, dst_feat, edge_feat): - """Send""" - return src_feat["h"] + edge_feat["h"] - - -class GNNModel(object): - """GNNModel""" - - def __init__(self, name, emb_dim, num_task, num_layers): - self.num_task = num_task - self.emb_dim = emb_dim - self.num_layers = num_layers - self.name = name - self.atom_encoder = AtomEncoder(name=name, emb_dim=emb_dim) - self.bond_encoder = BondEncoder(name=name, emb_dim=emb_dim) - - def forward(self, graph): - """foward""" - h_node = self.atom_encoder(graph.node_feat['feat']) - h_edge = self.bond_encoder(graph.edge_feat['feat']) - for layer in range(self.num_layers): - msg = graph.send( - send_func, - nfeat_list=[("h", h_node)], - efeat_list=[("h", h_edge)]) - h_node = graph.recv(msg, 'sum') + h_node - h_node = fluid.layers.fc(h_node, - size=self.emb_dim, - name=self.name + '_%s' % layer, - act="relu") - graph_nodes = pgl.layers.graph_pooling(graph, h_node, "average") - graph_pred = fluid.layers.fc(graph_nodes, self.num_task, name="final") - return graph_pred - - -def main(): - """main - """ - # Training settings - parser = argparse.ArgumentParser(description='Graph Dataset') - parser.add_argument( - '--epochs', - type=int, - default=100, - help='number of epochs to train (default: 100)') - parser.add_argument( - '--dataset', - type=str, - default="ogbg-mol-tox21", - help='dataset name (default: proteinfunc)') - args = parser.parse_args() - - place = fluid.CPUPlace() # Dataset too big to use GPU - - ### automatic dataloading and splitting - dataset = PglGraphPropPredDataset(name=args.dataset) - splitted_idx = dataset.get_idx_split() - - ### automatic evaluator. takes dataset name as input - evaluator = Evaluator(args.dataset) - - graph_data, label = dataset[:2] - batch_graph = pgl.graph.MultiGraph(graph_data) - graph_data = batch_graph - - train_program = fluid.Program() - startup_program = fluid.Program() - test_program = fluid.Program() - # degree normalize - graph_data.edge_feat["feat"] = graph_data.edge_feat["feat"].astype("int64") - graph_data.node_feat["feat"] = graph_data.node_feat["feat"].astype("int64") - - model = GNNModel( - name="gnn", num_task=dataset.num_tasks, emb_dim=64, num_layers=2) - - with fluid.program_guard(train_program, startup_program): - gw = pgl.graph_wrapper.GraphWrapper( - "graph", - node_feat=graph_data.node_feat_info(), - edge_feat=graph_data.edge_feat_info()) - pred = model.forward(gw) - sigmoid_pred = fluid.layers.sigmoid(pred) - - val_program = train_program.clone(for_test=True) - - initializer = [] - with fluid.program_guard(train_program, startup_program): - train_label = fluid.layers.data( - name="label", dtype="float32", shape=[None, dataset.num_tasks]) - train_weight = fluid.layers.data( - name="weight", dtype="float32", shape=[None, dataset.num_tasks]) - train_loss_t = fluid.layers.sigmoid_cross_entropy_with_logits( - x=pred, label=train_label) * train_weight - train_loss_t = fluid.layers.reduce_sum(train_loss_t) - - adam = fluid.optimizer.Adam( - learning_rate=1e-2, - regularization=fluid.regularizer.L2DecayRegularizer( - regularization_coeff=0.0005)) - adam.minimize(train_loss_t) - - exe = fluid.Executor(place) - exe.run(startup_program) - - for epoch in range(1, args.epochs + 1): - print("Epoch", epoch) - train(exe, 128, gw, train_program, splitted_idx, dataset, evaluator, - train_loss_t, sigmoid_pred) - evaluate(exe, 128, gw, val_program, splitted_idx, dataset, "valid", - evaluator, sigmoid_pred) - evaluate(exe, 128, gw, val_program, splitted_idx, dataset, "test", - evaluator, sigmoid_pred) - - -if __name__ == "__main__": - main() diff --git a/ogb_examples/graphproppred/mol/README.md b/ogb_examples/graphproppred/mol/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d1f4da579a7ce909bfa00f6d41cb2470ca64df93 --- /dev/null +++ b/ogb_examples/graphproppred/mol/README.md @@ -0,0 +1,37 @@ +# Graph Property Prediction for Open Graph Benchmark (OGB) + +[The Open Graph Benchmark (OGB)](https://ogb.stanford.edu/) is a collection of benchmark datasets, data loaders, and evaluators for graph machine learning. Here we complete the Graph Property Prediction task based on PGL. + +### Requirements + +- paddlpaddle >= 1.7.1 +- pgl 1.0.2 +- ogb + +NOTE: To install ogb that is fited for this project, run below command to install ogb +``` +git clone https://github.com/snap-stanford/ogb.git +git checkout 482c40bc9f31fe25f9df5aa11c8fb657bd2b1621 +python setup.py install +``` + +### How to run +For example, use GPU to train model on ogbg-molhiv dataset and ogb-molpcba dataset. +``` +CUDA_VISIBLE_DEVICES=1 python -u main.py --config hiv_config.yaml --use_cuda + +CUDA_VISIBLE_DEVICES=2 python -u main.py --config pcba_config.yaml --use_cuda +``` + +If you want to use CPU to train model, environment variables `CPU_NUM` should be specified and should be in the range of 1 to N, where N is the total CPU number on your machine. +``` +CPU_NUM=1 python -u main.py --config hiv_config.yaml + +CPU_NUM=1 python -u main.py --config pcba_config.yaml +``` + +### Experiment results + +| model | hiv (rocauc)| pcba (prcauc)| +|-------|-------------|--------------| +| GIN |0.7719 (0.0079) | 0.2232 (0.0018) | diff --git a/ogb_examples/graphproppred/mol/args.py b/ogb_examples/graphproppred/mol/args.py new file mode 100644 index 0000000000000000000000000000000000000000..b637a5ad41471032268e6a00b3e759d1e27dec4b --- /dev/null +++ b/ogb_examples/graphproppred/mol/args.py @@ -0,0 +1,104 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from __future__ import absolute_import + +import os +import time +import argparse + +from utils.args import ArgumentGroup + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument('--use_cuda', action='store_true') +model_g = ArgumentGroup(parser, "model", "model configuration and paths.") +model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.") +model_g.add_arg("init_pretraining_params", str, None, + "Init pre-training params which preforms fine-tuning from. If the " + "arg 'init_checkpoint' has been set, this argument wouldn't be valid.") +model_g.add_arg("./save_dir", str, "./checkpoints", "Path to save checkpoints.") +model_g.add_arg("hidden_size", int, 128, "hidden size.") + + +train_g = ArgumentGroup(parser, "training", "training options.") +train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.") +train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.") +train_g.add_arg("lr_scheduler", str, "linear_warmup_decay", + "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay']) +train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.") +train_g.add_arg("warmup_proportion", float, 0.1, + "Proportion of training steps to perform linear learning rate warmup for.") +train_g.add_arg("save_steps", int, 10000, "The steps interval to save checkpoints.") +train_g.add_arg("validation_steps", int, 1000, "The steps interval to evaluate model performance.") +train_g.add_arg("use_dynamic_loss_scaling", bool, True, "Whether to use dynamic loss scaling.") +train_g.add_arg("init_loss_scaling", float, 102400, + "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.") + +train_g.add_arg("test_save", str, "./checkpoints/test_result", "test_save") +train_g.add_arg("metric", str, "simple_accuracy", "metric") +train_g.add_arg("incr_every_n_steps", int, 100, "Increases loss scaling every n consecutive.") +train_g.add_arg("decr_every_n_nan_or_inf", int, 2, + "Decreases loss scaling every n accumulated steps with nan or inf gradients.") +train_g.add_arg("incr_ratio", float, 2.0, + "The multiplier to use when increasing the loss scaling.") +train_g.add_arg("decr_ratio", float, 0.8, + "The less-than-one-multiplier to use when decreasing.") + + + + +log_g = ArgumentGroup(parser, "logging", "logging related.") +log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.") +log_g.add_arg("verbose", bool, False, "Whether to output verbose log.") +log_g.add_arg("log_dir", str, './logs/', "Whether to output verbose log.") + +data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options") +data_g.add_arg("tokenizer", str, "FullTokenizer", + "ATTENTION: the INPUT must be splited by Word with blank while using SentencepieceTokenizer or WordsegTokenizer") +data_g.add_arg("train_set", str, None, "Path to training data.") +data_g.add_arg("test_set", str, None, "Path to test data.") +data_g.add_arg("dev_set", str, None, "Path to validation data.") +data_g.add_arg("aug1_type", str, "scheme1", "augment type") +data_g.add_arg("aug2_type", str, "scheme1", "augment type") +data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training. see also --in_tokens.") +data_g.add_arg("predict_batch_size", int, None, "Total examples' number in batch for predict. see also --in_tokens.") +data_g.add_arg("random_seed", int, None, "Random seed.") +data_g.add_arg("buf_size", int, 1000, "Random seed.") + +run_type_g = ArgumentGroup(parser, "run_type", "running type options.") +run_type_g.add_arg("num_iteration_per_drop_scope", int, 10, "Iteration intervals to drop scope.") +run_type_g.add_arg("do_train", bool, True, "Whether to perform training.") +run_type_g.add_arg("do_val", bool, True, "Whether to perform evaluation on dev data set.") +run_type_g.add_arg("do_test", bool, True, "Whether to perform evaluation on test data set.") +run_type_g.add_arg("metrics", bool, True, "Whether to perform evaluation on test data set.") +run_type_g.add_arg("shuffle", bool, True, "") +run_type_g.add_arg("for_cn", bool, True, "model train for cn or for other langs.") +run_type_g.add_arg("num_workers", int, 1, "use multiprocess to generate graph") +run_type_g.add_arg("output_dir", str, None, "path to save model") +run_type_g.add_arg("config", str, None, "configure yaml file") +run_type_g.add_arg("n", str, None, "task name") +run_type_g.add_arg("task_name", str, None, "task name") +run_type_g.add_arg("pretrain", bool, False, "Whether do pretrian") +run_type_g.add_arg("pretrain_name", str, None, "pretrain task name") +run_type_g.add_arg("pretrain_config", str, None, "pretrain config.yaml file") +run_type_g.add_arg("pretrain_model_step", str, None, "pretrain model step") +run_type_g.add_arg("model_type", str, "BaseLineModel", "pretrain model step") +run_type_g.add_arg("num_class", int, 1, "number class") +run_type_g.add_arg("dataset_name", str, None, "finetune dataset name") +run_type_g.add_arg("eval_metrics", str, None, "evaluate metrics") +run_type_g.add_arg("task_type", str, None, "regression or classification") diff --git a/ogb_examples/graphproppred/mol/data/__init__.py b/ogb_examples/graphproppred/mol/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/ogb_examples/graphproppred/mol/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/ogb_examples/graphproppred/mol/data/base_dataset.py b/ogb_examples/graphproppred/mol/data/base_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..e802ea5254100c121a216d9cb4f1cd0c1f264d9d --- /dev/null +++ b/ogb_examples/graphproppred/mol/data/base_dataset.py @@ -0,0 +1,83 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +import os + +from ogb.graphproppred import GraphPropPredDataset +import pgl +from pgl.utils.logger import log + + +class BaseDataset(object): + def __init__(self): + pass + + def __getitem__(self, idx): + raise NotImplementedError + + def __len__(self): + raise NotImplementedError + + +class Subset(BaseDataset): + r""" + Subset of a dataset at specified indices. + Arguments: + dataset (Dataset): The whole Dataset + indices (sequence): Indices in the whole set selected for subset + """ + + def __init__(self, dataset, indices): + self.dataset = dataset + self.indices = indices + + def __getitem__(self, idx): + return self.dataset[self.indices[idx]] + + def __len__(self): + return len(self.indices) + + +class Dataset(BaseDataset): + def __init__(self, args): + self.args = args + self.raw_dataset = GraphPropPredDataset(name=args.dataset_name) + self.num_tasks = self.raw_dataset.num_tasks + self.eval_metrics = self.raw_dataset.eval_metric + self.task_type = self.raw_dataset.task_type + + self.pgl_graph_list = [] + self.graph_label_list = [] + for i in range(len(self.raw_dataset)): + graph, label = self.raw_dataset[i] + edges = list(zip(graph["edge_index"][0], graph["edge_index"][1])) + g = pgl.graph.Graph(num_nodes=graph["num_nodes"], edges=edges) + + if graph["edge_feat"] is not None: + g.edge_feat["feat"] = graph["edge_feat"] + + if graph["node_feat"] is not None: + g.node_feat["feat"] = graph["node_feat"] + + self.pgl_graph_list.append(g) + self.graph_label_list.append(label) + + def __getitem__(self, idx): + return self.pgl_graph_list[idx], self.graph_label_list[idx] + + def __len__(self): + return len(slef.pgl_graph_list) + + def get_idx_split(self): + return self.raw_dataset.get_idx_split() diff --git a/ogb_examples/graphproppred/mol/data/dataloader.py b/ogb_examples/graphproppred/mol/data/dataloader.py new file mode 100644 index 0000000000000000000000000000000000000000..66023d0971f74121ee1cae8711c50a11ba6f9536 --- /dev/null +++ b/ogb_examples/graphproppred/mol/data/dataloader.py @@ -0,0 +1,183 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This file implement the graph dataloader. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from __future__ import absolute_import + +import ssl +ssl._create_default_https_context = ssl._create_unverified_context +# SSL + +import torch +import sys +import six +from io import open +import collections +from collections import namedtuple +import numpy as np +import tqdm +import time + +import paddle +import paddle.fluid as fluid +import paddle.fluid.layers as fl +import pgl +from pgl.utils import mp_reader +from pgl.utils.logger import log + +from ogb.graphproppred import GraphPropPredDataset + + +def batch_iter(data, batch_size, fid, num_workers): + """node_batch_iter + """ + size = len(data) + perm = np.arange(size) + np.random.shuffle(perm) + start = 0 + cc = 0 + while start < size: + index = perm[start:start + batch_size] + start += batch_size + cc += 1 + if cc % num_workers != fid: + continue + yield data[index] + + +def scan_batch_iter(data, batch_size, fid, num_workers): + """scan_batch_iter + """ + batch = [] + cc = 0 + for line_example in data.scan(): + cc += 1 + if cc % num_workers != fid: + continue + batch.append(line_example) + if len(batch) == batch_size: + yield batch + batch = [] + + if len(batch) > 0: + yield batch + + +class GraphDataloader(object): + """Graph Dataloader + """ + + def __init__(self, + dataset, + graph_wrapper, + batch_size, + seed=0, + num_workers=1, + buf_size=1000, + shuffle=True): + + self.shuffle = shuffle + self.seed = seed + self.num_workers = num_workers + self.buf_size = buf_size + self.batch_size = batch_size + self.dataset = dataset + self.graph_wrapper = graph_wrapper + + def batch_fn(self, batch_examples): + """ batch_fn batch producer""" + graphs = [b[0] for b in batch_examples] + labels = [b[1] for b in batch_examples] + join_graph = pgl.graph.MultiGraph(graphs) + labels = np.array(labels) + + feed_dict = self.graph_wrapper.to_feed(join_graph) + batch_valid = (labels == labels).astype("float32") + labels = np.nan_to_num(labels).astype("float32") + feed_dict['labels'] = labels + feed_dict['unmask'] = batch_valid + return feed_dict + + def batch_iter(self, fid): + """batch_iter""" + if self.shuffle: + for batch in batch_iter(self, self.batch_size, fid, + self.num_workers): + yield batch + else: + for batch in scan_batch_iter(self, self.batch_size, fid, + self.num_workers): + yield batch + + def __len__(self): + """__len__""" + return len(self.dataset) + + def __getitem__(self, idx): + """__getitem__""" + if isinstance(idx, collections.Iterable): + return [self[bidx] for bidx in idx] + else: + return self.dataset[idx] + + def __iter__(self): + """__iter__""" + + def worker(filter_id): + def func_run(): + for batch_examples in self.batch_iter(filter_id): + batch_dict = self.batch_fn(batch_examples) + yield batch_dict + + return func_run + + if self.num_workers == 1: + r = paddle.reader.buffered(worker(0), self.buf_size) + else: + worker_pool = [worker(wid) for wid in range(self.num_workers)] + worker = mp_reader.multiprocess_reader( + worker_pool, use_pipe=True, queue_size=1000) + r = paddle.reader.buffered(worker, self.buf_size) + + for batch in r(): + yield batch + + def scan(self): + """scan""" + for example in self.dataset: + yield example + + +if __name__ == "__main__": + from base_dataset import BaseDataset, Subset + dataset = GraphPropPredDataset(name="ogbg-molhiv") + splitted_index = dataset.get_idx_split() + train_dataset = Subset(dataset, splitted_index['train']) + valid_dataset = Subset(dataset, splitted_index['valid']) + test_dataset = Subset(dataset, splitted_index['test']) + log.info("Train Examples: %s" % len(train_dataset)) + log.info("Val Examples: %s" % len(valid_dataset)) + log.info("Test Examples: %s" % len(test_dataset)) + + # train_loader = GraphDataloader(train_dataset, batch_size=3) + # for batch_data in train_loader: + # graphs, labels = batch_data + # print(labels.shape) + # time.sleep(4) diff --git a/ogb_examples/graphproppred/mol/data/splitters.py b/ogb_examples/graphproppred/mol/data/splitters.py new file mode 100644 index 0000000000000000000000000000000000000000..be1f1c1d94b16bfe17346eabeee553f8c3e1965a --- /dev/null +++ b/ogb_examples/graphproppred/mol/data/splitters.py @@ -0,0 +1,153 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import os +import logging +from random import random +import pandas as pd +import numpy as np +from itertools import compress + +import scipy.sparse as sp +from sklearn.model_selection import StratifiedKFold +from sklearn.preprocessing import StandardScaler +from rdkit.Chem.Scaffolds import MurckoScaffold + +import pgl +from pgl.utils import paddle_helper +try: + from dataset.Dataset import Subset + from dataset.Dataset import ChemDataset +except: + from Dataset import Subset + from Dataset import ChemDataset + +log = logging.getLogger("logger") + + +def random_split(dataset, args): + total_precent = args.frac_train + args.frac_valid + args.frac_test + np.testing.assert_almost_equal(total_precent, 1.0) + + length = len(dataset) + perm = list(range(length)) + np.random.shuffle(perm) + num_train = int(args.frac_train * length) + num_valid = int(args.frac_valid * length) + num_test = int(args.frac_test * length) + + train_indices = perm[0:num_train] + valid_indices = perm[num_train:(num_train + num_valid)] + test_indices = perm[(num_train + num_valid):] + assert (len(train_indices) + len(valid_indices) + len(test_indices) + ) == length + + train_dataset = Subset(dataset, train_indices) + valid_dataset = Subset(dataset, valid_indices) + test_dataset = Subset(dataset, test_indices) + return train_dataset, valid_dataset, test_dataset + + +def scaffold_split(dataset, args, return_smiles=False): + total_precent = args.frac_train + args.frac_valid + args.frac_test + np.testing.assert_almost_equal(total_precent, 1.0) + + smiles_list_file = os.path.join(args.data_dir, "smiles.csv") + smiles_list = pd.read_csv(smiles_list_file, header=None)[0].tolist() + + non_null = np.ones(len(dataset)) == 1 + smiles_list = list(compress(enumerate(smiles_list), non_null)) + + # create dict of the form {scaffold_i: [idx1, idx....]} + all_scaffolds = {} + for i, smiles in smiles_list: + scaffold = MurckoScaffold.MurckoScaffoldSmiles( + smiles=smiles, includeChirality=True) + # scaffold = generate_scaffold(smiles, include_chirality=True) + if scaffold not in all_scaffolds: + all_scaffolds[scaffold] = [i] + else: + all_scaffolds[scaffold].append(i) + + # sort from largest to smallest sets + all_scaffolds = { + key: sorted(value) + for key, value in all_scaffolds.items() + } + all_scaffold_sets = [ + scaffold_set + for (scaffold, scaffold_set) in sorted( + all_scaffolds.items(), + key=lambda x: (len(x[1]), x[1][0]), + reverse=True) + ] + + # get train, valid test indices + train_cutoff = args.frac_train * len(smiles_list) + valid_cutoff = (args.frac_train + args.frac_valid) * len(smiles_list) + train_idx, valid_idx, test_idx = [], [], [] + for scaffold_set in all_scaffold_sets: + if len(train_idx) + len(scaffold_set) > train_cutoff: + if len(train_idx) + len(valid_idx) + len( + scaffold_set) > valid_cutoff: + test_idx.extend(scaffold_set) + else: + valid_idx.extend(scaffold_set) + else: + train_idx.extend(scaffold_set) + + assert len(set(train_idx).intersection(set(valid_idx))) == 0 + assert len(set(test_idx).intersection(set(valid_idx))) == 0 + # log.info(len(scaffold_set)) + # log.info(["train_idx", train_idx]) + # log.info(["valid_idx", valid_idx]) + # log.info(["test_idx", test_idx]) + + train_dataset = Subset(dataset, train_idx) + valid_dataset = Subset(dataset, valid_idx) + test_dataset = Subset(dataset, test_idx) + + if return_smiles: + train_smiles = [smiles_list[i][1] for i in train_idx] + valid_smiles = [smiles_list[i][1] for i in valid_idx] + test_smiles = [smiles_list[i][1] for i in test_idx] + + return train_dataset, valid_dataset, test_dataset, ( + train_smiles, valid_smiles, test_smiles) + + return train_dataset, valid_dataset, test_dataset + + +if __name__ == "__main__": + file_path = os.path.dirname(os.path.realpath(__file__)) + proj_path = os.path.join(file_path, '../') + sys.path.append(proj_path) + from utils.config import Config + from dataset.Dataset import Subset + from dataset.Dataset import ChemDataset + + config_file = "./finetune_config.yaml" + args = Config(config_file) + log.info("loading dataset") + dataset = ChemDataset(args) + + train_dataset, valid_dataset, test_dataset = scaffold_split(dataset, args) + + log.info("Train Examples: %s" % len(train_dataset)) + log.info("Val Examples: %s" % len(valid_dataset)) + log.info("Test Examples: %s" % len(test_dataset)) + import ipdb + ipdb.set_trace() + log.info("preprocess finish") diff --git a/ogb_examples/graphproppred/mol/hiv_config.yaml b/ogb_examples/graphproppred/mol/hiv_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ee0afbbb2af1b3315569e87ab09cad9f451120d8 --- /dev/null +++ b/ogb_examples/graphproppred/mol/hiv_config.yaml @@ -0,0 +1,53 @@ +task_name: hiv +seed: 15391 +dataset_name: ogbg-molhiv +eval_metrics: null +task_type: null +num_class: null +pool_type: average +train_eps: True +norm_type: layer_norm + +model_type: GNNModel +embed_dim: 128 +num_layers: 5 +hidden_size: 256 +save_dir: ./checkpoints + + +# finetune model config +init_checkpoint: null +init_pretraining_params: null + +# data config +data_dir: ./dataset/ +symmetry: True +batch_size: 32 +buf_size: 1000 +metrics: True +shuffle: True +num_workers: 12 +output_dir: ./outputs/ + +# trainging config +epoch: 50 +learning_rate: 0.0001 +lr_scheduler: linear_warmup_decay +weight_decay: 0.01 +warmup_proportion: 0.1 +save_steps: 10000 +validation_steps: 1000 +use_dynamic_loss_scaling: True +init_loss_scaling: 102400 +metric: simple_accuracy +incr_every_n_steps: 100 +decr_every_n_nan_or_inf: 2 +incr_ratio: 2.0 +decr_ratio: 0.8 +log_dir: ./logs +eval_step: 400 +train_log_step: 20 + +# log config +skip_steps: 10 +verbose: False diff --git a/ogb_examples/graphproppred/mol/main.py b/ogb_examples/graphproppred/mol/main.py new file mode 100644 index 0000000000000000000000000000000000000000..bbc4dc4600524c2e3c6b804d2d9a5dd6be17c27c --- /dev/null +++ b/ogb_examples/graphproppred/mol/main.py @@ -0,0 +1,180 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ssl +ssl._create_default_https_context = ssl._create_unverified_context +# SSL + +import torch +import os +import re +import time +from random import random +from functools import reduce, partial +import numpy as np +import multiprocessing + +from ogb.graphproppred import Evaluator +import paddle +import paddle.fluid as F +import paddle.fluid.layers as L +import pgl +from pgl.utils import paddle_helper +from pgl.utils.logger import log + +from utils.args import print_arguments, check_cuda, prepare_logger +from utils.init import init_checkpoint, init_pretraining_params +from utils.config import Config +from optimization import optimization +from monitor.train_monitor import train_and_evaluate +from args import parser + +import model as Model +from data.base_dataset import Subset, Dataset +from data.dataloader import GraphDataloader + + +def main(args): + log.info('loading data') + dataset = Dataset(args) + args.num_class = dataset.num_tasks + args.eval_metrics = dataset.eval_metrics + args.task_type = dataset.task_type + splitted_index = dataset.get_idx_split() + train_dataset = Subset(dataset, splitted_index['train']) + valid_dataset = Subset(dataset, splitted_index['valid']) + test_dataset = Subset(dataset, splitted_index['test']) + + log.info("preprocess finish") + log.info("Train Examples: %s" % len(train_dataset)) + log.info("Val Examples: %s" % len(valid_dataset)) + log.info("Test Examples: %s" % len(test_dataset)) + + train_prog = F.Program() + startup_prog = F.Program() + + if args.use_cuda: + dev_list = F.cuda_places() + place = dev_list[0] + dev_count = len(dev_list) + else: + place = F.CPUPlace() + dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + # dev_count = args.cpu_num + + log.info("building model") + with F.program_guard(train_prog, startup_prog): + with F.unique_name.guard(): + graph_model = getattr(Model, args.model_type)(args, dataset) + train_ds = GraphDataloader( + train_dataset, + graph_model.graph_wrapper, + batch_size=args.batch_size) + + num_train_examples = len(train_dataset) + max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count + warmup_steps = int(max_train_steps * args.warmup_proportion) + + scheduled_lr, loss_scaling = optimization( + loss=graph_model.loss, + warmup_steps=warmup_steps, + num_train_steps=max_train_steps, + learning_rate=args.learning_rate, + train_program=train_prog, + startup_prog=startup_prog, + weight_decay=args.weight_decay, + scheduler=args.lr_scheduler, + use_fp16=False, + use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, + init_loss_scaling=args.init_loss_scaling, + incr_every_n_steps=args.incr_every_n_steps, + decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, + incr_ratio=args.incr_ratio, + decr_ratio=args.decr_ratio) + + test_prog = F.Program() + with F.program_guard(test_prog, startup_prog): + with F.unique_name.guard(): + _graph_model = getattr(Model, args.model_type)(args, dataset) + + test_prog = test_prog.clone(for_test=True) + + valid_ds = GraphDataloader( + valid_dataset, + graph_model.graph_wrapper, + batch_size=args.batch_size, + shuffle=False) + test_ds = GraphDataloader( + test_dataset, + graph_model.graph_wrapper, + batch_size=args.batch_size, + shuffle=False) + + exe = F.Executor(place) + exe.run(startup_prog) + for init in graph_model.init_vars: + init(place) + for init in _graph_model.init_vars: + init(place) + + if args.init_pretraining_params is not None: + init_pretraining_params( + exe, args.init_pretraining_params, main_program=startup_prog) + + nccl2_num_trainers = 1 + nccl2_trainer_id = 0 + if dev_count > 1: + + exec_strategy = F.ExecutionStrategy() + exec_strategy.num_threads = dev_count + + train_exe = F.ParallelExecutor( + use_cuda=args.use_cuda, + loss_name=graph_model.loss.name, + exec_strategy=exec_strategy, + main_program=train_prog, + num_trainers=nccl2_num_trainers, + trainer_id=nccl2_trainer_id) + + test_exe = exe + else: + train_exe, test_exe = exe, exe + + evaluator = Evaluator(args.dataset_name) + + train_and_evaluate( + exe=exe, + train_exe=train_exe, + valid_exe=test_exe, + train_ds=train_ds, + valid_ds=valid_ds, + test_ds=test_ds, + train_prog=train_prog, + valid_prog=test_prog, + args=args, + dev_count=dev_count, + evaluator=evaluator, + model=graph_model) + + +if __name__ == "__main__": + args = parser.parse_args() + if args.config is not None: + config = Config(args.config, isCreate=True, isSave=True) + + config['use_cuda'] = args.use_cuda + + log.info(config) + + main(config) diff --git a/ogb_examples/graphproppred/mol/model.py b/ogb_examples/graphproppred/mol/model.py new file mode 100644 index 0000000000000000000000000000000000000000..f9e89c8942a52c3f46c79966d5e26bd3d9cbf311 --- /dev/null +++ b/ogb_examples/graphproppred/mol/model.py @@ -0,0 +1,210 @@ +#-*- coding: utf-8 -*- +import os +import re +import time +import logging +from random import random +from functools import reduce, partial + +import numpy as np +import multiprocessing + +import paddle +import paddle.fluid as F +import paddle.fluid.layers as L +import pgl +from pgl.graph_wrapper import GraphWrapper +from pgl.layers.conv import gcn, gat +from pgl.utils import paddle_helper +from pgl.utils.logger import log + +from utils.args import print_arguments, check_cuda, prepare_logger +from utils.init import init_checkpoint, init_pretraining_params + +from mol_encoder import AtomEncoder, BondEncoder + + +def copy_send(src_feat, dst_feat, edge_feat): + return src_feat["h"] + + +def mean_recv(feat): + return L.sequence_pool(feat, pool_type="average") + + +def sum_recv(feat): + return L.sequence_pool(feat, pool_type="sum") + + +def max_recv(feat): + return L.sequence_pool(feat, pool_type="max") + + +def unsqueeze(tensor): + tensor = L.unsqueeze(tensor, axes=-1) + tensor.stop_gradient = True + return tensor + + +class Metric: + def __init__(self, **args): + self.args = args + + @property + def vars(self): + values = [self.args[k] for k in self.args.keys()] + return values + + def parse(self, fetch_list): + tup = list(zip(self.args.keys(), [float(v[0]) for v in fetch_list])) + return dict(tup) + + +def gin_layer(gw, node_features, edge_features, train_eps, name): + def send_func(src_feat, dst_feat, edge_feat): + """Send""" + return src_feat["h"] + edge_feat["h"] + + epsilon = L.create_parameter( + shape=[1, 1], + dtype="float32", + attr=F.ParamAttr(name="%s_eps" % name), + default_initializer=F.initializer.ConstantInitializer(value=0.0)) + if not train_eps: + epsilon.stop_gradient = True + + msg = gw.send( + send_func, + nfeat_list=[("h", node_features)], + efeat_list=[("h", edge_features)]) + + node_feat = gw.recv(msg, "sum") + node_features * (epsilon + 1.0) + + # if apply_func is not None: + # node_feat = apply_func(node_feat, name) + return node_feat + + +class GNNModel(object): + def __init__(self, args, dataset): + self.args = args + self.dataset = dataset + self.hidden_size = self.args.hidden_size + self.embed_dim = self.args.embed_dim + self.dropout_prob = self.args.dropout_rate + self.pool_type = self.args.pool_type + self._init_vars = [] + + graph_data = [] + g, label = self.dataset[0] + graph_data.append(g) + g, label = self.dataset[1] + graph_data.append(g) + + batch_graph = pgl.graph.MultiGraph(graph_data) + graph_data = batch_graph + graph_data.edge_feat["feat"] = graph_data.edge_feat["feat"].astype( + "int64") + graph_data.node_feat["feat"] = graph_data.node_feat["feat"].astype( + "int64") + self.graph_wrapper = GraphWrapper( + name="graph", + place=F.CPUPlace(), + node_feat=graph_data.node_feat_info(), + edge_feat=graph_data.edge_feat_info()) + + self.atom_encoder = AtomEncoder(name="atom", emb_dim=self.embed_dim) + self.bond_encoder = BondEncoder(name="bond", emb_dim=self.embed_dim) + + self.labels = L.data( + "labels", + shape=[None, self.args.num_class], + dtype="float32", + append_batch_size=False) + + self.unmask = L.data( + "unmask", + shape=[None, self.args.num_class], + dtype="float32", + append_batch_size=False) + + self.build_model() + + def build_model(self): + node_features = self.atom_encoder(self.graph_wrapper.node_feat['feat']) + edge_features = self.bond_encoder(self.graph_wrapper.edge_feat['feat']) + + self._enc_out = self.node_repr_encode(node_features, edge_features) + + logits = L.fc(self._enc_out, + self.args.num_class, + act=None, + param_attr=F.ParamAttr(name="final_fc")) + + # L.Print(self.labels, message="labels") + # L.Print(self.unmask, message="unmask") + loss = L.sigmoid_cross_entropy_with_logits(x=logits, label=self.labels) + loss = loss * self.unmask + self.loss = L.reduce_sum(loss) / L.reduce_sum(self.unmask) + self.pred = L.sigmoid(logits) + + self._metrics = Metric(loss=self.loss) + + def node_repr_encode(self, node_features, edge_features): + features_list = [node_features] + for layer in range(self.args.num_layers): + feat = gin_layer( + self.graph_wrapper, + features_list[layer], + edge_features, + train_eps=self.args.train_eps, + name="gin_%s" % layer, ) + + feat = self.mlp(feat, name="mlp_%s" % layer) + + feat = feat + features_list[layer] # residual + + features_list.append(feat) + + output = pgl.layers.graph_pooling( + self.graph_wrapper, features_list[-1], self.args.pool_type) + + return output + + def mlp(self, features, name): + h = features + dim = features.shape[-1] + dim_list = [dim * 2, dim] + for i in range(2): + h = L.fc(h, + size=dim_list[i], + name="%s_fc_%s" % (name, i), + act=None) + if self.args.norm_type == "layer_norm": + log.info("norm_type is %s" % self.args.norm_type) + h = L.layer_norm( + h, + begin_norm_axis=1, + param_attr=F.ParamAttr( + name="norm_scale_%s_%s" % (name, i), + initializer=F.initializer.Constant(1.0)), + bias_attr=F.ParamAttr( + name="norm_bias_%s_%s" % (name, i), + initializer=F.initializer.Constant(0.0)), ) + else: + log.info("using batch_norm") + h = L.batch_norm(h) + h = pgl.layers.graph_norm(self.graph_wrapper, h) + h = L.relu(h) + return h + + def get_enc_output(self): + return self._enc_out + + @property + def init_vars(self): + return self._init_vars + + @property + def metrics(self): + return self._metrics diff --git a/ogb_examples/graphproppred/mol/mol_encoder.py b/ogb_examples/graphproppred/mol/mol_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..2662d141532dc58925f30e0973d5d85bb4953bd3 --- /dev/null +++ b/ogb_examples/graphproppred/mol/mol_encoder.py @@ -0,0 +1,71 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MolEncoder for ogb +""" +import paddle.fluid as fluid +from ogb.utils.features import get_atom_feature_dims, get_bond_feature_dims + + +class AtomEncoder(object): + """AtomEncoder for encoding node features""" + + def __init__(self, name, emb_dim): + self.emb_dim = emb_dim + self.name = name + + def __call__(self, x): + atom_feature = get_atom_feature_dims() + atom_input = fluid.layers.split( + x, num_or_sections=len(atom_feature), dim=-1) + outputs = None + count = 0 + for _x, _atom_input_dim in zip(atom_input, atom_feature): + count += 1 + emb = fluid.layers.embedding( + _x, + size=(_atom_input_dim, self.emb_dim), + param_attr=fluid.ParamAttr( + name=self.name + '_atom_feat_%s' % count)) + if outputs is None: + outputs = emb + else: + outputs = outputs + emb + return outputs + + +class BondEncoder(object): + """Bond for encoding edge features""" + + def __init__(self, name, emb_dim): + self.emb_dim = emb_dim + self.name = name + + def __call__(self, x): + bond_feature = get_bond_feature_dims() + bond_input = fluid.layers.split( + x, num_or_sections=len(bond_feature), dim=-1) + outputs = None + count = 0 + for _x, _bond_input_dim in zip(bond_input, bond_feature): + count += 1 + emb = fluid.layers.embedding( + _x, + size=(_bond_input_dim, self.emb_dim), + param_attr=fluid.ParamAttr( + name=self.name + '_bond_feat_%s' % count)) + if outputs is None: + outputs = emb + else: + outputs = outputs + emb + return outputs diff --git a/ogb_examples/graphproppred/mol/monitor/train_monitor.py b/ogb_examples/graphproppred/mol/monitor/train_monitor.py new file mode 100644 index 0000000000000000000000000000000000000000..6fb5fcaa11141e79963c9a8462e5dd56085c07c6 --- /dev/null +++ b/ogb_examples/graphproppred/mol/monitor/train_monitor.py @@ -0,0 +1,154 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tqdm +import json +import numpy as np +import os +from datetime import datetime +import logging +from collections import defaultdict +from tensorboardX import SummaryWriter + +import paddle.fluid as F +from pgl.utils.logger import log + + +def multi_device(reader, dev_count): + if dev_count == 1: + for batch in reader: + yield batch + else: + batches = [] + for batch in reader: + batches.append(batch) + if len(batches) == dev_count: + yield batches + batches = [] + + +def evaluate(exe, loader, prog, model, evaluator): + total_labels = [] + for i in range(len(loader.dataset)): + g, l = loader.dataset[i] + total_labels.append(l) + total_labels = np.vstack(total_labels) + + pred_output = [] + for feed_dict in loader: + ret = exe.run(prog, feed=feed_dict, fetch_list=model.pred) + pred_output.append(ret[0]) + + pred_output = np.vstack(pred_output) + + result = evaluator.eval({"y_true": total_labels, "y_pred": pred_output}) + + return result + + +def _create_if_not_exist(path): + basedir = os.path.dirname(path) + if not os.path.exists(basedir): + os.makedirs(basedir) + + +def train_and_evaluate(exe, + train_exe, + valid_exe, + train_ds, + valid_ds, + test_ds, + train_prog, + valid_prog, + args, + model, + evaluator, + dev_count=1): + + global_step = 0 + + timestamp = datetime.now().strftime("%Hh%Mm%Ss") + log_path = os.path.join(args.log_dir, "tensorboard_log_%s" % timestamp) + _create_if_not_exist(log_path) + + writer = SummaryWriter(log_path) + + best_valid_score = 0.0 + for e in range(args.epoch): + for feed_dict in multi_device(train_ds, dev_count): + if dev_count > 1: + ret = train_exe.run(feed=feed_dict, + fetch_list=model.metrics.vars) + ret = [[np.mean(v)] for v in ret] + else: + ret = train_exe.run(train_prog, + feed=feed_dict, + fetch_list=model.metrics.vars) + + ret = model.metrics.parse(ret) + if global_step % args.train_log_step == 0: + writer.add_scalar( + "batch_loss", ret['loss'], global_step=global_step) + log.info("epoch: %d | step: %d | loss: %.4f " % + (e, global_step, ret['loss'])) + + global_step += 1 + if global_step % args.eval_step == 0: + valid_ret = evaluate(exe, valid_ds, valid_prog, model, + evaluator) + message = "valid: " + for key, value in valid_ret.items(): + message += "%s %.4f | " % (key, value) + writer.add_scalar( + "eval_%s" % key, value, global_step=global_step) + log.info(message) + + # testing + test_ret = evaluate(exe, test_ds, valid_prog, model, evaluator) + message = "test: " + for key, value in test_ret.items(): + message += "%s %.4f | " % (key, value) + writer.add_scalar( + "test_%s" % key, value, global_step=global_step) + log.info(message) + + # evaluate after one epoch + valid_ret = evaluate(exe, valid_ds, valid_prog, model, evaluator) + message = "epoch %s valid: " % e + for key, value in valid_ret.items(): + message += "%s %.4f | " % (key, value) + writer.add_scalar("eval_%s" % key, value, global_step=global_step) + log.info(message) + + # testing + test_ret = evaluate(exe, test_ds, valid_prog, model, evaluator) + message = "epoch %s test: " % e + for key, value in test_ret.items(): + message += "%s %.4f | " % (key, value) + writer.add_scalar("test_%s" % key, value, global_step=global_step) + log.info(message) + + message = "epoch %s best %s result | " % (e, args.eval_metrics) + if valid_ret[args.eval_metrics] > best_valid_score: + best_valid_score = valid_ret[args.eval_metrics] + best_test_score = test_ret[args.eval_metrics] + + message += "valid %.4f | test %.4f" % (best_valid_score, + best_test_score) + log.info(message) + + # if global_step % args.save_step == 0: + # F.io.save_persistables(exe, os.path.join(args.save_dir, "%s" % global_step), train_prog) + + writer.close() diff --git a/ogb_examples/graphproppred/mol/optimization.py b/ogb_examples/graphproppred/mol/optimization.py new file mode 100644 index 0000000000000000000000000000000000000000..23a958f30459143d9ac581a26c9bf7690452bb69 --- /dev/null +++ b/ogb_examples/graphproppred/mol/optimization.py @@ -0,0 +1,163 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Optimization and learning rate scheduling.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from __future__ import absolute_import + +import numpy as np +import paddle.fluid as fluid +from utils.fp16 import create_master_params_grads, master_param_to_train_param, apply_dynamic_loss_scaling + + +def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps): + """ Applies linear warmup of learning rate from 0 and decay to 0.""" + with fluid.default_main_program()._lr_schedule_guard(): + lr = fluid.layers.tensor.create_global_var( + shape=[1], + value=0.0, + dtype='float32', + persistable=True, + name="scheduled_learning_rate") + + global_step = fluid.layers.learning_rate_scheduler._decay_step_counter( + ) + + with fluid.layers.control_flow.Switch() as switch: + with switch.case(global_step < warmup_steps): + warmup_lr = learning_rate * (global_step / warmup_steps) + fluid.layers.tensor.assign(warmup_lr, lr) + with switch.default(): + decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay( + learning_rate=learning_rate, + decay_steps=num_train_steps, + end_learning_rate=0.0, + power=1.0, + cycle=False) + fluid.layers.tensor.assign(decayed_lr, lr) + + return lr + + +def optimization(loss, + warmup_steps, + num_train_steps, + learning_rate, + train_program, + startup_prog, + weight_decay, + scheduler='linear_warmup_decay', + use_fp16=False, + use_dynamic_loss_scaling=False, + init_loss_scaling=1.0, + incr_every_n_steps=1000, + decr_every_n_nan_or_inf=2, + incr_ratio=2.0, + decr_ratio=0.8): + if warmup_steps > 0: + if scheduler == 'noam_decay': + scheduled_lr = fluid.layers.learning_rate_scheduler\ + .noam_decay(1/(warmup_steps *(learning_rate ** 2)), + warmup_steps) + elif scheduler == 'linear_warmup_decay': + scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, + num_train_steps) + else: + raise ValueError("Unkown learning rate scheduler, should be " + "'noam_decay' or 'linear_warmup_decay'") + optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) + else: + scheduled_lr = fluid.layers.create_global_var( + name=fluid.unique_name.generate("learning_rate"), + shape=[1], + value=learning_rate, + dtype='float32', + persistable=True) + optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) + optimizer._learning_rate_map[fluid.default_main_program( + )] = scheduled_lr + + fluid.clip.set_gradient_clip( + clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)) + + def exclude_from_weight_decay(name): + if name.find("layer_norm") > -1: + return True + bias_suffix = ["_bias", "_b", ".b_0"] + for suffix in bias_suffix: + if name.endswith(suffix): + return True + return False + + param_list = dict() + + loss_scaling = fluid.layers.create_global_var( + name=fluid.unique_name.generate("loss_scaling"), + shape=[1], + value=init_loss_scaling, + dtype='float32', + persistable=True) + + if use_fp16: + loss *= loss_scaling + param_grads = optimizer.backward(loss) + + master_param_grads = create_master_params_grads( + param_grads, train_program, startup_prog, loss_scaling) + + for param, _ in master_param_grads: + param_list[param.name] = param * 1.0 + param_list[param.name].stop_gradient = True + + if use_dynamic_loss_scaling: + apply_dynamic_loss_scaling( + loss_scaling, master_param_grads, incr_every_n_steps, + decr_every_n_nan_or_inf, incr_ratio, decr_ratio) + + optimizer.apply_gradients(master_param_grads) + + if weight_decay > 0: + for param, grad in master_param_grads: + if exclude_from_weight_decay(param.name.rstrip(".master")): + continue + with param.block.program._optimized_guard( + [param, grad]), fluid.framework.name_scope("weight_decay"): + updated_param = param - param_list[ + param.name] * weight_decay * scheduled_lr + fluid.layers.assign(output=param, input=updated_param) + + master_param_to_train_param(master_param_grads, param_grads, + train_program) + + else: + for param in train_program.global_block().all_parameters(): + param_list[param.name] = param * 1.0 + param_list[param.name].stop_gradient = True + + _, param_grads = optimizer.minimize(loss) + + if weight_decay > 0: + for param, grad in param_grads: + if exclude_from_weight_decay(param.name): + continue + with param.block.program._optimized_guard( + [param, grad]), fluid.framework.name_scope("weight_decay"): + updated_param = param - param_list[ + param.name] * weight_decay * scheduled_lr + fluid.layers.assign(output=param, input=updated_param) + + return scheduled_lr, loss_scaling diff --git a/ogb_examples/graphproppred/mol/pcba_config.yaml b/ogb_examples/graphproppred/mol/pcba_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e39eadecf21987b38d2bba10c5b0efa019e144e0 --- /dev/null +++ b/ogb_examples/graphproppred/mol/pcba_config.yaml @@ -0,0 +1,53 @@ +task_name: pcba +seed: 28994 +dataset_name: ogbg-molpcba +eval_metrics: null +task_type: null +num_class: null +pool_type: average +train_eps: True +norm_type: layer_norm + +model_type: GNNModel +embed_dim: 128 +num_layers: 5 +hidden_size: 256 +save_dir: ./checkpoints + + +# finetune model config +init_checkpoint: null +init_pretraining_params: null + +# data config +data_dir: ./dataset/ +symmetry: True +batch_size: 256 +buf_size: 1000 +metrics: True +shuffle: True +num_workers: 12 +output_dir: ./outputs/ + +# trainging config +epoch: 50 +learning_rate: 0.005 +lr_scheduler: linear_warmup_decay +weight_decay: 0.01 +warmup_proportion: 0.1 +save_steps: 10000 +validation_steps: 1000 +use_dynamic_loss_scaling: True +init_loss_scaling: 102400 +metric: simple_accuracy +incr_every_n_steps: 100 +decr_every_n_nan_or_inf: 2 +incr_ratio: 2.0 +decr_ratio: 0.8 +log_dir: ./logs +eval_step: 1000 +train_log_step: 20 + +# log config +skip_steps: 10 +verbose: False diff --git a/ogb_examples/graphproppred/mol/utils/__init__.py b/ogb_examples/graphproppred/mol/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/ogb_examples/graphproppred/mol/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/ogb_examples/graphproppred/mol/utils/args.py b/ogb_examples/graphproppred/mol/utils/args.py new file mode 100644 index 0000000000000000000000000000000000000000..2de3d0da17519f091079aa963aad743fa4095941 --- /dev/null +++ b/ogb_examples/graphproppred/mol/utils/args.py @@ -0,0 +1,94 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Arguments for configuration.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from __future__ import absolute_import + +import six +import os +import sys +import argparse +import logging + +import paddle.fluid as fluid + +log = logging.getLogger("logger") + + +def prepare_logger(logger, debug=False, save_to_file=None): + formatter = logging.Formatter( + fmt='[%(levelname)s] %(asctime)s [%(filename)12s:%(lineno)5d]:\t%(message)s' + ) + # console_hdl = logging.StreamHandler() + # console_hdl.setFormatter(formatter) + # logger.addHandler(console_hdl) + if save_to_file is not None: #and not os.path.exists(save_to_file): + if os.path.isdir(save_to_file): + file_hdl = logging.FileHandler( + os.path.join(save_to_file, 'log.txt')) + else: + file_hdl = logging.FileHandler(save_to_file) + file_hdl.setFormatter(formatter) + logger.addHandler(file_hdl) + logger.setLevel(logging.DEBUG) + logger.propagate = False + + +def str2bool(v): + # because argparse does not support to parse "true, False" as python + # boolean directly + return v.lower() in ("true", "t", "1") + + +class ArgumentGroup(object): + def __init__(self, parser, title, des): + self._group = parser.add_argument_group(title=title, description=des) + + def add_arg(self, + name, + type, + default, + help, + positional_arg=False, + **kwargs): + prefix = "" if positional_arg else "--" + type = str2bool if type == bool else type + self._group.add_argument( + prefix + name, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +def print_arguments(args): + log.info('----------- Configuration Arguments -----------') + for arg, value in sorted(six.iteritems(vars(args))): + log.info('%s: %s' % (arg, value)) + log.info('------------------------------------------------') + + +def check_cuda(use_cuda, err = \ + "\nYou can not set use_cuda = True in the model because you are using paddlepaddle-cpu.\n \ + Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n" + ): + try: + if use_cuda == True and fluid.is_compiled_with_cuda() == False: + log.error(err) + sys.exit(1) + except Exception as e: + pass diff --git a/ogb_examples/graphproppred/mol/utils/cards.py b/ogb_examples/graphproppred/mol/utils/cards.py new file mode 100644 index 0000000000000000000000000000000000000000..3c9c6709f71edd692c81d5fed8bfb87e9afd596f --- /dev/null +++ b/ogb_examples/graphproppred/mol/utils/cards.py @@ -0,0 +1,30 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from __future__ import absolute_import +import os + + +def get_cards(): + """ + get gpu cards number + """ + num = 0 + cards = os.environ.get('CUDA_VISIBLE_DEVICES', '') + if cards != '': + num = len(cards.split(",")) + return num diff --git a/ogb_examples/graphproppred/mol/utils/config.py b/ogb_examples/graphproppred/mol/utils/config.py new file mode 100644 index 0000000000000000000000000000000000000000..62d2847c357c3c0d28f1ed57e4430a766c7dfebc --- /dev/null +++ b/ogb_examples/graphproppred/mol/utils/config.py @@ -0,0 +1,136 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This file implement a class for model configure. +""" + +import datetime +import os +import yaml +import random +import shutil +import six +import logging + +log = logging.getLogger("logger") + + +class AttrDict(dict): + """Attr dict + """ + + def __init__(self, d): + self.dict = d + + def __getattr__(self, attr): + value = self.dict[attr] + if isinstance(value, dict): + return AttrDict(value) + else: + return value + + def __str__(self): + return str(self.dict) + + +class Config(object): + """Implementation of Config class for model configure. + + Args: + config_file(str): configure filename, which is a yaml file. + isCreate(bool): if true, create some neccessary directories to save models, log file and other outputs. + isSave(bool): if true, save config_file in order to record the configure message. + """ + + def __init__(self, config_file, isCreate=False, isSave=False): + self.config_file = config_file + # self.config = self.get_config_from_yaml(config_file) + self.config = self.load_config(config_file) + + if isCreate: + self.create_necessary_dirs() + + if isSave: + self.save_config_file() + + def load_config(self, config_file): + """Load config file""" + with open(config_file) as f: + if hasattr(yaml, 'FullLoader'): + config = yaml.load(f, Loader=yaml.FullLoader) + else: + config = yaml.load(f) + return config + + def create_necessary_dirs(self): + """Create some necessary directories to save some important files. + """ + + self.config['log_dir'] = os.path.join(self.config['log_dir'], + self.config['task_name']) + self.config['save_dir'] = os.path.join(self.config['save_dir'], + self.config['task_name']) + self.config['output_dir'] = os.path.join(self.config['output_dir'], + self.config['task_name']) + + self.make_dir(self.config['log_dir']) + self.make_dir(self.config['save_dir']) + self.make_dir(self.config['output_dir']) + + def save_config_file(self): + """Save config file so that we can know the config when we look back + """ + filename = self.config_file.split('/')[-1] + targetpath = os.path.join(self.config['save_dir'], filename) + try: + shutil.copyfile(self.config_file, targetpath) + except shutil.SameFileError: + log.info("%s and %s are the same file, did not copy by shutil"\ + % (self.config_file, targetpath)) + + def make_dir(self, path): + """Build directory""" + if not os.path.exists(path): + os.makedirs(path) + + def __getitem__(self, key): + return self.config[key] + + def __call__(self): + """__call__""" + return self.config + + def __getattr__(self, attr): + try: + result = self.config[attr] + except KeyError: + log.warn("%s attribute is not existed, return None" % attr) + result = None + return result + + def __setitem__(self, key, value): + self.config[key] = value + + def __str__(self): + return str(self.config) + + def pretty_print(self): + log.info( + "-----------------------------------------------------------------") + log.info("config file: %s" % self.config_file) + for key, value in sorted( + self.config.items(), key=lambda item: item[0]): + log.info("%s: %s" % (key, value)) + log.info( + "-----------------------------------------------------------------") diff --git a/ogb_examples/graphproppred/mol/utils/fp16.py b/ogb_examples/graphproppred/mol/utils/fp16.py new file mode 100644 index 0000000000000000000000000000000000000000..740add267dff2dbf463032bcc47a6741ca9f7c43 --- /dev/null +++ b/ogb_examples/graphproppred/mol/utils/fp16.py @@ -0,0 +1,201 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import paddle +import paddle.fluid as fluid + + +def append_cast_op(i, o, prog): + """ + Append a cast op in a given Program to cast input `i` to data type `o.dtype`. + Args: + i (Variable): The input Variable. + o (Variable): The output Variable. + prog (Program): The Program to append cast op. + """ + prog.global_block().append_op( + type="cast", + inputs={"X": i}, + outputs={"Out": o}, + attrs={"in_dtype": i.dtype, + "out_dtype": o.dtype}) + + +def copy_to_master_param(p, block): + v = block.vars.get(p.name, None) + if v is None: + raise ValueError("no param name %s found!" % p.name) + new_p = fluid.framework.Parameter( + block=block, + shape=v.shape, + dtype=fluid.core.VarDesc.VarType.FP32, + type=v.type, + lod_level=v.lod_level, + stop_gradient=p.stop_gradient, + trainable=p.trainable, + optimize_attr=p.optimize_attr, + regularizer=p.regularizer, + gradient_clip_attr=p.gradient_clip_attr, + error_clip=p.error_clip, + name=v.name + ".master") + return new_p + + +def apply_dynamic_loss_scaling(loss_scaling, master_params_grads, + incr_every_n_steps, decr_every_n_nan_or_inf, + incr_ratio, decr_ratio): + _incr_every_n_steps = fluid.layers.fill_constant( + shape=[1], dtype='int32', value=incr_every_n_steps) + _decr_every_n_nan_or_inf = fluid.layers.fill_constant( + shape=[1], dtype='int32', value=decr_every_n_nan_or_inf) + + _num_good_steps = fluid.layers.create_global_var( + name=fluid.unique_name.generate("num_good_steps"), + shape=[1], + value=0, + dtype='int32', + persistable=True) + _num_bad_steps = fluid.layers.create_global_var( + name=fluid.unique_name.generate("num_bad_steps"), + shape=[1], + value=0, + dtype='int32', + persistable=True) + + grads = [fluid.layers.reduce_sum(g) for [_, g] in master_params_grads] + all_grads = fluid.layers.concat(grads) + all_grads_sum = fluid.layers.reduce_sum(all_grads) + is_overall_finite = fluid.layers.isfinite(all_grads_sum) + + update_loss_scaling(is_overall_finite, loss_scaling, _num_good_steps, + _num_bad_steps, _incr_every_n_steps, + _decr_every_n_nan_or_inf, incr_ratio, decr_ratio) + + # apply_gradient append all ops in global block, thus we shouldn't + # apply gradient in the switch branch. + with fluid.layers.Switch() as switch: + with switch.case(is_overall_finite): + pass + with switch.default(): + for _, g in master_params_grads: + fluid.layers.assign(fluid.layers.zeros_like(g), g) + + +def create_master_params_grads(params_grads, main_prog, startup_prog, + loss_scaling): + master_params_grads = [] + for p, g in params_grads: + with main_prog._optimized_guard([p, g]): + # create master parameters + master_param = copy_to_master_param(p, main_prog.global_block()) + startup_master_param = startup_prog.global_block()._clone_variable( + master_param) + startup_p = startup_prog.global_block().var(p.name) + append_cast_op(startup_p, startup_master_param, startup_prog) + # cast fp16 gradients to fp32 before apply gradients + if g.name.find("layer_norm") > -1: + scaled_g = g / loss_scaling + master_params_grads.append([p, scaled_g]) + continue + master_grad = fluid.layers.cast(g, "float32") + master_grad = master_grad / loss_scaling + master_params_grads.append([master_param, master_grad]) + + return master_params_grads + + +def master_param_to_train_param(master_params_grads, params_grads, main_prog): + for idx, m_p_g in enumerate(master_params_grads): + train_p, _ = params_grads[idx] + if train_p.name.find("layer_norm") > -1: + continue + with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]): + append_cast_op(m_p_g[0], train_p, main_prog) + + +def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps, + num_bad_steps, incr_every_n_steps, + decr_every_n_nan_or_inf, incr_ratio, decr_ratio): + """ + Update loss scaling according to overall gradients. If all gradients is + finite after incr_every_n_steps, loss scaling will increase by incr_ratio. + Otherwisw, loss scaling will decrease by decr_ratio after + decr_every_n_nan_or_inf steps and each step some gradients are infinite. + Args: + is_overall_finite (Variable): A boolean variable indicates whether + all gradients are finite. + prev_loss_scaling (Variable): Previous loss scaling. + num_good_steps (Variable): A variable accumulates good steps in which + all gradients are finite. + num_bad_steps (Variable): A variable accumulates bad steps in which + some gradients are infinite. + incr_every_n_steps (Variable): A variable represents increasing loss + scaling every n consecutive steps with + finite gradients. + decr_every_n_nan_or_inf (Variable): A variable represents decreasing + loss scaling every n accumulated + steps with nan or inf gradients. + incr_ratio(float): The multiplier to use when increasing the loss + scaling. + decr_ratio(float): The less-than-one-multiplier to use when decreasing + loss scaling. + """ + zero_steps = fluid.layers.fill_constant(shape=[1], dtype='int32', value=0) + with fluid.layers.Switch() as switch: + with switch.case(is_overall_finite): + should_incr_loss_scaling = fluid.layers.less_than( + incr_every_n_steps, num_good_steps + 1) + with fluid.layers.Switch() as switch1: + with switch1.case(should_incr_loss_scaling): + new_loss_scaling = prev_loss_scaling * incr_ratio + loss_scaling_is_finite = fluid.layers.isfinite( + new_loss_scaling) + with fluid.layers.Switch() as switch2: + with switch2.case(loss_scaling_is_finite): + fluid.layers.assign(new_loss_scaling, + prev_loss_scaling) + with switch2.default(): + pass + fluid.layers.assign(zero_steps, num_good_steps) + fluid.layers.assign(zero_steps, num_bad_steps) + + with switch1.default(): + fluid.layers.increment(num_good_steps) + fluid.layers.assign(zero_steps, num_bad_steps) + + with switch.default(): + should_decr_loss_scaling = fluid.layers.less_than( + decr_every_n_nan_or_inf, num_bad_steps + 1) + with fluid.layers.Switch() as switch3: + with switch3.case(should_decr_loss_scaling): + new_loss_scaling = prev_loss_scaling * decr_ratio + static_loss_scaling = \ + fluid.layers.fill_constant(shape=[1], + dtype='float32', + value=1.0) + less_than_one = fluid.layers.less_than(new_loss_scaling, + static_loss_scaling) + with fluid.layers.Switch() as switch4: + with switch4.case(less_than_one): + fluid.layers.assign(static_loss_scaling, + prev_loss_scaling) + with switch4.default(): + fluid.layers.assign(new_loss_scaling, + prev_loss_scaling) + fluid.layers.assign(zero_steps, num_good_steps) + fluid.layers.assign(zero_steps, num_bad_steps) + with switch3.default(): + fluid.layers.assign(zero_steps, num_good_steps) + fluid.layers.increment(num_bad_steps) diff --git a/ogb_examples/graphproppred/mol/utils/init.py b/ogb_examples/graphproppred/mol/utils/init.py new file mode 100644 index 0000000000000000000000000000000000000000..0f54a185ac80ec2308c9f8effe59148547b2548d --- /dev/null +++ b/ogb_examples/graphproppred/mol/utils/init.py @@ -0,0 +1,91 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from __future__ import absolute_import + +import os +import six +import ast +import copy +import logging + +import numpy as np +import paddle.fluid as fluid + +log = logging.getLogger("logger") + + +def cast_fp32_to_fp16(exe, main_program): + log.info("Cast parameters to float16 data format.") + for param in main_program.global_block().all_parameters(): + if not param.name.endswith(".master"): + param_t = fluid.global_scope().find_var(param.name).get_tensor() + data = np.array(param_t) + if param.name.startswith("encoder_layer") \ + and "layer_norm" not in param.name: + param_t.set(np.float16(data).view(np.uint16), exe.place) + + #load fp32 + master_param_var = fluid.global_scope().find_var(param.name + + ".master") + if master_param_var is not None: + master_param_var.get_tensor().set(data, exe.place) + + +def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False): + assert os.path.exists( + init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path + + def existed_persitables(var): + if not fluid.io.is_persistable(var): + return False + return os.path.exists(os.path.join(init_checkpoint_path, var.name)) + + fluid.io.load_vars( + exe, + init_checkpoint_path, + main_program=main_program, + predicate=existed_persitables) + log.info("Load model from {}".format(init_checkpoint_path)) + + if use_fp16: + cast_fp32_to_fp16(exe, main_program) + + +def init_pretraining_params(exe, + pretraining_params_path, + main_program, + use_fp16=False): + assert os.path.exists(pretraining_params_path + ), "[%s] cann't be found." % pretraining_params_path + + def existed_params(var): + if not isinstance(var, fluid.framework.Parameter): + return False + return os.path.exists(os.path.join(pretraining_params_path, var.name)) + + fluid.io.load_vars( + exe, + pretraining_params_path, + main_program=main_program, + predicate=existed_params) + log.info("Load pretraining parameters from {}.".format( + pretraining_params_path)) + + if use_fp16: + cast_fp32_to_fp16(exe, main_program) diff --git a/ogb_examples/linkproppred/main_pgl.py b/ogb_examples/linkproppred/main_pgl.py deleted file mode 100644 index 2f6be61f853f635686e289ee7ffab098d57ba8ab..0000000000000000000000000000000000000000 --- a/ogb_examples/linkproppred/main_pgl.py +++ /dev/null @@ -1,275 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""test ogb -""" -import argparse -import time -import logging -import numpy as np - -import paddle.fluid as fluid - -import pgl -from pgl.contrib.ogb.linkproppred.dataset_pgl import PglLinkPropPredDataset -from pgl.utils import paddle_helper -from ogb.linkproppred import Evaluator - - -def send_func(src_feat, dst_feat, edge_feat): - """send_func""" - return src_feat["h"] - - -def recv_func(feat): - """recv_func""" - return fluid.layers.sequence_pool(feat, pool_type="sum") - - -class GNNModel(object): - """GNNModel""" - - def __init__(self, name, num_nodes, emb_dim, num_layers): - self.num_nodes = num_nodes - self.emb_dim = emb_dim - self.num_layers = num_layers - self.name = name - - self.src_nodes = fluid.layers.data( - name='src_nodes', - shape=[None], - dtype='int64', ) - - self.dst_nodes = fluid.layers.data( - name='dst_nodes', - shape=[None], - dtype='int64', ) - - self.edge_label = fluid.layers.data( - name='edge_label', - shape=[None, 1], - dtype='float32', ) - - def forward(self, graph): - """forward""" - h = fluid.layers.create_parameter( - shape=[self.num_nodes, self.emb_dim], - dtype="float32", - name=self.name + "_embedding") - - for layer in range(self.num_layers): - msg = graph.send( - send_func, - nfeat_list=[("h", h)], ) - h = graph.recv(msg, recv_func) - h = fluid.layers.fc( - h, - size=self.emb_dim, - bias_attr=False, - param_attr=fluid.ParamAttr(name=self.name + '_%s' % layer)) - h = h * graph.node_feat["norm"] - bias = fluid.layers.create_parameter( - shape=[self.emb_dim], - dtype='float32', - is_bias=True, - name=self.name + '_bias_%s' % layer) - h = fluid.layers.elementwise_add(h, bias, act="relu") - - src = fluid.layers.gather(h, self.src_nodes, overwrite=False) - dst = fluid.layers.gather(h, self.dst_nodes, overwrite=False) - edge_embed = src * dst - pred = fluid.layers.fc(input=edge_embed, - size=1, - name=self.name + "_pred_output") - - prob = fluid.layers.sigmoid(pred) - - loss = fluid.layers.sigmoid_cross_entropy_with_logits(pred, - self.edge_label) - loss = fluid.layers.reduce_sum(loss) - - return pred, prob, loss - - -def main(): - """main - """ - # Training settings - parser = argparse.ArgumentParser(description='Graph Dataset') - parser.add_argument( - '--epochs', - type=int, - default=4, - help='number of epochs to train (default: 100)') - parser.add_argument( - '--dataset', - type=str, - default="ogbl-ppa", - help='dataset name (default: protein protein associations)') - parser.add_argument('--use_cuda', action='store_true') - parser.add_argument('--batch_size', type=int, default=5120) - parser.add_argument('--embed_dim', type=int, default=64) - parser.add_argument('--num_layers', type=int, default=2) - parser.add_argument('--lr', type=float, default=0.001) - args = parser.parse_args() - print(args) - - place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace() - - ### automatic dataloading and splitting - print("loadding dataset") - dataset = PglLinkPropPredDataset(name=args.dataset) - splitted_edge = dataset.get_edge_split() - print(splitted_edge['train_edge'].shape) - print(splitted_edge['train_edge_label'].shape) - - print("building evaluator") - ### automatic evaluator. takes dataset name as input - evaluator = Evaluator(args.dataset) - - graph_data = dataset[0] - print("num_nodes: %d" % graph_data.num_nodes) - - train_program = fluid.Program() - startup_program = fluid.Program() - - # degree normalize - indegree = graph_data.indegree() - norm = np.zeros_like(indegree, dtype="float32") - norm[indegree > 0] = np.power(indegree[indegree > 0], -0.5) - graph_data.node_feat["norm"] = np.expand_dims(norm, -1).astype("float32") - # graph_data.node_feat["index"] = np.array([i for i in range(graph_data.num_nodes)], dtype=np.int64).reshape(-1,1) - - with fluid.program_guard(train_program, startup_program): - model = GNNModel( - name="gnn", - num_nodes=graph_data.num_nodes, - emb_dim=args.embed_dim, - num_layers=args.num_layers) - gw = pgl.graph_wrapper.GraphWrapper( - "graph", - node_feat=graph_data.node_feat_info(), - edge_feat=graph_data.edge_feat_info()) - pred, prob, loss = model.forward(gw) - - val_program = train_program.clone(for_test=True) - - with fluid.program_guard(train_program, startup_program): - global_steps = int(splitted_edge['train_edge'].shape[0] / - args.batch_size * 2) - learning_rate = fluid.layers.polynomial_decay(args.lr, global_steps, - 0.00005) - - adam = fluid.optimizer.Adam( - learning_rate=learning_rate, - regularization=fluid.regularizer.L2DecayRegularizer( - regularization_coeff=0.0005)) - adam.minimize(loss) - - exe = fluid.Executor(place) - exe.run(startup_program) - feed = gw.to_feed(graph_data) - - print("evaluate result before training: ") - result = test(exe, val_program, prob, evaluator, feed, splitted_edge) - print(result) - - print("training") - cc = 0 - for epoch in range(1, args.epochs + 1): - for batch_data, batch_label in data_generator( - graph_data, - splitted_edge["train_edge"], - splitted_edge["train_edge_label"], - batch_size=args.batch_size): - feed['src_nodes'] = batch_data[:, 0].reshape(-1, 1) - feed['dst_nodes'] = batch_data[:, 1].reshape(-1, 1) - feed['edge_label'] = batch_label.astype("float32") - - res_loss, y_pred, b_lr = exe.run( - train_program, - feed=feed, - fetch_list=[loss, prob, learning_rate]) - if cc % 1 == 0: - print("epoch %d | step %d | lr %s | Loss %s" % - (epoch, cc, b_lr[0], res_loss[0])) - cc += 1 - - if cc % 20 == 0: - print("Evaluating...") - result = test(exe, val_program, prob, evaluator, feed, - splitted_edge) - print("epoch %d | step %d" % (epoch, cc)) - print(result) - - -def test(exe, val_program, prob, evaluator, feed, splitted_edge): - """Evaluation""" - result = {} - feed['src_nodes'] = splitted_edge["valid_edge"][:, 0].reshape(-1, 1) - feed['dst_nodes'] = splitted_edge["valid_edge"][:, 1].reshape(-1, 1) - feed['edge_label'] = splitted_edge["valid_edge_label"].astype( - "float32").reshape(-1, 1) - y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0] - input_dict = { - "y_pred_pos": - y_pred[splitted_edge["valid_edge_label"] == 1].reshape(-1, ), - "y_pred_neg": - y_pred[splitted_edge["valid_edge_label"] == 0].reshape(-1, ) - } - result["valid"] = evaluator.eval(input_dict) - - feed['src_nodes'] = splitted_edge["test_edge"][:, 0].reshape(-1, 1) - feed['dst_nodes'] = splitted_edge["test_edge"][:, 1].reshape(-1, 1) - feed['edge_label'] = splitted_edge["test_edge_label"].astype( - "float32").reshape(-1, 1) - y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0] - input_dict = { - "y_pred_pos": - y_pred[splitted_edge["test_edge_label"] == 1].reshape(-1, ), - "y_pred_neg": - y_pred[splitted_edge["test_edge_label"] == 0].reshape(-1, ) - } - result["test"] = evaluator.eval(input_dict) - return result - - -def data_generator(graph, data, label_data, batch_size, shuffle=True): - """Data Generator""" - perm = np.arange(0, len(data)) - if shuffle: - np.random.shuffle(perm) - - offset = 0 - while offset < len(perm): - batch_index = perm[offset:(offset + batch_size)] - offset += batch_size - pos_data = data[batch_index] - pos_label = label_data[batch_index] - - neg_src_node = pos_data[:, 0] - neg_dst_node = np.random.choice( - pos_data.reshape(-1, ), size=len(neg_src_node)) - neg_data = np.hstack( - [neg_src_node.reshape(-1, 1), neg_dst_node.reshape(-1, 1)]) - exists = graph.has_edges_between(neg_src_node, neg_dst_node) - neg_data = neg_data[np.invert(exists)] - neg_label = np.zeros(shape=len(neg_data), dtype=np.int64) - - batch_data = np.vstack([pos_data, neg_data]) - label = np.vstack([pos_label.reshape(-1, 1), neg_label.reshape(-1, 1)]) - yield batch_data, label - - -if __name__ == "__main__": - main() diff --git a/ogb_examples/linkproppred/ogbl-ppa/README.md b/ogb_examples/linkproppred/ogbl-ppa/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f06b3bc2be13dca9548491c5a152841fd4bb034f --- /dev/null +++ b/ogb_examples/linkproppred/ogbl-ppa/README.md @@ -0,0 +1,21 @@ +# Graph Link Prediction for Open Graph Benchmark (OGB) PPA dataset + +[The Open Graph Benchmark (OGB)](https://ogb.stanford.edu/) is a collection of benchmark datasets, data loaders, and evaluators for graph machine learning. Here we complete the Graph Link Prediction task based on PGL. + + +### Requirements + +paddlpaddle >= 1.7.1 + +pgl 1.0.2 + +ogb + + +### How to Run + +``` +CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --use_cuda 1 --num_workers 4 --output_path ./output/model_1 --batch_size 65536 --epoch 1000 --learning_rate 0.005 --hidden_size 256 +``` + +The best record will be saved in ./output/model_1/best.txt. diff --git a/ogb_examples/linkproppred/ogbl-ppa/args.py b/ogb_examples/linkproppred/ogbl-ppa/args.py new file mode 100644 index 0000000000000000000000000000000000000000..5fc51d37f9774fbf50fb7bbb5aa700b9f8aaff7f --- /dev/null +++ b/ogb_examples/linkproppred/ogbl-ppa/args.py @@ -0,0 +1,44 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""finetune args""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from __future__ import absolute_import + +import os +import time +import argparse + +from utils.args import ArgumentGroup + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +model_g = ArgumentGroup(parser, "model", "model configuration and paths.") +model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.") +model_g.add_arg("init_pretraining_params", str, None, + "Init pre-training params which preforms fine-tuning from. If the " + "arg 'init_checkpoint' has been set, this argument wouldn't be valid.") + +train_g = ArgumentGroup(parser, "training", "training options.") +train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.") +train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.") + +run_type_g = ArgumentGroup(parser, "run_type", "running type options.") +run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.") +run_type_g.add_arg("num_workers", int, 1, "use multiprocess to generate graph") +run_type_g.add_arg("output_path", str, None, "path to save model") +run_type_g.add_arg("hidden_size", int, 128, "model hidden-size") +run_type_g.add_arg("batch_size", int, 128, "batch_size") diff --git a/ogb_examples/linkproppred/ogbl-ppa/dataloader/__init__.py b/ogb_examples/linkproppred/ogbl-ppa/dataloader/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/ogb_examples/linkproppred/ogbl-ppa/dataloader/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/ogb_examples/linkproppred/ogbl-ppa/dataloader/base_dataloader.py b/ogb_examples/linkproppred/ogbl-ppa/dataloader/base_dataloader.py new file mode 100644 index 0000000000000000000000000000000000000000..d04f9fd521602bf67f950b3e72ba021fd09c298f --- /dev/null +++ b/ogb_examples/linkproppred/ogbl-ppa/dataloader/base_dataloader.py @@ -0,0 +1,148 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Base DataLoader +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from __future__ import absolute_import + +import os +import sys +import six +from io import open +from collections import namedtuple +import numpy as np +import tqdm +import paddle +from pgl.utils import mp_reader +import collections +import time + +import pgl + +if six.PY3: + import io + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') + + +def batch_iter(data, perm, batch_size, fid, num_workers): + """node_batch_iter + """ + size = len(data) + start = 0 + cc = 0 + while start < size: + index = perm[start:start + batch_size] + start += batch_size + cc += 1 + if cc % num_workers != fid: + continue + yield data[index] + + +def scan_batch_iter(data, batch_size, fid, num_workers): + """node_batch_iter + """ + batch = [] + cc = 0 + for line_example in data.scan(): + cc += 1 + if cc % num_workers != fid: + continue + batch.append(line_example) + if len(batch) == batch_size: + yield batch + batch = [] + + if len(batch) > 0: + yield batch + + +class BaseDataGenerator(object): + """Base Data Geneartor""" + + def __init__(self, buf_size, batch_size, num_workers, shuffle=True): + self.num_workers = num_workers + self.batch_size = batch_size + self.line_examples = [] + self.buf_size = buf_size + self.shuffle = shuffle + + def batch_fn(self, batch_examples): + """ batch_fn batch producer""" + raise NotImplementedError("No defined Batch Fn") + + def batch_iter(self, fid, perm): + """ batch iterator""" + if self.shuffle: + for batch in batch_iter(self, perm, self.batch_size, fid, + self.num_workers): + yield batch + else: + for batch in scan_batch_iter(self, self.batch_size, fid, + self.num_workers): + yield batch + + def __len__(self): + return len(self.line_examples) + + def __getitem__(self, idx): + if isinstance(idx, collections.Iterable): + return [self[bidx] for bidx in idx] + else: + return self.line_examples[idx] + + def generator(self): + """batch dict generator""" + + def worker(filter_id, perm): + """ multiprocess worker""" + + def func_run(): + """ func_run """ + pid = os.getpid() + np.random.seed(pid + int(time.time())) + for batch_examples in self.batch_iter(filter_id, perm): + batch_dict = self.batch_fn(batch_examples) + yield batch_dict + + return func_run + + # consume a seed + np.random.rand() + if self.shuffle: + perm = np.arange(0, len(self)) + np.random.shuffle(perm) + else: + perm = None + if self.num_workers == 1: + r = paddle.reader.buffered(worker(0, perm), self.buf_size) + else: + worker_pool = [ + worker(wid, perm) for wid in range(self.num_workers) + ] + worker = mp_reader.multiprocess_reader( + worker_pool, use_pipe=True, queue_size=1000) + r = paddle.reader.buffered(worker, self.buf_size) + + for batch in r(): + yield batch + + def scan(self): + for line_example in self.line_examples: + yield line_example diff --git a/ogb_examples/linkproppred/ogbl-ppa/dataloader/ogbl_ppa_dataloader.py b/ogb_examples/linkproppred/ogbl-ppa/dataloader/ogbl_ppa_dataloader.py new file mode 100644 index 0000000000000000000000000000000000000000..621db215a6924de338a7dd881ddc54ac82290a33 --- /dev/null +++ b/ogb_examples/linkproppred/ogbl-ppa/dataloader/ogbl_ppa_dataloader.py @@ -0,0 +1,118 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from __future__ import absolute_import + +from dataloader.base_dataloader import BaseDataGenerator +import ssl +ssl._create_default_https_context = ssl._create_unverified_context + +from ogb.linkproppred import LinkPropPredDataset +from ogb.linkproppred import Evaluator +import tqdm +from collections import namedtuple +import pgl +import numpy as np + + +class PPADataGenerator(BaseDataGenerator): + def __init__(self, + graph_wrapper=None, + buf_size=1000, + batch_size=128, + num_workers=1, + shuffle=True, + phase="train"): + super(PPADataGenerator, self).__init__( + buf_size=buf_size, + num_workers=num_workers, + batch_size=batch_size, + shuffle=shuffle) + + self.d_name = "ogbl-ppa" + self.graph_wrapper = graph_wrapper + dataset = LinkPropPredDataset(name=self.d_name) + splitted_edge = dataset.get_edge_split() + self.phase = phase + graph = dataset[0] + edges = graph["edge_index"].T + #self.graph = pgl.graph.Graph(num_nodes=graph["num_nodes"], + # edges=edges, + # node_feat={"nfeat": graph["node_feat"], + # "node_id": np.arange(0, graph["num_nodes"], dtype="int64").reshape(-1, 1) }) + + #self.graph.indegree() + self.num_nodes = graph["num_nodes"] + if self.phase == 'train': + edges = splitted_edge["train"]["edge"] + labels = np.ones(len(edges)) + elif self.phase == "valid": + # Compute the embedding for all the nodes + pos_edges = splitted_edge["valid"]["edge"] + neg_edges = splitted_edge["valid"]["edge_neg"] + pos_labels = np.ones(len(pos_edges)) + neg_labels = np.zeros(len(neg_edges)) + edges = np.vstack([pos_edges, neg_edges]) + labels = pos_labels.tolist() + neg_labels.tolist() + elif self.phase == "test": + # Compute the embedding for all the nodes + pos_edges = splitted_edge["test"]["edge"] + neg_edges = splitted_edge["test"]["edge_neg"] + pos_labels = np.ones(len(pos_edges)) + neg_labels = np.zeros(len(neg_edges)) + edges = np.vstack([pos_edges, neg_edges]) + labels = pos_labels.tolist() + neg_labels.tolist() + + self.line_examples = [] + Example = namedtuple('Example', ['src', "dst", "label"]) + for edge, label in zip(edges, labels): + self.line_examples.append( + Example( + src=edge[0], dst=edge[1], label=label)) + print("Phase", self.phase) + print("Len Examples", len(self.line_examples)) + + def batch_fn(self, batch_ex): + batch_src = [] + batch_dst = [] + join_graph = [] + cc = 0 + batch_node_id = [] + batch_labels = [] + for ex in batch_ex: + batch_src.append(ex.src) + batch_dst.append(ex.dst) + batch_labels.append(ex.label) + + if self.phase == "train": + for num in range(1): + rand_src = np.random.randint( + low=0, high=self.num_nodes, size=len(batch_ex)) + rand_dst = np.random.randint( + low=0, high=self.num_nodes, size=len(batch_ex)) + batch_src = batch_src + rand_src.tolist() + batch_dst = batch_dst + rand_dst.tolist() + batch_labels = batch_labels + np.zeros_like( + rand_src, dtype="int64").tolist() + + feed_dict = {} + + feed_dict["batch_src"] = np.array(batch_src, dtype="int64") + feed_dict["batch_dst"] = np.array(batch_dst, dtype="int64") + feed_dict["labels"] = np.array(batch_labels, dtype="int64") + return feed_dict diff --git a/ogb_examples/linkproppred/ogbl-ppa/model.py b/ogb_examples/linkproppred/ogbl-ppa/model.py new file mode 100644 index 0000000000000000000000000000000000000000..9429ea39a900488e1ab65c084e4b133079c56dcb --- /dev/null +++ b/ogb_examples/linkproppred/ogbl-ppa/model.py @@ -0,0 +1,108 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""lbs_model""" +import os +import re +import time +from random import random +from functools import reduce, partial + +import numpy as np +import multiprocessing + +import paddle +import paddle.fluid as F +import paddle.fluid.layers as L +from pgl.graph_wrapper import GraphWrapper +from pgl.layers.conv import gcn, gat + + +class BaseGraph(object): + """Base Graph Model""" + + def __init__(self, args): + node_feature = [('nfeat', [None, 58], "float32"), + ('node_id', [None, 1], "int64")] + self.hidden_size = args.hidden_size + self.num_nodes = args.num_nodes + + self.graph_wrapper = None # GraphWrapper( + #name="graph", place=F.CPUPlace(), node_feat=node_feature) + + self.build_model(args) + + def build_model(self, args): + """ build graph model""" + self.batch_src = L.data(name="batch_src", shape=[-1], dtype="int64") + self.batch_src = L.reshape(self.batch_src, [-1, 1]) + self.batch_dst = L.data(name="batch_dst", shape=[-1], dtype="int64") + self.batch_dst = L.reshape(self.batch_dst, [-1, 1]) + self.labels = L.data(name="labels", shape=[-1], dtype="int64") + self.labels = L.reshape(self.labels, [-1, 1]) + self.labels.stop_gradients = True + self.src_repr = L.embedding( + self.batch_src, + size=(self.num_nodes, self.hidden_size), + param_attr=F.ParamAttr( + name="node_embeddings", + initializer=F.initializer.NormalInitializer( + loc=0.0, scale=1.0))) + + self.dst_repr = L.embedding( + self.batch_dst, + size=(self.num_nodes, self.hidden_size), + param_attr=F.ParamAttr( + name="node_embeddings", + initializer=F.initializer.NormalInitializer( + loc=0.0, scale=1.0))) + + self.link_predictor(self.src_repr, self.dst_repr) + + self.bce_loss() + + def link_predictor(self, x, y): + """ siamese network""" + feat = x * y + + feat = L.fc(feat, size=self.hidden_size, name="link_predictor_1") + feat = L.relu(feat) + + feat = L.fc(feat, size=self.hidden_size, name="link_predictor_2") + feat = L.relu(feat) + + self.logits = L.fc(feat, + size=1, + act="sigmoid", + name="link_predictor_logits") + + def bce_loss(self): + """listwise model""" + mask = L.cast(self.labels > 0.5, dtype="float32") + mask.stop_gradients = True + + self.loss = L.log_loss(self.logits, mask, epsilon=1e-15) + self.loss = L.reduce_mean(self.loss) * 2 + proba = L.sigmoid(self.logits) + proba = L.concat([proba * -1 + 1, proba], axis=1) + auc_out, batch_auc_out, _ = \ + L.auc(input=proba, label=self.labels, curve='ROC', slide_steps=1) + + self.metrics = { + "loss": self.loss, + "auc": batch_auc_out, + } + + def neighbor_aggregator(self, node_repr): + """neighbor aggregation""" + return node_repr diff --git a/ogb_examples/linkproppred/ogbl-ppa/monitor/__init__.py b/ogb_examples/linkproppred/ogbl-ppa/monitor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d814437561c253c97a95e31187e63a554476364f --- /dev/null +++ b/ogb_examples/linkproppred/ogbl-ppa/monitor/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""init""" diff --git a/ogb_examples/linkproppred/ogbl-ppa/monitor/train_monitor.py b/ogb_examples/linkproppred/ogbl-ppa/monitor/train_monitor.py new file mode 100644 index 0000000000000000000000000000000000000000..a517b7c2679f51f4247912df8f661a20792720b8 --- /dev/null +++ b/ogb_examples/linkproppred/ogbl-ppa/monitor/train_monitor.py @@ -0,0 +1,184 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""train and evaluate""" +import tqdm +import json +import numpy as np +import sys +import os +import paddle.fluid as F +from tensorboardX import SummaryWriter +from ogb.linkproppred import Evaluator +from ogb.linkproppred import LinkPropPredDataset + + +def multi_device(reader, dev_count): + """multi device""" + if dev_count == 1: + for batch in reader: + yield batch + else: + batches = [] + for batch in reader: + batches.append(batch) + if len(batches) == dev_count: + yield batches + batches = [] + + +class OgbEvaluator(object): + def __init__(self): + d_name = "ogbl-ppa" + dataset = LinkPropPredDataset(name=d_name) + splitted_edge = dataset.get_edge_split() + graph = dataset[0] + self.num_nodes = graph["num_nodes"] + self.ogb_evaluator = Evaluator(name="ogbl-ppa") + + def eval(self, scores, labels, phase): + labels = np.reshape(labels, [-1]) + ret = {} + pos = scores[labels > 0.5].squeeze(-1) + neg = scores[labels < 0.5].squeeze(-1) + for K in [10, 50, 100]: + self.ogb_evaluator.K = K + ret['%s_hits@%s' % (phase, K)] = self.ogb_evaluator.eval({ + 'y_pred_pos': pos, + 'y_pred_neg': neg, + })[f'hits@{K}'] + return ret + + +def evaluate(model, valid_exe, valid_ds, valid_prog, dev_count, evaluator, + phase): + """evaluate """ + cc = 0 + scores = [] + labels = [] + + for feed_dict in tqdm.tqdm( + multi_device(valid_ds.generator(), dev_count), desc='evaluating'): + + if dev_count > 1: + output = valid_exe.run(feed=feed_dict, + fetch_list=[model.logits, model.labels]) + else: + output = valid_exe.run(valid_prog, + feed=feed_dict, + fetch_list=[model.logits, model.labels]) + scores.append(output[0]) + labels.append(output[1]) + + scores = np.vstack(scores) + labels = np.vstack(labels) + ret = evaluator.eval(scores, labels, phase) + return ret + + +def _create_if_not_exist(path): + basedir = os.path.dirname(path) + if not os.path.exists(basedir): + os.makedirs(basedir) + + +def train_and_evaluate(exe, + train_exe, + valid_exe, + train_ds, + valid_ds, + test_ds, + train_prog, + valid_prog, + model, + metric, + epoch=20, + dev_count=1, + train_log_step=5, + eval_step=10000, + evaluator=None, + output_path=None): + """train and evaluate""" + + global_step = 0 + + log_path = os.path.join(output_path, "log") + _create_if_not_exist(log_path) + + writer = SummaryWriter(log_path) + + best_model = 0 + for e in range(epoch): + for feed_dict in tqdm.tqdm( + multi_device(train_ds.generator(), dev_count), + desc='Epoch %s' % e): + if dev_count > 1: + ret = train_exe.run(feed=feed_dict, fetch_list=metric.vars) + ret = [[np.mean(v)] for v in ret] + else: + ret = train_exe.run(train_prog, + feed=feed_dict, + fetch_list=metric.vars) + + ret = metric.parse(ret) + if global_step % train_log_step == 0: + for key, value in ret.items(): + writer.add_scalar( + 'train_' + key, value, global_step=global_step) + + global_step += 1 + if global_step % eval_step == 0: + eval_ret = evaluate(model, exe, valid_ds, valid_prog, 1, + evaluator, "valid") + + test_eval_ret = evaluate(model, exe, test_ds, valid_prog, 1, + evaluator, "test") + + eval_ret.update(test_eval_ret) + + sys.stderr.write(json.dumps(eval_ret, indent=4) + "\n") + + for key, value in eval_ret.items(): + writer.add_scalar(key, value, global_step=global_step) + + if eval_ret["valid_hits@100"] > best_model: + F.io.save_persistables( + exe, + os.path.join(output_path, "checkpoint"), train_prog) + eval_ret["step"] = global_step + with open(os.path.join(output_path, "best.txt"), "w") as f: + f.write(json.dumps(eval_ret, indent=2) + '\n') + best_model = eval_ret["valid_hits@100"] + # Epoch End + eval_ret = evaluate(model, exe, valid_ds, valid_prog, 1, evaluator, + "valid") + + test_eval_ret = evaluate(model, exe, test_ds, valid_prog, 1, evaluator, + "test") + + eval_ret.update(test_eval_ret) + sys.stderr.write(json.dumps(eval_ret, indent=4) + "\n") + + for key, value in eval_ret.items(): + writer.add_scalar(key, value, global_step=global_step) + + if eval_ret["valid_hits@100"] > best_model: + F.io.save_persistables(exe, + os.path.join(output_path, "checkpoint"), + train_prog) + eval_ret["step"] = global_step + with open(os.path.join(output_path, "best.txt"), "w") as f: + f.write(json.dumps(eval_ret, indent=2) + '\n') + best_model = eval_ret["valid_hits@100"] + + writer.close() diff --git a/ogb_examples/linkproppred/ogbl-ppa/train.py b/ogb_examples/linkproppred/ogbl-ppa/train.py new file mode 100644 index 0000000000000000000000000000000000000000..c70fa4f9dd4987e615f6f935b5108c727fe7abee --- /dev/null +++ b/ogb_examples/linkproppred/ogbl-ppa/train.py @@ -0,0 +1,157 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""listwise model +""" + +import torch +import os +import re +import time +import logging +from random import random +from functools import reduce, partial + +# For downloading ogb +import ssl +ssl._create_default_https_context = ssl._create_unverified_context +# SSL + +import numpy as np +import multiprocessing + +import pgl +import paddle +import paddle.fluid as F +import paddle.fluid.layers as L + +from args import parser +from utils.args import print_arguments, check_cuda +from utils.init import init_checkpoint, init_pretraining_params +from model import BaseGraph +from dataloader.ogbl_ppa_dataloader import PPADataGenerator +from monitor.train_monitor import train_and_evaluate, OgbEvaluator + +log = logging.getLogger(__name__) + + +class Metric(object): + """Metric""" + + def __init__(self, **args): + self.args = args + + @property + def vars(self): + """ fetch metric vars""" + values = [self.args[k] for k in self.args.keys()] + return values + + def parse(self, fetch_list): + """parse""" + tup = list(zip(self.args.keys(), [float(v[0]) for v in fetch_list])) + return dict(tup) + + +if __name__ == '__main__': + args = parser.parse_args() + print_arguments(args) + evaluator = OgbEvaluator() + + train_prog = F.Program() + startup_prog = F.Program() + args.num_nodes = evaluator.num_nodes + + if args.use_cuda: + dev_list = F.cuda_places() + place = dev_list[0] + dev_count = len(dev_list) + else: + place = F.CPUPlace() + dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + + with F.program_guard(train_prog, startup_prog): + with F.unique_name.guard(): + graph_model = BaseGraph(args) + test_prog = train_prog.clone(for_test=True) + opt = F.optimizer.Adam(learning_rate=args.learning_rate) + opt.minimize(graph_model.loss) + + #test_prog = F.Program() + #with F.program_guard(test_prog, startup_prog): + # with F.unique_name.guard(): + # _graph_model = BaseGraph(args) + + train_ds = PPADataGenerator( + phase="train", + graph_wrapper=graph_model.graph_wrapper, + num_workers=args.num_workers, + batch_size=args.batch_size) + + valid_ds = PPADataGenerator( + phase="valid", + graph_wrapper=graph_model.graph_wrapper, + num_workers=args.num_workers, + batch_size=args.batch_size) + + test_ds = PPADataGenerator( + phase="test", + graph_wrapper=graph_model.graph_wrapper, + num_workers=args.num_workers, + batch_size=args.batch_size) + + exe = F.Executor(place) + exe.run(startup_prog) + + if args.init_pretraining_params is not None: + init_pretraining_params( + exe, args.init_pretraining_params, main_program=startup_prog) + + metric = Metric(**graph_model.metrics) + + nccl2_num_trainers = 1 + nccl2_trainer_id = 0 + if dev_count > 1: + + exec_strategy = F.ExecutionStrategy() + exec_strategy.num_threads = dev_count + + train_exe = F.ParallelExecutor( + use_cuda=args.use_cuda, + loss_name=graph_model.loss.name, + exec_strategy=exec_strategy, + main_program=train_prog, + num_trainers=nccl2_num_trainers, + trainer_id=nccl2_trainer_id) + + test_exe = exe + else: + train_exe, test_exe = exe, exe + + train_and_evaluate( + exe=exe, + train_exe=train_exe, + valid_exe=test_exe, + train_ds=train_ds, + valid_ds=valid_ds, + test_ds=test_ds, + train_prog=train_prog, + valid_prog=test_prog, + train_log_step=5, + output_path=args.output_path, + dev_count=dev_count, + model=graph_model, + epoch=args.epoch, + eval_step=1000000, + evaluator=evaluator, + metric=metric) diff --git a/ogb_examples/linkproppred/ogbl-ppa/utils/__init__.py b/ogb_examples/linkproppred/ogbl-ppa/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1333621cf62da67fcf10016fc848c503f7c254fa --- /dev/null +++ b/ogb_examples/linkproppred/ogbl-ppa/utils/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""utils""" diff --git a/ogb_examples/linkproppred/ogbl-ppa/utils/args.py b/ogb_examples/linkproppred/ogbl-ppa/utils/args.py new file mode 100644 index 0000000000000000000000000000000000000000..5131f2ceb88775f12e886402ef205735a1ac1d77 --- /dev/null +++ b/ogb_examples/linkproppred/ogbl-ppa/utils/args.py @@ -0,0 +1,97 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Arguments for configuration.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from __future__ import absolute_import + +import six +import os +import sys +import argparse +import logging + +import paddle.fluid as fluid + +log = logging.getLogger(__name__) + + +def prepare_logger(logger, debug=False, save_to_file=None): + """doc""" + formatter = logging.Formatter( + fmt='[%(levelname)s] %(asctime)s [%(filename)12s:%(lineno)5d]:\t%(message)s' + ) + #console_hdl = logging.StreamHandler() + #console_hdl.setFormatter(formatter) + #logger.addHandler(console_hdl) + if save_to_file is not None and not os.path.exists(save_to_file): + file_hdl = logging.FileHandler(save_to_file) + file_hdl.setFormatter(formatter) + logger.addHandler(file_hdl) + logger.setLevel(logging.DEBUG) + logger.propagate = False + + +def str2bool(v): + """doc""" + # because argparse does not support to parse "true, False" as python + # boolean directly + return v.lower() in ("true", "t", "1") + + +class ArgumentGroup(object): + """doc""" + + def __init__(self, parser, title, des): + self._group = parser.add_argument_group(title=title, description=des) + + def add_arg(self, + name, + type, + default, + help, + positional_arg=False, + **kwargs): + """doc""" + prefix = "" if positional_arg else "--" + type = str2bool if type == bool else type + self._group.add_argument( + prefix + name, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +def print_arguments(args): + """doc""" + log.info('----------- Configuration Arguments -----------') + for arg, value in sorted(six.iteritems(vars(args))): + log.info('%s: %s' % (arg, value)) + log.info('------------------------------------------------') + + +def check_cuda(use_cuda, err= \ + "\nYou can not set use_cuda=True in the model because you are using paddlepaddle-cpu.\n \ + Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda=False to run models on CPU.\n" + ): + """doc""" + try: + if use_cuda == True and fluid.is_compiled_with_cuda() == False: + log.error(err) + sys.exit(1) + except Exception as e: + pass diff --git a/ogb_examples/linkproppred/ogbl-ppa/utils/cards.py b/ogb_examples/linkproppred/ogbl-ppa/utils/cards.py new file mode 100644 index 0000000000000000000000000000000000000000..2b658a4bf6272f00f48ff447caaaa580189afe60 --- /dev/null +++ b/ogb_examples/linkproppred/ogbl-ppa/utils/cards.py @@ -0,0 +1,31 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""cards""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from __future__ import absolute_import +import os + + +def get_cards(): + """ + get gpu cards number + """ + num = 0 + cards = os.environ.get('CUDA_VISIBLE_DEVICES', '') + if cards != '': + num = len(cards.split(",")) + return num diff --git a/ogb_examples/linkproppred/ogbl-ppa/utils/fp16.py b/ogb_examples/linkproppred/ogbl-ppa/utils/fp16.py new file mode 100644 index 0000000000000000000000000000000000000000..740add267dff2dbf463032bcc47a6741ca9f7c43 --- /dev/null +++ b/ogb_examples/linkproppred/ogbl-ppa/utils/fp16.py @@ -0,0 +1,201 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import paddle +import paddle.fluid as fluid + + +def append_cast_op(i, o, prog): + """ + Append a cast op in a given Program to cast input `i` to data type `o.dtype`. + Args: + i (Variable): The input Variable. + o (Variable): The output Variable. + prog (Program): The Program to append cast op. + """ + prog.global_block().append_op( + type="cast", + inputs={"X": i}, + outputs={"Out": o}, + attrs={"in_dtype": i.dtype, + "out_dtype": o.dtype}) + + +def copy_to_master_param(p, block): + v = block.vars.get(p.name, None) + if v is None: + raise ValueError("no param name %s found!" % p.name) + new_p = fluid.framework.Parameter( + block=block, + shape=v.shape, + dtype=fluid.core.VarDesc.VarType.FP32, + type=v.type, + lod_level=v.lod_level, + stop_gradient=p.stop_gradient, + trainable=p.trainable, + optimize_attr=p.optimize_attr, + regularizer=p.regularizer, + gradient_clip_attr=p.gradient_clip_attr, + error_clip=p.error_clip, + name=v.name + ".master") + return new_p + + +def apply_dynamic_loss_scaling(loss_scaling, master_params_grads, + incr_every_n_steps, decr_every_n_nan_or_inf, + incr_ratio, decr_ratio): + _incr_every_n_steps = fluid.layers.fill_constant( + shape=[1], dtype='int32', value=incr_every_n_steps) + _decr_every_n_nan_or_inf = fluid.layers.fill_constant( + shape=[1], dtype='int32', value=decr_every_n_nan_or_inf) + + _num_good_steps = fluid.layers.create_global_var( + name=fluid.unique_name.generate("num_good_steps"), + shape=[1], + value=0, + dtype='int32', + persistable=True) + _num_bad_steps = fluid.layers.create_global_var( + name=fluid.unique_name.generate("num_bad_steps"), + shape=[1], + value=0, + dtype='int32', + persistable=True) + + grads = [fluid.layers.reduce_sum(g) for [_, g] in master_params_grads] + all_grads = fluid.layers.concat(grads) + all_grads_sum = fluid.layers.reduce_sum(all_grads) + is_overall_finite = fluid.layers.isfinite(all_grads_sum) + + update_loss_scaling(is_overall_finite, loss_scaling, _num_good_steps, + _num_bad_steps, _incr_every_n_steps, + _decr_every_n_nan_or_inf, incr_ratio, decr_ratio) + + # apply_gradient append all ops in global block, thus we shouldn't + # apply gradient in the switch branch. + with fluid.layers.Switch() as switch: + with switch.case(is_overall_finite): + pass + with switch.default(): + for _, g in master_params_grads: + fluid.layers.assign(fluid.layers.zeros_like(g), g) + + +def create_master_params_grads(params_grads, main_prog, startup_prog, + loss_scaling): + master_params_grads = [] + for p, g in params_grads: + with main_prog._optimized_guard([p, g]): + # create master parameters + master_param = copy_to_master_param(p, main_prog.global_block()) + startup_master_param = startup_prog.global_block()._clone_variable( + master_param) + startup_p = startup_prog.global_block().var(p.name) + append_cast_op(startup_p, startup_master_param, startup_prog) + # cast fp16 gradients to fp32 before apply gradients + if g.name.find("layer_norm") > -1: + scaled_g = g / loss_scaling + master_params_grads.append([p, scaled_g]) + continue + master_grad = fluid.layers.cast(g, "float32") + master_grad = master_grad / loss_scaling + master_params_grads.append([master_param, master_grad]) + + return master_params_grads + + +def master_param_to_train_param(master_params_grads, params_grads, main_prog): + for idx, m_p_g in enumerate(master_params_grads): + train_p, _ = params_grads[idx] + if train_p.name.find("layer_norm") > -1: + continue + with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]): + append_cast_op(m_p_g[0], train_p, main_prog) + + +def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps, + num_bad_steps, incr_every_n_steps, + decr_every_n_nan_or_inf, incr_ratio, decr_ratio): + """ + Update loss scaling according to overall gradients. If all gradients is + finite after incr_every_n_steps, loss scaling will increase by incr_ratio. + Otherwisw, loss scaling will decrease by decr_ratio after + decr_every_n_nan_or_inf steps and each step some gradients are infinite. + Args: + is_overall_finite (Variable): A boolean variable indicates whether + all gradients are finite. + prev_loss_scaling (Variable): Previous loss scaling. + num_good_steps (Variable): A variable accumulates good steps in which + all gradients are finite. + num_bad_steps (Variable): A variable accumulates bad steps in which + some gradients are infinite. + incr_every_n_steps (Variable): A variable represents increasing loss + scaling every n consecutive steps with + finite gradients. + decr_every_n_nan_or_inf (Variable): A variable represents decreasing + loss scaling every n accumulated + steps with nan or inf gradients. + incr_ratio(float): The multiplier to use when increasing the loss + scaling. + decr_ratio(float): The less-than-one-multiplier to use when decreasing + loss scaling. + """ + zero_steps = fluid.layers.fill_constant(shape=[1], dtype='int32', value=0) + with fluid.layers.Switch() as switch: + with switch.case(is_overall_finite): + should_incr_loss_scaling = fluid.layers.less_than( + incr_every_n_steps, num_good_steps + 1) + with fluid.layers.Switch() as switch1: + with switch1.case(should_incr_loss_scaling): + new_loss_scaling = prev_loss_scaling * incr_ratio + loss_scaling_is_finite = fluid.layers.isfinite( + new_loss_scaling) + with fluid.layers.Switch() as switch2: + with switch2.case(loss_scaling_is_finite): + fluid.layers.assign(new_loss_scaling, + prev_loss_scaling) + with switch2.default(): + pass + fluid.layers.assign(zero_steps, num_good_steps) + fluid.layers.assign(zero_steps, num_bad_steps) + + with switch1.default(): + fluid.layers.increment(num_good_steps) + fluid.layers.assign(zero_steps, num_bad_steps) + + with switch.default(): + should_decr_loss_scaling = fluid.layers.less_than( + decr_every_n_nan_or_inf, num_bad_steps + 1) + with fluid.layers.Switch() as switch3: + with switch3.case(should_decr_loss_scaling): + new_loss_scaling = prev_loss_scaling * decr_ratio + static_loss_scaling = \ + fluid.layers.fill_constant(shape=[1], + dtype='float32', + value=1.0) + less_than_one = fluid.layers.less_than(new_loss_scaling, + static_loss_scaling) + with fluid.layers.Switch() as switch4: + with switch4.case(less_than_one): + fluid.layers.assign(static_loss_scaling, + prev_loss_scaling) + with switch4.default(): + fluid.layers.assign(new_loss_scaling, + prev_loss_scaling) + fluid.layers.assign(zero_steps, num_good_steps) + fluid.layers.assign(zero_steps, num_bad_steps) + with switch3.default(): + fluid.layers.assign(zero_steps, num_good_steps) + fluid.layers.increment(num_bad_steps) diff --git a/ogb_examples/linkproppred/ogbl-ppa/utils/init.py b/ogb_examples/linkproppred/ogbl-ppa/utils/init.py new file mode 100644 index 0000000000000000000000000000000000000000..baa3ba5987cf1cbae20a60ea88e3f3bf0e389f43 --- /dev/null +++ b/ogb_examples/linkproppred/ogbl-ppa/utils/init.py @@ -0,0 +1,97 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""paddle init""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from __future__ import absolute_import + +import os +import six +import ast +import copy +import logging + +import numpy as np +import paddle.fluid as fluid + +log = logging.getLogger(__name__) + + +def cast_fp32_to_fp16(exe, main_program): + """doc""" + log.info("Cast parameters to float16 data format.") + for param in main_program.global_block().all_parameters(): + if not param.name.endswith(".master"): + param_t = fluid.global_scope().find_var(param.name).get_tensor() + data = np.array(param_t) + if param.name.startswith("encoder_layer") \ + and "layer_norm" not in param.name: + param_t.set(np.float16(data).view(np.uint16), exe.place) + + #load fp32 + master_param_var = fluid.global_scope().find_var(param.name + + ".master") + if master_param_var is not None: + master_param_var.get_tensor().set(data, exe.place) + + +def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False): + """init""" + assert os.path.exists( + init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path + + def existed_persitables(var): + """existed""" + if not fluid.io.is_persistable(var): + return False + return os.path.exists(os.path.join(init_checkpoint_path, var.name)) + + fluid.io.load_vars( + exe, + init_checkpoint_path, + main_program=main_program, + predicate=existed_persitables) + log.info("Load model from {}".format(init_checkpoint_path)) + + if use_fp16: + cast_fp32_to_fp16(exe, main_program) + + +def init_pretraining_params(exe, + pretraining_params_path, + main_program, + use_fp16=False): + """init""" + assert os.path.exists(pretraining_params_path + ), "[%s] cann't be found." % pretraining_params_path + + def existed_params(var): + """doc""" + if not isinstance(var, fluid.framework.Parameter): + return False + return os.path.exists(os.path.join(pretraining_params_path, var.name)) + + fluid.io.load_vars( + exe, + pretraining_params_path, + main_program=main_program, + predicate=existed_params) + log.info("Load pretraining parameters from {}.".format( + pretraining_params_path)) + + if use_fp16: + cast_fp32_to_fp16(exe, main_program) diff --git a/pgl/graph_wrapper.py b/pgl/graph_wrapper.py index 009176468f3988486fdfef65a2f46bd317aa4285..91dda8f78796aedd493b37e85a92ad9ecb1c6664 100644 --- a/pgl/graph_wrapper.py +++ b/pgl/graph_wrapper.py @@ -40,7 +40,6 @@ def recv(dst, uniq_dst, bucketing_index, msg, reduce_function, num_nodes, num_edges): """Recv message from given msg to dst nodes. """ - empty_msg_flag = fluid.layers.cast(num_edges > 0, dtype="float32") if reduce_function == "sum": if isinstance(msg, dict): raise TypeError("The message for build-in function" @@ -49,8 +48,9 @@ def recv(dst, uniq_dst, bucketing_index, msg, reduce_function, num_nodes, try: out_dim = msg.shape[-1] init_output = fluid.layers.fill_constant( - shape=[num_nodes, out_dim], value=0, dtype="float32") + shape=[num_nodes, out_dim], value=0, dtype=msg.dtype) init_output.stop_gradient = False + empty_msg_flag = fluid.layers.cast(num_edges > 0, dtype=msg.dtype) msg = msg * empty_msg_flag output = paddle_helper.scatter_add(init_output, dst, msg) return output @@ -66,10 +66,12 @@ def recv(dst, uniq_dst, bucketing_index, msg, reduce_function, num_nodes, bucketed_msg = op.nested_lod_reset(msg, bucketing_index) output = reduce_function(bucketed_msg) output_dim = output.shape[-1] + + empty_msg_flag = fluid.layers.cast(num_edges > 0, dtype=output.dtype) output = output * empty_msg_flag init_output = fluid.layers.fill_constant( - shape=[num_nodes, output_dim], value=0, dtype="float32") + shape=[num_nodes, output_dim], value=0, dtype=output.dtype) init_output.stop_gradient = True final_output = fluid.layers.scatter(init_output, uniq_dst, output) return final_output diff --git a/pgl/layers/conv.py b/pgl/layers/conv.py index 96e1bb990a5b200efd6b360b7d4400cfa71db8dd..bcad46559614193878a5ce24a7bff1b6cdade58d 100644 --- a/pgl/layers/conv.py +++ b/pgl/layers/conv.py @@ -230,7 +230,7 @@ def gin(gw, epsilon.stop_gradient = True msg = gw.send(send_src_copy, nfeat_list=[("h", feature)]) - output = gw.recv(msg, "sum") + (1.0 + epsilon) * feature + output = gw.recv(msg, "sum") + feature * (epsilon + 1.0) output = fluid.layers.fc(output, size=hidden_size, @@ -238,8 +238,18 @@ def gin(gw, param_attr=fluid.ParamAttr(name="%s_w_0" % name), bias_attr=fluid.ParamAttr(name="%s_b_0" % name)) - output = fluid.layers.batch_norm(output) - output = getattr(fluid.layers, activation)(output) + output = fluid.layers.layer_norm( + output, + begin_norm_axis=1, + param_attr=fluid.ParamAttr( + name="norm_scale_%s" % (name), + initializer=fluid.initializer.Constant(1.0)), + bias_attr=fluid.ParamAttr( + name="norm_bias_%s" % (name), + initializer=fluid.initializer.Constant(0.0)), ) + + if activation is not None: + output = getattr(fluid.layers, activation)(output) output = fluid.layers.fc(output, size=hidden_size,