diff --git a/examples/erniesage/config/erniesage_v1_cpu.yaml b/examples/erniesage/config/erniesage_v1_cpu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1f7e5eddc0b6bda5f8c3377c7320429d16b0718b --- /dev/null +++ b/examples/erniesage/config/erniesage_v1_cpu.yaml @@ -0,0 +1,56 @@ +# Global Enviroment Settings +# +# trainer config ------ +learner_type: "cpu" +optimizer_type: "adam" +lr: 0.00005 +batch_size: 2 +CPU_NUM: 10 +epoch: 20 +log_per_step: 1 +save_per_step: 100 +output_path: "./output" +ckpt_path: "./ernie_base_ckpt" + +# data config ------ +input_data: "./data.txt" +graph_path: "./workdir" +sample_workers: 1 +use_pyreader: true +input_type: "text" + +# model config ------ +samples: [10] +model_type: "ErnieSageModelV1" +layer_type: "graphsage_sum" + +max_seqlen: 40 + +num_layers: 1 +hidden_size: 128 +final_fc: true +final_l2_norm: true +loss_type: "hinge" +margin: 0.3 + +# infer config ------ +infer_model: "./output/last" +infer_batch_size: 128 + +# ernie config ------ +encoding: "utf8" +ernie_vocab_file: "./vocab.txt" +ernie_config: + attention_probs_dropout_prob: 0.1 + hidden_act: "relu" + hidden_dropout_prob: 0.1 + hidden_size: 768 + initializer_range: 0.02 + max_position_embeddings: 513 + num_attention_heads: 12 + num_hidden_layers: 12 + sent_type_vocab_size: 4 + task_type_vocab_size: 3 + vocab_size: 18000 + use_task_id: false + use_fp16: false diff --git a/examples/erniesage/config/erniesage_v1_gpu.yaml b/examples/erniesage/config/erniesage_v1_gpu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7b883fe3fa06332cf196d5142c40acaee8b98259 --- /dev/null +++ b/examples/erniesage/config/erniesage_v1_gpu.yaml @@ -0,0 +1,56 @@ +# Global Enviroment Settings +# +# trainer config ------ +learner_type: "gpu" +optimizer_type: "adam" +lr: 0.00005 +batch_size: 32 +CPU_NUM: 10 +epoch: 20 +log_per_step: 1 +save_per_step: 100 +output_path: "./output" +ckpt_path: "./ernie_base_ckpt" + +# data config ------ +input_data: "./data.txt" +graph_path: "./workdir" +sample_workers: 1 +use_pyreader: true +input_type: "text" + +# model config ------ +samples: [10] +model_type: "ErnieSageModelV1" +layer_type: "graphsage_sum" + +max_seqlen: 40 + +num_layers: 1 +hidden_size: 128 +final_fc: true +final_l2_norm: true +loss_type: "hinge" +margin: 0.3 + +# infer config ------ +infer_model: "./output/last" +infer_batch_size: 128 + +# ernie config ------ +encoding: "utf8" +ernie_vocab_file: "./vocab.txt" +ernie_config: + attention_probs_dropout_prob: 0.1 + hidden_act: "relu" + hidden_dropout_prob: 0.1 + hidden_size: 768 + initializer_range: 0.02 + max_position_embeddings: 513 + num_attention_heads: 12 + num_hidden_layers: 12 + sent_type_vocab_size: 4 + task_type_vocab_size: 3 + vocab_size: 18000 + use_task_id: false + use_fp16: false diff --git a/examples/erniesage/config/erniesage_v2_cpu.yaml b/examples/erniesage/config/erniesage_v2_cpu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d39e2442a71c9400a29ddd365a7fc3e2ad126731 --- /dev/null +++ b/examples/erniesage/config/erniesage_v2_cpu.yaml @@ -0,0 +1,55 @@ +# Global Enviroment Settings +# +# trainer config ------ +learner_type: "cpu" +optimizer_type: "adam" +lr: 0.00005 +batch_size: 2 +CPU_NUM: 10 +epoch: 20 +log_per_step: 1 +save_per_step: 100 +output_path: "./output" +ckpt_path: "./ernie_base_ckpt" + +# data config ------ +input_data: "./data.txt" +graph_path: "./workdir" +sample_workers: 1 +use_pyreader: true +input_type: "text" + +# model config ------ +samples: [10] +model_type: "ErnieSageModelV2" + +max_seqlen: 40 + +num_layers: 1 +hidden_size: 128 +final_fc: true +final_l2_norm: true +loss_type: "hinge" +margin: 0.3 + +# infer config ------ +infer_model: "./output/last" +infer_batch_size: 128 + +# ernie config ------ +encoding: "utf8" +ernie_vocab_file: "./vocab.txt" +ernie_config: + attention_probs_dropout_prob: 0.1 + hidden_act: "relu" + hidden_dropout_prob: 0.1 + hidden_size: 768 + initializer_range: 0.02 + max_position_embeddings: 513 + num_attention_heads: 12 + num_hidden_layers: 12 + sent_type_vocab_size: 4 + task_type_vocab_size: 3 + vocab_size: 18000 + use_task_id: false + use_fp16: false diff --git a/examples/erniesage/config/erniesage_v2_gpu.yaml b/examples/erniesage/config/erniesage_v2_gpu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a346808c8c1f7a5fe36544c6d2dc06eda98e0ed8 --- /dev/null +++ b/examples/erniesage/config/erniesage_v2_gpu.yaml @@ -0,0 +1,55 @@ +# Global Enviroment Settings +# +# trainer config ------ +learner_type: "gpu" +optimizer_type: "adam" +lr: 0.00005 +batch_size: 32 +CPU_NUM: 10 +epoch: 20 +log_per_step: 1 +save_per_step: 100 +output_path: "./output" +ckpt_path: "./ernie_base_ckpt" + +# data config ------ +input_data: "./data.txt" +graph_path: "./workdir" +sample_workers: 1 +use_pyreader: true +input_type: "text" + +# model config ------ +samples: [10] +model_type: "ErnieSageModelV2" + +max_seqlen: 40 + +num_layers: 1 +hidden_size: 128 +final_fc: true +final_l2_norm: true +loss_type: "hinge" +margin: 0.3 + +# infer config ------ +infer_model: "./output/last" +infer_batch_size: 128 + +# ernie config ------ +encoding: "utf8" +ernie_vocab_file: "./vocab.txt" +ernie_config: + attention_probs_dropout_prob: 0.1 + hidden_act: "relu" + hidden_dropout_prob: 0.1 + hidden_size: 768 + initializer_range: 0.02 + max_position_embeddings: 513 + num_attention_heads: 12 + num_hidden_layers: 12 + sent_type_vocab_size: 4 + task_type_vocab_size: 3 + vocab_size: 18000 + use_task_id: false + use_fp16: false diff --git a/examples/erniesage/config/erniesage_v3_cpu.yaml b/examples/erniesage/config/erniesage_v3_cpu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2172a26133c9718358163f4495133720dbeb9eff --- /dev/null +++ b/examples/erniesage/config/erniesage_v3_cpu.yaml @@ -0,0 +1,55 @@ +# Global Enviroment Settings +# +# trainer config ------ +learner_type: "cpu" +optimizer_type: "adam" +lr: 0.00005 +batch_size: 2 +CPU_NUM: 10 +epoch: 20 +log_per_step: 1 +save_per_step: 100 +output_path: "./output" +ckpt_path: "./ernie_base_ckpt" + +# data config ------ +input_data: "./data.txt" +graph_path: "./workdir" +sample_workers: 1 +use_pyreader: true +input_type: "text" + +# model config ------ +samples: [10] +model_type: "ErnieSageModelV3" + +max_seqlen: 40 + +num_layers: 1 +hidden_size: 128 +final_fc: true +final_l2_norm: true +loss_type: "hinge" +margin: 0.3 + +# infer config ------ +infer_model: "./output/last" +infer_batch_size: 128 + +# ernie config ------ +encoding: "utf8" +ernie_vocab_file: "./vocab.txt" +ernie_config: + attention_probs_dropout_prob: 0.1 + hidden_act: "relu" + hidden_dropout_prob: 0.1 + hidden_size: 768 + initializer_range: 0.02 + max_position_embeddings: 513 + num_attention_heads: 12 + num_hidden_layers: 12 + sent_type_vocab_size: 4 + task_type_vocab_size: 3 + vocab_size: 18000 + use_task_id: false + use_fp16: false diff --git a/examples/erniesage/config/erniesage_v3_gpu.yaml b/examples/erniesage/config/erniesage_v3_gpu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e53ab33c41f8b8760e75d602bf1b8ed9f1735fb8 --- /dev/null +++ b/examples/erniesage/config/erniesage_v3_gpu.yaml @@ -0,0 +1,55 @@ +# Global Enviroment Settings +# +# trainer config ------ +learner_type: "gpu" +optimizer_type: "adam" +lr: 0.00005 +batch_size: 32 +CPU_NUM: 10 +epoch: 20 +log_per_step: 1 +save_per_step: 100 +output_path: "./output" +ckpt_path: "./ernie_base_ckpt" + +# data config ------ +input_data: "./data.txt" +graph_path: "./workdir" +sample_workers: 1 +use_pyreader: true +input_type: "text" + +# model config ------ +samples: [10] +model_type: "ErnieSageModelV3" + +max_seqlen: 40 + +num_layers: 1 +hidden_size: 128 +final_fc: true +final_l2_norm: true +loss_type: "hinge" +margin: 0.3 + +# infer config ------ +infer_model: "./output/last" +infer_batch_size: 128 + +# ernie config ------ +encoding: "utf8" +ernie_vocab_file: "./vocab.txt" +ernie_config: + attention_probs_dropout_prob: 0.1 + hidden_act: "relu" + hidden_dropout_prob: 0.1 + hidden_size: 768 + initializer_range: 0.02 + max_position_embeddings: 513 + num_attention_heads: 12 + num_hidden_layers: 12 + sent_type_vocab_size: 4 + task_type_vocab_size: 3 + vocab_size: 18000 + use_task_id: false + use_fp16: false diff --git a/examples/erniesage/dataset/__init__.py b/examples/erniesage/dataset/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/examples/erniesage/dataset/base_dataset.py b/examples/erniesage/dataset/base_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..3b29b5761769e9be9e62fb4536e41d43f9c9abb4 --- /dev/null +++ b/examples/erniesage/dataset/base_dataset.py @@ -0,0 +1,158 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Base DataLoader +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from __future__ import absolute_import + +import os +import sys +import six +from io import open +from collections import namedtuple +import numpy as np +import tqdm +import paddle +from pgl.utils import mp_reader +import collections +import time +from pgl.utils.logger import log +import traceback + + +if six.PY3: + import io + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') + + +def batch_iter(data, perm, batch_size, fid, num_workers): + """node_batch_iter + """ + size = len(data) + start = 0 + cc = 0 + while start < size: + index = perm[start:start + batch_size] + start += batch_size + cc += 1 + if cc % num_workers != fid: + continue + yield data[index] + + +def scan_batch_iter(data, batch_size, fid, num_workers): + """node_batch_iter + """ + batch = [] + cc = 0 + for line_example in data.scan(): + cc += 1 + if cc % num_workers != fid: + continue + batch.append(line_example) + if len(batch) == batch_size: + yield batch + batch = [] + + if len(batch) > 0: + yield batch + + +class BaseDataGenerator(object): + """Base Data Geneartor""" + + def __init__(self, buf_size, batch_size, num_workers, shuffle=True): + self.num_workers = num_workers + self.batch_size = batch_size + self.line_examples = [] + self.buf_size = buf_size + self.shuffle = shuffle + + def batch_fn(self, batch_examples): + """ batch_fn batch producer""" + raise NotImplementedError("No defined Batch Fn") + + def batch_iter(self, fid, perm): + """ batch iterator""" + if self.shuffle: + for batch in batch_iter(self, perm, self.batch_size, fid, self.num_workers): + yield batch + else: + for batch in scan_batch_iter(self, self.batch_size, fid, self.num_workers): + yield batch + + def __len__(self): + return len(self.line_examples) + + def __getitem__(self, idx): + if isinstance(idx, collections.Iterable): + return [self[bidx] for bidx in idx] + else: + return self.line_examples[idx] + + def generator(self): + """batch dict generator""" + + def worker(filter_id, perm): + """ multiprocess worker""" + + def func_run(): + """ func_run """ + pid = os.getpid() + np.random.seed(pid + int(time.time())) + for batch_examples in self.batch_iter(filter_id, perm): + try: + batch_dict = self.batch_fn(batch_examples) + except Exception as e: + traceback.print_exc() + log.info(traceback.format_exc()) + log.info(str(e)) + continue + + if batch_dict is None: + continue + yield batch_dict + + + + return func_run + + # consume a seed + np.random.rand() + + if self.shuffle: + perm = np.arange(0, len(self)) + np.random.shuffle(perm) + else: + perm = None + + if self.num_workers == 1: + r = paddle.reader.buffered(worker(0, perm), self.buf_size) + else: + worker_pool = [worker(wid, perm) for wid in range(self.num_workers)] + worker = mp_reader.multiprocess_reader( + worker_pool, use_pipe=True, queue_size=1000) + r = paddle.reader.buffered(worker, self.buf_size) + + for batch in r(): + yield batch + + def scan(self): + for line_example in self.line_examples: + yield line_example diff --git a/examples/erniesage/dataset/graph_reader.py b/examples/erniesage/dataset/graph_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..99d029a98a3ac0f482cdf5d4cd6591967ce86495 --- /dev/null +++ b/examples/erniesage/dataset/graph_reader.py @@ -0,0 +1,119 @@ +"""Graph Dataset +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from __future__ import absolute_import + +import os +import pgl +import sys + +import numpy as np + +from pgl.utils.logger import log +from dataset.base_dataset import BaseDataGenerator +from pgl.sample import alias_sample +from pgl.sample import pinsage_sample +from pgl.sample import graphsage_sample +from pgl.sample import edge_hash + + +class GraphGenerator(BaseDataGenerator): + def __init__(self, graph_wrappers, data, batch_size, samples, + num_workers, feed_name_list, use_pyreader, + phase, graph_data_path, shuffle=True, buf_size=1000): + + super(GraphGenerator, self).__init__( + buf_size=buf_size, + num_workers=num_workers, + batch_size=batch_size, shuffle=shuffle) + # For iteration + self.line_examples = data + + self.graph_wrappers = graph_wrappers + self.samples = samples + self.feed_name_list = feed_name_list + self.use_pyreader = use_pyreader + self.phase = phase + self.load_graph(graph_data_path) + self.num_layers = len(graph_wrappers) + + def load_graph(self, graph_data_path): + self.graph = pgl.graph.MemmapGraph(graph_data_path) + self.alias = np.load(os.path.join(graph_data_path, "alias.npy"), mmap_mode="r") + self.events = np.load(os.path.join(graph_data_path, "events.npy"), mmap_mode="r") + self.term_ids = np.load(os.path.join(graph_data_path, "term_ids.npy"), mmap_mode="r") + + def batch_fn(self, batch_ex): + # batch_ex = [ + # (src, dst, neg), + # (src, dst, neg), + # (src, dst, neg), + # ] + # + batch_src = [] + batch_dst = [] + batch_neg = [] + for batch in batch_ex: + batch_src.append(batch[0]) + batch_dst.append(batch[1]) + if len(batch) == 3: # default neg samples + batch_neg.append(batch[2]) + + if len(batch_src) != self.batch_size: + if self.phase == "train": + return None #Skip + + if len(batch_neg) > 0: + batch_neg = np.unique(np.concatenate(batch_neg)) + batch_src = np.array(batch_src, dtype="int64") + batch_dst = np.array(batch_dst, dtype="int64") + + sampled_batch_neg = alias_sample(batch_dst.shape, self.alias, self.events) + + if len(batch_neg) > 0: + batch_neg = np.concatenate([batch_neg, sampled_batch_neg], 0) + else: + batch_neg = sampled_batch_neg + + if self.phase == "train": + ignore_edges = set() + else: + ignore_edges = set() + + nodes = np.unique(np.concatenate([batch_src, batch_dst, batch_neg], 0)) + subgraphs = graphsage_sample(self.graph, nodes, self.samples, ignore_edges=ignore_edges) + feed_dict = {} + for i in range(self.num_layers): + feed_dict.update(self.graph_wrappers[i].to_feed(subgraphs[i])) + + # only reindex from first subgraph + sub_src_idx = subgraphs[0].reindex_from_parrent_nodes(batch_src) + sub_dst_idx = subgraphs[0].reindex_from_parrent_nodes(batch_dst) + sub_neg_idx = subgraphs[0].reindex_from_parrent_nodes(batch_neg) + + feed_dict["user_index"] = np.array(sub_src_idx, dtype="int64") + feed_dict["item_index"] = np.array(sub_dst_idx, dtype="int64") + #feed_dict["neg_item_index"] = np.array(sub_neg_idx, dtype="int64") + feed_dict["term_ids"] = self.term_ids[subgraphs[0].node_feat["index"]] + return feed_dict + + def __call__(self): + return self.generator() + + def generator(self): + try: + for feed_dict in super(GraphGenerator, self).generator(): + if self.use_pyreader: + yield [feed_dict[name] for name in self.feed_name_list] + else: + yield feed_dict + + except Exception as e: + log.exception(e) + + + diff --git a/examples/erniesage/docs/source/_static/ernie_aggregator.png b/examples/erniesage/docs/source/_static/ernie_aggregator.png new file mode 100644 index 0000000000000000000000000000000000000000..206a0673d76e97bcc6a47108df683583e4a0b240 Binary files /dev/null and b/examples/erniesage/docs/source/_static/ernie_aggregator.png differ diff --git a/examples/erniesage/docs/source/_static/text_graph.png b/examples/erniesage/docs/source/_static/text_graph.png new file mode 100644 index 0000000000000000000000000000000000000000..26f89eb124f272acee2b097f39cb310416de45e1 Binary files /dev/null and b/examples/erniesage/docs/source/_static/text_graph.png differ diff --git a/examples/erniesage/infer.py b/examples/erniesage/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..20735ddc487e216c309f9bcfccc8a8ed3a602873 --- /dev/null +++ b/examples/erniesage/infer.py @@ -0,0 +1,187 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import division +from __future__ import absolute_import +from __future__ import print_function +from __future__ import unicode_literals +import argparse +import pickle +import time +import glob +import os +import io +import traceback +import pickle as pkl +role = os.getenv("TRAINING_ROLE", "TRAINER") + +import numpy as np +import yaml +from easydict import EasyDict as edict +import pgl +from pgl.utils.logger import log +from pgl.utils import paddle_helper +import paddle +import paddle.fluid as F + +from models.model_factory import Model +from dataset.graph_reader import GraphGenerator + + +class PredictData(object): + def __init__(self, num_nodes): + trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) + train_usr = np.arange(trainer_id, num_nodes, trainer_count) + #self.data = (train_usr, train_usr) + self.data = train_usr + + def __getitem__(self, index): + return [self.data[index], self.data[index]] + +def tostr(data_array): + return " ".join(["%.5lf" % d for d in data_array]) + +def run_predict(py_reader, + exe, + program, + model_dict, + log_per_step=1, + args=None): + + if args.input_type == "text": + id2str = np.load(os.path.join(args.graph_path, "id2str.npy"), mmap_mode="r") + + trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) + if not os.path.exists(args.output_path): + os.mkdir(args.output_path) + + fout = io.open("%s/part-%s" % (args.output_path, trainer_id), "w", encoding="utf8") + batch = 0 + + for batch_feed_dict in py_reader(): + batch += 1 + batch_usr_feat, batch_ad_feat, batch_src_real_index = exe.run( + program, + feed=batch_feed_dict, + fetch_list=model_dict.outputs) + + if batch % log_per_step == 0: + log.info("Predict %s finished" % batch) + + for ufs, _, sri in zip(batch_usr_feat, batch_ad_feat, batch_src_real_index): + if args.input_type == "text": + sri = id2str[int(sri)] + line = "{}\t{}\n".format(sri, tostr(ufs)) + fout.write(line) + + fout.close() + +def _warmstart(exe, program, path='params'): + def _existed_persitables(var): + #if not isinstance(var, fluid.framework.Parameter): + # return False + if not F.io.is_persistable(var): + return False + param_path = os.path.join(path, var.name) + log.info("Loading parameter: {} persistable: {} exists: {}".format( + param_path, + F.io.is_persistable(var), + os.path.exists(param_path), + )) + return os.path.exists(param_path) + F.io.load_vars( + exe, + path, + main_program=program, + predicate=_existed_persitables + ) + +def main(config): + model = Model.factory(config) + + if config.learner_type == "cpu": + place = F.CPUPlace() + elif config.learner_type == "gpu": + gpu_id = int(os.getenv("FLAGS_selected_gpus", "0")) + place = F.CUDAPlace(gpu_id) + else: + raise ValueError + + exe = F.Executor(place) + + val_program = F.default_main_program().clone(for_test=True) + exe.run(F.default_startup_program()) + _warmstart(exe, F.default_startup_program(), path=config.infer_model) + + num_threads = int(os.getenv("CPU_NUM", 1)) + trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) + + exec_strategy = F.ExecutionStrategy() + exec_strategy.num_threads = num_threads + build_strategy = F.BuildStrategy() + build_strategy.enable_inplace = True + build_strategy.memory_optimize = True + build_strategy.remove_unnecessary_lock = False + build_strategy.memory_optimize = False + + if num_threads > 1: + build_strategy.reduce_strategy = F.BuildStrategy.ReduceStrategy.Reduce + + val_compiled_prog = F.compiler.CompiledProgram( + val_program).with_data_parallel( + build_strategy=build_strategy, + exec_strategy=exec_strategy) + + num_nodes = int(np.load(os.path.join(config.graph_path, "num_nodes.npy"))) + + predict_data = PredictData(num_nodes) + + predict_iter = GraphGenerator( + graph_wrappers=model.graph_wrappers, + batch_size=config.infer_batch_size, + data=predict_data, + samples=config.samples, + num_workers=config.sample_workers, + feed_name_list=[var.name for var in model.feed_list], + use_pyreader=config.use_pyreader, + phase="predict", + graph_data_path=config.graph_path, + shuffle=False) + + if config.learner_type == "cpu": + model.data_loader.decorate_batch_generator( + predict_iter, places=F.cpu_places()) + elif config.learner_type == "gpu": + gpu_id = int(os.getenv("FLAGS_selected_gpus", "0")) + place = F.CUDAPlace(gpu_id) + model.data_loader.decorate_batch_generator( + predict_iter, places=place) + else: + raise ValueError + + run_predict(model.data_loader, + program=val_compiled_prog, + exe=exe, + model_dict=model, + args=config) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='main') + parser.add_argument("--conf", type=str, default="./config.yaml") + args = parser.parse_args() + config = edict(yaml.load(open(args.conf), Loader=yaml.FullLoader)) + print(config) + main(config) diff --git a/examples/erniesage/job.sh b/examples/erniesage/job.sh new file mode 100644 index 0000000000000000000000000000000000000000..77739df75a3250b1f4d096a50c0841cac5b83a6a --- /dev/null +++ b/examples/erniesage/job.sh @@ -0,0 +1,45 @@ + +unset http_proxy https_proxy +set -x +mode=${1:-local} +config=${2:-"./config.yaml"} + +function parse_yaml { + local prefix=$2 + local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034') + sed -ne "s|^\($s\):|\1|" \ + -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \ + -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p" $1 | + awk -F$fs '{ + indent = length($1)/2; + vname[indent] = $2; + for (i in vname) {if (i > indent) {delete vname[i]}} + if (length($3) > 0) { + vn=""; for (i=0; i 1: + build_strategy.reduce_strategy = F.BuildStrategy.ReduceStrategy.Reduce + + log.info("start build compile program...") + compiled_prog = F.compiler.CompiledProgram(tfleet.main_program + ).with_data_parallel( + loss_name=loss.name, + build_strategy=build_strategy, + exec_strategy=exec_strategy) + + return compiled_prog + + +class CollectiveLearner(Learner): + def __init__(self): + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + cfleet.init(role) + + def optimize(self, loss, optimizer_type, lr): + optimizer = F.optimizer.Adam(learning_rate=lr) + dist_strategy = DistributedStrategy() + optimizer = cfleet.distributed_optimizer(optimizer, strategy=dist_strategy) + _, param_grads = optimizer.minimize(loss, F.default_startup_program()) + + def build(self, model, data_gen, config): + self.optimize(model.loss, config.optimizer_type, config.lr) + self.program = cfleet.main_program + gpu_id = int(os.getenv("FLAGS_selected_gpus", "0")) + place = F.CUDAPlace(gpu_id) + self.exe = F.Executor(place) + self.exe.run(F.default_startup_program()) + self.warmstart(F.default_startup_program(), config.ckpt_path) + self.fleet = cfleet + model.data_loader.decorate_batch_generator( + data_gen, places=place) + self.config = config + self.model = model diff --git a/examples/erniesage/local_run.sh b/examples/erniesage/local_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..7b76d1c8288472d5c84abfd8585032c71ace8dab --- /dev/null +++ b/examples/erniesage/local_run.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +set -x +config=${1:-"./config.yaml"} +unset http_proxy https_proxy + +function parse_yaml { + local prefix=$2 + local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034') + sed -ne "s|^\($s\):|\1|" \ + -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \ + -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p" $1 | + awk -F$fs '{ + indent = length($1)/2; + vname[indent] = $2; + for (i in vname) {if (i > indent) {delete vname[i]}} + if (length($3) > 0) { + vn=""; for (i=0; i $BASE/pserver.$i.log & + echo $! >> job_id + done + sleep 3s + for((j=0;j<${PADDLE_TRAINERS_NUM};j++)) + do + echo "start ps work: ${j}" + TRAINING_ROLE="TRAINER" PADDLE_TRAINER_ID=${j} sh job.sh local $config \ + echo $! >> job_id + done +} + +collective_local_train(){ + export PATH=./python27-gcc482-gpu/bin/:$PATH + echo `which python` + python -m paddle.distributed.launch train.py --conf $config + python -m paddle.distributed.launch infer.py --conf $config +} + +eval $(parse_yaml $config) +unalias python + +python3 ./preprocessing/dump_graph.py -i $input_data -o $graph_path --encoding $encoding \ + -l $max_seqlen --vocab_file $ernie_vocab_file + +if [[ $learner_type == "cpu" ]];then + transpiler_local_train +fi +if [[ $learner_type == "gpu" ]];then + collective_local_train +fi diff --git a/examples/erniesage/models/__init__.py b/examples/erniesage/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/examples/erniesage/models/base.py b/examples/erniesage/models/base.py new file mode 100644 index 0000000000000000000000000000000000000000..c910c714db4269bc9b3bac391e2adf2699f476c6 --- /dev/null +++ b/examples/erniesage/models/base.py @@ -0,0 +1,202 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import time +import glob +import os + +import numpy as np + +import pgl +import paddle.fluid as F +import paddle.fluid.layers as L + +from models import message_passing + +def get_layer(layer_type, gw, feature, hidden_size, act, initializer, learning_rate, name, is_test=False): + return getattr(message_passing, layer_type)(gw, feature, hidden_size, act, initializer, learning_rate, name) + + +class BaseGraphWrapperBuilder(object): + def __init__(self, config): + self.config = config + self.node_feature_info = [] + self.edge_feature_info = [] + + def __call__(self): + place = F.CPUPlace() + graph_wrappers = [] + for i in range(self.config.num_layers): + # all graph have same node_feat_info + graph_wrappers.append( + pgl.graph_wrapper.GraphWrapper( + "layer_%s" % i, place, node_feat=self.node_feature_info, edge_feat=self.edge_feature_info)) + return graph_wrappers + + +class GraphsageGraphWrapperBuilder(BaseGraphWrapperBuilder): + def __init__(self, config): + super(GraphsageGraphWrapperBuilder, self).__init__(config) + self.node_feature_info.append(('index', [None], np.dtype('int64'))) + + +class BaseGNNModel(object): + def __init__(self, config): + self.config = config + self.graph_wrapper_builder = self.gen_graph_wrapper_builder(config) + self.net_fn = self.gen_net_fn(config) + self.feed_list_builder = self.gen_feed_list_builder(config) + self.data_loader_builder = self.gen_data_loader_builder(config) + self.loss_fn = self.gen_loss_fn(config) + self.build() + + + def gen_graph_wrapper_builder(self, config): + return GraphsageGraphWrapperBuilder(config) + + def gen_net_fn(self, config): + return BaseNet(config) + + def gen_feed_list_builder(self, config): + return BaseFeedListBuilder(config) + + def gen_data_loader_builder(self, config): + return BaseDataLoaderBuilder(config) + + def gen_loss_fn(self, config): + return BaseLoss(config) + + def build(self): + self.graph_wrappers = self.graph_wrapper_builder() + self.inputs, self.outputs = self.net_fn(self.graph_wrappers) + self.feed_list = self.feed_list_builder(self.inputs, self.graph_wrappers) + self.data_loader = self.data_loader_builder(self.feed_list) + self.loss = self.loss_fn(self.outputs) + +class BaseFeedListBuilder(object): + def __init__(self, config): + self.config = config + + def __call__(self, inputs, graph_wrappers): + feed_list = [] + for i in range(len(graph_wrappers)): + feed_list.extend(graph_wrappers[i].holder_list) + feed_list.extend(inputs) + return feed_list + + +class BaseDataLoaderBuilder(object): + def __init__(self, config): + self.config = config + + def __call__(self, feed_list): + data_loader = F.io.PyReader( + feed_list=feed_list, capacity=20, use_double_buffer=True, iterable=True) + return data_loader + + + +class BaseNet(object): + def __init__(self, config): + self.config = config + + def take_final_feature(self, feature, index, name): + """take final feature""" + feat = L.gather(feature, index, overwrite=False) + + if self.config.final_fc: + feat = L.fc(feat, + self.config.hidden_size, + param_attr=F.ParamAttr(name=name + '_w'), + bias_attr=F.ParamAttr(name=name + '_b')) + + if self.config.final_l2_norm: + feat = L.l2_normalize(feat, axis=1) + return feat + + def build_inputs(self): + user_index = L.data( + "user_index", shape=[None], dtype="int64", append_batch_size=False) + item_index = L.data( + "item_index", shape=[None], dtype="int64", append_batch_size=False) + return [user_index, item_index] + + def build_embedding(self, graph_wrappers, inputs=None): + num_embed = int(np.load(os.path.join(self.config.graph_path, "num_nodes.npy"))) + is_sparse = self.config.trainer_type == "Transpiler" + + embed = L.embedding( + input=L.reshape(graph_wrappers[0].node_feat['index'], [-1, 1]), + size=[num_embed, self.config.hidden_size], + is_sparse=is_sparse, + param_attr=F.ParamAttr(name="node_embedding", initializer=F.initializer.Uniform( + low=-1. / self.config.hidden_size, + high=1. / self.config.hidden_size))) + return embed + + def gnn_layers(self, graph_wrappers, feature): + features = [feature] + + initializer = None + fc_lr = self.config.lr / 0.001 + + for i in range(self.config.num_layers): + if i == self.config.num_layers - 1: + act = None + else: + act = "leaky_relu" + + feature = get_layer( + self.config.layer_type, + graph_wrappers[i], + feature, + self.config.hidden_size, + act, + initializer, + learning_rate=fc_lr, + name="%s_%s" % (self.config.layer_type, i)) + features.append(feature) + return features + + def __call__(self, graph_wrappers): + inputs = self.build_inputs() + feature = self.build_embedding(graph_wrappers, inputs) + features = self.gnn_layers(graph_wrappers, feature) + outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs] + src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0]) + outputs.append(src_real_index) + return inputs, outputs + +class BaseLoss(object): + def __init__(self, config): + self.config = config + + def __call__(self, outputs): + user_feat, item_feat = outputs[0], outputs[1] + loss_type = self.config.loss_type + # Calc Loss + if self.config.loss_type == "hinge": + pos = L.reduce_sum(user_feat * item_feat, -1, keep_dim=True) # [B, 1] + neg = L.matmul(user_feat, item_feat, transpose_y=True) # [B, B] + loss = L.reduce_mean(L.relu(neg - pos + self.config.margin)) + elif self.config.loss_type == "softmax": + pass + # TODO + # pos = L.reduce_sum(user_feat * item_feat, -1, keep_dim=True) # [B, 1] + # neg = L.matmul(user_feat, neg_feat, transpose_y=True) # [B, B] + # logits = L.concat([pos, neg], -1) # [B, 1+B] + # labels = L.fill_constant_batch_size_like(logits, [-1, 1], "int64", 0) + # loss = L.reduce_mean(L.softmax_with_cross_entropy(logits, labels)) + else: + raise ValueError + return loss diff --git a/examples/erniesage/models/ernie.py b/examples/erniesage/models/ernie.py new file mode 100644 index 0000000000000000000000000000000000000000..6b53ebd439713f7f249c21ad96cafa0b2ae84f06 --- /dev/null +++ b/examples/erniesage/models/ernie.py @@ -0,0 +1,40 @@ +"""Ernie +""" +from models.base import BaseNet, BaseGNNModel + +class Ernie(BaseNet): + + def build_inputs(self): + inputs = super(Ernie, self).build_inputs() + term_ids = L.data( + "term_ids", shape=[None, self.config.max_seqlen], dtype="int64", append_batch_size=False) + return inputs + [term_ids] + + def build_embedding(self, graph_wrappers, term_ids): + term_ids = L.unsqueeze(term_ids, [-1]) + ernie_config = self.config.ernie_config + ernie = ErnieModel( + src_ids=term_ids, + sentence_ids=L.zeros_like(term_ids), + task_ids=None, + config=ernie_config, + use_fp16=False, + name="student_") + feature = ernie.get_pooled_output() + return feature + + def __call__(self, graph_wrappers): + inputs = self.build_inputs() + feature = self.build_embedding(graph_wrappers, inputs[-1]) + features = [feature] + outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs[:-1]] + src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0]) + outputs.append(src_real_index) + return inputs, outputs + + +class ErnieModel(BaseGNNModel): + def gen_net_fn(self, config): + return Ernie(config) + + diff --git a/examples/erniesage/models/ernie_model/__init__.py b/examples/erniesage/models/ernie_model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/examples/erniesage/models/ernie_model/ernie.py b/examples/erniesage/models/ernie_model/ernie.py new file mode 100644 index 0000000000000000000000000000000000000000..3ba4f9bbd82f3889e66a8ff16aa7f1eee27abc79 --- /dev/null +++ b/examples/erniesage/models/ernie_model/ernie.py @@ -0,0 +1,399 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Ernie model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import json +import six +import logging +import paddle.fluid as fluid +import paddle.fluid.layers as L + +from io import open + +from models.ernie_model.transformer_encoder import encoder, pre_process_layer +from models.ernie_model.transformer_encoder import graph_encoder + +log = logging.getLogger(__name__) + + +class ErnieConfig(object): + def __init__(self, config_path): + self._config_dict = self._parse(config_path) + + def _parse(self, config_path): + try: + with open(config_path, 'r', encoding='utf8') as json_file: + config_dict = json.load(json_file) + except Exception: + raise IOError("Error in parsing Ernie model config file '%s'" % + config_path) + else: + return config_dict + + def __getitem__(self, key): + return self._config_dict.get(key, None) + + def print_config(self): + for arg, value in sorted(six.iteritems(self._config_dict)): + log.info('%s: %s' % (arg, value)) + log.info('------------------------------------------------') + + +class ErnieModel(object): + def __init__(self, + src_ids, + sentence_ids, + task_ids=None, + config=None, + weight_sharing=True, + use_fp16=False, + name=""): + + self._set_config(config, name, weight_sharing) + input_mask = self._build_input_mask(src_ids) + position_ids = self._build_position_ids(src_ids) + self._build_model(src_ids, position_ids, sentence_ids, task_ids, + input_mask) + self._debug_summary(input_mask) + + def _debug_summary(self, input_mask): + #histogram + seqlen_before_pad = L.cast( + L.reduce_sum( + input_mask, dim=1), dtype='float32') + seqlen_after_pad = L.reduce_sum( + L.cast( + L.zeros_like(input_mask), dtype='float32') + 1.0, dim=1) + pad_num = seqlen_after_pad - seqlen_before_pad + pad_rate = pad_num / seqlen_after_pad + + def _build_position_ids(self, src_ids): + d_shape = L.shape(src_ids) + d_seqlen = d_shape[1] + d_batch = d_shape[0] + position_ids = L.reshape( + L.range( + 0, d_seqlen, 1, dtype='int32'), [1, d_seqlen, 1], + inplace=True) + position_ids = L.expand(position_ids, [d_batch, 1, 1]) + position_ids = L.cast(position_ids, 'int64') + position_ids.stop_gradient = True + return position_ids + + def _build_input_mask(self, src_ids): + zero = L.fill_constant([1], dtype='int64', value=0) + input_mask = L.logical_not(L.equal(src_ids, + zero)) # assume pad id == 0 + input_mask = L.cast(input_mask, 'float') + input_mask.stop_gradient = True + return input_mask + + def _set_config(self, config, name, weight_sharing): + self._emb_size = config['hidden_size'] + self._n_layer = config['num_hidden_layers'] + self._n_head = config['num_attention_heads'] + self._voc_size = config['vocab_size'] + self._max_position_seq_len = config['max_position_embeddings'] + if config.get('sent_type_vocab_size'): + self._sent_types = config['sent_type_vocab_size'] + else: + self._sent_types = config['type_vocab_size'] + + self._use_task_id = config['use_task_id'] + if self._use_task_id: + self._task_types = config['task_type_vocab_size'] + self._hidden_act = config['hidden_act'] + self._postprocess_cmd = config.get('postprocess_cmd', 'dan') + self._preprocess_cmd = config.get('preprocess_cmd', '') + self._prepostprocess_dropout = config['hidden_dropout_prob'] + self._attention_dropout = config['attention_probs_dropout_prob'] + self._weight_sharing = weight_sharing + self.name = name + + self._word_emb_name = self.name + "word_embedding" + self._pos_emb_name = self.name + "pos_embedding" + self._sent_emb_name = self.name + "sent_embedding" + self._task_emb_name = self.name + "task_embedding" + self._dtype = "float16" if config['use_fp16'] else "float32" + self._emb_dtype = "float32" + + # Initialize all weigths by truncated normal initializer, and all biases + # will be initialized by constant zero by default. + self._param_initializer = fluid.initializer.TruncatedNormal( + scale=config['initializer_range']) + + def _build_model(self, src_ids, position_ids, sentence_ids, task_ids, + input_mask): + + emb_out = self._build_embedding(src_ids, position_ids, sentence_ids, + task_ids) + self.input_mask = input_mask + self._enc_out, self.all_hidden, self.all_attn, self.all_ffn = encoder( + enc_input=emb_out, + input_mask=input_mask, + n_layer=self._n_layer, + n_head=self._n_head, + d_key=self._emb_size // self._n_head, + d_value=self._emb_size // self._n_head, + d_model=self._emb_size, + d_inner_hid=self._emb_size * 4, + prepostprocess_dropout=self._prepostprocess_dropout, + attention_dropout=self._attention_dropout, + relu_dropout=0, + hidden_act=self._hidden_act, + preprocess_cmd=self._preprocess_cmd, + postprocess_cmd=self._postprocess_cmd, + param_initializer=self._param_initializer, + name=self.name + 'encoder') + if self._dtype == "float16": + self._enc_out = fluid.layers.cast( + x=self._enc_out, dtype=self._emb_dtype) + + def _build_embedding(self, src_ids, position_ids, sentence_ids, task_ids): + # padding id in vocabulary must be set to 0 + emb_out = fluid.layers.embedding( + input=src_ids, + size=[self._voc_size, self._emb_size], + dtype=self._emb_dtype, + param_attr=fluid.ParamAttr( + name=self._word_emb_name, initializer=self._param_initializer), + is_sparse=False) + + position_emb_out = fluid.layers.embedding( + input=position_ids, + size=[self._max_position_seq_len, self._emb_size], + dtype=self._emb_dtype, + param_attr=fluid.ParamAttr( + name=self._pos_emb_name, initializer=self._param_initializer)) + + sent_emb_out = fluid.layers.embedding( + sentence_ids, + size=[self._sent_types, self._emb_size], + dtype=self._emb_dtype, + param_attr=fluid.ParamAttr( + name=self._sent_emb_name, initializer=self._param_initializer)) + + self.all_emb = [emb_out, position_emb_out, sent_emb_out] + emb_out = emb_out + position_emb_out + emb_out = emb_out + sent_emb_out + + if self._use_task_id: + task_emb_out = fluid.layers.embedding( + task_ids, + size=[self._task_types, self._emb_size], + dtype=self._emb_dtype, + param_attr=fluid.ParamAttr( + name=self._task_emb_name, + initializer=self._param_initializer)) + + emb_out = emb_out + task_emb_out + + emb_out = pre_process_layer( + emb_out, + 'nd', + self._prepostprocess_dropout, + name=self.name + 'pre_encoder') + + if self._dtype == "float16": + emb_out = fluid.layers.cast(x=emb_out, dtype=self._dtype) + return emb_out + + def get_sequence_output(self): + return self._enc_out + + def get_pooled_output(self): + """Get the first feature of each sequence for classification""" + next_sent_feat = self._enc_out[:, 0, :] + #next_sent_feat = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1]) + next_sent_feat = fluid.layers.fc( + input=next_sent_feat, + size=self._emb_size, + act="tanh", + param_attr=fluid.ParamAttr( + name=self.name + "pooled_fc.w_0", + initializer=self._param_initializer), + bias_attr=self.name + "pooled_fc.b_0") + return next_sent_feat + + def get_lm_output(self, mask_label, mask_pos): + """Get the loss & accuracy for pretraining""" + + mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32') + + # extract the first token feature in each sentence + self.next_sent_feat = self.get_pooled_output() + reshaped_emb_out = fluid.layers.reshape( + x=self._enc_out, shape=[-1, self._emb_size]) + # extract masked tokens' feature + mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos) + + # transform: fc + mask_trans_feat = fluid.layers.fc( + input=mask_feat, + size=self._emb_size, + act=self._hidden_act, + param_attr=fluid.ParamAttr( + name=self.name + 'mask_lm_trans_fc.w_0', + initializer=self._param_initializer), + bias_attr=fluid.ParamAttr(name=self.name + 'mask_lm_trans_fc.b_0')) + + # transform: layer norm + mask_trans_feat = fluid.layers.layer_norm( + mask_trans_feat, + begin_norm_axis=len(mask_trans_feat.shape) - 1, + param_attr=fluid.ParamAttr( + name=self.name + 'mask_lm_trans_layer_norm_scale', + initializer=fluid.initializer.Constant(1.)), + bias_attr=fluid.ParamAttr( + name=self.name + 'mask_lm_trans_layer_norm_bias', + initializer=fluid.initializer.Constant(0.))) + # transform: layer norm + #mask_trans_feat = pre_process_layer( + # mask_trans_feat, 'n', name=self.name + 'mask_lm_trans') + + mask_lm_out_bias_attr = fluid.ParamAttr( + name=self.name + "mask_lm_out_fc.b_0", + initializer=fluid.initializer.Constant(value=0.0)) + if self._weight_sharing: + fc_out = fluid.layers.matmul( + x=mask_trans_feat, + y=fluid.default_main_program().global_block().var( + self._word_emb_name), + transpose_y=True) + fc_out += fluid.layers.create_parameter( + shape=[self._voc_size], + dtype=self._emb_dtype, + attr=mask_lm_out_bias_attr, + is_bias=True) + + else: + fc_out = fluid.layers.fc(input=mask_trans_feat, + size=self._voc_size, + param_attr=fluid.ParamAttr( + name=self.name + "mask_lm_out_fc.w_0", + initializer=self._param_initializer), + bias_attr=mask_lm_out_bias_attr) + + mask_lm_loss = fluid.layers.softmax_with_cross_entropy( + logits=fc_out, label=mask_label) + return mask_lm_loss + + def get_task_output(self, task, task_labels): + task_fc_out = fluid.layers.fc( + input=self.next_sent_feat, + size=task["num_labels"], + param_attr=fluid.ParamAttr( + name=self.name + task["task_name"] + "_fc.w_0", + initializer=self._param_initializer), + bias_attr=self.name + task["task_name"] + "_fc.b_0") + task_loss, task_softmax = fluid.layers.softmax_with_cross_entropy( + logits=task_fc_out, label=task_labels, return_softmax=True) + task_acc = fluid.layers.accuracy(input=task_softmax, label=task_labels) + return task_loss, task_acc + + +class ErnieGraphModel(ErnieModel): + def __init__(self, + src_ids, + task_ids=None, + config=None, + weight_sharing=True, + use_fp16=False, + slot_seqlen=40, + name=""): + self.slot_seqlen = slot_seqlen + self._set_config(config, name, weight_sharing) + input_mask = self._build_input_mask(src_ids) + position_ids = self._build_position_ids(src_ids) + sentence_ids = self._build_sentence_ids(src_ids) + self._build_model(src_ids, position_ids, sentence_ids, task_ids, + input_mask) + self._debug_summary(input_mask) + + def _build_position_ids(self, src_ids): + src_shape = L.shape(src_ids) + src_seqlen = src_shape[1] + src_batch = src_shape[0] + + slot_seqlen = self.slot_seqlen + + num_b = (src_seqlen / slot_seqlen) - 1 + a_position_ids = L.reshape( + L.range( + 0, slot_seqlen, 1, dtype='int32'), [1, slot_seqlen, 1], + inplace=True) # [1, slot_seqlen, 1] + a_position_ids = L.expand(a_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen * num_b, 1] + + zero = L.fill_constant([1], dtype='int64', value=0) + input_mask = L.cast(L.equal(src_ids[:, :slot_seqlen], zero), "int32") # assume pad id == 0 [B, slot_seqlen, 1] + a_pad_len = L.reduce_sum(input_mask, 1) # [B, 1, 1] + + b_position_ids = L.reshape( + L.range( + slot_seqlen, 2*slot_seqlen, 1, dtype='int32'), [1, slot_seqlen, 1], + inplace=True) # [1, slot_seqlen, 1] + b_position_ids = L.expand(b_position_ids, [src_batch, num_b, 1]) # [B, slot_seqlen * num_b, 1] + b_position_ids = b_position_ids - a_pad_len # [B, slot_seqlen * num_b, 1] + + position_ids = L.concat([a_position_ids, b_position_ids], 1) + position_ids = L.cast(position_ids, 'int64') + position_ids.stop_gradient = True + return position_ids + + def _build_sentence_ids(self, src_ids): + src_shape = L.shape(src_ids) + src_seqlen = src_shape[1] + src_batch = src_shape[0] + + slot_seqlen = self.slot_seqlen + + zeros = L.zeros([src_batch, slot_seqlen, 1], "int64") + ones = L.ones([src_batch, src_seqlen-slot_seqlen, 1], "int64") + + sentence_ids = L.concat([zeros, ones], 1) + sentence_ids.stop_gradient = True + return sentence_ids + + def _build_model(self, src_ids, position_ids, sentence_ids, task_ids, + input_mask): + + emb_out = self._build_embedding(src_ids, position_ids, sentence_ids, + task_ids) + self.input_mask = input_mask + self._enc_out, self.all_hidden, self.all_attn, self.all_ffn = graph_encoder( + enc_input=emb_out, + input_mask=input_mask, + n_layer=self._n_layer, + n_head=self._n_head, + d_key=self._emb_size // self._n_head, + d_value=self._emb_size // self._n_head, + d_model=self._emb_size, + d_inner_hid=self._emb_size * 4, + prepostprocess_dropout=self._prepostprocess_dropout, + attention_dropout=self._attention_dropout, + relu_dropout=0, + hidden_act=self._hidden_act, + preprocess_cmd=self._preprocess_cmd, + postprocess_cmd=self._postprocess_cmd, + param_initializer=self._param_initializer, + slot_seqlen=self.slot_seqlen, + name=self.name + 'encoder') + if self._dtype == "float16": + self._enc_out = fluid.layers.cast( + x=self._enc_out, dtype=self._emb_dtype) diff --git a/examples/erniesage/models/ernie_model/transformer_encoder.py b/examples/erniesage/models/ernie_model/transformer_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..eef78f95595d90a6589ee10ad35cfaf0c88f485a --- /dev/null +++ b/examples/erniesage/models/ernie_model/transformer_encoder.py @@ -0,0 +1,518 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import partial +import numpy as np +from contextlib import contextmanager + +import paddle.fluid as fluid +import paddle.fluid.layers as L +import paddle.fluid.layers as layers +#import propeller.paddle as propeller +#from propeller import log + +#determin this at the begining +to_3d = lambda a: a # will change later +to_2d = lambda a: a + + +def multi_head_attention(queries, + keys, + values, + attn_bias, + d_key, + d_value, + d_model, + n_head=1, + dropout_rate=0., + cache=None, + param_initializer=None, + name='multi_head_att'): + """ + Multi-Head Attention. Note that attn_bias is added to the logit before + computing softmax activiation to mask certain selected positions so that + they will not considered in attention weights. + """ + keys = queries if keys is None else keys + values = keys if values is None else values + + def __compute_qkv(queries, keys, values, n_head, d_key, d_value): + """ + Add linear projection to queries, keys, and values. + """ + q = layers.fc(input=queries, + size=d_key * n_head, + num_flatten_dims=len(queries.shape) - 1, + param_attr=fluid.ParamAttr( + name=name + '_query_fc.w_0', + initializer=param_initializer), + bias_attr=name + '_query_fc.b_0') + k = layers.fc(input=keys, + size=d_key * n_head, + num_flatten_dims=len(keys.shape) - 1, + param_attr=fluid.ParamAttr( + name=name + '_key_fc.w_0', + initializer=param_initializer), + bias_attr=name + '_key_fc.b_0') + v = layers.fc(input=values, + size=d_value * n_head, + num_flatten_dims=len(values.shape) - 1, + param_attr=fluid.ParamAttr( + name=name + '_value_fc.w_0', + initializer=param_initializer), + bias_attr=name + '_value_fc.b_0') + return q, k, v + + def __split_heads(x, n_head): + """ + Reshape the last dimension of inpunt tensor x so that it becomes two + dimensions and then transpose. Specifically, input a tensor with shape + [bs, max_sequence_length, n_head * hidden_dim] then output a tensor + with shape [bs, n_head, max_sequence_length, hidden_dim]. + """ + hidden_size = x.shape[-1] + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + reshaped = layers.reshape( + x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True) + + # permuate the dimensions into: + # [batch_size, n_head, max_sequence_len, hidden_size_per_head] + return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) + + def __combine_heads(x): + """ + Transpose and then reshape the last two dimensions of inpunt tensor x + so that it becomes one dimension, which is reverse to __split_heads. + """ + if len(x.shape) == 3: return x + if len(x.shape) != 4: + raise ValueError("Input(x) should be a 4-D Tensor.") + trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + #trans_x.desc.set_shape((-1, 1, n_head, d_value)) + return layers.reshape(x=trans_x, shape=[0, 0, d_model], inplace=True) + + def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): + """ + Scaled Dot-Product Attention + """ + scaled_q = layers.scale(x=q, scale=d_key**-0.5) + product = layers.matmul(x=scaled_q, y=k, transpose_y=True) + if attn_bias: + product += attn_bias + weights = layers.softmax(product) + if dropout_rate: + weights = layers.dropout( + weights, + dropout_prob=dropout_rate, + dropout_implementation="upscale_in_train", + is_test=False) + out = layers.matmul(weights, v) + #return out, product + return out, weights + + q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) + q = to_3d(q) + k = to_3d(k) + v = to_3d(v) + + if cache is not None: # use cache and concat time steps + # Since the inplace reshape in __split_heads changes the shape of k and + # v, which is the cache input for next time step, reshape the cache + # input from the previous time step first. + k = cache["k"] = layers.concat( + [layers.reshape( + cache["k"], shape=[0, 0, d_model]), k], axis=1) + v = cache["v"] = layers.concat( + [layers.reshape( + cache["v"], shape=[0, 0, d_model]), v], axis=1) + + q = __split_heads(q, n_head) + k = __split_heads(k, n_head) + v = __split_heads(v, n_head) + + ctx_multiheads, ctx_multiheads_attn = scaled_dot_product_attention( + q, k, v, attn_bias, d_key, dropout_rate) + + out = __combine_heads(ctx_multiheads) + + out = to_2d(out) + + # Project back to the model size. + proj_out = layers.fc(input=out, + size=d_model, + num_flatten_dims=len(out.shape) - 1, + param_attr=fluid.ParamAttr( + name=name + '_output_fc.w_0', + initializer=param_initializer), + bias_attr=name + '_output_fc.b_0') + return proj_out, ctx_multiheads_attn + + +def positionwise_feed_forward(x, + d_inner_hid, + d_hid, + dropout_rate, + hidden_act, + param_initializer=None, + name='ffn'): + """ + Position-wise Feed-Forward Networks. + This module consists of two linear transformations with a ReLU activation + in between, which is applied to each position separately and identically. + """ + hidden = layers.fc(input=x, + size=d_inner_hid, + num_flatten_dims=len(x.shape) - 1, + act=hidden_act, + param_attr=fluid.ParamAttr( + name=name + '_fc_0.w_0', + initializer=param_initializer), + bias_attr=name + '_fc_0.b_0') + if dropout_rate: + hidden = layers.dropout( + hidden, + dropout_prob=dropout_rate, + dropout_implementation="upscale_in_train", + is_test=False) + out = layers.fc(input=hidden, + size=d_hid, + num_flatten_dims=len(hidden.shape) - 1, + param_attr=fluid.ParamAttr( + name=name + '_fc_1.w_0', + initializer=param_initializer), + bias_attr=name + '_fc_1.b_0') + return out + + +def pre_post_process_layer(prev_out, + out, + process_cmd, + dropout_rate=0., + name=''): + """ + Add residual connection, layer normalization and droput to the out tensor + optionally according to the value of process_cmd. + This will be used before or after multi-head attention and position-wise + feed-forward networks. + """ + for cmd in process_cmd: + if cmd == "a": # add residual connection + out = out + prev_out if prev_out else out + elif cmd == "n": # add layer normalization + out_dtype = out.dtype + if out_dtype == fluid.core.VarDesc.VarType.FP16: + out = layers.cast(x=out, dtype="float32") + out = layers.layer_norm( + out, + begin_norm_axis=len(out.shape) - 1, + param_attr=fluid.ParamAttr( + name=name + '_layer_norm_scale', + initializer=fluid.initializer.Constant(1.)), + bias_attr=fluid.ParamAttr( + name=name + '_layer_norm_bias', + initializer=fluid.initializer.Constant(0.))) + if out_dtype == fluid.core.VarDesc.VarType.FP16: + out = layers.cast(x=out, dtype="float16") + elif cmd == "d": # add dropout + if dropout_rate: + out = layers.dropout( + out, + dropout_prob=dropout_rate, + dropout_implementation="upscale_in_train", + is_test=False) + return out + + +pre_process_layer = partial(pre_post_process_layer, None) +post_process_layer = pre_post_process_layer + + +def encoder_layer(enc_input, + attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + hidden_act, + preprocess_cmd="n", + postprocess_cmd="da", + param_initializer=None, + name=''): + """The encoder layers that can be stacked to form a deep encoder. + This module consits of a multi-head (self) attention followed by + position-wise feed-forward networks and both the two components companied + with the post_process_layer to add residual connection, layer normalization + and droput. + """ + #L.Print(L.reduce_mean(enc_input), message='1') + attn_output, ctx_multiheads_attn = multi_head_attention( + pre_process_layer( + enc_input, + preprocess_cmd, + prepostprocess_dropout, + name=name + '_pre_att'), + None, + None, + attn_bias, + d_key, + d_value, + d_model, + n_head, + attention_dropout, + param_initializer=param_initializer, + name=name + '_multi_head_att') + #L.Print(L.reduce_mean(attn_output), message='1') + attn_output = post_process_layer( + enc_input, + attn_output, + postprocess_cmd, + prepostprocess_dropout, + name=name + '_post_att') + + #L.Print(L.reduce_mean(attn_output), message='2') + ffd_output = positionwise_feed_forward( + pre_process_layer( + attn_output, + preprocess_cmd, + prepostprocess_dropout, + name=name + '_pre_ffn'), + d_inner_hid, + d_model, + relu_dropout, + hidden_act, + param_initializer=param_initializer, + name=name + '_ffn') + #L.Print(L.reduce_mean(ffd_output), message='3') + ret = post_process_layer( + attn_output, + ffd_output, + postprocess_cmd, + prepostprocess_dropout, + name=name + '_post_ffn') + #L.Print(L.reduce_mean(ret), message='4') + return ret, ctx_multiheads_attn, ffd_output + + +def build_pad_idx(input_mask): + pad_idx = L.where(L.cast(L.squeeze(input_mask, [2]), 'bool')) + return pad_idx + + +def build_attn_bias(input_mask, n_head, dtype): + attn_bias = L.matmul( + input_mask, input_mask, transpose_y=True) # [batch, seq, seq] + attn_bias = (1. - attn_bias) * -10000. + attn_bias = L.stack([attn_bias] * n_head, 1) # [batch, n_head, seq, seq] + if attn_bias.dtype != dtype: + attn_bias = L.cast(attn_bias, dtype) + return attn_bias + + +def build_graph_attn_bias(input_mask, n_head, dtype, slot_seqlen): + + input_shape = L.shape(input_mask) + input_batch = input_shape[0] + input_seqlen = input_shape[1] + num_slot = input_seqlen / slot_seqlen + num_b = num_slot - 1 + ones = L.ones([num_b], dtype="float32") # [num_b] + diag_ones = L.diag(ones) # [num_b, num_b] + diag_ones = L.unsqueeze(diag_ones, [1, -1]) # [num_b, 1, num_b, 1] + diag_ones = L.expand(diag_ones, [1, slot_seqlen, 1, slot_seqlen]) # [num_b, seqlen, num_b, seqlen] + diag_ones = L.reshape(diag_ones, [1, num_b*slot_seqlen, num_b*slot_seqlen]) # [1, num_b*seqlen, num_b*seqlen] + + graph_attn_bias = L.concat([L.ones([1, num_b*slot_seqlen, slot_seqlen], dtype="float32"), diag_ones], 2) + graph_attn_bias = L.concat([L.ones([1, slot_seqlen, num_slot*slot_seqlen], dtype="float32"), graph_attn_bias], 1) # [1, seq, seq] + + pad_attn_bias = L.matmul( + input_mask, input_mask, transpose_y=True) # [batch, seq, seq] + attn_bias = graph_attn_bias * pad_attn_bias + + attn_bias = (1. - attn_bias) * -10000. + attn_bias = L.stack([attn_bias] * n_head, 1) # [batch, n_head, seq, seq] + if attn_bias.dtype != dtype: + attn_bias = L.cast(attn_bias, dtype) + return attn_bias + + +def encoder(enc_input, + input_mask, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + hidden_act, + preprocess_cmd="n", + postprocess_cmd="da", + param_initializer=None, + name=''): + """ + The encoder is composed of a stack of identical layers returned by calling + encoder_layer. + """ + + #global to_2d, to_3d #, batch, seqlen, dynamic_dim + d_shape = L.shape(input_mask) + pad_idx = build_pad_idx(input_mask) + attn_bias = build_attn_bias(input_mask, n_head, enc_input.dtype) + + # d_batch = d_shape[0] + # d_seqlen = d_shape[1] + # pad_idx = L.where( + # L.cast(L.reshape(input_mask, [d_batch, d_seqlen]), 'bool')) + + # attn_bias = L.matmul( + # input_mask, input_mask, transpose_y=True) # [batch, seq, seq] + # attn_bias = (1. - attn_bias) * -10000. + # attn_bias = L.stack([attn_bias] * n_head, 1) + # if attn_bias.dtype != enc_input.dtype: + # attn_bias = L.cast(attn_bias, enc_input.dtype) + + # def to_2d(t_3d): + # t_2d = L.gather_nd(t_3d, pad_idx) + # return t_2d + + # def to_3d(t_2d): + # t_3d = L.scatter_nd( + # pad_idx, t_2d, shape=[d_shape[0], d_shape[1], d_model]) + # return t_3d + + enc_input = to_2d(enc_input) + all_hidden = [] + all_attn = [] + all_ffn = [] + for i in range(n_layer): + enc_output, ctx_multiheads_attn, ffn_output = encoder_layer( + enc_input, + attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + hidden_act, + preprocess_cmd, + postprocess_cmd, + param_initializer=param_initializer, + name=name + '_layer_' + str(i)) + all_hidden.append(enc_output) + all_attn.append(ctx_multiheads_attn) + all_ffn.append(ffn_output) + enc_input = enc_output + enc_output = pre_process_layer( + enc_output, + preprocess_cmd, + prepostprocess_dropout, + name="post_encoder") + enc_output = to_3d(enc_output) + #enc_output.desc.set_shape((-1, 1, final_dim)) + return enc_output, all_hidden, all_attn, all_ffn + +def graph_encoder(enc_input, + input_mask, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + hidden_act, + preprocess_cmd="n", + postprocess_cmd="da", + param_initializer=None, + slot_seqlen=40, + name=''): + """ + The encoder is composed of a stack of identical layers returned by calling + encoder_layer. + """ + + #global to_2d, to_3d #, batch, seqlen, dynamic_dim + d_shape = L.shape(input_mask) + pad_idx = build_pad_idx(input_mask) + attn_bias = build_graph_attn_bias(input_mask, n_head, enc_input.dtype, slot_seqlen) + #attn_bias = build_attn_bias(input_mask, n_head, enc_input.dtype) + + # d_batch = d_shape[0] + # d_seqlen = d_shape[1] + # pad_idx = L.where( + # L.cast(L.reshape(input_mask, [d_batch, d_seqlen]), 'bool')) + + # attn_bias = L.matmul( + # input_mask, input_mask, transpose_y=True) # [batch, seq, seq] + # attn_bias = (1. - attn_bias) * -10000. + # attn_bias = L.stack([attn_bias] * n_head, 1) + # if attn_bias.dtype != enc_input.dtype: + # attn_bias = L.cast(attn_bias, enc_input.dtype) + + # def to_2d(t_3d): + # t_2d = L.gather_nd(t_3d, pad_idx) + # return t_2d + + # def to_3d(t_2d): + # t_3d = L.scatter_nd( + # pad_idx, t_2d, shape=[d_shape[0], d_shape[1], d_model]) + # return t_3d + + enc_input = to_2d(enc_input) + all_hidden = [] + all_attn = [] + all_ffn = [] + for i in range(n_layer): + enc_output, ctx_multiheads_attn, ffn_output = encoder_layer( + enc_input, + attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + hidden_act, + preprocess_cmd, + postprocess_cmd, + param_initializer=param_initializer, + name=name + '_layer_' + str(i)) + all_hidden.append(enc_output) + all_attn.append(ctx_multiheads_attn) + all_ffn.append(ffn_output) + enc_input = enc_output + enc_output = pre_process_layer( + enc_output, + preprocess_cmd, + prepostprocess_dropout, + name="post_encoder") + enc_output = to_3d(enc_output) + #enc_output.desc.set_shape((-1, 1, final_dim)) + return enc_output, all_hidden, all_attn, all_ffn diff --git a/examples/erniesage/models/erniesage_v1.py b/examples/erniesage/models/erniesage_v1.py new file mode 100644 index 0000000000000000000000000000000000000000..696231a785ffecb8098c3379c7a7b0f4ee935e33 --- /dev/null +++ b/examples/erniesage/models/erniesage_v1.py @@ -0,0 +1,42 @@ +import pgl +import paddle.fluid as F +import paddle.fluid.layers as L +from models.base import BaseNet, BaseGNNModel +from models.ernie_model.ernie import ErnieModel +from models.ernie_model.ernie import ErnieGraphModel +from models.ernie_model.ernie import ErnieConfig + +class ErnieSageV1(BaseNet): + + def build_inputs(self): + inputs = super(ErnieSageV1, self).build_inputs() + term_ids = L.data( + "term_ids", shape=[None, self.config.max_seqlen], dtype="int64", append_batch_size=False) + return inputs + [term_ids] + + def build_embedding(self, graph_wrappers, term_ids): + term_ids = L.unsqueeze(term_ids, [-1]) + ernie_config = self.config.ernie_config + ernie = ErnieModel( + src_ids=term_ids, + sentence_ids=L.zeros_like(term_ids), + task_ids=None, + config=ernie_config, + use_fp16=False, + name="student_") + feature = ernie.get_pooled_output() + return feature + + def __call__(self, graph_wrappers): + inputs = self.build_inputs() + feature = self.build_embedding(graph_wrappers, inputs[-1]) + features = self.gnn_layers(graph_wrappers, feature) + outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs[:-1]] + src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0]) + outputs.append(src_real_index) + return inputs, outputs + + +class ErnieSageModelV1(BaseGNNModel): + def gen_net_fn(self, config): + return ErnieSageV1(config) diff --git a/examples/erniesage/models/erniesage_v2.py b/examples/erniesage/models/erniesage_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..e5c03c1fcc0076a8566035c5d523b2bfbf76eb7c --- /dev/null +++ b/examples/erniesage/models/erniesage_v2.py @@ -0,0 +1,102 @@ +import pgl +import paddle.fluid as F +import paddle.fluid.layers as L +from models.base import BaseNet, BaseGNNModel +from models.ernie_model.ernie import ErnieModel +from models.ernie_model.ernie import ErnieGraphModel +from models.ernie_model.ernie import ErnieConfig + + +class ErnieSageV2(BaseNet): + + def build_inputs(self): + inputs = super(ErnieSageV2, self).build_inputs() + term_ids = L.data( + "term_ids", shape=[None, self.config.max_seqlen], dtype="int64", append_batch_size=False) + return inputs + [term_ids] + + def gnn_layer(self, gw, feature, hidden_size, act, initializer, learning_rate, name): + def ernie_send(src_feat, dst_feat, edge_feat): + """doc""" + cls = L.fill_constant_batch_size_like(src_feat["term_ids"], [-1, 1, 1], "int64", 1) + src_ids = L.concat([cls, src_feat["term_ids"]], 1) + dst_ids = dst_feat["term_ids"] + + sent_ids = L.concat([L.zeros_like(src_ids), L.ones_like(dst_ids)], 1) + term_ids = L.concat([src_ids, dst_ids], 1) + + term_ids.stop_gradient = True + sent_ids.stop_gradient = True + ernie = ErnieModel( + term_ids, sent_ids, + config=self.config.ernie_config) + feature = ernie.get_pooled_output() + return feature + + def erniesage_v2_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name): + feature = L.unsqueeze(feature, [-1]) + msg = gw.send(ernie_send, nfeat_list=[("term_ids", feature)]) + neigh_feature = gw.recv(msg, lambda feat: F.layers.sequence_pool(feat, pool_type="sum")) + + term_ids = feature + cls = L.fill_constant_batch_size_like(term_ids, [-1, 1, 1], "int64", 1) + term_ids = L.concat([cls, term_ids], 1) + term_ids.stop_gradient = True + ernie = ErnieModel( + term_ids, L.zeros_like(term_ids), + config=self.config.ernie_config) + self_feature = ernie.get_pooled_output() + + self_feature = L.fc(self_feature, + hidden_size, + act=act, + param_attr=F.ParamAttr(name=name + "_l", + learning_rate=learning_rate), + ) + neigh_feature = L.fc(neigh_feature, + hidden_size, + act=act, + param_attr=F.ParamAttr(name=name + "_r", + learning_rate=learning_rate), + ) + output = L.concat([self_feature, neigh_feature], axis=1) + output = L.l2_normalize(output, axis=1) + return output + return erniesage_v2_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name) + + def gnn_layers(self, graph_wrappers, feature): + features = [feature] + + initializer = None + fc_lr = self.config.lr / 0.001 + + for i in range(self.config.num_layers): + if i == self.config.num_layers - 1: + act = None + else: + act = "leaky_relu" + + feature = self.gnn_layer( + graph_wrappers[i], + feature, + self.config.hidden_size, + act, + initializer, + learning_rate=fc_lr, + name="%s_%s" % ("erniesage_v2", i)) + features.append(feature) + return features + + def __call__(self, graph_wrappers): + inputs = self.build_inputs() + feature = inputs[-1] + features = self.gnn_layers(graph_wrappers, feature) + outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs[:-1]] + src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0]) + outputs.append(src_real_index) + return inputs, outputs + + +class ErnieSageModelV2(BaseGNNModel): + def gen_net_fn(self, config): + return ErnieSageV2(config) diff --git a/examples/erniesage/models/erniesage_v3.py b/examples/erniesage/models/erniesage_v3.py new file mode 100644 index 0000000000000000000000000000000000000000..4fd9968a49213c4bc5fbb0deb5316e44bba30a12 --- /dev/null +++ b/examples/erniesage/models/erniesage_v3.py @@ -0,0 +1,123 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pgl +import paddle.fluid as F +import paddle.fluid.layers as L + +from models.base import BaseNet, BaseGNNModel +from models.ernie_model.ernie import ErnieModel +from models.ernie_model.ernie import ErnieGraphModel +from models.ernie_model.ernie import ErnieConfig +from models.message_passing import copy_send + + +class ErnieSageV3(BaseNet): + def __init__(self, config): + super(ErnieSageV3, self).__init__(config) + self.config.layer_type = "ernie_recv_sum" + + def build_inputs(self): + inputs = super(ErnieSageV3, self).build_inputs() + term_ids = L.data( + "term_ids", shape=[None, self.config.max_seqlen], dtype="int64", append_batch_size=False) + return inputs + [term_ids] + + def gnn_layer(self, gw, feature, hidden_size, act, initializer, learning_rate, name): + def ernie_recv(feat): + """doc""" + # TODO maxlen 400 + #pad_value = L.cast(L.assign(input=np.array([0], dtype=np.int32)), "int64") + pad_value = L.zeros([1], "int64") + out, _ = L.sequence_pad(feat, pad_value=pad_value, maxlen=10) + out = L.reshape(out, [0, 400]) + return out + + def erniesage_v3_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name): + msg = gw.send(copy_send, nfeat_list=[("h", feature)]) + neigh_feature = gw.recv(msg, ernie_recv) + neigh_feature = L.cast(L.unsqueeze(neigh_feature, [-1]), "int64") + + feature = L.unsqueeze(feature, [-1]) + cls = L.fill_constant_batch_size_like(feature, [-1, 1, 1], "int64", 1) + term_ids = L.concat([cls, feature[:, :-1], neigh_feature], 1) + term_ids.stop_gradient = True + return term_ids + return erniesage_v3_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name) + + def gnn_layers(self, graph_wrappers, feature): + features = [feature] + + initializer = None + fc_lr = self.config.lr / 0.001 + + for i in range(self.config.num_layers): + if i == self.config.num_layers - 1: + act = None + else: + act = "leaky_relu" + + feature = self.gnn_layer( + graph_wrappers[i], + feature, + self.config.hidden_size, + act, + initializer, + learning_rate=fc_lr, + name="%s_%s" % (self.config.layer_type, i)) + features.append(feature) + return features + + def take_final_feature(self, feature, index, name): + """take final feature""" + feat = L.gather(feature, index, overwrite=False) + + ernie_config = self.config.ernie_config + ernie = ErnieGraphModel( + src_ids=feat, + config=ernie_config, + slot_seqlen=self.config.max_seqlen, + name="student_") + feat = ernie.get_pooled_output() + fc_lr = self.config.lr / 0.001 + feat= L.fc(feat, + self.config.hidden_size, + act="relu", + param_attr=F.ParamAttr(name=name + "_l", + learning_rate=fc_lr), + ) + feat = L.l2_normalize(feat, axis=1) + + if self.config.final_fc: + feat = L.fc(feat, + self.config.hidden_size, + param_attr=F.ParamAttr(name=name + '_w'), + bias_attr=F.ParamAttr(name=name + '_b')) + + if self.config.final_l2_norm: + feat = L.l2_normalize(feat, axis=1) + return feat + + def __call__(self, graph_wrappers): + inputs = self.build_inputs() + feature = inputs[-1] + features = self.gnn_layers(graph_wrappers, feature) + outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs[:-1]] + src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0]) + outputs.append(src_real_index) + return inputs, outputs + + +class ErnieSageModelV3(BaseGNNModel): + def gen_net_fn(self, config): + return ErnieSageV3(config) diff --git a/examples/erniesage/models/message_passing.py b/examples/erniesage/models/message_passing.py new file mode 100644 index 0000000000000000000000000000000000000000..4567bd694b123841c6b71b61a88e8dcbae8957b7 --- /dev/null +++ b/examples/erniesage/models/message_passing.py @@ -0,0 +1,137 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.layers as L + + +def copy_send(src_feat, dst_feat, edge_feat): + """doc""" + return src_feat["h"] + +def weighted_copy_send(src_feat, dst_feat, edge_feat): + """doc""" + return src_feat["h"] * edge_feat["weight"] + +def mean_recv(feat): + """doc""" + return fluid.layers.sequence_pool(feat, pool_type="average") + + +def sum_recv(feat): + """doc""" + return fluid.layers.sequence_pool(feat, pool_type="sum") + + +def max_recv(feat): + """doc""" + return fluid.layers.sequence_pool(feat, pool_type="max") + + +def lstm_recv(feat): + """doc""" + hidden_dim = 128 + forward, _ = fluid.layers.dynamic_lstm( + input=feat, size=hidden_dim * 4, use_peepholes=False) + output = fluid.layers.sequence_last_step(forward) + return output + + +def graphsage_sum(gw, feature, hidden_size, act, initializer, learning_rate, name): + """doc""" + msg = gw.send(copy_send, nfeat_list=[("h", feature)]) + neigh_feature = gw.recv(msg, sum_recv) + self_feature = feature + self_feature = fluid.layers.fc(self_feature, + hidden_size, + act=act, + param_attr=fluid.ParamAttr(name=name + "_l", initializer=initializer, + learning_rate=learning_rate), + ) + neigh_feature = fluid.layers.fc(neigh_feature, + hidden_size, + act=act, + param_attr=fluid.ParamAttr(name=name + "_r", initializer=initializer, + learning_rate=learning_rate), + ) + output = fluid.layers.concat([self_feature, neigh_feature], axis=1) + output = fluid.layers.l2_normalize(output, axis=1) + return output + + +def graphsage_mean(gw, feature, hidden_size, act, initializer, learning_rate, name): + """doc""" + msg = gw.send(copy_send, nfeat_list=[("h", feature)]) + neigh_feature = gw.recv(msg, mean_recv) + self_feature = feature + self_feature = fluid.layers.fc(self_feature, + hidden_size, + act=act, + param_attr=fluid.ParamAttr(name=name + "_l", initializer=initializer, + learning_rate=learning_rate), + ) + neigh_feature = fluid.layers.fc(neigh_feature, + hidden_size, + act=act, + param_attr=fluid.ParamAttr(name=name + "_r", initializer=initializer, + learning_rate=learning_rate), + ) + output = fluid.layers.concat([self_feature, neigh_feature], axis=1) + output = fluid.layers.l2_normalize(output, axis=1) + return output + + +def pinsage_mean(gw, feature, hidden_size, act, initializer, learning_rate, name): + """doc""" + msg = gw.send(weighted_copy_send, nfeat_list=[("h", feature)], efeat_list=["weight"]) + neigh_feature = gw.recv(msg, mean_recv) + self_feature = feature + self_feature = fluid.layers.fc(self_feature, + hidden_size, + act=act, + param_attr=fluid.ParamAttr(name=name + "_l", initializer=initializer, + learning_rate=learning_rate), + ) + neigh_feature = fluid.layers.fc(neigh_feature, + hidden_size, + act=act, + param_attr=fluid.ParamAttr(name=name + "_r", initializer=initializer, + learning_rate=learning_rate), + ) + output = fluid.layers.concat([self_feature, neigh_feature], axis=1) + output = fluid.layers.l2_normalize(output, axis=1) + return output + + +def pinsage_sum(gw, feature, hidden_size, act, initializer, learning_rate, name): + """doc""" + msg = gw.send(weighted_copy_send, nfeat_list=[("h", feature)], efeat_list=["weight"]) + neigh_feature = gw.recv(msg, sum_recv) + self_feature = feature + self_feature = fluid.layers.fc(self_feature, + hidden_size, + act=act, + param_attr=fluid.ParamAttr(name=name + "_l", initializer=initializer, + learning_rate=learning_rate), + ) + neigh_feature = fluid.layers.fc(neigh_feature, + hidden_size, + act=act, + param_attr=fluid.ParamAttr(name=name + "_r", initializer=initializer, + learning_rate=learning_rate), + ) + output = fluid.layers.concat([self_feature, neigh_feature], axis=1) + output = fluid.layers.l2_normalize(output, axis=1) + return output diff --git a/examples/erniesage/models/model_factory.py b/examples/erniesage/models/model_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..0f69bb1f6932a219f4a41faee9cf5bf6c3f947a8 --- /dev/null +++ b/examples/erniesage/models/model_factory.py @@ -0,0 +1,24 @@ +from models.base import BaseGNNModel +from models.ernie import ErnieModel +from models.erniesage_v1 import ErnieSageModelV1 +from models.erniesage_v2 import ErnieSageModelV2 +from models.erniesage_v3 import ErnieSageModelV3 + +class Model(object): + @classmethod + def factory(cls, config): + name = config.model_type + if name == "BaseGNNModel": + return BaseGNNModel(config) + if name == "ErnieModel": + return ErnieModel(config) + if name == "ErnieSageModelV1": + return ErnieSageModelV1(config) + if name == "ErnieSageModelV2": + return ErnieSageModelV2(config) + if name == "ErnieSageModelV3": + return ErnieSageModelV3(config) + else: + raise ValueError + + diff --git a/examples/erniesage/preprocessing/dump_graph.py b/examples/erniesage/preprocessing/dump_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..06281456c29a11704e921a99cdd80d3dabfa0c3c --- /dev/null +++ b/examples/erniesage/preprocessing/dump_graph.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +######################################################################## +# +# Copyright (c) 2020 Baidu.com, Inc. All Rights Reserved +# +# File: dump_graph.py +# Author: suweiyue(suweiyue@baidu.com) +# Date: 2020/03/01 22:17:13 +# +######################################################################## +""" + Comment. +""" +from __future__ import division +from __future__ import absolute_import +from __future__ import print_function +#from __future__ import unicode_literals + +import io +import os +import sys +import argparse +import logging +import multiprocessing +from functools import partial +from io import open + +import numpy as np +import tqdm +import pgl +from pgl.graph_kernel import alias_sample_build_table +from pgl.utils.logger import log + +from tokenization import FullTokenizer + + +def term2id(string, tokenizer, max_seqlen): + string = string.split("\t")[1] + tokens = tokenizer.tokenize(string) + ids = tokenizer.convert_tokens_to_ids(tokens) + ids = ids[:max_seqlen-1] + ids = ids + [2] # ids + [sep] + ids = ids + [0] * (max_seqlen - len(ids)) + return ids + + +def dump_graph(args): + if not os.path.exists(args.outpath): + os.makedirs(args.outpath) + neg_samples = [] + str2id = dict() + term_file = io.open(os.path.join(args.outpath, "terms.txt"), "w", encoding=args.encoding) + terms = [] + count = 0 + + with io.open(args.inpath, encoding=args.encoding) as f: + edges = [] + for idx, line in enumerate(f): + if idx % 100000 == 0: + log.info("%s readed %s lines" % (args.inpath, idx)) + slots = [] + for col_idx, col in enumerate(line.strip("\n").split("\t")): + s = col[:args.max_seqlen] + if s not in str2id: + str2id[s] = count + count += 1 + term_file.write(str(col_idx) + "\t" + col + "\n") + + slots.append(str2id[s]) + + src = slots[0] + dst = slots[1] + neg_samples.append(slots[2:]) + edges.append((src, dst)) + edges.append((dst, src)) + + term_file.close() + edges = np.array(edges, dtype="int64") + num_nodes = len(str2id) + str2id.clear() + log.info("building graph...") + graph = pgl.graph.Graph(num_nodes=num_nodes, edges=edges) + indegree = graph.indegree() + graph.outdegree() + graph.dump(args.outpath) + + # dump alias sample table + sqrt_indegree = np.sqrt(indegree) + distribution = 1. * sqrt_indegree / sqrt_indegree.sum() + alias, events = alias_sample_build_table(distribution) + np.save(os.path.join(args.outpath, "alias.npy"), alias) + np.save(os.path.join(args.outpath, "events.npy"), events) + np.save(os.path.join(args.outpath, "neg_samples.npy"), np.array(neg_samples)) + log.info("End Build Graph") + +def dump_id2str_map(args): + log.info("Dump id2str map starting...") + id2str = np.array([line.strip("\n") for line in open(os.path.join(args.outpath, "terms.txt"), "r", encoding=args.encoding)]) + np.save(os.path.join(args.outpath, "id2str.npy"), id2str) + log.info("Dump id2str map done.") + +def dump_node_feat(args): + log.info("Dump node feat starting...") + id2str = np.load(os.path.join(args.outpath, "id2str.npy"), mmap_mode="r") + pool = multiprocessing.Pool() + tokenizer = FullTokenizer(args.vocab_file) + term_ids = pool.map(partial(term2id, tokenizer=tokenizer, max_seqlen=args.max_seqlen), id2str) + np.save(os.path.join(args.outpath, "term_ids.npy"), np.array(term_ids)) + log.info("Dump node feat done.") + pool.terminate() + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='main') + parser.add_argument("-i", "--inpath", type=str, default=None) + parser.add_argument("-l", "--max_seqlen", type=int, default=30) + parser.add_argument("--vocab_file", type=str, default="./vocab.txt") + parser.add_argument("--encoding", type=str, default="utf8") + parser.add_argument("-o", "--outpath", type=str, default=None) + args = parser.parse_args() + dump_graph(args) + dump_id2str_map(args) + dump_node_feat(args) diff --git a/examples/erniesage/preprocessing/tokenization.py b/examples/erniesage/preprocessing/tokenization.py new file mode 100644 index 0000000000000000000000000000000000000000..975bb26a531e655bcfa4744f8ebc81fc01c68d9c --- /dev/null +++ b/examples/erniesage/preprocessing/tokenization.py @@ -0,0 +1,461 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import unicodedata +import six +import sentencepiece as sp + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode("utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + fin = open(vocab_file, 'rb') + for num, line in enumerate(fin): + items = convert_to_unicode(line.strip()).split("\t") + if len(items) > 2: + break + token = items[0] + index = items[1] if len(items) == 2 else num + token = token.strip() + vocab[token] = int(index) + return vocab + + +def convert_by_vocab(vocab, items): + """Converts a sequence of [tokens|ids] using the vocab.""" + output = [] + for item in items: + output.append(vocab[item]) + return output + + +def convert_tokens_to_ids_include_unk(vocab, tokens, unk_token="[UNK]"): + output = [] + for token in tokens: + if token in vocab: + output.append(vocab[token]) + else: + output.append(vocab[unk_token]) + return output + + +def convert_tokens_to_ids(vocab, tokens): + return convert_by_vocab(vocab, tokens) + + +def convert_ids_to_tokens(inv_vocab, ids): + return convert_by_vocab(inv_vocab, ids) + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a peice of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class FullTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class CharTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in text.lower().split(" "): + for sub_token in self.tokenizer.tokenize(token): + split_tokens.append(sub_token) + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = convert_to_unicode(text) + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class SentencepieceTokenizer(object): + """Runs SentencePiece tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True, unk_token="[UNK]"): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.do_lower_case = do_lower_case + self.tokenizer = sp.SentencePieceProcessor() + self.tokenizer.Load(vocab_file + ".model") + self.sp_unk_token = "" + self.unk_token = unk_token + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + Returns: + A list of wordpiece tokens. + """ + text = text.lower() if self.do_lower_case else text + text = convert_to_unicode(text.replace("\1", " ")) + tokens = self.tokenizer.EncodeAsPieces(text) + + output_tokens = [] + for token in tokens: + if token == self.sp_unk_token: + token = self.unk_token + + if token in self.vocab: + output_tokens.append(token) + else: + output_tokens.append(self.unk_token) + + return output_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class WordsegTokenizer(object): + """Runs Wordseg tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True, unk_token="[UNK]", + split_token="\1"): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.tokenizer = sp.SentencePieceProcessor() + self.tokenizer.Load(vocab_file + ".model") + + self.do_lower_case = do_lower_case + self.unk_token = unk_token + self.split_token = split_token + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + Returns: + A list of wordpiece tokens. + """ + text = text.lower() if self.do_lower_case else text + text = convert_to_unicode(text) + + output_tokens = [] + for token in text.split(self.split_token): + if token in self.vocab: + output_tokens.append(token) + else: + sp_tokens = self.tokenizer.EncodeAsPieces(token) + for sp_token in sp_tokens: + if sp_token in self.vocab: + output_tokens.append(sp_token) + return output_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenziation.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + + Returns: + A list of wordpiece tokens. + """ + + text = convert_to_unicode(text) + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/examples/erniesage/train.py b/examples/erniesage/train.py new file mode 100644 index 0000000000000000000000000000000000000000..22bc5371ee7bbaf997e18f77180bedba18c30f69 --- /dev/null +++ b/examples/erniesage/train.py @@ -0,0 +1,93 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import argparse +import traceback + +import yaml +import numpy as np +from easydict import EasyDict as edict +from pgl.utils.logger import log +from pgl.utils import paddle_helper + +from learner import Learner +from models.model_factory import Model +from dataset.graph_reader import GraphGenerator + + +class TrainData(object): + def __init__(self, graph_path): + trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) + log.info("trainer_id: %s, trainer_count: %s." % (trainer_id, trainer_count)) + + edges = np.load(os.path.join(graph_path, "edges.npy"), allow_pickle=True) + # edges is bidirectional. + train_usr = edges[trainer_id::trainer_count, 0] + train_ad = edges[trainer_id::trainer_count, 1] + returns = { + "train_data": [train_usr, train_ad] + } + + if os.path.exists(os.path.join(graph_path, "neg_samples.npy")): + neg_samples = np.load(os.path.join(graph_path, "neg_samples.npy"), allow_pickle=True) + if neg_samples.size != 0: + train_negs = neg_samples[trainer_id::trainer_count] + returns["train_data"].append(train_negs) + log.info("Load train_data done.") + self.data = returns + + def __getitem__(self, index): + return [ data[index] for data in self.data["train_data"]] + + def __len__(self): + return len(self.data["train_data"][0]) + + +def main(config): + # Select Model + model = Model.factory(config) + + # Build Train Edges + data = TrainData(config.graph_path) + + # Build Train Data + train_iter = GraphGenerator( + graph_wrappers=model.graph_wrappers, + batch_size=config.batch_size, + data=data, + samples=config.samples, + num_workers=config.sample_workers, + feed_name_list=[var.name for var in model.feed_list], + use_pyreader=config.use_pyreader, + phase="train", + graph_data_path=config.graph_path, + shuffle=True) + + log.info("build graph reader done.") + + learner = Learner.factory(config.learner_type) + learner.build(model, train_iter, config) + + learner.start() + learner.stop() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='main') + parser.add_argument("--conf", type=str, default="./config.yaml") + args = parser.parse_args() + config = edict(yaml.load(open(args.conf), Loader=yaml.FullLoader)) + print(config) + main(config)