Merge branch 'PaddlePaddle-develop'

cc0a1a85 · Yelrose · 2c7593d6 · 9ae72c39 · cc0a1a85 · cc0a1a85
88 changed file
--- a/examples/distribute_metapath2vec/README.md
+++ b/examples/distribute_metapath2vec/README.md
-# Distributed metapath2vec in PGL
+# Distributed metapath2vec, metapath2vec++, multi-metapath2vec++ in PGL
 [metapath2vec](https://ericdongyx.github.io/papers/KDD17-dong-chawla-swami-metapath2vec.pdf) is a algorithm framework for representation learning in heterogeneous networks which contains multiple types of nodes and links. Given a heterogeneous graph, metapath2vec algorithm first generates meta-path-based random walks and then use skipgram model to train a language model. Based on PGL, we reproduce metapath2vec algorithm in distributed mode.
-## Datasets
+### Datasets
 DBLP: The dataset contains 14376 papers (P), 20 conferences (C), 14475 authors (A), and 8920 terms (T). There are 33791 nodes in this dataset.
 You can dowload datasets from [here](https://github.com/librahu/HIN-Datasets-for-Recommendation-and-Network-Embedding)
 We use the ```DBLP``` dataset for example. After downloading the dataset, put them, let's say, in ```./data/DBLP/``` .
-## Dependencies
+### Dependencies
 - paddlepaddle>=1.6
 - pgl>=1.0.0
-## How to run
+### How to run
 Before training, run the below command to do data preprocessing.
 ```sh
 python data_process.py --data_path ./data/DBLP  --output_path ./data/data_processed
@@ -30,11 +30,21 @@ python multi_class.py --dataset ./data/data_processed/author_label.txt --ckpt_pa
 ```
+### Model Selection
+Actually, There are 3 models in this example, they are ```metapath2vec```, ```metapath2vec++``` and ```multi_metapath2vec++```. You can select different models by modifying ```config.yaml```.
-## Hyperparameters
+In order to run ```metapath2vec++``` model, you can easily rewrite the hyper parameter of **neg_sample_type** to **m2v_plus**, then ```metapath2vec++``` model will be selected.
+```multi-metapath2vec++``` means that you are not only use a single metapath, instead, you can use several metapaths at the same time to train the model. For example, you might want to use ```c2p-p2a-a2p-p2c``` and  ```p2a-a2p``` simultaneously. Then you can rewrite the below hyper parameters in ```config.yaml```.
+- **neg_sample_type**: "m2v_plus"
+- **walk_mode**: "multi_m2v"
+- **meta_path**: "c2p-p2a-a2p-p2c;p2a-a2p"
+- **first_node_type**: "c;p"
+### Hyperparameters
 All the hyper parameters are saved in ```config.yaml``` file. So before training, you can open the config.yaml to modify the hyper parameters as you like.
-Some important hyper parameters in config.yaml:
+Some important hyper parameters in ```config.yaml```:
 - **edge_path**: the directory of graph data that you want to load
 - **lr**: learning rate
 - **neg_num**: number of negative samples.

--- a/examples/distribute_metapath2vec/config.yaml
+++ b/examples/distribute_metapath2vec/config.yaml
@@ -31,7 +31,7 @@ is_distributed: False
 # trainging config
 epochs: 10
 optimizer: "sgd"
-lr: 1.0
+lr: 0.1
 warm_start_from_dir: null
 walkpath_files: "None"
 train_files: "None"

--- a/examples/distribute_metapath2vec/walker.py
+++ b/examples/distribute_metapath2vec/walker.py
@@ -87,9 +87,12 @@ class NodeGenerator(object):
            idx = cc % num_n_type
            n_type = n_type_list[idx]
            try:
-                nodes = node_generators[n_type].next()
+                nodes = next(node_generators[n_type])
            except StopIteration as e:
-                log.info("exception when iteration")
+                log.info("node type of %s iteration finished in one epoch" %
+                         (n_type))
+                node_generators[n_type] = \
+                        self.graph.node_batch_iter(self.batch_size, n_type=n_type)
                break
            yield (nodes, idx)
            cc += 1

--- a/examples/erniesage/config/erniesage_v1_cpu.yaml
+++ b/examples/erniesage/config/erniesage_v1_cpu.yaml
+# Global Enviroment Settings 
+#
+# trainer config ------
+learner_type: "cpu"
+optimizer_type: "adam"
+lr: 0.00005
+batch_size: 2
+CPU_NUM: 10
+epoch: 20
+log_per_step: 1
+save_per_step: 100
+output_path: "./output"
+ckpt_path: "./ernie_base_ckpt"
+# data config ------
+input_data: "./data.txt"
+graph_path: "./workdir"
+sample_workers: 1
+use_pyreader: true
+input_type: "text"
+# model config ------
+samples: [10]
+model_type: "ErnieSageModelV1"
+layer_type: "graphsage_sum"
+max_seqlen: 40
+num_layers: 1
+hidden_size: 128
+final_fc: true
+final_l2_norm: true
+loss_type: "hinge"
+margin: 0.3
+# infer config ------
+infer_model: "./output/last"
+infer_batch_size: 128
+# ernie config ------
+encoding: "utf8"
+ernie_vocab_file: "./vocab.txt"
+ernie_config:
+    attention_probs_dropout_prob: 0.1
+    hidden_act: "relu"
+    hidden_dropout_prob: 0.1
+    hidden_size: 768
+    initializer_range: 0.02
+    max_position_embeddings: 513
+    num_attention_heads: 12
+    num_hidden_layers: 12
+    sent_type_vocab_size: 4
+    task_type_vocab_size: 3
+    vocab_size: 18000
+    use_task_id: false
+    use_fp16: false
--- a/examples/erniesage/config/erniesage_v1_gpu.yaml
+++ b/examples/erniesage/config/erniesage_v1_gpu.yaml
+# Global Enviroment Settings 
+#
+# trainer config ------
+learner_type: "gpu"
+optimizer_type: "adam"
+lr: 0.00005
+batch_size: 32
+CPU_NUM: 10
+epoch: 20
+log_per_step: 1
+save_per_step: 100
+output_path: "./output"
+ckpt_path: "./ernie_base_ckpt"
+# data config ------
+input_data: "./data.txt"
+graph_path: "./workdir"
+sample_workers: 1
+use_pyreader: true
+input_type: "text"
+# model config ------
+samples: [10]
+model_type: "ErnieSageModelV1"
+layer_type: "graphsage_sum"
+max_seqlen: 40
+num_layers: 1
+hidden_size: 128
+final_fc: true
+final_l2_norm: true
+loss_type: "hinge"
+margin: 0.3
+# infer config ------
+infer_model: "./output/last"
+infer_batch_size: 128
+# ernie config ------
+encoding: "utf8"
+ernie_vocab_file: "./vocab.txt"
+ernie_config:
+    attention_probs_dropout_prob: 0.1
+    hidden_act: "relu"
+    hidden_dropout_prob: 0.1
+    hidden_size: 768
+    initializer_range: 0.02
+    max_position_embeddings: 513
+    num_attention_heads: 12
+    num_hidden_layers: 12
+    sent_type_vocab_size: 4
+    task_type_vocab_size: 3
+    vocab_size: 18000
+    use_task_id: false
+    use_fp16: false
--- a/examples/erniesage/config/erniesage_v2_cpu.yaml
+++ b/examples/erniesage/config/erniesage_v2_cpu.yaml
+# Global Enviroment Settings 
+#
+# trainer config ------
+learner_type: "cpu"
+optimizer_type: "adam"
+lr: 0.00005
+batch_size: 2
+CPU_NUM: 10
+epoch: 20
+log_per_step: 1
+save_per_step: 100
+output_path: "./output"
+ckpt_path: "./ernie_base_ckpt"
+# data config ------
+input_data: "./data.txt"
+graph_path: "./workdir"
+sample_workers: 1
+use_pyreader: true
+input_type: "text"
+# model config ------
+samples: [10]
+model_type: "ErnieSageModelV2"
+max_seqlen: 40
+num_layers: 1
+hidden_size: 128
+final_fc: true
+final_l2_norm: true
+loss_type: "hinge"
+margin: 0.3
+# infer config ------
+infer_model: "./output/last"
+infer_batch_size: 128
+# ernie config ------
+encoding: "utf8"
+ernie_vocab_file: "./vocab.txt"
+ernie_config:
+    attention_probs_dropout_prob: 0.1
+    hidden_act: "relu"
+    hidden_dropout_prob: 0.1
+    hidden_size: 768
+    initializer_range: 0.02
+    max_position_embeddings: 513
+    num_attention_heads: 12
+    num_hidden_layers: 12
+    sent_type_vocab_size: 4
+    task_type_vocab_size: 3
+    vocab_size: 18000
+    use_task_id: false
+    use_fp16: false
--- a/examples/erniesage/config/erniesage_v2_gpu.yaml
+++ b/examples/erniesage/config/erniesage_v2_gpu.yaml
+# Global Enviroment Settings 
+#
+# trainer config ------
+learner_type: "gpu"
+optimizer_type: "adam"
+lr: 0.00005
+batch_size: 32
+CPU_NUM: 10
+epoch: 20
+log_per_step: 1
+save_per_step: 100
+output_path: "./output"
+ckpt_path: "./ernie_base_ckpt"
+# data config ------
+input_data: "./data.txt"
+graph_path: "./workdir"
+sample_workers: 1
+use_pyreader: true
+input_type: "text"
+# model config ------
+samples: [10]
+model_type: "ErnieSageModelV2"
+max_seqlen: 40
+num_layers: 1
+hidden_size: 128
+final_fc: true
+final_l2_norm: true
+loss_type: "hinge"
+margin: 0.3
+# infer config ------
+infer_model: "./output/last"
+infer_batch_size: 128
+# ernie config ------
+encoding: "utf8"
+ernie_vocab_file: "./vocab.txt"
+ernie_config:
+    attention_probs_dropout_prob: 0.1
+    hidden_act: "relu"
+    hidden_dropout_prob: 0.1
+    hidden_size: 768
+    initializer_range: 0.02
+    max_position_embeddings: 513
+    num_attention_heads: 12
+    num_hidden_layers: 12
+    sent_type_vocab_size: 4
+    task_type_vocab_size: 3
+    vocab_size: 18000
+    use_task_id: false
+    use_fp16: false
--- a/examples/erniesage/config/erniesage_v3_cpu.yaml
+++ b/examples/erniesage/config/erniesage_v3_cpu.yaml
+# Global Enviroment Settings 
+#
+# trainer config ------
+learner_type: "cpu"
+optimizer_type: "adam"
+lr: 0.00005
+batch_size: 2
+CPU_NUM: 10
+epoch: 20
+log_per_step: 1
+save_per_step: 100
+output_path: "./output"
+ckpt_path: "./ernie_base_ckpt"
+# data config ------
+input_data: "./data.txt"
+graph_path: "./workdir"
+sample_workers: 1
+use_pyreader: true
+input_type: "text"
+# model config ------
+samples: [10]
+model_type: "ErnieSageModelV3"
+max_seqlen: 40
+num_layers: 1
+hidden_size: 128
+final_fc: true
+final_l2_norm: true
+loss_type: "hinge"
+margin: 0.3
+# infer config ------
+infer_model: "./output/last"
+infer_batch_size: 128
+# ernie config ------
+encoding: "utf8"
+ernie_vocab_file: "./vocab.txt"
+ernie_config:
+    attention_probs_dropout_prob: 0.1
+    hidden_act: "relu"
+    hidden_dropout_prob: 0.1
+    hidden_size: 768
+    initializer_range: 0.02
+    max_position_embeddings: 513
+    num_attention_heads: 12
+    num_hidden_layers: 12
+    sent_type_vocab_size: 4
+    task_type_vocab_size: 3
+    vocab_size: 18000
+    use_task_id: false
+    use_fp16: false
--- a/examples/erniesage/config/erniesage_v3_gpu.yaml
+++ b/examples/erniesage/config/erniesage_v3_gpu.yaml
+# Global Enviroment Settings 
+#
+# trainer config ------
+learner_type: "gpu"
+optimizer_type: "adam"
+lr: 0.00005
+batch_size: 32
+CPU_NUM: 10
+epoch: 20
+log_per_step: 1
+save_per_step: 100
+output_path: "./output"
+ckpt_path: "./ernie_base_ckpt"
+# data config ------
+input_data: "./data.txt"
+graph_path: "./workdir"
+sample_workers: 1
+use_pyreader: true
+input_type: "text"
+# model config ------
+samples: [10]
+model_type: "ErnieSageModelV3"
+max_seqlen: 40
+num_layers: 1
+hidden_size: 128
+final_fc: true
+final_l2_norm: true
+loss_type: "hinge"
+margin: 0.3
+# infer config ------
+infer_model: "./output/last"
+infer_batch_size: 128
+# ernie config ------
+encoding: "utf8"
+ernie_vocab_file: "./vocab.txt"
+ernie_config:
+    attention_probs_dropout_prob: 0.1
+    hidden_act: "relu"
+    hidden_dropout_prob: 0.1
+    hidden_size: 768
+    initializer_range: 0.02
+    max_position_embeddings: 513
+    num_attention_heads: 12
+    num_hidden_layers: 12
+    sent_type_vocab_size: 4
+    task_type_vocab_size: 3
+    vocab_size: 18000
+    use_task_id: false
+    use_fp16: false
--- a/examples/erniesage/dataset/__init__.py
+++ b/examples/erniesage/dataset/__init__.py
--- a/examples/erniesage/dataset/base_dataset.py
+++ b/examples/erniesage/dataset/base_dataset.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Base DataLoader 
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import os
+import sys
+import six
+from io import open
+from collections import namedtuple
+import numpy as np
+import tqdm
+import paddle
+from pgl.utils import mp_reader
+import collections
+import time
+from pgl.utils.logger import log
+import traceback
+if six.PY3:
+    import io
+    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
+def batch_iter(data, perm, batch_size, fid, num_workers):
+    """node_batch_iter
+    """
+    size = len(data)
+    start = 0
+    cc = 0
+    while start < size:
+        index = perm[start:start + batch_size]
+        start += batch_size
+        cc += 1
+        if cc % num_workers != fid:
+            continue
+        yield data[index]
+def scan_batch_iter(data, batch_size, fid, num_workers):
+    """node_batch_iter
+    """
+    batch = []
+    cc = 0
+    for line_example in data.scan(): 
+        cc += 1
+        if cc % num_workers != fid:
+            continue
+        batch.append(line_example)
+        if len(batch) == batch_size:
+            yield batch 
+            batch = []
+    if len(batch) > 0:
+        yield batch 
+class BaseDataGenerator(object):
+    """Base Data Geneartor"""
+    def __init__(self, buf_size, batch_size, num_workers, shuffle=True):
+        self.num_workers = num_workers
+        self.batch_size = batch_size
+        self.line_examples = []
+        self.buf_size = buf_size
+        self.shuffle = shuffle
+    def batch_fn(self, batch_examples):
+        """ batch_fn batch producer"""
+        raise NotImplementedError("No defined Batch Fn")
+    def batch_iter(self, fid, perm):
+        """ batch iterator"""
+        if self.shuffle:
+            for batch in batch_iter(self, perm, self.batch_size, fid, self.num_workers):
+                yield batch
+        else:
+            for batch in scan_batch_iter(self, self.batch_size, fid, self.num_workers):
+                yield batch
+    def __len__(self):
+        return len(self.line_examples)
+    def __getitem__(self, idx):
+        if isinstance(idx, collections.Iterable):
+            return [self[bidx] for bidx in idx]
+        else:
+            return self.line_examples[idx]
+    def generator(self):
+        """batch dict generator"""
+        def worker(filter_id, perm):
+            """ multiprocess worker"""
+            def func_run():
+                """ func_run """
+                pid = os.getpid()
+                np.random.seed(pid + int(time.time()))
+                for batch_examples in self.batch_iter(filter_id, perm):
+                    try:
+                        batch_dict = self.batch_fn(batch_examples)
+                    except Exception as e:
+                       traceback.print_exc()
+                       log.info(traceback.format_exc())
+                       log.info(str(e))
+                       continue
+                    if batch_dict is None:
+                        continue
+                    yield batch_dict
+            return func_run
+        # consume a seed
+        np.random.rand()
+        if self.shuffle:
+            perm = np.arange(0, len(self)) 
+            np.random.shuffle(perm)
+        else:
+            perm = None
+        if self.num_workers == 1:
+            r = paddle.reader.buffered(worker(0, perm), self.buf_size)
+        else:
+            worker_pool = [worker(wid, perm) for wid in range(self.num_workers)]
+            worker = mp_reader.multiprocess_reader(
+                worker_pool, use_pipe=True, queue_size=1000)
+            r = paddle.reader.buffered(worker, self.buf_size)
+        for batch in r():
+            yield batch
+    def scan(self): 
+        for line_example in self.line_examples:
+            yield line_example
--- a/examples/erniesage/dataset/graph_reader.py
+++ b/examples/erniesage/dataset/graph_reader.py
+"""Graph Dataset
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import os
+import pgl
+import sys
+import numpy as np
+from pgl.utils.logger import log
+from dataset.base_dataset import BaseDataGenerator
+from pgl.sample import alias_sample
+from pgl.sample import pinsage_sample
+from pgl.sample import graphsage_sample 
+from pgl.sample import edge_hash
+class GraphGenerator(BaseDataGenerator):
+    def __init__(self, graph_wrappers, data, batch_size, samples,
+        num_workers, feed_name_list, use_pyreader,
+        phase, graph_data_path, shuffle=True, buf_size=1000):
+        super(GraphGenerator, self).__init__(
+            buf_size=buf_size,
+            num_workers=num_workers,
+            batch_size=batch_size, shuffle=shuffle)
+        # For iteration
+        self.line_examples = data
+        self.graph_wrappers = graph_wrappers
+        self.samples = samples
+        self.feed_name_list = feed_name_list
+        self.use_pyreader = use_pyreader
+        self.phase = phase
+        self.load_graph(graph_data_path)
+        self.num_layers = len(graph_wrappers)
+    def load_graph(self, graph_data_path):
+        self.graph = pgl.graph.MemmapGraph(graph_data_path)
+        self.alias = np.load(os.path.join(graph_data_path, "alias.npy"), mmap_mode="r")
+        self.events = np.load(os.path.join(graph_data_path, "events.npy"), mmap_mode="r")
+        self.term_ids = np.load(os.path.join(graph_data_path, "term_ids.npy"), mmap_mode="r")
+    def batch_fn(self, batch_ex):
+        # batch_ex = [
+        #     (src, dst, neg),
+        #     (src, dst, neg),
+        #     (src, dst, neg),
+        #     ]
+        #
+        batch_src = []
+        batch_dst = []
+        batch_neg = []
+        for batch in batch_ex:
+            batch_src.append(batch[0])
+            batch_dst.append(batch[1])
+            if len(batch) == 3: # default neg samples
+                batch_neg.append(batch[2])
+        if len(batch_src) != self.batch_size:
+            if self.phase == "train":
+                return None  #Skip
+        if len(batch_neg) > 0:
+            batch_neg = np.unique(np.concatenate(batch_neg))
+        batch_src = np.array(batch_src, dtype="int64")
+        batch_dst = np.array(batch_dst, dtype="int64")
+        sampled_batch_neg = alias_sample(batch_dst.shape, self.alias, self.events)
+        if len(batch_neg) > 0:
+            batch_neg = np.concatenate([batch_neg, sampled_batch_neg], 0)
+        else:
+            batch_neg = sampled_batch_neg
+        if self.phase == "train":
+            ignore_edges = set()
+        else:
+            ignore_edges = set()
+        nodes = np.unique(np.concatenate([batch_src, batch_dst, batch_neg], 0))
+        subgraphs = graphsage_sample(self.graph, nodes, self.samples, ignore_edges=ignore_edges)
+        feed_dict = {}
+        for i in range(self.num_layers):
+            feed_dict.update(self.graph_wrappers[i].to_feed(subgraphs[i]))
+        # only reindex from first subgraph
+        sub_src_idx = subgraphs[0].reindex_from_parrent_nodes(batch_src)
+        sub_dst_idx = subgraphs[0].reindex_from_parrent_nodes(batch_dst)
+        sub_neg_idx = subgraphs[0].reindex_from_parrent_nodes(batch_neg)
+        feed_dict["user_index"] = np.array(sub_src_idx, dtype="int64")
+        feed_dict["item_index"] = np.array(sub_dst_idx, dtype="int64")
+        #feed_dict["neg_item_index"] = np.array(sub_neg_idx, dtype="int64")
+        feed_dict["term_ids"] = self.term_ids[subgraphs[0].node_feat["index"]]
+        return feed_dict
+    def __call__(self):
+        return self.generator()
+    def generator(self):
+        try:
+            for feed_dict in super(GraphGenerator, self).generator():
+                if self.use_pyreader:
+                    yield [feed_dict[name] for name in self.feed_name_list]
+                else:
+                    yield feed_dict
+        except Exception as e:
+            log.exception(e)
--- a/examples/erniesage/docs/source/_static/ernie_aggregator.png
+++ b/examples/erniesage/docs/source/_static/ernie_aggregator.png
--- a/examples/erniesage/docs/source/_static/text_graph.png
+++ b/examples/erniesage/docs/source/_static/text_graph.png
--- a/examples/erniesage/infer.py
+++ b/examples/erniesage/infer.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+import argparse
+import pickle
+import time
+import glob
+import os
+import io
+import traceback
+import pickle as pkl
+role = os.getenv("TRAINING_ROLE", "TRAINER")
+import numpy as np
+import yaml
+from easydict import EasyDict as edict
+import pgl
+from pgl.utils.logger import log
+from pgl.utils import paddle_helper
+import paddle
+import paddle.fluid as F
+from models.model_factory import Model
+from dataset.graph_reader import GraphGenerator 
+class PredictData(object):
+    def __init__(self, num_nodes):
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
+        train_usr = np.arange(trainer_id, num_nodes, trainer_count)
+        #self.data = (train_usr, train_usr)
+        self.data = train_usr
+    def __getitem__(self, index):
+        return [self.data[index], self.data[index]]
+def tostr(data_array):
+    return " ".join(["%.5lf" % d for d in  data_array])
+def run_predict(py_reader,
+              exe,
+              program,
+              model_dict,
+              log_per_step=1,
+              args=None):
+    if args.input_type == "text":
+        id2str = np.load(os.path.join(args.graph_path, "id2str.npy"), mmap_mode="r")
+    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+    trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
+    if not os.path.exists(args.output_path):
+        os.mkdir(args.output_path)
+    fout = io.open("%s/part-%s" % (args.output_path, trainer_id), "w", encoding="utf8")
+    batch = 0
+    for batch_feed_dict in py_reader():
+        batch += 1
+        batch_usr_feat, batch_ad_feat, batch_src_real_index = exe.run(
+            program,
+            feed=batch_feed_dict,
+            fetch_list=model_dict.outputs)
+        if batch % log_per_step == 0:
+            log.info("Predict %s finished" % batch)
+        for ufs, _, sri in zip(batch_usr_feat, batch_ad_feat, batch_src_real_index):
+            if args.input_type == "text":
+                sri = id2str[int(sri)]
+            line = "{}\t{}\n".format(sri, tostr(ufs))
+            fout.write(line)
+    fout.close()
+def _warmstart(exe, program, path='params'):
+    def _existed_persitables(var):
+        #if not isinstance(var, fluid.framework.Parameter):
+        #    return False
+        if not F.io.is_persistable(var):
+            return False
+        param_path = os.path.join(path, var.name)
+        log.info("Loading parameter: {} persistable: {} exists: {}".format(
+            param_path,
+            F.io.is_persistable(var),
+            os.path.exists(param_path),
+        ))
+        return os.path.exists(param_path)
+    F.io.load_vars(
+        exe,
+        path,
+        main_program=program,
+        predicate=_existed_persitables
+    )
+def main(config):
+    model = Model.factory(config)
+    if config.learner_type == "cpu":
+        place = F.CPUPlace()
+    elif config.learner_type == "gpu":
+        gpu_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        place = F.CUDAPlace(gpu_id)
+    else:
+        raise ValueError
+    exe = F.Executor(place)
+    val_program = F.default_main_program().clone(for_test=True)
+    exe.run(F.default_startup_program()) 
+    _warmstart(exe, F.default_startup_program(), path=config.infer_model)
+    num_threads = int(os.getenv("CPU_NUM", 1))
+    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
+    exec_strategy = F.ExecutionStrategy()
+    exec_strategy.num_threads = num_threads
+    build_strategy = F.BuildStrategy()
+    build_strategy.enable_inplace = True
+    build_strategy.memory_optimize = True
+    build_strategy.remove_unnecessary_lock = False
+    build_strategy.memory_optimize = False
+    if num_threads > 1:
+        build_strategy.reduce_strategy = F.BuildStrategy.ReduceStrategy.Reduce
+    val_compiled_prog = F.compiler.CompiledProgram(
+        val_program).with_data_parallel(
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy)
+    num_nodes = int(np.load(os.path.join(config.graph_path, "num_nodes.npy")))
+    predict_data = PredictData(num_nodes)
+    predict_iter = GraphGenerator(
+        graph_wrappers=model.graph_wrappers,
+        batch_size=config.infer_batch_size,
+        data=predict_data,
+        samples=config.samples,
+        num_workers=config.sample_workers,
+        feed_name_list=[var.name for var in model.feed_list],
+        use_pyreader=config.use_pyreader,
+        phase="predict",
+        graph_data_path=config.graph_path,
+        shuffle=False)
+    if config.learner_type == "cpu":
+        model.data_loader.decorate_batch_generator(
+            predict_iter, places=F.cpu_places())
+    elif config.learner_type == "gpu":
+        gpu_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        place = F.CUDAPlace(gpu_id)
+        model.data_loader.decorate_batch_generator(
+            predict_iter, places=place)
+    else:
+        raise ValueError
+    run_predict(model.data_loader,
+                program=val_compiled_prog,
+                exe=exe,
+                model_dict=model,
+                args=config)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='main')
+    parser.add_argument("--conf", type=str, default="./config.yaml")
+    args = parser.parse_args()
+    config = edict(yaml.load(open(args.conf), Loader=yaml.FullLoader))
+    print(config)
+    main(config)
--- a/examples/erniesage/job.sh
+++ b/examples/erniesage/job.sh
+unset http_proxy https_proxy
+set -x
+mode=${1:-local}
+config=${2:-"./config.yaml"}
+function parse_yaml {
+   local prefix=$2
+   local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034')
+   sed -ne "s|^\($s\):|\1|" \
+        -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \
+        -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p"  $1 |
+   awk -F$fs '{
+      indent = length($1)/2;
+      vname[indent] = $2;
+      for (i in vname) {if (i > indent) {delete vname[i]}}
+      if (length($3) > 0) {
+         vn=""; for (i=0; i<indent; i++) {vn=(vn)(vname[i])("_")}
+         printf("%s%s%s=\"%s\"\n", "'$prefix'",vn, $2, $3);
+      }
+   }'
+}
+eval $(parse_yaml $config)
+export CPU_NUM=$CPU_NUM
+export FLAGS_rpc_deadline=3000000 
+export FLAGS_rpc_retry_times=1000
+if [[ $async_mode == "True" ]];then
+    echo "async_mode is True"
+else
+    export FLAGS_communicator_send_queue_size=1
+    export FLAGS_communicator_min_send_grad_num_before_recv=0
+    export FLAGS_communicator_max_merge_var_num=1 # important! 
+    export FLAGS_communicator_merge_sparse_grad=0
+fi
+export FLAGS_communicator_recv_wait_times=5000000
+mkdir -p output
+python ./train.py --conf $config
+if [[ $TRAINING_ROLE == "TRAINER" ]];then
+    python ./infer.py --conf $config
+fi
--- a/examples/erniesage/learner.py
+++ b/examples/erniesage/learner.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import os
+role = os.getenv("TRAINING_ROLE", "TRAINER")
+import numpy as np
+from pgl.utils.logger import log
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import StrategyFactory
+from paddle.fluid.incubate.fleet.collective import DistributedStrategy
+from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
+from paddle.fluid.incubate.fleet.collective import fleet as cfleet
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet as tfleet
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from tensorboardX import SummaryWriter
+class Learner(object):
+    @classmethod
+    def factory(cls, name):
+        if name == "cpu":
+            return TranspilerLearner()
+        elif name == "gpu":
+            return CollectiveLearner()
+        else:
+            raise ValueError
+    def build(self, model, data_gen, config):
+        raise NotImplementedError
+    def warmstart(self, program, path='./checkpoints'):
+        def _existed_persitables(var):
+            #if not isinstance(var, fluid.framework.Parameter):
+            #    return False
+            if not F.io.is_persistable(var):
+                return False
+            param_path = os.path.join(path, var.name)
+            log.info("Loading parameter: {} persistable: {} exists: {}".format(
+                param_path,
+                F.io.is_persistable(var),
+                os.path.exists(param_path),
+            ))
+            return os.path.exists(param_path)
+        F.io.load_vars(
+            self.exe,
+            path,
+            main_program=program,
+            predicate=_existed_persitables
+        )
+    def start(self):
+        batch = 0
+        start = time.time()
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        if trainer_id == 0:
+            writer = SummaryWriter(os.path.join(self.config.output_path, "train_history"))
+        for epoch_idx in range(self.config.epoch):
+            for idx, batch_feed_dict in enumerate(self.model.data_loader()):
+                try:
+                    cpu_time = time.time()
+                    batch += 1
+                    batch_loss  = self.exe.run(
+                        self.program,
+                        feed=batch_feed_dict,
+                        fetch_list=[self.model.loss])
+                    end = time.time()
+                    if trainer_id == 0:
+                        writer.add_scalar("loss", np.mean(batch_loss), batch)
+                        if batch % self.config.log_per_step == 0:
+                            log.info(
+                                "Epoch %s Batch %s %s-Loss %s \t Speed(per batch) %.5lf/%.5lf sec"
+                                % (epoch_idx, batch, "train", np.mean(batch_loss), (end - start) /batch, (end - cpu_time)))
+                            writer.flush()
+                        if batch % self.config.save_per_step == 0:
+                            self.fleet.save_persistables(self.exe, os.path.join(self.config.output_path, str(batch)))
+                except Exception as e:
+                    log.info("Pyreader train error")
+                    log.exception(e)
+            log.info("epcoh %s done." % epoch_idx)
+    def stop(self):
+        self.fleet.save_persistables(self.exe, os.path.join(self.config.output_path, "last"))
+class TranspilerLearner(Learner):
+    def __init__(self):
+        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        paddle_role = role_maker.Role.WORKER
+        place = F.CPUPlace()
+        if training_role == "PSERVER":
+            paddle_role = role_maker.Role.SERVER
+        # set the fleet runtime environment according to configure
+        port = os.getenv("PADDLE_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVERS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = eplist  # ip:port,ip:port...
+        worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        role = role_maker.UserDefinedRoleMaker(
+            current_id=trainer_id,
+            role=paddle_role,
+            worker_num=worker_num,
+            server_endpoints=pserver_endpoints)
+        tfleet.init(role)
+        tfleet.save_on_pserver = True
+    def build(self, model, data_gen, config):
+        self.optimize(model.loss, config.optimizer_type, config.lr)
+        self.init_and_run_ps_worker(config.ckpt_path)
+        self.program = self.complie_program(model.loss)
+        self.fleet = tfleet
+        model.data_loader.decorate_batch_generator(
+            data_gen, places=F.cpu_places())
+        self.config = config
+        self.model = model
+    def optimize(self, loss, optimizer_type, lr):
+        strategy = DistributeTranspilerConfig()
+        strategy.sync_mode = False
+        log.info('learning rate:%f' % lr)
+        if optimizer_type == "sgd":
+            optimizer = F.optimizer.SGD(learning_rate=lr)
+        elif optimizer_type == "adam":
+            # Don't slice tensor ensure convergence 
+            optimizer = F.optimizer.Adam(learning_rate=lr, lazy_mode=True)
+        else:
+            raise ValueError("Unknown Optimizer %s" % optimizer_type)
+        #create the DistributeTranspiler configure
+        optimizer = tfleet.distributed_optimizer(optimizer, strategy)
+        optimizer.minimize(loss)
+    def init_and_run_ps_worker(self, ckpt_path):
+        # init and run server or worker
+        self.exe = F.Executor(F.CPUPlace())
+        if tfleet.is_server():
+            tfleet.init_server()
+            self.warmstart(tfleet.startup_program, path=ckpt_path)
+            tfleet.run_server()
+            exit()
+        if tfleet.is_worker():
+            log.info("start init worker done")
+            tfleet.init_worker()
+            self.exe.run(tfleet.startup_program)
+    def complie_program(self, loss):
+        num_threads = int(os.getenv("CPU_NUM", 1))
+        exec_strategy = F.ExecutionStrategy()
+        exec_strategy.num_threads = num_threads
+        exec_strategy.use_thread_barrier = False
+        build_strategy = F.BuildStrategy()
+        build_strategy.enable_inplace = True
+        build_strategy.memory_optimize = True
+        build_strategy.remove_unnecessary_lock = False
+        build_strategy.memory_optimize = False
+        build_strategy.async_mode = False
+        if num_threads > 1:
+            build_strategy.reduce_strategy = F.BuildStrategy.ReduceStrategy.Reduce
+        log.info("start build compile program...")
+        compiled_prog = F.compiler.CompiledProgram(tfleet.main_program
+            ).with_data_parallel(
+                loss_name=loss.name,
+                build_strategy=build_strategy,
+                exec_strategy=exec_strategy)
+        return compiled_prog
+class CollectiveLearner(Learner):
+    def __init__(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        cfleet.init(role)
+    def optimize(self, loss, optimizer_type, lr):
+        optimizer = F.optimizer.Adam(learning_rate=lr)
+        dist_strategy = DistributedStrategy()
+        optimizer = cfleet.distributed_optimizer(optimizer, strategy=dist_strategy)
+        _, param_grads = optimizer.minimize(loss, F.default_startup_program())
+    def build(self, model, data_gen, config):
+        self.optimize(model.loss, config.optimizer_type, config.lr)
+        self.program = cfleet.main_program
+        gpu_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        place = F.CUDAPlace(gpu_id)
+        self.exe = F.Executor(place)
+        self.exe.run(F.default_startup_program())
+        self.warmstart(F.default_startup_program(), config.ckpt_path)
+        self.fleet = cfleet
+        model.data_loader.decorate_batch_generator(
+            data_gen, places=place)
+        self.config = config
+        self.model = model
--- a/examples/erniesage/local_run.sh
+++ b/examples/erniesage/local_run.sh
+#!/bin/bash 
+set -x
+config=${1:-"./config.yaml"}
+unset http_proxy https_proxy
+function parse_yaml {
+   local prefix=$2
+   local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034')
+   sed -ne "s|^\($s\):|\1|" \
+        -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \
+        -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p"  $1 |
+   awk -F$fs '{
+      indent = length($1)/2;
+      vname[indent] = $2;
+      for (i in vname) {if (i > indent) {delete vname[i]}}
+      if (length($3) > 0) {
+         vn=""; for (i=0; i<indent; i++) {vn=(vn)(vname[i])("_")}
+         printf("%s%s%s=\"%s\"\n", "'$prefix'",vn, $2, $3);
+      }
+   }'
+}
+transpiler_local_train(){
+    export PADDLE_TRAINERS_NUM=1
+    export PADDLE_PSERVERS_NUM=1
+    export PADDLE_PORT=6206
+    export PADDLE_PSERVERS="127.0.0.1"
+    export BASE="./local_dir"
+    echo `which python`
+    if [ -d ${BASE} ]; then
+        rm -rf ${BASE}
+    fi 
+    mkdir ${BASE}
+    rm job_id
+    for((i=0;i<${PADDLE_PSERVERS_NUM};i++))
+    do
+        echo "start ps server: ${i}"
+        TRAINING_ROLE="PSERVER" PADDLE_TRAINER_ID=${i} sh job.sh local $config \
+            &> $BASE/pserver.$i.log &
+        echo $! >> job_id
+    done
+    sleep 3s 
+    for((j=0;j<${PADDLE_TRAINERS_NUM};j++))
+    do
+        echo "start ps work: ${j}"
+        TRAINING_ROLE="TRAINER" PADDLE_TRAINER_ID=${j} sh job.sh local $config \
+        echo $! >> job_id
+    done
+}
+collective_local_train(){
+    export PATH=./python27-gcc482-gpu/bin/:$PATH
+    echo `which python`
+    python -m paddle.distributed.launch train.py --conf $config
+    python -m paddle.distributed.launch infer.py --conf $config
+}
+eval $(parse_yaml $config)
+unalias python
+python3 ./preprocessing/dump_graph.py -i $input_data -o $graph_path --encoding $encoding \
+    -l $max_seqlen --vocab_file $ernie_vocab_file
+if [[ $learner_type == "cpu" ]];then
+    transpiler_local_train
+fi
+if [[ $learner_type == "gpu" ]];then
+    collective_local_train
+fi
--- a/examples/erniesage/models/__init__.py
+++ b/examples/erniesage/models/__init__.py
--- a/examples/erniesage/models/base.py
+++ b/examples/erniesage/models/base.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import glob
+import os
+import numpy as np
+import pgl
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from models import message_passing
+def get_layer(layer_type, gw, feature, hidden_size, act, initializer, learning_rate, name, is_test=False):
+    return getattr(message_passing, layer_type)(gw, feature, hidden_size, act, initializer, learning_rate, name)
+class BaseGraphWrapperBuilder(object):
+    def __init__(self, config):
+        self.config = config
+        self.node_feature_info = []
+        self.edge_feature_info = []
+    def __call__(self):
+        place = F.CPUPlace()
+        graph_wrappers = []
+        for i in range(self.config.num_layers):
+            # all graph have same node_feat_info
+            graph_wrappers.append(
+                pgl.graph_wrapper.GraphWrapper(
+                    "layer_%s" % i, place, node_feat=self.node_feature_info, edge_feat=self.edge_feature_info))
+        return graph_wrappers
+class GraphsageGraphWrapperBuilder(BaseGraphWrapperBuilder):
+    def __init__(self, config):
+        super(GraphsageGraphWrapperBuilder, self).__init__(config)
+        self.node_feature_info.append(('index', [None], np.dtype('int64')))
+class BaseGNNModel(object):
+    def __init__(self, config):
+        self.config = config
+        self.graph_wrapper_builder = self.gen_graph_wrapper_builder(config) 
+        self.net_fn = self.gen_net_fn(config)
+        self.feed_list_builder = self.gen_feed_list_builder(config)
+        self.data_loader_builder = self.gen_data_loader_builder(config)
+        self.loss_fn = self.gen_loss_fn(config)
+        self.build()
+    def gen_graph_wrapper_builder(self, config): 
+        return GraphsageGraphWrapperBuilder(config)
+    def gen_net_fn(self, config):
+        return BaseNet(config)
+    def gen_feed_list_builder(self, config):
+        return BaseFeedListBuilder(config) 
+    def gen_data_loader_builder(self, config):
+        return BaseDataLoaderBuilder(config)
+    def gen_loss_fn(self, config):
+        return BaseLoss(config)
+    def build(self):
+        self.graph_wrappers = self.graph_wrapper_builder()
+        self.inputs, self.outputs = self.net_fn(self.graph_wrappers)
+        self.feed_list = self.feed_list_builder(self.inputs, self.graph_wrappers)
+        self.data_loader = self.data_loader_builder(self.feed_list)
+        self.loss = self.loss_fn(self.outputs)
+class BaseFeedListBuilder(object):
+    def __init__(self, config):
+        self.config = config
+    def __call__(self, inputs, graph_wrappers):
+        feed_list = []
+        for i in range(len(graph_wrappers)):
+            feed_list.extend(graph_wrappers[i].holder_list)
+        feed_list.extend(inputs)
+        return feed_list
+class BaseDataLoaderBuilder(object):
+    def __init__(self, config):
+        self.config = config
+    def __call__(self, feed_list):
+        data_loader = F.io.PyReader(
+            feed_list=feed_list, capacity=20, use_double_buffer=True, iterable=True)
+        return data_loader
+class BaseNet(object):
+    def __init__(self, config):
+        self.config = config
+    def take_final_feature(self, feature, index, name):
+        """take final feature"""
+        feat = L.gather(feature, index, overwrite=False)
+        if self.config.final_fc:
+            feat = L.fc(feat,
+                           self.config.hidden_size,
+                           param_attr=F.ParamAttr(name=name + '_w'),
+                           bias_attr=F.ParamAttr(name=name + '_b'))
+        if self.config.final_l2_norm:
+            feat = L.l2_normalize(feat, axis=1)
+        return feat
+    def build_inputs(self):
+        user_index = L.data(
+            "user_index", shape=[None], dtype="int64", append_batch_size=False)
+        item_index = L.data(
+            "item_index", shape=[None], dtype="int64", append_batch_size=False)
+        return [user_index, item_index]
+    def build_embedding(self, graph_wrappers, inputs=None):
+        num_embed = int(np.load(os.path.join(self.config.graph_path, "num_nodes.npy")))
+        is_sparse = self.config.trainer_type == "Transpiler"
+        embed = L.embedding(
+            input=L.reshape(graph_wrappers[0].node_feat['index'], [-1, 1]),
+            size=[num_embed, self.config.hidden_size],
+            is_sparse=is_sparse,
+            param_attr=F.ParamAttr(name="node_embedding", initializer=F.initializer.Uniform(
+                low=-1. / self.config.hidden_size,
+                high=1. / self.config.hidden_size)))
+        return embed
+    def gnn_layers(self, graph_wrappers, feature):
+        features = [feature]
+        initializer = None
+        fc_lr = self.config.lr / 0.001
+        for i in range(self.config.num_layers):
+            if i == self.config.num_layers - 1:
+                act = None
+            else:
+                act = "leaky_relu"
+            feature = get_layer(
+                self.config.layer_type,
+                graph_wrappers[i],
+                feature,
+                self.config.hidden_size,
+                act,
+                initializer,
+                learning_rate=fc_lr,
+                name="%s_%s" % (self.config.layer_type, i))
+            features.append(feature)
+        return features
+    def __call__(self, graph_wrappers):
+        inputs = self.build_inputs()
+        feature = self.build_embedding(graph_wrappers, inputs)
+        features = self.gnn_layers(graph_wrappers, feature)
+        outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs]
+        src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0])
+        outputs.append(src_real_index)
+        return inputs, outputs
+class BaseLoss(object):
+    def __init__(self, config):
+        self.config = config
+    def __call__(self, outputs):
+        user_feat, item_feat = outputs[0], outputs[1]
+        loss_type = self.config.loss_type
+        # Calc Loss
+        if self.config.loss_type == "hinge":
+            pos = L.reduce_sum(user_feat * item_feat, -1, keep_dim=True) # [B, 1]
+            neg = L.matmul(user_feat, item_feat, transpose_y=True) # [B, B]
+            loss = L.reduce_mean(L.relu(neg - pos + self.config.margin))
+        elif self.config.loss_type == "softmax":
+            pass
+            # TODO
+            # pos = L.reduce_sum(user_feat * item_feat, -1, keep_dim=True) # [B, 1]
+            # neg = L.matmul(user_feat, neg_feat, transpose_y=True) # [B, B]
+            # logits = L.concat([pos, neg], -1) # [B, 1+B]
+            # labels = L.fill_constant_batch_size_like(logits, [-1, 1], "int64", 0)
+            # loss = L.reduce_mean(L.softmax_with_cross_entropy(logits, labels))
+        else:
+            raise ValueError
+        return loss
--- a/examples/erniesage/models/ernie.py
+++ b/examples/erniesage/models/ernie.py
+"""Ernie
+"""
+from models.base  import BaseNet, BaseGNNModel 
+class Ernie(BaseNet):
+    def build_inputs(self):
+        inputs = super(Ernie, self).build_inputs()
+        term_ids = L.data(
+            "term_ids", shape=[None, self.config.max_seqlen], dtype="int64", append_batch_size=False)
+        return inputs + [term_ids]
+    def build_embedding(self, graph_wrappers, term_ids):
+        term_ids = L.unsqueeze(term_ids, [-1])
+        ernie_config = self.config.ernie_config
+        ernie = ErnieModel(
+            src_ids=term_ids,
+            sentence_ids=L.zeros_like(term_ids),
+            task_ids=None,
+            config=ernie_config,
+            use_fp16=False,
+            name="student_")
+        feature = ernie.get_pooled_output()
+        return feature
+    def __call__(self, graph_wrappers):
+        inputs = self.build_inputs()
+        feature = self.build_embedding(graph_wrappers, inputs[-1])
+        features = [feature]
+        outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs[:-1]]
+        src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0])
+        outputs.append(src_real_index)
+        return inputs, outputs
+class ErnieModel(BaseGNNModel):
+    def gen_net_fn(self, config):
+        return Ernie(config)
--- a/examples/erniesage/models/ernie_model/__init__.py
+++ b/examples/erniesage/models/ernie_model/__init__.py
--- a/examples/erniesage/models/ernie_model/ernie.py
+++ b/examples/erniesage/models/ernie_model/ernie.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Ernie model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import json
+import six
+import logging
+import paddle.fluid as fluid
+import paddle.fluid.layers as L
+from io import open
+from models.ernie_model.transformer_encoder import encoder, pre_process_layer
+from models.ernie_model.transformer_encoder import graph_encoder
+log = logging.getLogger(__name__)
+class ErnieConfig(object):
+    def __init__(self, config_path):
+        self._config_dict = self._parse(config_path)
+    def _parse(self, config_path):
+        try:
+            with open(config_path, 'r', encoding='utf8') as json_file:
+                config_dict = json.load(json_file)
+        except Exception:
+            raise IOError("Error in parsing Ernie model config file '%s'" %
+                          config_path)
+        else:
+            return config_dict
+    def __getitem__(self, key):
+        return self._config_dict.get(key, None)
+    def print_config(self):
+        for arg, value in sorted(six.iteritems(self._config_dict)):
+            log.info('%s: %s' % (arg, value))
+        log.info('------------------------------------------------')
+class ErnieModel(object):
+    def __init__(self,
+                 src_ids,
+                 sentence_ids,
+                 task_ids=None,
+                 config=None,
+                 weight_sharing=True,
+                 use_fp16=False,
+                 name=""):
+        self._set_config(config, name, weight_sharing)
+        input_mask = self._build_input_mask(src_ids)
+        position_ids = self._build_position_ids(src_ids)
+        self._build_model(src_ids, position_ids, sentence_ids, task_ids,
+                          input_mask)
+        self._debug_summary(input_mask)
+    def _debug_summary(self, input_mask):
+        #histogram
+        seqlen_before_pad = L.cast(
+            L.reduce_sum(
+                input_mask, dim=1), dtype='float32')
+        seqlen_after_pad = L.reduce_sum(
+            L.cast(
+                L.zeros_like(input_mask), dtype='float32') + 1.0, dim=1)
+        pad_num = seqlen_after_pad - seqlen_before_pad
+        pad_rate = pad_num / seqlen_after_pad
+    def _build_position_ids(self, src_ids):
+        d_shape = L.shape(src_ids)
+        d_seqlen = d_shape[1]
+        d_batch = d_shape[0]
+        position_ids = L.reshape(
+            L.range(
+                0, d_seqlen, 1, dtype='int32'), [1, d_seqlen, 1],
+            inplace=True)
+        position_ids = L.expand(position_ids, [d_batch, 1, 1])
+        position_ids = L.cast(position_ids, 'int64')
+        position_ids.stop_gradient = True
+        return position_ids
+    def _build_input_mask(self, src_ids):
+        zero = L.fill_constant([1], dtype='int64', value=0)
+        input_mask = L.logical_not(L.equal(src_ids,
+                                           zero))  # assume pad id == 0
+        input_mask = L.cast(input_mask, 'float')
+        input_mask.stop_gradient = True
+        return input_mask
+    def _set_config(self, config, name, weight_sharing):
+        self._emb_size = config['hidden_size']
+        self._n_layer = config['num_hidden_layers']
+        self._n_head = config['num_attention_heads']
+        self._voc_size = config['vocab_size']
+        self._max_position_seq_len = config['max_position_embeddings']
+        if config.get('sent_type_vocab_size'):
+            self._sent_types = config['sent_type_vocab_size']
+        else:
+            self._sent_types = config['type_vocab_size']
+        self._use_task_id = config['use_task_id']
+        if self._use_task_id:
+            self._task_types = config['task_type_vocab_size']
+        self._hidden_act = config['hidden_act']
+        self._postprocess_cmd = config.get('postprocess_cmd', 'dan')
+        self._preprocess_cmd = config.get('preprocess_cmd', '')
+        self._prepostprocess_dropout = config['hidden_dropout_prob']
+        self._attention_dropout = config['attention_probs_dropout_prob']
+        self._weight_sharing = weight_sharing
+        self.name = name
+        self._word_emb_name = self.name + "word_embedding"
+        self._pos_emb_name = self.name + "pos_embedding"
+        self._sent_emb_name = self.name + "sent_embedding"
+        self._task_emb_name = self.name + "task_embedding"
+        self._dtype = "float16" if config['use_fp16'] else "float32"
+        self._emb_dtype = "float32"
+        # Initialize all weigths by truncated normal initializer, and all biases
+        # will be initialized by constant zero by default.
+        self._param_initializer = fluid.initializer.TruncatedNormal(
+            scale=config['initializer_range'])
+    def _build_model(self, src_ids, position_ids, sentence_ids, task_ids,
+                     input_mask):
+        emb_out = self._build_embedding(src_ids, position_ids, sentence_ids,
+                                        task_ids)
+        self.input_mask = input_mask
+        self._enc_out, self.all_hidden, self.all_attn, self.all_ffn = encoder(
+            enc_input=emb_out,
+            input_mask=input_mask,
+            n_layer=self._n_layer,
+            n_head=self._n_head,
+            d_key=self._emb_size // self._n_head,
+            d_value=self._emb_size // self._n_head,
+            d_model=self._emb_size,
+            d_inner_hid=self._emb_size * 4,
+            prepostprocess_dropout=self._prepostprocess_dropout,
+            attention_dropout=self._attention_dropout,
+            relu_dropout=0,
+            hidden_act=self._hidden_act,
+            preprocess_cmd=self._preprocess_cmd,
+            postprocess_cmd=self._postprocess_cmd,
+            param_initializer=self._param_initializer,
+            name=self.name + 'encoder')
+        if self._dtype == "float16":
+            self._enc_out = fluid.layers.cast(
+                x=self._enc_out, dtype=self._emb_dtype)
+    def _build_embedding(self, src_ids, position_ids, sentence_ids, task_ids):
+        # padding id in vocabulary must be set to 0
+        emb_out = fluid.layers.embedding(
+            input=src_ids,
+            size=[self._voc_size, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._word_emb_name, initializer=self._param_initializer),
+            is_sparse=False)
+        position_emb_out = fluid.layers.embedding(
+            input=position_ids,
+            size=[self._max_position_seq_len, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._pos_emb_name, initializer=self._param_initializer))
+        sent_emb_out = fluid.layers.embedding(
+            sentence_ids,
+            size=[self._sent_types, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._sent_emb_name, initializer=self._param_initializer))
+        self.all_emb = [emb_out, position_emb_out, sent_emb_out]
+        emb_out = emb_out + position_emb_out
+        emb_out = emb_out + sent_emb_out
+        if self._use_task_id:
+            task_emb_out = fluid.layers.embedding(
+                task_ids,
+                size=[self._task_types, self._emb_size],
+                dtype=self._emb_dtype,
+                param_attr=fluid.ParamAttr(
+                    name=self._task_emb_name,
+                    initializer=self._param_initializer))
+            emb_out = emb_out + task_emb_out
+        emb_out = pre_process_layer(
+            emb_out,
+            'nd',
+            self._prepostprocess_dropout,
+            name=self.name + 'pre_encoder')
+        if self._dtype == "float16":
+            emb_out = fluid.layers.cast(x=emb_out, dtype=self._dtype)
+        return emb_out
+    def get_sequence_output(self):
+        return self._enc_out
+    def get_pooled_output(self):
+        """Get the first feature of each sequence for classification"""
+        next_sent_feat = self._enc_out[:, 0, :]
+        #next_sent_feat = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1])
+        next_sent_feat = fluid.layers.fc(
+            input=next_sent_feat,
+            size=self._emb_size,
+            act="tanh",
+            param_attr=fluid.ParamAttr(
+                name=self.name + "pooled_fc.w_0",
+                initializer=self._param_initializer),
+            bias_attr=self.name + "pooled_fc.b_0")
+        return next_sent_feat
+    def get_lm_output(self, mask_label, mask_pos):
+        """Get the loss & accuracy for pretraining"""
+        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
+        # extract the first token feature in each sentence
+        self.next_sent_feat = self.get_pooled_output()
+        reshaped_emb_out = fluid.layers.reshape(
+            x=self._enc_out, shape=[-1, self._emb_size])
+        # extract masked tokens' feature
+        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
+        # transform: fc
+        mask_trans_feat = fluid.layers.fc(
+            input=mask_feat,
+            size=self._emb_size,
+            act=self._hidden_act,
+            param_attr=fluid.ParamAttr(
+                name=self.name + 'mask_lm_trans_fc.w_0',
+                initializer=self._param_initializer),
+            bias_attr=fluid.ParamAttr(name=self.name + 'mask_lm_trans_fc.b_0'))
+        # transform: layer norm 
+        mask_trans_feat = fluid.layers.layer_norm(
+            mask_trans_feat,
+            begin_norm_axis=len(mask_trans_feat.shape) - 1,
+            param_attr=fluid.ParamAttr(
+                name=self.name + 'mask_lm_trans_layer_norm_scale',
+                initializer=fluid.initializer.Constant(1.)),
+            bias_attr=fluid.ParamAttr(
+                name=self.name + 'mask_lm_trans_layer_norm_bias',
+                initializer=fluid.initializer.Constant(0.)))
+        # transform: layer norm 
+        #mask_trans_feat = pre_process_layer(
+        #    mask_trans_feat, 'n', name=self.name + 'mask_lm_trans')
+        mask_lm_out_bias_attr = fluid.ParamAttr(
+            name=self.name + "mask_lm_out_fc.b_0",
+            initializer=fluid.initializer.Constant(value=0.0))
+        if self._weight_sharing:
+            fc_out = fluid.layers.matmul(
+                x=mask_trans_feat,
+                y=fluid.default_main_program().global_block().var(
+                    self._word_emb_name),
+                transpose_y=True)
+            fc_out += fluid.layers.create_parameter(
+                shape=[self._voc_size],
+                dtype=self._emb_dtype,
+                attr=mask_lm_out_bias_attr,
+                is_bias=True)
+        else:
+            fc_out = fluid.layers.fc(input=mask_trans_feat,
+                                     size=self._voc_size,
+                                     param_attr=fluid.ParamAttr(
+                                         name=self.name + "mask_lm_out_fc.w_0",
+                                         initializer=self._param_initializer),
+                                     bias_attr=mask_lm_out_bias_attr)
+        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
+            logits=fc_out, label=mask_label)
+        return mask_lm_loss
+    def get_task_output(self, task, task_labels):
+        task_fc_out = fluid.layers.fc(
+            input=self.next_sent_feat,
+            size=task["num_labels"],
+            param_attr=fluid.ParamAttr(
+                name=self.name + task["task_name"] + "_fc.w_0",
+                initializer=self._param_initializer),
+            bias_attr=self.name + task["task_name"] + "_fc.b_0")
+        task_loss, task_softmax = fluid.layers.softmax_with_cross_entropy(
+            logits=task_fc_out, label=task_labels, return_softmax=True)
+        task_acc = fluid.layers.accuracy(input=task_softmax, label=task_labels)
+        return task_loss, task_acc
+class ErnieGraphModel(ErnieModel):
+    def __init__(self,
+                 src_ids,
+                 task_ids=None,
+                 config=None,
+                 weight_sharing=True,
+                 use_fp16=False,
+                 slot_seqlen=40,
+                 name=""):
+        self.slot_seqlen = slot_seqlen
+        self._set_config(config, name, weight_sharing)
+        input_mask = self._build_input_mask(src_ids)
+        position_ids = self._build_position_ids(src_ids)
+        sentence_ids = self._build_sentence_ids(src_ids)
+        self._build_model(src_ids, position_ids, sentence_ids, task_ids,
+                          input_mask)
+        self._debug_summary(input_mask)
+    def _build_position_ids(self, src_ids):
+        src_shape = L.shape(src_ids)
+        src_seqlen = src_shape[1]
+        src_batch = src_shape[0]
+        slot_seqlen = self.slot_seqlen
+        num_b = (src_seqlen / slot_seqlen) - 1
+        a_position_ids = L.reshape(
+            L.range(
+                0, slot_seqlen, 1, dtype='int32'), [1, slot_seqlen, 1],
+            inplace=True) # [1, slot_seqlen, 1]
+        a_position_ids = L.expand(a_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen * num_b, 1]
+        zero = L.fill_constant([1], dtype='int64', value=0)
+        input_mask = L.cast(L.equal(src_ids[:, :slot_seqlen], zero), "int32")  # assume pad id == 0 [B, slot_seqlen, 1]
+        a_pad_len = L.reduce_sum(input_mask, 1) # [B, 1, 1]
+        b_position_ids = L.reshape(
+            L.range(
+                slot_seqlen, 2*slot_seqlen, 1, dtype='int32'), [1, slot_seqlen, 1],
+            inplace=True) # [1, slot_seqlen, 1]
+        b_position_ids = L.expand(b_position_ids, [src_batch, num_b, 1]) # [B, slot_seqlen * num_b, 1]
+        b_position_ids = b_position_ids - a_pad_len # [B, slot_seqlen * num_b, 1]
+        position_ids = L.concat([a_position_ids, b_position_ids], 1)
+        position_ids = L.cast(position_ids, 'int64')
+        position_ids.stop_gradient = True
+        return position_ids
+    def _build_sentence_ids(self, src_ids):
+        src_shape = L.shape(src_ids)
+        src_seqlen = src_shape[1]
+        src_batch = src_shape[0]
+        slot_seqlen = self.slot_seqlen
+        zeros = L.zeros([src_batch, slot_seqlen, 1], "int64")
+        ones = L.ones([src_batch, src_seqlen-slot_seqlen, 1], "int64")
+        sentence_ids = L.concat([zeros, ones], 1)
+        sentence_ids.stop_gradient = True
+        return sentence_ids
+    def _build_model(self, src_ids, position_ids, sentence_ids, task_ids,
+                     input_mask):
+        emb_out = self._build_embedding(src_ids, position_ids, sentence_ids,
+                                        task_ids)
+        self.input_mask = input_mask
+        self._enc_out, self.all_hidden, self.all_attn, self.all_ffn = graph_encoder(
+            enc_input=emb_out,
+            input_mask=input_mask,
+            n_layer=self._n_layer,
+            n_head=self._n_head,
+            d_key=self._emb_size // self._n_head,
+            d_value=self._emb_size // self._n_head,
+            d_model=self._emb_size,
+            d_inner_hid=self._emb_size * 4,
+            prepostprocess_dropout=self._prepostprocess_dropout,
+            attention_dropout=self._attention_dropout,
+            relu_dropout=0,
+            hidden_act=self._hidden_act,
+            preprocess_cmd=self._preprocess_cmd,
+            postprocess_cmd=self._postprocess_cmd,
+            param_initializer=self._param_initializer,
+            slot_seqlen=self.slot_seqlen,
+            name=self.name + 'encoder')
+        if self._dtype == "float16":
+            self._enc_out = fluid.layers.cast(
+                x=self._enc_out, dtype=self._emb_dtype)
--- a/examples/erniesage/models/ernie_model/transformer_encoder.py
+++ b/examples/erniesage/models/ernie_model/transformer_encoder.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+import numpy as np
+from contextlib import contextmanager
+import paddle.fluid as fluid
+import paddle.fluid.layers as L
+import paddle.fluid.layers as layers
+#import propeller.paddle as propeller
+#from propeller import log
+#determin this at the begining
+to_3d = lambda a: a  # will change later
+to_2d = lambda a: a
+def multi_head_attention(queries,
+                         keys,
+                         values,
+                         attn_bias,
+                         d_key,
+                         d_value,
+                         d_model,
+                         n_head=1,
+                         dropout_rate=0.,
+                         cache=None,
+                         param_initializer=None,
+                         name='multi_head_att'):
+    """
+    Multi-Head Attention. Note that attn_bias is added to the logit before
+    computing softmax activiation to mask certain selected positions so that
+    they will not considered in attention weights.
+    """
+    keys = queries if keys is None else keys
+    values = keys if values is None else values
+    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
+        """
+        Add linear projection to queries, keys, and values.
+        """
+        q = layers.fc(input=queries,
+                      size=d_key * n_head,
+                      num_flatten_dims=len(queries.shape) - 1,
+                      param_attr=fluid.ParamAttr(
+                          name=name + '_query_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_query_fc.b_0')
+        k = layers.fc(input=keys,
+                      size=d_key * n_head,
+                      num_flatten_dims=len(keys.shape) - 1,
+                      param_attr=fluid.ParamAttr(
+                          name=name + '_key_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_key_fc.b_0')
+        v = layers.fc(input=values,
+                      size=d_value * n_head,
+                      num_flatten_dims=len(values.shape) - 1,
+                      param_attr=fluid.ParamAttr(
+                          name=name + '_value_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_value_fc.b_0')
+        return q, k, v
+    def __split_heads(x, n_head):
+        """
+        Reshape the last dimension of inpunt tensor x so that it becomes two
+        dimensions and then transpose. Specifically, input a tensor with shape
+        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
+        with shape [bs, n_head, max_sequence_length, hidden_dim].
+        """
+        hidden_size = x.shape[-1]
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        reshaped = layers.reshape(
+            x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
+        # permuate the dimensions into:
+        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
+        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
+    def __combine_heads(x):
+        """
+        Transpose and then reshape the last two dimensions of inpunt tensor x
+        so that it becomes one dimension, which is reverse to __split_heads.
+        """
+        if len(x.shape) == 3: return x
+        if len(x.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        #trans_x.desc.set_shape((-1, 1, n_head, d_value))
+        return layers.reshape(x=trans_x, shape=[0, 0, d_model], inplace=True)
+    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
+        """
+        Scaled Dot-Product Attention
+        """
+        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
+        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
+        if attn_bias:
+            product += attn_bias
+        weights = layers.softmax(product)
+        if dropout_rate:
+            weights = layers.dropout(
+                weights,
+                dropout_prob=dropout_rate,
+                dropout_implementation="upscale_in_train",
+                is_test=False)
+        out = layers.matmul(weights, v)
+        #return out, product
+        return out, weights
+    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
+    q = to_3d(q)
+    k = to_3d(k)
+    v = to_3d(v)
+    if cache is not None:  # use cache and concat time steps
+        # Since the inplace reshape in __split_heads changes the shape of k and
+        # v, which is the cache input for next time step, reshape the cache
+        # input from the previous time step first.
+        k = cache["k"] = layers.concat(
+            [layers.reshape(
+                cache["k"], shape=[0, 0, d_model]), k], axis=1)
+        v = cache["v"] = layers.concat(
+            [layers.reshape(
+                cache["v"], shape=[0, 0, d_model]), v], axis=1)
+    q = __split_heads(q, n_head)
+    k = __split_heads(k, n_head)
+    v = __split_heads(v, n_head)
+    ctx_multiheads, ctx_multiheads_attn = scaled_dot_product_attention(
+        q, k, v, attn_bias, d_key, dropout_rate)
+    out = __combine_heads(ctx_multiheads)
+    out = to_2d(out)
+    # Project back to the model size.
+    proj_out = layers.fc(input=out,
+                         size=d_model,
+                         num_flatten_dims=len(out.shape) - 1,
+                         param_attr=fluid.ParamAttr(
+                             name=name + '_output_fc.w_0',
+                             initializer=param_initializer),
+                         bias_attr=name + '_output_fc.b_0')
+    return proj_out, ctx_multiheads_attn
+def positionwise_feed_forward(x,
+                              d_inner_hid,
+                              d_hid,
+                              dropout_rate,
+                              hidden_act,
+                              param_initializer=None,
+                              name='ffn'):
+    """
+    Position-wise Feed-Forward Networks.
+    This module consists of two linear transformations with a ReLU activation
+    in between, which is applied to each position separately and identically.
+    """
+    hidden = layers.fc(input=x,
+                       size=d_inner_hid,
+                       num_flatten_dims=len(x.shape) - 1,
+                       act=hidden_act,
+                       param_attr=fluid.ParamAttr(
+                           name=name + '_fc_0.w_0',
+                           initializer=param_initializer),
+                       bias_attr=name + '_fc_0.b_0')
+    if dropout_rate:
+        hidden = layers.dropout(
+            hidden,
+            dropout_prob=dropout_rate,
+            dropout_implementation="upscale_in_train",
+            is_test=False)
+    out = layers.fc(input=hidden,
+                    size=d_hid,
+                    num_flatten_dims=len(hidden.shape) - 1,
+                    param_attr=fluid.ParamAttr(
+                        name=name + '_fc_1.w_0',
+                        initializer=param_initializer),
+                    bias_attr=name + '_fc_1.b_0')
+    return out
+def pre_post_process_layer(prev_out,
+                           out,
+                           process_cmd,
+                           dropout_rate=0.,
+                           name=''):
+    """
+    Add residual connection, layer normalization and droput to the out tensor
+    optionally according to the value of process_cmd.
+    This will be used before or after multi-head attention and position-wise
+    feed-forward networks.
+    """
+    for cmd in process_cmd:
+        if cmd == "a":  # add residual connection
+            out = out + prev_out if prev_out else out
+        elif cmd == "n":  # add layer normalization
+            out_dtype = out.dtype
+            if out_dtype == fluid.core.VarDesc.VarType.FP16:
+                out = layers.cast(x=out, dtype="float32")
+            out = layers.layer_norm(
+                out,
+                begin_norm_axis=len(out.shape) - 1,
+                param_attr=fluid.ParamAttr(
+                    name=name + '_layer_norm_scale',
+                    initializer=fluid.initializer.Constant(1.)),
+                bias_attr=fluid.ParamAttr(
+                    name=name + '_layer_norm_bias',
+                    initializer=fluid.initializer.Constant(0.)))
+            if out_dtype == fluid.core.VarDesc.VarType.FP16:
+                out = layers.cast(x=out, dtype="float16")
+        elif cmd == "d":  # add dropout
+            if dropout_rate:
+                out = layers.dropout(
+                    out,
+                    dropout_prob=dropout_rate,
+                    dropout_implementation="upscale_in_train",
+                    is_test=False)
+    return out
+pre_process_layer = partial(pre_post_process_layer, None)
+post_process_layer = pre_post_process_layer
+def encoder_layer(enc_input,
+                  attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  prepostprocess_dropout,
+                  attention_dropout,
+                  relu_dropout,
+                  hidden_act,
+                  preprocess_cmd="n",
+                  postprocess_cmd="da",
+                  param_initializer=None,
+                  name=''):
+    """The encoder layers that can be stacked to form a deep encoder.
+    This module consits of a multi-head (self) attention followed by
+    position-wise feed-forward networks and both the two components companied
+    with the post_process_layer to add residual connection, layer normalization
+    and droput.
+    """
+    #L.Print(L.reduce_mean(enc_input), message='1')
+    attn_output, ctx_multiheads_attn = multi_head_attention(
+        pre_process_layer(
+            enc_input,
+            preprocess_cmd,
+            prepostprocess_dropout,
+            name=name + '_pre_att'),
+        None,
+        None,
+        attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        attention_dropout,
+        param_initializer=param_initializer,
+        name=name + '_multi_head_att')
+    #L.Print(L.reduce_mean(attn_output), message='1')
+    attn_output = post_process_layer(
+        enc_input,
+        attn_output,
+        postprocess_cmd,
+        prepostprocess_dropout,
+        name=name + '_post_att')
+    #L.Print(L.reduce_mean(attn_output), message='2')
+    ffd_output = positionwise_feed_forward(
+        pre_process_layer(
+            attn_output,
+            preprocess_cmd,
+            prepostprocess_dropout,
+            name=name + '_pre_ffn'),
+        d_inner_hid,
+        d_model,
+        relu_dropout,
+        hidden_act,
+        param_initializer=param_initializer,
+        name=name + '_ffn')
+    #L.Print(L.reduce_mean(ffd_output), message='3')
+    ret = post_process_layer(
+        attn_output,
+        ffd_output,
+        postprocess_cmd,
+        prepostprocess_dropout,
+        name=name + '_post_ffn')
+    #L.Print(L.reduce_mean(ret), message='4')
+    return ret, ctx_multiheads_attn, ffd_output
+def build_pad_idx(input_mask):
+    pad_idx = L.where(L.cast(L.squeeze(input_mask, [2]), 'bool'))
+    return pad_idx
+def build_attn_bias(input_mask, n_head, dtype):
+    attn_bias = L.matmul(
+        input_mask, input_mask, transpose_y=True)  # [batch, seq, seq]
+    attn_bias = (1. - attn_bias) * -10000.
+    attn_bias = L.stack([attn_bias] * n_head, 1) # [batch, n_head, seq, seq]
+    if attn_bias.dtype != dtype:
+        attn_bias = L.cast(attn_bias, dtype)
+    return attn_bias
+def build_graph_attn_bias(input_mask, n_head, dtype, slot_seqlen):
+    input_shape = L.shape(input_mask)
+    input_batch = input_shape[0]
+    input_seqlen = input_shape[1]
+    num_slot = input_seqlen / slot_seqlen
+    num_b = num_slot - 1
+    ones = L.ones([num_b], dtype="float32") # [num_b]
+    diag_ones = L.diag(ones) # [num_b, num_b]
+    diag_ones = L.unsqueeze(diag_ones, [1, -1]) # [num_b, 1, num_b, 1]
+    diag_ones = L.expand(diag_ones, [1, slot_seqlen, 1, slot_seqlen]) # [num_b, seqlen, num_b, seqlen]
+    diag_ones = L.reshape(diag_ones, [1, num_b*slot_seqlen, num_b*slot_seqlen]) # [1, num_b*seqlen, num_b*seqlen]
+    graph_attn_bias = L.concat([L.ones([1, num_b*slot_seqlen, slot_seqlen], dtype="float32"), diag_ones], 2)
+    graph_attn_bias = L.concat([L.ones([1, slot_seqlen, num_slot*slot_seqlen], dtype="float32"), graph_attn_bias], 1) # [1, seq, seq]
+    pad_attn_bias = L.matmul(
+        input_mask, input_mask, transpose_y=True)  # [batch, seq, seq]
+    attn_bias = graph_attn_bias * pad_attn_bias
+    attn_bias = (1. - attn_bias) * -10000.
+    attn_bias = L.stack([attn_bias] * n_head, 1) # [batch, n_head, seq, seq]
+    if attn_bias.dtype != dtype:
+        attn_bias = L.cast(attn_bias, dtype)
+    return attn_bias
+def encoder(enc_input,
+            input_mask,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd="n",
+            postprocess_cmd="da",
+            param_initializer=None,
+            name=''):
+    """
+    The encoder is composed of a stack of identical layers returned by calling
+    encoder_layer.
+    """
+    #global to_2d, to_3d  #, batch, seqlen, dynamic_dim
+    d_shape = L.shape(input_mask)
+    pad_idx = build_pad_idx(input_mask)
+    attn_bias = build_attn_bias(input_mask, n_head, enc_input.dtype)
+    # d_batch = d_shape[0]
+    # d_seqlen = d_shape[1]
+    # pad_idx = L.where(
+    # L.cast(L.reshape(input_mask, [d_batch, d_seqlen]), 'bool'))
+    # attn_bias = L.matmul(
+    # input_mask, input_mask, transpose_y=True)  # [batch, seq, seq]
+    # attn_bias = (1. - attn_bias) * -10000.
+    # attn_bias = L.stack([attn_bias] * n_head, 1)
+    # if attn_bias.dtype != enc_input.dtype:
+    # attn_bias = L.cast(attn_bias, enc_input.dtype)
+    # def to_2d(t_3d):
+        # t_2d = L.gather_nd(t_3d, pad_idx)
+        # return t_2d
+    # def to_3d(t_2d):
+        # t_3d = L.scatter_nd(
+        # pad_idx, t_2d, shape=[d_shape[0], d_shape[1], d_model])
+        # return t_3d
+    enc_input = to_2d(enc_input)
+    all_hidden = []
+    all_attn = []
+    all_ffn = []
+    for i in range(n_layer):
+        enc_output, ctx_multiheads_attn, ffn_output = encoder_layer(
+            enc_input,
+            attn_bias,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd,
+            postprocess_cmd,
+            param_initializer=param_initializer,
+            name=name + '_layer_' + str(i))
+        all_hidden.append(enc_output)
+        all_attn.append(ctx_multiheads_attn)
+        all_ffn.append(ffn_output)
+        enc_input = enc_output
+    enc_output = pre_process_layer(
+        enc_output,
+        preprocess_cmd,
+        prepostprocess_dropout,
+        name="post_encoder")
+    enc_output = to_3d(enc_output)
+    #enc_output.desc.set_shape((-1, 1, final_dim))
+    return enc_output, all_hidden, all_attn, all_ffn
+def graph_encoder(enc_input,
+            input_mask,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd="n",
+            postprocess_cmd="da",
+            param_initializer=None,
+            slot_seqlen=40,
+            name=''):
+    """
+    The encoder is composed of a stack of identical layers returned by calling
+    encoder_layer.
+    """
+    #global to_2d, to_3d  #, batch, seqlen, dynamic_dim
+    d_shape = L.shape(input_mask)
+    pad_idx = build_pad_idx(input_mask)
+    attn_bias = build_graph_attn_bias(input_mask, n_head, enc_input.dtype, slot_seqlen)
+    #attn_bias = build_attn_bias(input_mask, n_head, enc_input.dtype)
+    # d_batch = d_shape[0]
+    # d_seqlen = d_shape[1]
+    # pad_idx = L.where(
+    # L.cast(L.reshape(input_mask, [d_batch, d_seqlen]), 'bool'))
+    # attn_bias = L.matmul(
+    # input_mask, input_mask, transpose_y=True)  # [batch, seq, seq]
+    # attn_bias = (1. - attn_bias) * -10000.
+    # attn_bias = L.stack([attn_bias] * n_head, 1)
+    # if attn_bias.dtype != enc_input.dtype:
+    # attn_bias = L.cast(attn_bias, enc_input.dtype)
+    # def to_2d(t_3d):
+        # t_2d = L.gather_nd(t_3d, pad_idx)
+        # return t_2d
+    # def to_3d(t_2d):
+        # t_3d = L.scatter_nd(
+        # pad_idx, t_2d, shape=[d_shape[0], d_shape[1], d_model])
+        # return t_3d
+    enc_input = to_2d(enc_input)
+    all_hidden = []
+    all_attn = []
+    all_ffn = []
+    for i in range(n_layer):
+        enc_output, ctx_multiheads_attn, ffn_output = encoder_layer(
+            enc_input,
+            attn_bias,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd,
+            postprocess_cmd,
+            param_initializer=param_initializer,
+            name=name + '_layer_' + str(i))
+        all_hidden.append(enc_output)
+        all_attn.append(ctx_multiheads_attn)
+        all_ffn.append(ffn_output)
+        enc_input = enc_output
+    enc_output = pre_process_layer(
+        enc_output,
+        preprocess_cmd,
+        prepostprocess_dropout,
+        name="post_encoder")
+    enc_output = to_3d(enc_output)
+    #enc_output.desc.set_shape((-1, 1, final_dim))
+    return enc_output, all_hidden, all_attn, all_ffn
--- a/examples/erniesage/models/erniesage_v1.py
+++ b/examples/erniesage/models/erniesage_v1.py
+import pgl
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from models.base import BaseNet, BaseGNNModel
+from models.ernie_model.ernie import ErnieModel
+from models.ernie_model.ernie import ErnieGraphModel
+from models.ernie_model.ernie import ErnieConfig
+class ErnieSageV1(BaseNet):
+    def build_inputs(self):
+        inputs = super(ErnieSageV1, self).build_inputs()
+        term_ids = L.data(
+            "term_ids", shape=[None, self.config.max_seqlen], dtype="int64", append_batch_size=False)
+        return inputs + [term_ids]
+    def build_embedding(self, graph_wrappers, term_ids):
+        term_ids = L.unsqueeze(term_ids, [-1])
+        ernie_config = self.config.ernie_config
+        ernie = ErnieModel(
+            src_ids=term_ids,
+            sentence_ids=L.zeros_like(term_ids),
+            task_ids=None,
+            config=ernie_config,
+            use_fp16=False,
+            name="student_")
+        feature = ernie.get_pooled_output()
+        return feature
+    def __call__(self, graph_wrappers):
+        inputs = self.build_inputs()
+        feature = self.build_embedding(graph_wrappers, inputs[-1])
+        features = self.gnn_layers(graph_wrappers, feature)
+        outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs[:-1]]
+        src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0])
+        outputs.append(src_real_index)
+        return inputs, outputs
+class ErnieSageModelV1(BaseGNNModel):
+    def gen_net_fn(self, config):
+        return ErnieSageV1(config)
--- a/examples/erniesage/models/erniesage_v2.py
+++ b/examples/erniesage/models/erniesage_v2.py
+import pgl
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from models.base import BaseNet, BaseGNNModel
+from models.ernie_model.ernie import ErnieModel
+from models.ernie_model.ernie import ErnieGraphModel
+from models.ernie_model.ernie import ErnieConfig
+class ErnieSageV2(BaseNet):
+    def build_inputs(self):
+        inputs = super(ErnieSageV2, self).build_inputs()
+        term_ids = L.data(
+            "term_ids", shape=[None, self.config.max_seqlen], dtype="int64", append_batch_size=False)
+        return inputs + [term_ids]
+    def gnn_layer(self, gw, feature, hidden_size, act, initializer, learning_rate, name):
+        def ernie_send(src_feat, dst_feat, edge_feat):
+            """doc"""
+            cls = L.fill_constant_batch_size_like(src_feat["term_ids"], [-1, 1, 1], "int64", 1)
+            src_ids = L.concat([cls, src_feat["term_ids"]], 1)
+            dst_ids = dst_feat["term_ids"]
+            sent_ids = L.concat([L.zeros_like(src_ids), L.ones_like(dst_ids)], 1)
+            term_ids = L.concat([src_ids, dst_ids], 1)
+            term_ids.stop_gradient = True
+            sent_ids.stop_gradient = True
+            ernie = ErnieModel(
+                term_ids, sent_ids,
+                config=self.config.ernie_config)
+            feature = ernie.get_pooled_output()
+            return feature
+        def erniesage_v2_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name):
+            feature = L.unsqueeze(feature, [-1])
+            msg = gw.send(ernie_send, nfeat_list=[("term_ids", feature)])
+            neigh_feature = gw.recv(msg, lambda feat: F.layers.sequence_pool(feat, pool_type="sum"))
+            term_ids = feature
+            cls = L.fill_constant_batch_size_like(term_ids, [-1, 1, 1], "int64", 1)
+            term_ids = L.concat([cls, term_ids], 1)
+            term_ids.stop_gradient = True
+            ernie = ErnieModel(
+                term_ids, L.zeros_like(term_ids),
+                config=self.config.ernie_config)
+            self_feature = ernie.get_pooled_output()
+            self_feature = L.fc(self_feature,
+                                           hidden_size,
+                                           act=act,
+                                           param_attr=F.ParamAttr(name=name + "_l",
+                                           learning_rate=learning_rate),
+                                           )
+            neigh_feature = L.fc(neigh_feature,
+                                            hidden_size,
+                                            act=act,
+                                            param_attr=F.ParamAttr(name=name + "_r",
+                                           learning_rate=learning_rate),
+                                            )
+            output = L.concat([self_feature, neigh_feature], axis=1)
+            output = L.l2_normalize(output, axis=1)
+            return output
+        return erniesage_v2_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name)
+    def gnn_layers(self, graph_wrappers, feature):
+        features = [feature]
+        initializer = None
+        fc_lr = self.config.lr / 0.001
+        for i in range(self.config.num_layers):
+            if i == self.config.num_layers - 1:
+                act = None
+            else:
+                act = "leaky_relu"
+            feature = self.gnn_layer(
+                graph_wrappers[i],
+                feature,
+                self.config.hidden_size,
+                act,
+                initializer,
+                learning_rate=fc_lr,
+                name="%s_%s" % ("erniesage_v2", i))
+            features.append(feature)
+        return features
+    def __call__(self, graph_wrappers):
+        inputs = self.build_inputs()
+        feature = inputs[-1]
+        features = self.gnn_layers(graph_wrappers, feature)
+        outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs[:-1]]
+        src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0])
+        outputs.append(src_real_index)
+        return inputs, outputs
+class ErnieSageModelV2(BaseGNNModel):
+    def gen_net_fn(self, config):
+        return ErnieSageV2(config)
--- a/examples/erniesage/models/erniesage_v3.py
+++ b/examples/erniesage/models/erniesage_v3.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pgl
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from models.base import BaseNet, BaseGNNModel
+from models.ernie_model.ernie import ErnieModel
+from models.ernie_model.ernie import ErnieGraphModel
+from models.ernie_model.ernie import ErnieConfig
+from models.message_passing import copy_send
+class ErnieSageV3(BaseNet):
+    def __init__(self, config):
+        super(ErnieSageV3, self).__init__(config)
+        self.config.layer_type = "ernie_recv_sum"
+    def build_inputs(self):
+        inputs = super(ErnieSageV3, self).build_inputs()
+        term_ids = L.data(
+            "term_ids", shape=[None, self.config.max_seqlen], dtype="int64", append_batch_size=False)
+        return inputs + [term_ids]
+    def gnn_layer(self, gw, feature, hidden_size, act, initializer, learning_rate, name):
+        def ernie_recv(feat):
+            """doc"""
+            # TODO maxlen  400
+            #pad_value = L.cast(L.assign(input=np.array([0], dtype=np.int32)), "int64")
+            pad_value = L.zeros([1], "int64")
+            out, _ = L.sequence_pad(feat, pad_value=pad_value, maxlen=10)
+            out = L.reshape(out, [0, 400])
+            return out
+        def erniesage_v3_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name):
+            msg = gw.send(copy_send, nfeat_list=[("h", feature)])
+            neigh_feature = gw.recv(msg, ernie_recv)
+            neigh_feature = L.cast(L.unsqueeze(neigh_feature, [-1]), "int64")
+            feature = L.unsqueeze(feature, [-1])
+            cls = L.fill_constant_batch_size_like(feature, [-1, 1, 1], "int64", 1)
+            term_ids = L.concat([cls, feature[:, :-1], neigh_feature], 1)
+            term_ids.stop_gradient = True
+            return term_ids
+        return erniesage_v3_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name)
+    def gnn_layers(self, graph_wrappers, feature):
+        features = [feature]
+        initializer = None
+        fc_lr = self.config.lr / 0.001
+        for i in range(self.config.num_layers):
+            if i == self.config.num_layers - 1:
+                act = None
+            else:
+                act = "leaky_relu"
+            feature = self.gnn_layer(
+                graph_wrappers[i],
+                feature,
+                self.config.hidden_size,
+                act,
+                initializer,
+                learning_rate=fc_lr,
+                name="%s_%s" % (self.config.layer_type, i))
+            features.append(feature)
+        return features
+    def take_final_feature(self, feature, index, name):
+        """take final feature"""
+        feat = L.gather(feature, index, overwrite=False)
+        ernie_config = self.config.ernie_config
+        ernie = ErnieGraphModel(
+            src_ids=feat,
+            config=ernie_config,
+            slot_seqlen=self.config.max_seqlen,
+            name="student_")
+        feat = ernie.get_pooled_output()
+        fc_lr = self.config.lr / 0.001
+        feat= L.fc(feat,
+                   self.config.hidden_size,
+                   act="relu",
+                   param_attr=F.ParamAttr(name=name + "_l",
+                   learning_rate=fc_lr),
+                   )
+        feat = L.l2_normalize(feat, axis=1)
+        if self.config.final_fc:
+            feat = L.fc(feat,
+                           self.config.hidden_size,
+                           param_attr=F.ParamAttr(name=name + '_w'),
+                           bias_attr=F.ParamAttr(name=name + '_b'))
+        if self.config.final_l2_norm:
+            feat = L.l2_normalize(feat, axis=1)
+        return feat
+    def __call__(self, graph_wrappers):
+        inputs = self.build_inputs()
+        feature = inputs[-1]
+        features = self.gnn_layers(graph_wrappers, feature)
+        outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs[:-1]]
+        src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0])
+        outputs.append(src_real_index)
+        return inputs, outputs
+class ErnieSageModelV3(BaseGNNModel):
+    def gen_net_fn(self, config):
+        return ErnieSageV3(config)
--- a/examples/erniesage/models/message_passing.py
+++ b/examples/erniesage/models/message_passing.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as L
+def copy_send(src_feat, dst_feat, edge_feat):
+    """doc"""
+    return src_feat["h"]
+def weighted_copy_send(src_feat, dst_feat, edge_feat):
+    """doc"""
+    return src_feat["h"] * edge_feat["weight"]
+def mean_recv(feat):
+    """doc"""
+    return fluid.layers.sequence_pool(feat, pool_type="average")
+def sum_recv(feat):
+    """doc"""
+    return fluid.layers.sequence_pool(feat, pool_type="sum")
+def max_recv(feat):
+    """doc"""
+    return fluid.layers.sequence_pool(feat, pool_type="max")
+def lstm_recv(feat):
+    """doc"""
+    hidden_dim = 128
+    forward, _ = fluid.layers.dynamic_lstm(
+        input=feat, size=hidden_dim * 4, use_peepholes=False)
+    output = fluid.layers.sequence_last_step(forward)
+    return output
+def graphsage_sum(gw, feature, hidden_size, act, initializer, learning_rate, name):
+    """doc"""
+    msg = gw.send(copy_send, nfeat_list=[("h", feature)])
+    neigh_feature = gw.recv(msg, sum_recv)
+    self_feature = feature
+    self_feature = fluid.layers.fc(self_feature,
+                                   hidden_size,
+                                   act=act,
+                                   param_attr=fluid.ParamAttr(name=name + "_l", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                   )
+    neigh_feature = fluid.layers.fc(neigh_feature,
+                                    hidden_size,
+                                    act=act,
+                                    param_attr=fluid.ParamAttr(name=name + "_r", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                    )
+    output = fluid.layers.concat([self_feature, neigh_feature], axis=1)
+    output = fluid.layers.l2_normalize(output, axis=1)
+    return output
+def graphsage_mean(gw, feature, hidden_size, act, initializer, learning_rate, name):
+    """doc"""
+    msg = gw.send(copy_send, nfeat_list=[("h", feature)])
+    neigh_feature = gw.recv(msg, mean_recv)
+    self_feature = feature
+    self_feature = fluid.layers.fc(self_feature,
+                                   hidden_size,
+                                   act=act,
+                                   param_attr=fluid.ParamAttr(name=name + "_l", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                   )
+    neigh_feature = fluid.layers.fc(neigh_feature,
+                                    hidden_size,
+                                    act=act,
+                                    param_attr=fluid.ParamAttr(name=name + "_r", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                    )
+    output = fluid.layers.concat([self_feature, neigh_feature], axis=1)
+    output = fluid.layers.l2_normalize(output, axis=1)
+    return output
+def pinsage_mean(gw, feature, hidden_size, act, initializer, learning_rate, name):
+    """doc"""
+    msg = gw.send(weighted_copy_send, nfeat_list=[("h", feature)], efeat_list=["weight"])
+    neigh_feature = gw.recv(msg, mean_recv)
+    self_feature = feature
+    self_feature = fluid.layers.fc(self_feature,
+                                   hidden_size,
+                                   act=act,
+                                   param_attr=fluid.ParamAttr(name=name + "_l", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                   )
+    neigh_feature = fluid.layers.fc(neigh_feature,
+                                    hidden_size,
+                                    act=act,
+                                    param_attr=fluid.ParamAttr(name=name + "_r", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                    )
+    output = fluid.layers.concat([self_feature, neigh_feature], axis=1)
+    output = fluid.layers.l2_normalize(output, axis=1)
+    return output
+def pinsage_sum(gw, feature, hidden_size, act, initializer, learning_rate, name):
+    """doc"""
+    msg = gw.send(weighted_copy_send, nfeat_list=[("h", feature)], efeat_list=["weight"])
+    neigh_feature = gw.recv(msg, sum_recv)
+    self_feature = feature
+    self_feature = fluid.layers.fc(self_feature,
+                                   hidden_size,
+                                   act=act,
+                                   param_attr=fluid.ParamAttr(name=name + "_l", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                   )
+    neigh_feature = fluid.layers.fc(neigh_feature,
+                                    hidden_size,
+                                    act=act,
+                                    param_attr=fluid.ParamAttr(name=name + "_r", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                    )
+    output = fluid.layers.concat([self_feature, neigh_feature], axis=1)
+    output = fluid.layers.l2_normalize(output, axis=1)
+    return output
--- a/examples/erniesage/models/model_factory.py
+++ b/examples/erniesage/models/model_factory.py
+from models.base import BaseGNNModel
+from models.ernie import ErnieModel
+from models.erniesage_v1 import ErnieSageModelV1
+from models.erniesage_v2 import ErnieSageModelV2
+from models.erniesage_v3 import ErnieSageModelV3
+class Model(object):
+    @classmethod
+    def factory(cls, config):
+        name = config.model_type
+        if name == "BaseGNNModel":
+            return BaseGNNModel(config)
+        if name == "ErnieModel":
+            return ErnieModel(config)
+        if name == "ErnieSageModelV1":
+            return ErnieSageModelV1(config)
+        if name == "ErnieSageModelV2":
+            return ErnieSageModelV2(config)
+        if name == "ErnieSageModelV3":
+            return ErnieSageModelV3(config)
+        else:
+            raise ValueError
--- a/examples/erniesage/preprocessing/dump_graph.py
+++ b/examples/erniesage/preprocessing/dump_graph.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+########################################################################
+#
+# Copyright (c) 2020 Baidu.com, Inc. All Rights Reserved
+#
+# File: dump_graph.py
+# Author: suweiyue(suweiyue@baidu.com)
+# Date: 2020/03/01 22:17:13
+#
+########################################################################
+"""
+    Comment.
+"""
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+#from __future__ import unicode_literals
+import io
+import os
+import sys
+import argparse
+import logging
+import multiprocessing
+from functools import partial
+from io import open
+import numpy as np
+import tqdm
+import pgl
+from pgl.graph_kernel import alias_sample_build_table
+from pgl.utils.logger import log
+from tokenization import FullTokenizer
+def term2id(string, tokenizer, max_seqlen):
+    string = string.split("\t")[1]
+    tokens = tokenizer.tokenize(string)
+    ids = tokenizer.convert_tokens_to_ids(tokens)
+    ids = ids[:max_seqlen-1]
+    ids = ids + [2] # ids + [sep]
+    ids = ids + [0] * (max_seqlen - len(ids))
+    return ids
+def dump_graph(args):
+    if not os.path.exists(args.outpath):
+        os.makedirs(args.outpath)
+    neg_samples = []
+    str2id = dict()
+    term_file = io.open(os.path.join(args.outpath, "terms.txt"), "w", encoding=args.encoding)
+    terms = []
+    count = 0
+    with io.open(args.inpath, encoding=args.encoding) as f:
+        edges = []
+        for idx, line in enumerate(f):
+            if idx % 100000 == 0:
+                log.info("%s readed %s lines" % (args.inpath, idx))
+            slots = []
+            for col_idx, col in enumerate(line.strip("\n").split("\t")):
+                s = col[:args.max_seqlen]
+                if s not in str2id:
+                    str2id[s] = count
+                    count += 1
+                    term_file.write(str(col_idx) + "\t" + col + "\n")
+                slots.append(str2id[s])
+            src = slots[0]
+            dst = slots[1]
+            neg_samples.append(slots[2:])
+            edges.append((src, dst))
+            edges.append((dst, src))
+        term_file.close()
+        edges = np.array(edges, dtype="int64")
+        num_nodes = len(str2id)
+        str2id.clear()
+    log.info("building graph...")
+    graph = pgl.graph.Graph(num_nodes=num_nodes, edges=edges)
+    indegree = graph.indegree()
+    graph.outdegree()
+    graph.dump(args.outpath)
+    # dump alias sample table
+    sqrt_indegree = np.sqrt(indegree)
+    distribution = 1. * sqrt_indegree / sqrt_indegree.sum()
+    alias, events = alias_sample_build_table(distribution)
+    np.save(os.path.join(args.outpath, "alias.npy"), alias)
+    np.save(os.path.join(args.outpath, "events.npy"), events)
+    np.save(os.path.join(args.outpath, "neg_samples.npy"), np.array(neg_samples))
+    log.info("End Build Graph")
+def dump_id2str_map(args):
+    log.info("Dump id2str map starting...")
+    id2str = np.array([line.strip("\n") for line in open(os.path.join(args.outpath, "terms.txt"), "r", encoding=args.encoding)])
+    np.save(os.path.join(args.outpath, "id2str.npy"), id2str)
+    log.info("Dump id2str map done.")
+def dump_node_feat(args):
+    log.info("Dump node feat starting...")
+    id2str = np.load(os.path.join(args.outpath, "id2str.npy"), mmap_mode="r")
+    pool = multiprocessing.Pool()
+    tokenizer = FullTokenizer(args.vocab_file)
+    term_ids = pool.map(partial(term2id, tokenizer=tokenizer, max_seqlen=args.max_seqlen), id2str)
+    np.save(os.path.join(args.outpath, "term_ids.npy"), np.array(term_ids))
+    log.info("Dump node feat done.")
+    pool.terminate()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='main')
+    parser.add_argument("-i", "--inpath", type=str, default=None)
+    parser.add_argument("-l", "--max_seqlen", type=int, default=30)
+    parser.add_argument("--vocab_file", type=str, default="./vocab.txt")
+    parser.add_argument("--encoding", type=str, default="utf8")
+    parser.add_argument("-o", "--outpath", type=str, default=None)
+    args = parser.parse_args()
+    dump_graph(args)
+    dump_id2str_map(args)
+    dump_node_feat(args)
--- a/examples/erniesage/preprocessing/tokenization.py
+++ b/examples/erniesage/preprocessing/tokenization.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import unicodedata
+import six
+import sentencepiece as sp
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    fin = open(vocab_file, 'rb')
+    for num, line in enumerate(fin):
+        items = convert_to_unicode(line.strip()).split("\t")
+        if len(items) > 2:
+            break
+        token = items[0]
+        index = items[1] if len(items) == 2 else num
+        token = token.strip()
+        vocab[token] = int(index)
+    return vocab
+def convert_by_vocab(vocab, items):
+    """Converts a sequence of [tokens|ids] using the vocab."""
+    output = []
+    for item in items:
+        output.append(vocab[item])
+    return output
+def convert_tokens_to_ids_include_unk(vocab, tokens, unk_token="[UNK]"):
+    output = []
+    for token in tokens:
+        if token in vocab:
+            output.append(vocab[token])
+        else:
+            output.append(vocab[unk_token])
+    return output
+def convert_tokens_to_ids(vocab, tokens):
+    return convert_by_vocab(vocab, tokens)
+def convert_ids_to_tokens(inv_vocab, ids):
+    return convert_by_vocab(inv_vocab, ids)
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a peice of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+class CharTokenizer(object):
+    """Runs end-to-end tokenziation."""
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.tokenizer = WordpieceTokenizer(vocab=self.vocab)
+    def tokenize(self, text):
+        split_tokens = []
+        for token in text.lower().split(" "):
+            for sub_token in self.tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+        Args:
+            do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+        return ["".join(x) for x in output]
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+        return False
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+class SentencepieceTokenizer(object):
+    """Runs SentencePiece tokenziation."""
+    def __init__(self, vocab_file, do_lower_case=True, unk_token="[UNK]"):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.do_lower_case = do_lower_case
+        self.tokenizer = sp.SentencePieceProcessor()
+        self.tokenizer.Load(vocab_file + ".model")
+        self.sp_unk_token = "<unk>"
+        self.unk_token = unk_token
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+        Returns:
+            A list of wordpiece tokens.
+        """
+        text = text.lower() if self.do_lower_case else text 
+        text = convert_to_unicode(text.replace("\1", " "))
+        tokens = self.tokenizer.EncodeAsPieces(text)
+        output_tokens = []
+        for token in tokens:
+            if token == self.sp_unk_token:
+                token = self.unk_token
+            if token in self.vocab:
+                output_tokens.append(token)
+            else:
+                output_tokens.append(self.unk_token)
+        return output_tokens
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+class WordsegTokenizer(object):
+    """Runs Wordseg tokenziation."""
+    def __init__(self, vocab_file, do_lower_case=True, unk_token="[UNK]", 
+            split_token="\1"):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.tokenizer = sp.SentencePieceProcessor()
+        self.tokenizer.Load(vocab_file + ".model")
+        self.do_lower_case = do_lower_case
+        self.unk_token = unk_token
+        self.split_token = split_token
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+        Returns:
+            A list of wordpiece tokens.
+        """
+        text = text.lower() if self.do_lower_case else text 
+        text = convert_to_unicode(text)
+        output_tokens = []
+        for token in text.split(self.split_token):
+            if token in self.vocab:
+                output_tokens.append(token)
+            else:
+                sp_tokens = self.tokenizer.EncodeAsPieces(token)
+                for sp_token in sp_tokens:
+                    if sp_token in self.vocab:
+                        output_tokens.append(sp_token)
+        return output_tokens
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenziation."""
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+        For example:
+            input = "unaffable"
+            output = ["un", "##aff", "##able"]
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through `BasicTokenizer.
+        Returns:
+            A list of wordpiece tokens.
+        """
+        text = convert_to_unicode(text)
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+        (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
--- a/examples/erniesage/train.py
+++ b/examples/erniesage/train.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import argparse
+import traceback
+import yaml
+import numpy as np
+from easydict import EasyDict as edict
+from pgl.utils.logger import log
+from pgl.utils import paddle_helper
+from learner import Learner
+from models.model_factory import Model
+from dataset.graph_reader import GraphGenerator 
+class TrainData(object):
+    def __init__(self, graph_path):
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
+        log.info("trainer_id: %s, trainer_count: %s." % (trainer_id, trainer_count))
+        edges = np.load(os.path.join(graph_path, "edges.npy"), allow_pickle=True)
+        # edges is bidirectional.
+        train_usr = edges[trainer_id::trainer_count, 0]
+        train_ad = edges[trainer_id::trainer_count, 1]
+        returns = {
+            "train_data": [train_usr, train_ad]
+        }
+        if os.path.exists(os.path.join(graph_path, "neg_samples.npy")):
+            neg_samples = np.load(os.path.join(graph_path, "neg_samples.npy"), allow_pickle=True)
+            if neg_samples.size != 0:
+                train_negs = neg_samples[trainer_id::trainer_count]
+                returns["train_data"].append(train_negs)
+        log.info("Load train_data done.")
+        self.data = returns
+    def __getitem__(self, index):
+        return [ data[index] for data in self.data["train_data"]]
+    def __len__(self):
+        return len(self.data["train_data"][0])
+def main(config):
+    # Select Model
+    model = Model.factory(config)
+    # Build Train Edges
+    data = TrainData(config.graph_path)
+    # Build Train Data
+    train_iter = GraphGenerator(
+        graph_wrappers=model.graph_wrappers,
+        batch_size=config.batch_size,
+        data=data,
+        samples=config.samples,
+        num_workers=config.sample_workers,
+        feed_name_list=[var.name for var in model.feed_list],
+        use_pyreader=config.use_pyreader,
+        phase="train",
+        graph_data_path=config.graph_path,
+        shuffle=True)
+    log.info("build graph reader done.")
+    learner = Learner.factory(config.learner_type)
+    learner.build(model, train_iter, config)
+    learner.start()
+    learner.stop()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='main')
+    parser.add_argument("--conf", type=str, default="./config.yaml")
+    args = parser.parse_args()
+    config = edict(yaml.load(open(args.conf), Loader=yaml.FullLoader))
+    print(config)
+    main(config)
--- a/examples/gin/Dataset.py
+++ b/examples/gin/Dataset.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file implement the dataset for GIN model.
+"""
+import os
+import sys
+import numpy as np
+from sklearn.model_selection import StratifiedKFold
+import pgl
+from pgl.utils.logger import log
+def fold10_split(dataset, fold_idx=0, seed=0, shuffle=True):
+    """10 fold splitter"""
+    assert 0 <= fold_idx and fold_idx < 10, print(
+        "fold_idx must be from 0 to 9.")
+    skf = StratifiedKFold(n_splits=10, shuffle=shuffle, random_state=seed)
+    labels = []
+    for i in range(len(dataset)):
+        g, c = dataset[i]
+        labels.append(c)
+    idx_list = []
+    for idx in skf.split(np.zeros(len(labels)), labels):
+        idx_list.append(idx)
+    train_idx, valid_idx = idx_list[fold_idx]
+    log.info("train_set : test_set == %d : %d" %
+             (len(train_idx), len(valid_idx)))
+    return Subset(dataset, train_idx), Subset(dataset, valid_idx)
+def random_split(dataset, split_ratio=0.7, seed=0, shuffle=True):
+    """random splitter"""
+    np.random.seed(seed)
+    indices = list(range(len(dataset)))
+    np.random.shuffle(indices)
+    split = int(split_ratio * len(dataset))
+    train_idx, valid_idx = indices[:split], indices[split:]
+    log.info("train_set : test_set == %d : %d" %
+             (len(train_idx), len(valid_idx)))
+    return Subset(dataset, train_idx), Subset(dataset, valid_idx)
+class BaseDataset(object):
+    """BaseDataset"""
+    def __init__(self):
+        pass
+    def __getitem__(self, idx):
+        """getitem"""
+        raise NotImplementedError
+    def __len__(self):
+        """len"""
+        raise NotImplementedError
+class Subset(BaseDataset):
+    """
+    Subset of a dataset at specified indices.
+    """
+    def __init__(self, dataset, indices):
+        self.dataset = dataset
+        self.indices = indices
+    def __getitem__(self, idx):
+        """getitem"""
+        return self.dataset[self.indices[idx]]
+    def __len__(self):
+        """len"""
+        return len(self.indices)
+class GINDataset(BaseDataset):
+    """Dataset for Graph Isomorphism Network (GIN)
+    Adapted from https://github.com/weihua916/powerful-gnns/blob/master/dataset.zip.
+    """
+    def __init__(self,
+                 data_path,
+                 dataset_name,
+                 self_loop,
+                 degree_as_nlabel=False):
+        self.data_path = data_path
+        self.dataset_name = dataset_name
+        self.self_loop = self_loop
+        self.degree_as_nlabel = degree_as_nlabel
+        self.graph_list = []
+        self.glabel_list = []
+        # relabel
+        self.glabel_dict = {}
+        self.nlabel_dict = {}
+        self.elabel_dict = {}
+        self.ndegree_dict = {}
+        # global num
+        self.num_graph = 0  # total graphs number
+        self.n = 0  # total nodes number
+        self.m = 0  # total edges number
+        # global num of classes
+        self.gclasses = 0
+        self.nclasses = 0
+        self.eclasses = 0
+        self.dim_nfeats = 0
+        # flags
+        self.degree_as_nlabel = degree_as_nlabel
+        self.nattrs_flag = False
+        self.nlabels_flag = False
+        self._load_data()
+    def __len__(self):
+        """return the number of graphs"""
+        return len(self.graph_list)
+    def __getitem__(self, idx):
+        """getitem"""
+        return self.graph_list[idx], self.glabel_list[idx]
+    def _load_data(self):
+        """Loads dataset
+        """
+        filename = os.path.join(self.data_path, self.dataset_name,
+                                "%s.txt" % self.dataset_name)
+        log.info("loading data from %s" % filename)
+        with open(filename, 'r') as reader:
+            # first line --> N, means total number of graphs
+            self.num_graph = int(reader.readline().strip())
+            for i in range(self.num_graph):
+                if (i + 1) % int(self.num_graph / 10) == 0:
+                    log.info("processing graph %s" % (i + 1))
+                graph = dict()
+                # second line --> [num_node, label] 
+                # means [node number of a graph, class label of a graph]
+                grow = reader.readline().strip().split()
+                n_nodes, glabel = [int(w) for w in grow]
+                # relabel graphs
+                if glabel not in self.glabel_dict:
+                    mapped = len(self.glabel_dict)
+                    self.glabel_dict[glabel] = mapped
+                graph['num_nodes'] = n_nodes
+                self.glabel_list.append(self.glabel_dict[glabel])
+                nlabels = []
+                node_features = []
+                num_edges = 0
+                edges = []
+                for j in range(graph['num_nodes']):
+                    slots = reader.readline().strip().split()
+                    # handle edges and node feature(if has)
+                    tmp = int(slots[
+                        1]) + 2  # tmp == 2 + num_edges of current node
+                    if tmp == len(slots):
+                        # no node feature
+                        nrow = [int(w) for w in slots]
+                        nfeat = None
+                    elif tmp < len(slots):
+                        nrow = [int(w) for w in slots[:tmp]]
+                        nfeat = [float(w) for w in slots[tmp:]]
+                        node_features.append(nfeat)
+                    else:
+                        raise Exception('edge number is not correct!')
+                    # relabel nodes if is has labels
+                    # if it doesn't have node labels, then every nrow[0] == 0
+                    if not nrow[0] in self.nlabel_dict:
+                        mapped = len(self.nlabel_dict)
+                        self.nlabel_dict[nrow[0]] = mapped
+                    nlabels.append(self.nlabel_dict[nrow[0]])
+                    num_edges += nrow[1]
+                    edges.extend([(j, u) for u in nrow[2:]])
+                    if self.self_loop:
+                        num_edges += 1
+                        edges.append((j, j))
+                if node_features != []:
+                    node_features = np.stack(node_features)
+                    graph['attr'] = node_features
+                    self.nattrs_flag = True
+                else:
+                    node_features = None
+                    graph['attr'] = node_features
+                graph['nlabel'] = np.array(
+                    nlabels, dtype="int64").reshape(-1, 1)
+                if len(self.nlabel_dict) > 1:
+                    self.nlabels_flag = True
+                graph['edges'] = edges
+                assert num_edges == len(edges)
+                g = pgl.graph.Graph(
+                    num_nodes=graph['num_nodes'],
+                    edges=graph['edges'],
+                    node_feat={
+                        'nlabel': graph['nlabel'],
+                        'attr': graph['attr']
+                    })
+                self.graph_list.append(g)
+                # update statistics of graphs
+                self.n += graph['num_nodes']
+                self.m += num_edges
+        # if no attr
+        if not self.nattrs_flag:
+            log.info('there are no node features in this dataset!')
+            label2idx = {}
+            # generate node attr by node degree
+            if self.degree_as_nlabel:
+                log.info('generate node features by node degree...')
+                nlabel_set = set([])
+                for g in self.graph_list:
+                    g.node_feat['nlabel'] = g.indegree()
+                    # extracting unique node labels
+                    nlabel_set = nlabel_set.union(set(g.node_feat['nlabel']))
+                    g.node_feat['nlabel'] = g.node_feat['nlabel'].reshape(-1,
+                                                                          1)
+                nlabel_set = list(nlabel_set)
+                # in case the labels/degrees are not continuous number
+                self.ndegree_dict = {
+                    nlabel_set[i]: i
+                    for i in range(len(nlabel_set))
+                }
+                label2idx = self.ndegree_dict
+            # generate node attr by node label
+            else:
+                log.info('generate node features by node label...')
+                label2idx = self.nlabel_dict
+            for g in self.graph_list:
+                attr = np.zeros((g.num_nodes, len(label2idx)))
+                idx = [
+                    label2idx[tag]
+                    for tag in g.node_feat['nlabel'].reshape(-1, )
+                ]
+                attr[:, idx] = 1
+                g.node_feat['attr'] = attr.astype("float32")
+        # after load, get the #classes and #dim
+        self.gclasses = len(self.glabel_dict)
+        self.nclasses = len(self.nlabel_dict)
+        self.eclasses = len(self.elabel_dict)
+        self.dim_nfeats = len(self.graph_list[0].node_feat['attr'][0])
+        message = "finished loading data\n"
+        message += """
+                    num_graph: %d
+                    num_graph_class: %d
+                    total_num_nodes: %d
+                    node Classes: %d
+                    node_features_dim: %d
+                    num_edges: %d
+                    edge_classes: %d
+                    Avg. of #Nodes: %.2f
+                    Avg. of #Edges: %.2f
+                    Graph Relabeled: %s
+                    Node Relabeled: %s
+                    Degree Relabeled(If degree_as_nlabel=True): %s""" % (
+            self.num_graph,
+            self.gclasses,
+            self.n,
+            self.nclasses,
+            self.dim_nfeats,
+            self.m,
+            self.eclasses,
+            self.n / self.num_graph,
+            self.m / self.num_graph,
+            self.glabel_dict,
+            self.nlabel_dict,
+            self.ndegree_dict, )
+        log.info(message)
+if __name__ == "__main__":
+    gindataset = GINDataset(
+        "./dataset/", "MUTAG", self_loop=True, degree_as_nlabel=False)
--- a/examples/gin/README.md
+++ b/examples/gin/README.md
+# Graph Isomorphism Network (GIN)
+[Graph Isomorphism Network \(GIN\)](https://arxiv.org/pdf/1810.00826.pdf) is a simple graph neural network that expects to achieve the ability as the Weisfeiler-Lehman graph isomorphism test. Based on PGL, we reproduce the GIN model.
+### Datasets
+The dataset can be downloaded from [here](https://github.com/weihua916/powerful-gnns/blob/master/dataset.zip).
+After downloading the data，uncompress them, then a directory named `./dataset/` can be found in current directory. Note that the current directory is the root directory of GIN model.
+### Dependencies
+- paddlepaddle >= 1.6
+- pgl 1.0.2
+### How to run
+For examples, use GPU to train GIN model on MUTAG dataset.
+```
+python main.py --use_cuda --dataset_name MUTAG  --data_path ./dataset
+```
+### Hyperparameters
+- data\_path: the root path of your dataset 
+- dataset\_name: the name of the dataset
+- fold\_idx: The $fold\_idx^{th}$ fold of dataset splited. Here we use 10 fold cross-validation
+- train\_eps: whether the $\epsilon$ parameter is learnable.
+### Experiment results （Accuracy）
+| |MUTAG | COLLAB   | IMDBBINARY | IMDBMULTI |
+|--|-------------|----------|------------|-----------------|
+|PGL result | 90.8           | 78.6 | 76.8     | 50.8          |
+|paper reuslt |90.0           | 80.0 | 75.1     | 52.3          |
--- a/examples/gin/dataloader.py
+++ b/examples/gin/dataloader.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file implement the graph dataloader.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import os
+import sys
+import time
+import argparse
+import numpy as np
+import collections
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as fl
+import pgl
+from pgl.utils import mp_reader
+from pgl.utils.logger import log
+def batch_iter(data, batch_size, fid, num_workers):
+    """node_batch_iter
+    """
+    size = len(data)
+    perm = np.arange(size)
+    np.random.shuffle(perm)
+    start = 0
+    cc = 0
+    while start < size:
+        index = perm[start:start + batch_size]
+        start += batch_size
+        cc += 1
+        if cc % num_workers != fid:
+            continue
+        yield data[index]
+def scan_batch_iter(data, batch_size, fid, num_workers):
+    """scan_batch_iter
+    """
+    batch = []
+    cc = 0
+    for line_example in data.scan():
+        cc += 1
+        if cc % num_workers != fid:
+            continue
+        batch.append(line_example)
+        if len(batch) == batch_size:
+            yield batch
+            batch = []
+    if len(batch) > 0:
+        yield batch
+class GraphDataloader(object):
+    """Graph Dataloader
+    """
+    def __init__(
+            self,
+            dataset,
+            batch_size,
+            seed=0,
+            num_workers=1,
+            buf_size=1000,
+            shuffle=True, ):
+        self.shuffle = shuffle
+        self.seed = seed
+        self.num_workers = num_workers
+        self.buf_size = buf_size
+        self.batch_size = batch_size
+        self.dataset = dataset
+    def batch_fn(self, batch_examples):
+        """ batch_fn batch producer"""
+        graphs = [b[0] for b in batch_examples]
+        labels = [b[1] for b in batch_examples]
+        join_graph = pgl.graph.MultiGraph(graphs)
+        labels = np.array(labels, dtype="int64").reshape(-1, 1)
+        return join_graph, labels
+        #  feed_dict = self.graph_wrapper.to_feed(join_graph)
+        #  raise NotImplementedError("No defined Batch Fn")
+    def batch_iter(self, fid):
+        """batch_iter"""
+        if self.shuffle:
+            for batch in batch_iter(self, self.batch_size, fid,
+                                    self.num_workers):
+                yield batch
+        else:
+            for batch in scan_batch_iter(self, self.batch_size, fid,
+                                         self.num_workers):
+                yield batch
+    def __len__(self):
+        """__len__"""
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        """__getitem__"""
+        if isinstance(idx, collections.Iterable):
+            return [self[bidx] for bidx in idx]
+        else:
+            return self.dataset[idx]
+    def __iter__(self):
+        """__iter__"""
+        def worker(filter_id):
+            def func_run():
+                for batch_examples in self.batch_iter(filter_id):
+                    batch_dict = self.batch_fn(batch_examples)
+                    yield batch_dict
+            return func_run
+        if self.num_workers == 1:
+            r = paddle.reader.buffered(worker(0), self.buf_size)
+        else:
+            worker_pool = [worker(wid) for wid in range(self.num_workers)]
+            worker = mp_reader.multiprocess_reader(
+                worker_pool, use_pipe=True, queue_size=1000)
+            r = paddle.reader.buffered(worker, self.buf_size)
+        for batch in r():
+            yield batch
+    def scan(self):
+        """scan"""
+        for example in self.dataset:
+            yield example
--- a/examples/gin/main.py
+++ b/examples/gin/main.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file implement the training process of GIN model.
+"""
+import os
+import sys
+import time
+import argparse
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.layers as fl
+import pgl
+from pgl.utils.logger import log
+from Dataset import GINDataset, fold10_split, random_split
+from dataloader import GraphDataloader
+from model import GINModel
+def main(args):
+    """main function"""
+    dataset = GINDataset(
+        args.data_path,
+        args.dataset_name,
+        self_loop=not args.train_eps,
+        degree_as_nlabel=True)
+    train_dataset, test_dataset = fold10_split(
+        dataset, fold_idx=args.fold_idx, seed=args.seed)
+    train_loader = GraphDataloader(train_dataset, batch_size=args.batch_size)
+    test_loader = GraphDataloader(
+        test_dataset, batch_size=args.batch_size, shuffle=False)
+    place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+    train_program = fluid.Program()
+    startup_program = fluid.Program()
+    with fluid.program_guard(train_program, startup_program):
+        gw = pgl.graph_wrapper.GraphWrapper(
+            "gw", place=place, node_feat=dataset[0][0].node_feat_info())
+        model = GINModel(args, gw, dataset.gclasses)
+        model.forward()
+    infer_program = train_program.clone(for_test=True)
+    with fluid.program_guard(train_program, startup_program):
+        epoch_step = int(len(train_dataset) / args.batch_size) + 1
+        boundaries = [
+            i
+            for i in range(50 * epoch_step, args.epochs * epoch_step,
+                           epoch_step * 50)
+        ]
+        values = [args.lr * 0.5**i for i in range(0, len(boundaries) + 1)]
+        lr = fl.piecewise_decay(boundaries=boundaries, values=values)
+        train_op = fluid.optimizer.Adam(lr).minimize(model.loss)
+    exe = fluid.Executor(place)
+    exe.run(startup_program)
+    # train and evaluate
+    global_step = 0
+    for epoch in range(1, args.epochs + 1):
+        for idx, batch_data in enumerate(train_loader):
+            g, labels = batch_data
+            feed_dict = gw.to_feed(g)
+            feed_dict['labels'] = labels
+            ret_loss, ret_lr, ret_acc = exe.run(
+                train_program,
+                feed=feed_dict,
+                fetch_list=[model.loss, lr, model.acc])
+            global_step += 1
+            if global_step % 10 == 0:
+                message = "epoch %d | step %d | " % (epoch, global_step)
+                message += "lr %.6f | loss %.6f | acc %.4f" % (
+                    ret_lr, ret_loss, ret_acc)
+                log.info(message)
+        # evaluate
+        result = evaluate(exe, infer_program, model, gw, test_loader)
+        message = "evaluating result"
+        for key, value in result.items():
+            message += " | %s %.6f" % (key, value)
+        log.info(message)
+def evaluate(exe, prog, model, gw, loader):
+    """evaluate"""
+    total_loss = []
+    total_acc = []
+    for idx, batch_data in enumerate(loader):
+        g, labels = batch_data
+        feed_dict = gw.to_feed(g)
+        feed_dict['labels'] = labels
+        ret_loss, ret_acc = exe.run(prog,
+                                    feed=feed_dict,
+                                    fetch_list=[model.loss, model.acc])
+        total_loss.append(ret_loss)
+        total_acc.append(ret_acc)
+    total_loss = np.mean(total_loss)
+    total_acc = np.mean(total_acc)
+    return {"loss": total_loss, "acc": total_acc}
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_path', type=str, default='./dataset')
+    parser.add_argument('--dataset_name', type=str, default='MUTAG')
+    parser.add_argument('--batch_size', type=int, default=32)
+    parser.add_argument('--fold_idx', type=int, default=0)
+    parser.add_argument('--output_path', type=str, default='./outputs/')
+    parser.add_argument('--use_cuda', action='store_true')
+    parser.add_argument('--num_layers', type=int, default=5)
+    parser.add_argument('--num_mlp_layers', type=int, default=2)
+    parser.add_argument('--hidden_size', type=int, default=64)
+    parser.add_argument(
+        '--pool_type',
+        type=str,
+        default="sum",
+        choices=["sum", "average", "max"])
+    parser.add_argument('--train_eps', action='store_true')
+    parser.add_argument('--epochs', type=int, default=350)
+    parser.add_argument('--lr', type=float, default=0.01)
+    parser.add_argument('--dropout_prob', type=float, default=0.5)
+    parser.add_argument('--seed', type=int, default=0)
+    args = parser.parse_args()
+    log.info(args)
+    if not os.path.exists(args.output_path):
+        os.makedirs(args.output_path)
+    main(args)
--- a/examples/gin/model.py
+++ b/examples/gin/model.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This file implement the GIN model.
+"""
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.layers as fl
+import pgl
+from pgl.layers.conv import gin
+class GINModel(object):
+    """GINModel"""
+    def __init__(self, args, gw, num_class):
+        self.args = args
+        self.num_layers = self.args.num_layers
+        self.hidden_size = self.args.hidden_size
+        self.train_eps = self.args.train_eps
+        self.pool_type = self.args.pool_type
+        self.dropout_prob = self.args.dropout_prob
+        self.num_class = num_class
+        self.gw = gw
+        self.labels = fl.data(name="labels", shape=[None, 1], dtype="int64")
+    def forward(self):
+        """forward"""
+        features_list = [self.gw.node_feat["attr"]]
+        for i in range(self.num_layers):
+            h = gin(self.gw,
+                    features_list[i],
+                    hidden_size=self.hidden_size,
+                    activation="relu",
+                    name="gin_%s" % (i),
+                    init_eps=0.0,
+                    train_eps=self.train_eps)
+            h = fl.layer_norm(
+                h,
+                begin_norm_axis=1,
+                param_attr=fluid.ParamAttr(
+                    name="norm_scale_%s" % (i),
+                    initializer=fluid.initializer.Constant(1.0)),
+                bias_attr=fluid.ParamAttr(
+                    name="norm_bias_%s" % (i),
+                    initializer=fluid.initializer.Constant(0.0)), )
+            h = fl.relu(h)
+            features_list.append(h)
+        output = 0
+        for i, h in enumerate(features_list):
+            pooled_h = pgl.layers.graph_pooling(self.gw, h, self.pool_type)
+            drop_h = fl.dropout(
+                pooled_h,
+                self.dropout_prob,
+                dropout_implementation="upscale_in_train")
+            output += fl.fc(drop_h,
+                            size=self.num_class,
+                            act=None,
+                            param_attr=fluid.ParamAttr(name="final_fc_%s" %
+                                                       (i)))
+        # calculate loss
+        self.loss = fl.softmax_with_cross_entropy(output, self.labels)
+        self.loss = fl.reduce_mean(self.loss)
+        self.acc = fl.accuracy(fl.softmax(output), self.labels)
--- a/examples/kg/README.md
+++ b/examples/kg/README.md
-# PGL - Knowledge Graph Embedding
-## Introduction 
-This package is mainly for computing node and relation embedding of knowledge graphs efficiently.  
-This package reproduce the following knowledge embedding models:
- TransE
- TransR
- RotatE
-## Dataset
-The dataset WN18 and FB15k are originally published by TransE paper and and be download [here](https://everest.hds.utc.fr/doku.php?id=en:transe)
-## Dependencies
-If you want to use the PGL-KGE in paddle, please install following packages.
- paddlepaddle>=1.7
- pgl
-## Experiment results
-FB15k dataset
-|  Models  |Mean Rank|  Mrr  | Hits@1 | Hits@3 | Hits@10 | MR@filter| Hits10@filter| 
-|----------|-------|-------|--------|--------|---------|---------|---------|
-| TransE| 214 | --   | --     | --  | 0.491   | 118 | 0.668|
-| TransR| 202 | --   | --     | --  | 0.502   | 115 | 0.683|
-| RotatE| 156| --   | --     | --  | 0.498   | 52 | 0.710|
-WN18 dataset
-|  Models  |Mean Rank|  Mrr  | Hits@1 | Hits@3 | Hits@10 | MR@filter| Hits10@filter| 
-|----------|-------|-------|--------|--------|---------|---------|---------|
-| TransE|  257 | --   | --     | --  |  0.800  | 245 | 0.915|
-| TransR|  255 | --   | --     | --  |  0.8012| 243 | 0.9371|
-| RotatE|  188 | --   | --     | --  |  0.8325| 176 | 0.9601|
-## References
-[1]. TransE https://ieeexplore.ieee.org/abstract/document/8047276
-[2]. TransR http://www.aaai.org/ocs/index.php/AAAI/AAAI15/paper/viewFile/9571/9523
-[3]. RotatE https://arxiv.org/abs/1902.10197
--- a/examples/kg/run.sh
+++ b/examples/kg/run.sh
-#CUDA_VISIBLE_DEVICES=2 \
-#FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-#python main.py \
-#    --use_cuda \
-#    --model TransE \
-#    --optimizer adam \
-#    --batch_size=512 \
-#    --learning_rate=0.001 \
-#    --epoch 100 \
-#    --evaluate_per_iteration 20 \
-#    --sample_workers 4 \
-#    --margin 4 \
-##    #--only_evaluate
-#CUDA_VISIBLE_DEVICES=2 \
-#FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-#python main.py \
-#    --use_cuda \
-#    --model RotatE \
-#    --data_dir ./data/WN18 \
-#    --optimizer adam \
-#    --batch_size=512 \
-#    --learning_rate=0.001 \
-#    --epoch 100 \
-#    --evaluate_per_iteration 100 \
-#    --sample_workers 10 \
-#    --margin 6 \
-#    --neg_times 10 
-CUDA_VISIBLE_DEVICES=2 \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python main.py \
-    --use_cuda \
-    --model RotatE \
-    --data_dir ./data/FB15k \
-    --optimizer adam \
-    --batch_size=512 \
-    --learning_rate=0.001 \
-    --epoch 100 \
-    --evaluate_per_iteration 100 \
-    --sample_workers 10 \
-    --margin 8 \
-    --neg_times 10 \
-    --neg_mode True
--- a/examples/pgl-ke/README.md
+++ b/examples/pgl-ke/README.md
+# PGL - Knowledge Graph Embedding
+This package is mainly for computing node and relation embedding of knowledge graphs efficiently.
+This package reproduce the following knowledge embedding models:
+- TransE
+- TransR
+- RotatE
+### Dataset
+The dataset WN18 and FB15k are originally published by TransE paper and can be download [here](https://everest.hds.utc.fr/doku.php?id=en:transe).
+FB15k: [https://drive.google.com/open?id=19I3LqaKjgq-3vOs0us7OgEL06TIs37W8](https://drive.google.com/open?id=19I3LqaKjgq-3vOs0us7OgEL06TIs37W8)
+WN18: [https://drive.google.com/open?id=1MXy257ZsjeXQHZScHLeQeVnUTPjltlwD](https://drive.google.com/open?id=1MXy257ZsjeXQHZScHLeQeVnUTPjltlwD)
+### Dependencies
+If you want to use the PGL-KG in paddle, please install following packages.
+- paddlepaddle>=1.7
+- pgl
+### Hyperparameters
+- use\_cuda: use cuda to train.
+- model: pgl-kg model names. Now available for `TransE`, `TransR` and `RotatE`.
+- data\_dir: the data path of dataset.
+- optimizer: optimizer to run the model.
+- batch\_size: batch size.
+- learning\_rate:learning rate.
+- epoch: epochs to run.
+- evaluate\_per\_iteration: evaluate after certain epochs.
+- sample\_workers: sample workers nums to prepare data.
+- margin: hyper-parameter for some model.
+For more hyper parameters usages, please refer the `main.py`. We also provide `run.sh` script to reproduce performance results (please download dataset in `./data` and specify the data\_dir paramter).
+### How to run
+For examples, use GPU to train TransR model on WN18 dataset.
+(please download WN18 dataset to `./data` floder)
+```
+python main.py --use_cuda --model TransR --data_dir ./data/WN18
+```
+We also provide `run.sh` script to reproduce following performance results.
+### Experiment results
+Here we report the experiment results on FB15k and WN18 dataset. The evaluation criteria are MR (mean rank), Mrr (mean reciprocal rank), Hit@N (The first N hit rate). The suffix `@f` means that we filter the exists relations of entities.
+FB15k dataset
+| Models | MR  |  Mrr  | Hits@1 | Hits@3 | Hits@10|  MR@f |Mrr@f|Hit1@f|Hit3@f|Hits10@f|
+|--------|-----|-------|--------|--------|--------|-------|-----|------|------|--------|
+| TransE | 215 | 0.205 |  0.093 | 0.234  |  0.446 |   74  |0.379| 0.235| 0.453|  0.647 |
+| TransR | 304 | 0.193 |  0.092 | 0.211  |  0.418 |  156  |0.366| 0.232| 0.435|  0.623 |
+| RotatE | 157 | 0.270 | 0.162  | 0.303  |  0.501 |   53  |0.478| 0.354| 0.547|  0.710 |
+WN18 dataset
+| Models | MR  |  Mrr  | Hits@1 | Hits@3 | Hits@10|  MR@f |Mrr@f|Hit1@f|Hit3@f|Hits10@f|
+|--------|-----|-------|--------|--------|--------|-------|-----|------|------|--------|
+| TransE | 219 | 0.338 | 0.082  | 0.523  |  0.800 |  208  |0.463| 0.135| 0.771| 0.932  |
+| TransR | 321 | 0.370 | 0.096  | 0.591  |  0.810 |  309  |0.513| 0.158| 0.941| 0.941  |
+| RotatE | 167 | 0.623 | 0.476  | 0.688  |  0.830 |  155  |0.915| 0.884| 0.941| 0.957  |
+## References
+[1]. [TransE: Translating embeddings for modeling multi-relational data.](https://ieeexplore.ieee.org/abstract/document/8047276)
+[2]. [TransR: Learning entity and relation embeddings for knowledge graph completion.](http://www.aaai.org/ocs/index.php/AAAI/AAAI15/paper/viewFile/9571/9523)
+[3]. [RotatE: Knowledge Graph Embedding by Relational Rotation in Complex Space.](https://arxiv.org/abs/1902.10197)
--- a/examples/kg/data_loader.py
+++ b/examples/kg/data_loader.py
@@ -19,10 +19,11 @@ import os
 import numpy as np
 from collections import defaultdict
 from pgl.utils.logger import log
-from pybloom import BloomFilter
+#from pybloom import BloomFilter
-class KBloader:
+class KGLoader:
    """
    load the FB15K
    """
@@ -65,8 +66,9 @@ class KBloader:
    def training_data_no_filter(self, train_triple_positive):
        """faster, no filter for exists triples"""
-        size = len(train_triple_positive)
+        size = len(train_triple_positive) * self._neg_times
-        train_triple_negative = train_triple_positive + 0
+        train_triple_negative = train_triple_positive.repeat(
+            self._neg_times, axis=0)
        replace_head_probability = 0.5 * np.ones(size)
        replace_entity_id = np.random.randint(self.entity_total, size=size)
        random_num = np.random.random(size=size)
@@ -122,7 +124,6 @@ class KBloader:
        """
        n = len(self._triple_train)
        rand_idx = np.random.permutation(n)
-        rand_idx = rand_idx % n
        n_triple = len(rand_idx)
        start = 0
        while start < n_triple:

--- a/examples/kg/evalutate.py
+++ b/examples/kg/evalutate.py
@@ -99,8 +99,10 @@ class Evaluate:
                                                 feed=batch_feed_dict)
                yield batch_feed_dict["test_triple"], head_score, tail_score
                n_used_eval_triple += 1
+                if n_used_eval_triple % 500 == 0:
                    print('[{:.3f}s] #evaluation triple: {}/{}'.format(
-                    timeit.default_timer() - start, n_used_eval_triple, 5000))
+                        timeit.default_timer(
+                        ) - start, n_used_eval_triple, self.reader.test_num))
        res_reader = mp_reader_mapper(
            reader=iterator,

--- a/examples/kg/main.py
+++ b/examples/kg/main.py
@@ -16,10 +16,13 @@ The script to run these models.
 """
 import argparse
 import timeit
+import os
+import numpy as np
 import paddle.fluid as fluid
-from data_loader import KBloader
+from data_loader import KGLoader
 from evalutate import Evaluate
 from model import model_dict
+from model.utils import load_var
 from mp_mapper import mp_reader_mapper
 from pgl.utils.logger import log
@@ -49,6 +52,7 @@ def run_round(batch_iter,
    run_time = 0
    data_time = 0
    t2 = timeit.default_timer()
+    start_epoch_time = timeit.default_timer()
    for batch_feed_dict in batch_iter():
        batch += 1
        t1 = timeit.default_timer()
@@ -62,8 +66,11 @@ def run_round(batch_iter,
        if batch % log_per_step == 0:
            tmp_epoch += 1
            if prefix == "train":
-                log.info("Epoch %s Ava Loss %s" %
+                log.info("Epoch %s (%.7f sec) Train Loss: %.7f" %
-                         (epoch + tmp_epoch, tmp_loss / batch))
+                         (epoch + tmp_epoch,
+                          timeit.default_timer() - start_epoch_time,
+                          tmp_loss[0] / batch))
+                start_epoch_time = timeit.default_timer()
            else:
                log.info("Batch %s" % batch)
            batch = 0
@@ -84,7 +91,7 @@ def train(args):
    :param args: all args.
    :return: None
    """
-    kgreader = KBloader(
+    kgreader = KGLoader(
        batch_size=args.batch_size,
        data_dir=args.data_dir,
        neg_mode=args.neg_mode,
@@ -117,8 +124,8 @@ def train(args):
        reader = mp_reader_mapper(
            data_repeat,
-            func=kgreader.training_data_map,
+            func=kgreader.training_data_no_filter
-            #func=kgreader.training_data_no_filter,
+            if args.nofilter else kgreader.training_data_map,
            num_works=args.sample_workers)
        return reader
@@ -148,6 +155,20 @@ def train(args):
    exe = fluid.Executor(places[0])
    exe.run(model.startup_program)
    exe.run(fluid.default_startup_program())
+    if args.pretrain and model.model_name in ["TransR", "transr"]:
+        pretrain_ent = os.path.join(args.checkpoint,
+                                    model.ent_name.replace("TransR", "TransE"))
+        pretrain_rel = os.path.join(args.checkpoint,
+                                    model.rel_name.replace("TransR", "TransE"))
+        if os.path.exists(pretrain_ent):
+            print("loading pretrain!")
+            #var = fluid.global_scope().find_var(model.ent_name)
+            load_var(exe, model.train_program, model.ent_name, pretrain_ent)
+            #var = fluid.global_scope().find_var(model.rel_name)
+            load_var(exe, model.train_program, model.rel_name, pretrain_rel)
+        else:
+            raise ValueError("pretrain file {} not exists!".format(
+                pretrain_ent))
    prog = fluid.CompiledProgram(model.train_program).with_data_parallel(
        loss_name=model.train_fetch_vars[0].name)
@@ -182,9 +203,9 @@ def train(args):
            log_per_step=kgreader.train_num // args.batch_size,
            epoch=epoch * args.evaluate_per_iteration)
        log.info("epoch\t%s" % ((1 + epoch) * args.evaluate_per_iteration))
-        if True:
        fluid.io.save_params(
            exe, dirname=args.checkpoint, main_program=model.train_program)
+        if not args.noeval:
            eva = Evaluate(kgreader)
            eva.launch_evaluation(
                exe=exe,
@@ -273,6 +294,22 @@ def main():
    parser.add_argument(
        '--neg_mode', type=bool, help='return neg mode flag', default=False)
+    parser.add_argument(
+        '--nofilter',
+        type=bool,
+        help='don\'t filter invalid examples',
+        default=False)
+    parser.add_argument(
+        '--pretrain',
+        type=bool,
+        help='pretrain for TransR model',
+        default=False)
+    parser.add_argument(
+        '--noeval',
+        type=bool,
+        help='whether to evaluate the result',
+        default=False)
    args = parser.parse_args()
    log.info(args)
    train(args)

--- a/examples/kg/model/Model.py
+++ b/examples/kg/model/Model.py
--- a/examples/kg/model/RotatE.py
+++ b/examples/kg/model/RotatE.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 """
 RotatE:
-"Learning entity and relation embeddings for knowledge graph completion."
+"RotatE: Knowledge Graph Embedding by Relational Rotation in Complex Space."
-Lin, Yankai, et al.
+Sun, Zhiqing, et al.
-https://www.aaai.org/ocs/index.php/AAAI/AAAI15/paper/view/9571/9523
+https://arxiv.org/abs/1902.10197
 """
 import paddle.fluid as fluid
 from .Model import Model

--- a/examples/kg/model/TransE.py
+++ b/examples/kg/model/TransE.py
@@ -34,6 +34,7 @@ class TransE(Model):
                 learning_rate,
                 args,
                 optimizer="adam"):
+        self._neg_times = args.neg_times
        super(TransE, self).__init__(
            model_name="TransE",
            data_reader=data_reader,
@@ -84,6 +85,9 @@ class TransE(Model):
            fluid.layers.abs(pos_score), 1, keep_dim=False)
        neg = fluid.layers.reduce_sum(
            fluid.layers.abs(neg_score), 1, keep_dim=False)
+        neg = fluid.layers.reshape(
+            neg, shape=[-1, self._neg_times], inplace=True)
        loss = fluid.layers.reduce_mean(
            fluid.layers.relu(pos - neg + self._margin))
        return [loss]

--- a/examples/kg/model/TransR.py
+++ b/examples/kg/model/TransR.py
@@ -36,6 +36,7 @@ class TransR(Model):
                 args,
                 optimizer="adam"):
        """init"""
+        self._neg_times = args.neg_times
        super(TransR, self).__init__(
            model_name="TransR",
            data_reader=data_reader,
@@ -60,19 +61,19 @@ class TransR(Model):
            dtype="float32",
            name=self.rel_name,
            default_initializer=fluid.initializer.Xavier())
+        init_values = np.tile(
+            np.identity(
+                self._hidden_size, dtype="float32").reshape(-1),
+            (self._relation_total, 1))
        transfer_matrix = fluid.layers.create_parameter(
            shape=[
                self._relation_total, self._hidden_size * self._hidden_size
            ],
            dtype="float32",
-            name=self._prefix + "transfer_matrix", )
+            name=self._prefix + "transfer_matrix",
-        # Here is a trick, must init with identity matrix to get good hit@10 performance.
+            default_initializer=fluid.initializer.NumpyArrayInitializer(
-        fluid.layers.assign(
+                init_values))
-            np.tile(
-                np.identity(
-                    self._hidden_size, dtype="float32").reshape(-1),
-                (self._relation_total, 1)),
-            transfer_matrix)
        return entity_embedding, relation_embedding, transfer_matrix
    def score_with_l2_normalize(self, head, rel, tail):
@@ -111,7 +112,7 @@ class TransR(Model):
        pos_head_trans = self.matmul_with_expend_dims(pos_head, rel_matrix)
        pos_tail_trans = self.matmul_with_expend_dims(pos_tail, rel_matrix)
-        trans_neg = False
+        trans_neg = True
        if trans_neg:
            rel_matrix_neg = fluid.layers.reshape(
                lookup_table(self.train_neg_input[:, 1], transfer_matrix),
@@ -133,6 +134,9 @@ class TransR(Model):
            fluid.layers.abs(pos_score), -1, keep_dim=False)
        neg = fluid.layers.reduce_sum(
            fluid.layers.abs(neg_score), -1, keep_dim=False)
+        neg = fluid.layers.reshape(
+            neg, shape=[-1, self._neg_times], inplace=True)
        loss = fluid.layers.reduce_mean(
            fluid.layers.relu(pos - neg + self._margin))
        return [loss]

--- a/examples/kg/model/__init__.py
+++ b/examples/kg/model/__init__.py
--- a/examples/kg/model/utils.py
+++ b/examples/kg/model/utils.py
@@ -56,3 +56,64 @@ def lookup_table_gather(index, input):
    :return:
    """
    return fluid.layers.gather(index=index, input=input, overwrite=False)
+def _clone_var_in_block_(block, var):
+    assert isinstance(var, fluid.Variable)
+    if var.desc.type() == fluid.core.VarDesc.VarType.LOD_TENSOR:
+        return block.create_var(
+            name=var.name,
+            shape=var.shape,
+            dtype=var.dtype,
+            type=var.type,
+            lod_level=var.lod_level,
+            persistable=True)
+    else:
+        return block.create_var(
+            name=var.name,
+            shape=var.shape,
+            dtype=var.dtype,
+            type=var.type,
+            persistable=True)
+def load_var(executor, main_program=None, var=None, filename=None):
+    """
+    load_var to certain program
+    :param executor: executor
+    :param main_program: the program to load
+    :param var: the variable name in main_program.
+    :file_name: the file name of the file to load.
+    :return: None
+    """
+    load_prog = fluid.Program()
+    load_block = load_prog.global_block()
+    if main_program is None:
+        main_program = fluid.default_main_program()
+    if not isinstance(main_program, fluid.Program):
+        raise TypeError("program should be as Program type or None")
+    vars = list(filter(None, main_program.list_vars()))
+    # save origin param shape
+    orig_para_shape = {}
+    load_var_map = {}
+    for each_var in vars:
+        if each_var.name != var:
+            continue
+        assert isinstance(each_var, fluid.Variable)
+        if each_var.type == fluid.core.VarDesc.VarType.RAW:
+            continue
+        if isinstance(each_var, fluid.framework.Parameter):
+            orig_para_shape[each_var.name] = tuple(each_var.desc.get_shape())
+        new_var = _clone_var_in_block_(load_block, each_var)
+        if filename is not None:
+            load_block.append_op(
+                type='load',
+                inputs={},
+                outputs={'Out': [new_var]},
+                attrs={'file_path': filename})
+    executor.run(load_prog)
--- a/examples/kg/mp_mapper.py
+++ b/examples/kg/mp_mapper.py
@@ -65,12 +65,16 @@ def mp_reader_mapper(reader, func, num_works=4):
            all_process.append(p)
        data_iter = reader()
+        if not hasattr(data_iter, "__next__"):
+            __next__ = data_iter.next
+        else:
+            __next__ = data_iter.__next__
        def next_data():
            """next_data"""
            _next = None
            try:
-                _next = data_iter.next()
+                _next = __next__()
            except StopIteration:
                # log.debug(traceback.format_exc())
                pass

--- a/examples/pgl-ke/run.sh
+++ b/examples/pgl-ke/run.sh
+device=3
+CUDA_VISIBLE_DEVICES=$device \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python main.py \
+    --use_cuda \
+    --model TransE \
+    --data_dir ./data/FB15k \
+    --optimizer adam \
+    --batch_size=1024 \
+    --learning_rate=0.001 \
+    --epoch 200 \
+    --evaluate_per_iteration 200 \
+    --sample_workers 1 \
+    --margin 1.0 \
+    --nofilter True \
+    --neg_times 10 \
+    --neg_mode True
+    #--only_evaluate
+#  TransE FB15k
+#  -----Raw-Average-Results
+#  MeanRank: 214.94, MRR: 0.2051, Hits@1: 0.0929, Hits@3: 0.2343, Hits@10: 0.4458
+#  -----Filter-Average-Results
+#  MeanRank:  74.41, MRR: 0.3793, Hits@1: 0.2351, Hits@3: 0.4538, Hits@10: 0.6570
+CUDA_VISIBLE_DEVICES=$device \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python main.py \
+    --use_cuda \
+    --model TransE \
+    --data_dir ./data/WN18 \
+    --optimizer adam \
+    --batch_size=1024 \
+    --learning_rate=0.001 \
+    --epoch 100 \
+    --evaluate_per_iteration 100 \
+    --sample_workers 1 \
+    --margin 4 \
+    --nofilter True \
+    --neg_times 10 \
+    --neg_mode True
+#  TransE WN18
+#  -----Raw-Average-Results
+#  MeanRank: 219.08, MRR: 0.3383, Hits@1: 0.0821, Hits@3: 0.5233, Hits@10: 0.7997
+#  -----Filter-Average-Results
+#  MeanRank: 207.72, MRR: 0.4631, Hits@1: 0.1349, Hits@3: 0.7708, Hits@10: 0.9315
+#for  prertrain
+CUDA_VISIBLE_DEVICES=$device \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python main.py \
+    --use_cuda \
+    --model TransE \
+    --data_dir ./data/FB15k \
+    --optimizer adam \
+    --batch_size=512 \
+    --learning_rate=0.001 \
+    --epoch 30 \
+    --evaluate_per_iteration 30 \
+    --sample_workers 1 \
+    --margin 2.0 \
+    --nofilter True \
+    --noeval True \
+    --neg_times 10 \
+    --neg_mode True && \
+CUDA_VISIBLE_DEVICES=$device \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python main.py \
+    --use_cuda \
+    --model TransR \
+    --data_dir ./data/FB15k \
+    --optimizer adam \
+    --batch_size=512 \
+    --learning_rate=0.001 \
+    --epoch 200 \
+    --evaluate_per_iteration 200 \
+    --sample_workers 1 \
+    --margin 2.0 \
+    --pretrain True \
+    --nofilter True \
+    --neg_times 10 \
+    --neg_mode True
+#  FB15k TransR 200, pretrain 20
+#  -----Raw-Average-Results
+#  MeanRank: 303.81, MRR: 0.1931, Hits@1: 0.0920, Hits@3: 0.2109, Hits@10: 0.4181
+#  -----Filter-Average-Results
+#  MeanRank: 156.30, MRR: 0.3663, Hits@1: 0.2318, Hits@3: 0.4352, Hits@10: 0.6231
+# for pretrain
+CUDA_VISIBLE_DEVICES=$device \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python main.py \
+    --use_cuda \
+    --model TransE \
+    --data_dir ./data/WN18 \
+    --optimizer adam \
+    --batch_size=512 \
+    --learning_rate=0.001 \
+    --epoch 30 \
+    --evaluate_per_iteration 30 \
+    --sample_workers 1 \
+    --margin 4.0 \
+    --nofilter True \
+    --noeval True \
+    --neg_times 10 \
+    --neg_mode True && \
+CUDA_VISIBLE_DEVICES=$device \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python main.py \
+    --use_cuda \
+    --model TransR \
+    --data_dir ./data/WN18 \
+    --optimizer adam \
+    --batch_size=512 \
+    --learning_rate=0.001 \
+    --epoch 100 \
+    --evaluate_per_iteration 100 \
+    --sample_workers 1 \
+    --margin 4.0 \
+    --pretrain True \
+    --nofilter True \
+    --neg_times 10 \
+    --neg_mode True
+#  TransR WN18 100, pretrain 30
+#  -----Raw-Average-Results
+#  MeanRank: 321.41, MRR: 0.3706, Hits@1: 0.0955, Hits@3: 0.5906, Hits@10: 0.8099
+#  -----Filter-Average-Results
+#  MeanRank: 309.15, MRR: 0.5126, Hits@1: 0.1584, Hits@3: 0.8601, Hits@10: 0.9409
+CUDA_VISIBLE_DEVICES=$device \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python main.py \
+    --use_cuda \
+    --model RotatE \
+    --data_dir ./data/FB15k \
+    --optimizer adam \
+    --batch_size=512 \
+    --learning_rate=0.001 \
+    --epoch 100 \
+    --evaluate_per_iteration 100 \
+    --sample_workers 10 \
+    --margin 8 \
+    --neg_times 10 \
+    --neg_mode True
+# RotatE FB15k
+# -----Raw-Average-Results
+# MeanRank: 156.85, MRR: 0.2699, Hits@1: 0.1615, Hits@3: 0.3031, Hits@10: 0.5006
+# -----Filter-Average-Results
+# MeanRank:  53.35, MRR: 0.4776, Hits@1: 0.3537, Hits@3: 0.5473, Hits@10: 0.7062
+CUDA_VISIBLE_DEVICES=$device \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python main.py \
+    --use_cuda \
+    --model RotatE \
+    --data_dir ./data/WN18 \
+    --optimizer adam \
+    --batch_size=512 \
+    --learning_rate=0.001 \
+    --epoch 100 \
+    --evaluate_per_iteration 100 \
+    --sample_workers 10 \
+    --margin 6 \
+    --neg_times 10 \
+    --neg_mode True
+# RotaE WN18
+# -----Raw-Average-Results
+# MeanRank: 167.27, MRR: 0.6025, Hits@1: 0.4764, Hits@3: 0.6880, Hits@10: 0.8298
+# -----Filter-Average-Results
+# MeanRank: 155.23, MRR: 0.9145, Hits@1: 0.8843, Hits@3: 0.9412, Hits@10: 0.9570
--- a/ogb_examples/graphproppred/main_pgl.py
+++ b/ogb_examples/graphproppred/main_pgl.py
--- a/ogb_examples/graphproppred/mol/README.md
+++ b/ogb_examples/graphproppred/mol/README.md
+# Graph Property Prediction for Open Graph Benchmark (OGB)
+[The Open Graph Benchmark (OGB)](https://ogb.stanford.edu/) is a collection of benchmark datasets, data loaders, and evaluators for graph machine learning. Here we complete the Graph Property Prediction task based on PGL.
+### Requirements
+- paddlpaddle >= 1.7.1
+- pgl 1.0.2
+- ogb
+NOTE: To install ogb that is fited for this project, run below command to install ogb
+```
+git clone https://github.com/snap-stanford/ogb.git
+git checkout 482c40bc9f31fe25f9df5aa11c8fb657bd2b1621
+python setup.py install
+```
+### How to run
+For example, use GPU to train model on ogbg-molhiv dataset and ogb-molpcba dataset.
+```
+CUDA_VISIBLE_DEVICES=1 python -u main.py --config hiv_config.yaml --use_cuda
+CUDA_VISIBLE_DEVICES=2 python -u main.py --config pcba_config.yaml --use_cuda
+```
+If you want to use CPU to train model, environment variables `CPU_NUM` should be specified and should be in the range of 1 to N, where N is the total CPU number on your machine.
+```
+CPU_NUM=1 python -u main.py --config hiv_config.yaml
+CPU_NUM=1 python -u main.py --config pcba_config.yaml
+```
+### Experiment results
+| model | hiv (rocauc)| pcba (prcauc)|
+|-------|-------------|--------------|
+| GIN   |0.7719 (0.0079) | 0.2232 (0.0018) |
--- a/ogb_examples/graphproppred/mol/args.py
+++ b/ogb_examples/graphproppred/mol/args.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import os
+import time
+import argparse
+from utils.args import ArgumentGroup
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument('--use_cuda', action='store_true')
+model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
+model_g.add_arg("init_checkpoint",          str,  None,           "Init checkpoint to resume training from.")
+model_g.add_arg("init_pretraining_params",  str,  None,
+                "Init pre-training params which preforms fine-tuning from. If the "
+                 "arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
+model_g.add_arg("./save_dir",              str,  "./checkpoints",  "Path to save checkpoints.")
+model_g.add_arg("hidden_size",             int,    128,       "hidden size.")
+train_g = ArgumentGroup(parser, "training", "training options.")
+train_g.add_arg("epoch",             int,    3,       "Number of epoches for fine-tuning.")
+train_g.add_arg("learning_rate",     float,  5e-5,    "Learning rate used to train with warmup.")
+train_g.add_arg("lr_scheduler",      str,    "linear_warmup_decay",
+                "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
+train_g.add_arg("weight_decay",      float,  0.01,    "Weight decay rate for L2 regularizer.")
+train_g.add_arg("warmup_proportion", float,  0.1,
+                "Proportion of training steps to perform linear learning rate warmup for.")
+train_g.add_arg("save_steps",        int,    10000,   "The steps interval to save checkpoints.")
+train_g.add_arg("validation_steps",  int,    1000,    "The steps interval to evaluate model performance.")
+train_g.add_arg("use_dynamic_loss_scaling",    bool,   True,   "Whether to use dynamic loss scaling.")
+train_g.add_arg("init_loss_scaling",           float,  102400,
+                "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
+train_g.add_arg("test_save",            str,    "./checkpoints/test_result",       "test_save")
+train_g.add_arg("metric",               str,    "simple_accuracy",   "metric")
+train_g.add_arg("incr_every_n_steps",          int,    100,   "Increases loss scaling every n consecutive.")
+train_g.add_arg("decr_every_n_nan_or_inf",     int,    2,
+                "Decreases loss scaling every n accumulated steps with nan or inf gradients.")
+train_g.add_arg("incr_ratio",                  float,  2.0,
+                "The multiplier to use when increasing the loss scaling.")
+train_g.add_arg("decr_ratio",                  float,  0.8,
+                "The less-than-one-multiplier to use when decreasing.")
+log_g = ArgumentGroup(parser,     "logging", "logging related.")
+log_g.add_arg("skip_steps",          int,    10,    "The steps interval to print loss.")
+log_g.add_arg("verbose",             bool,   False, "Whether to output verbose log.")
+log_g.add_arg("log_dir",             str,   './logs/', "Whether to output verbose log.")
+data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
+data_g.add_arg("tokenizer",           str, "FullTokenizer",
+              "ATTENTION: the INPUT must be splited by Word with blank while using SentencepieceTokenizer or WordsegTokenizer")
+data_g.add_arg("train_set",           str,  None,  "Path to training data.")
+data_g.add_arg("test_set",            str,  None,  "Path to test data.")
+data_g.add_arg("dev_set",             str,  None,  "Path to validation data.")
+data_g.add_arg("aug1_type",           str,  "scheme1",  "augment type")
+data_g.add_arg("aug2_type",           str,  "scheme1",  "augment type")
+data_g.add_arg("batch_size",          int,  32,    "Total examples' number in batch for training. see also --in_tokens.")
+data_g.add_arg("predict_batch_size",  int,  None,    "Total examples' number in batch for predict. see also --in_tokens.")
+data_g.add_arg("random_seed",         int,  None,     "Random seed.")
+data_g.add_arg("buf_size",         int,  1000,     "Random seed.")
+run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
+run_type_g.add_arg("num_iteration_per_drop_scope", int,    10,    "Iteration intervals to drop scope.")
+run_type_g.add_arg("do_train",                     bool,   True,  "Whether to perform training.")
+run_type_g.add_arg("do_val",                       bool,   True,  "Whether to perform evaluation on dev data set.")
+run_type_g.add_arg("do_test",                      bool,   True,  "Whether to perform evaluation on test data set.")
+run_type_g.add_arg("metrics",                      bool,   True,  "Whether to perform evaluation on test data set.")
+run_type_g.add_arg("shuffle",                      bool,   True,  "")
+run_type_g.add_arg("for_cn",                       bool,   True,  "model train for cn or for other langs.")
+run_type_g.add_arg("num_workers",                       int,   1,  "use multiprocess to generate graph")
+run_type_g.add_arg("output_dir",                       str,   None,  "path to save model")
+run_type_g.add_arg("config",                       str,   None,  "configure yaml file")
+run_type_g.add_arg("n",                       str,   None,  "task name")
+run_type_g.add_arg("task_name", str,   None,  "task name")
+run_type_g.add_arg("pretrain", bool,   False,  "Whether do pretrian")
+run_type_g.add_arg("pretrain_name", str,   None,  "pretrain task name")
+run_type_g.add_arg("pretrain_config", str,   None,  "pretrain config.yaml file")
+run_type_g.add_arg("pretrain_model_step", str,   None,  "pretrain model step")
+run_type_g.add_arg("model_type", str,   "BaseLineModel",  "pretrain model step")
+run_type_g.add_arg("num_class", int,   1,  "number class")
+run_type_g.add_arg("dataset_name", str,   None,  "finetune dataset name")
+run_type_g.add_arg("eval_metrics", str,   None,  "evaluate metrics")
+run_type_g.add_arg("task_type", str,   None,  "regression or classification")
--- a/ogb_examples/graphproppred/mol/data/__init__.py
+++ b/ogb_examples/graphproppred/mol/data/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/ogb_examples/graphproppred/mol/data/base_dataset.py
+++ b/ogb_examples/graphproppred/mol/data/base_dataset.py
--- a/ogb_examples/graphproppred/mol/data/dataloader.py
+++ b/ogb_examples/graphproppred/mol/data/dataloader.py
--- a/ogb_examples/graphproppred/mol/data/splitters.py
+++ b/ogb_examples/graphproppred/mol/data/splitters.py
--- a/ogb_examples/graphproppred/mol/hiv_config.yaml
+++ b/ogb_examples/graphproppred/mol/hiv_config.yaml
--- a/ogb_examples/graphproppred/mol/main.py
+++ b/ogb_examples/graphproppred/mol/main.py
--- a/ogb_examples/graphproppred/mol/model.py
+++ b/ogb_examples/graphproppred/mol/model.py
--- a/ogb_examples/graphproppred/mol/mol_encoder.py
+++ b/ogb_examples/graphproppred/mol/mol_encoder.py
--- a/ogb_examples/graphproppred/mol/monitor/train_monitor.py
+++ b/ogb_examples/graphproppred/mol/monitor/train_monitor.py
--- a/ogb_examples/graphproppred/mol/optimization.py
+++ b/ogb_examples/graphproppred/mol/optimization.py
--- a/ogb_examples/graphproppred/mol/pcba_config.yaml
+++ b/ogb_examples/graphproppred/mol/pcba_config.yaml
--- a/ogb_examples/graphproppred/mol/utils/__init__.py
+++ b/ogb_examples/graphproppred/mol/utils/__init__.py
--- a/ogb_examples/graphproppred/mol/utils/args.py
+++ b/ogb_examples/graphproppred/mol/utils/args.py
--- a/ogb_examples/graphproppred/mol/utils/cards.py
+++ b/ogb_examples/graphproppred/mol/utils/cards.py
--- a/ogb_examples/graphproppred/mol/utils/config.py
+++ b/ogb_examples/graphproppred/mol/utils/config.py
--- a/ogb_examples/graphproppred/mol/utils/fp16.py
+++ b/ogb_examples/graphproppred/mol/utils/fp16.py
--- a/ogb_examples/graphproppred/mol/utils/init.py
+++ b/ogb_examples/graphproppred/mol/utils/init.py
--- a/ogb_examples/linkproppred/main_pgl.py
+++ b/ogb_examples/linkproppred/main_pgl.py
--- a/ogb_examples/linkproppred/ogbl-ppa/README.md
+++ b/ogb_examples/linkproppred/ogbl-ppa/README.md
--- a/ogb_examples/linkproppred/ogbl-ppa/args.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/args.py
--- a/ogb_examples/linkproppred/ogbl-ppa/dataloader/__init__.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/dataloader/__init__.py
--- a/ogb_examples/linkproppred/ogbl-ppa/dataloader/base_dataloader.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/dataloader/base_dataloader.py
--- a/ogb_examples/linkproppred/ogbl-ppa/dataloader/ogbl_ppa_dataloader.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/dataloader/ogbl_ppa_dataloader.py
--- a/ogb_examples/linkproppred/ogbl-ppa/model.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/model.py
--- a/ogb_examples/linkproppred/ogbl-ppa/monitor/__init__.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/monitor/__init__.py
--- a/ogb_examples/linkproppred/ogbl-ppa/monitor/train_monitor.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/monitor/train_monitor.py
--- a/ogb_examples/linkproppred/ogbl-ppa/train.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/train.py
--- a/ogb_examples/linkproppred/ogbl-ppa/utils/__init__.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/utils/__init__.py
--- a/ogb_examples/linkproppred/ogbl-ppa/utils/args.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/utils/args.py
--- a/ogb_examples/linkproppred/ogbl-ppa/utils/cards.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/utils/cards.py
--- a/ogb_examples/linkproppred/ogbl-ppa/utils/fp16.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/utils/fp16.py
--- a/ogb_examples/linkproppred/ogbl-ppa/utils/init.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/utils/init.py
--- a/pgl/graph_wrapper.py
+++ b/pgl/graph_wrapper.py
--- a/pgl/layers/conv.py
+++ b/pgl/layers/conv.py