Merge branch 'PaddlePaddle-develop'

cc0a1a85 · Yelrose · 2c7593d6 · 9ae72c39 · cc0a1a85 · cc0a1a85
88 changed file
--- a/examples/distribute_metapath2vec/README.md
+++ b/examples/distribute_metapath2vec/README.md
-# Distributed metapath2vec in PGL
+# Distributed metapath2vec, metapath2vec++, multi-metapath2vec++ in PGL
 [metapath2vec](https://ericdongyx.github.io/papers/KDD17-dong-chawla-swami-metapath2vec.pdf) is a algorithm framework for representation learning in heterogeneous networks which contains multiple types of nodes and links. Given a heterogeneous graph, metapath2vec algorithm first generates meta-path-based random walks and then use skipgram model to train a language model. Based on PGL, we reproduce metapath2vec algorithm in distributed mode.
-## Datasets
+### Datasets
 DBLP: The dataset contains 14376 papers (P), 20 conferences (C), 14475 authors (A), and 8920 terms (T). There are 33791 nodes in this dataset.
 You can dowload datasets from [here](https://github.com/librahu/HIN-Datasets-for-Recommendation-and-Network-Embedding)
 We use the ```DBLP``` dataset for example. After downloading the dataset, put them, let's say, in ```./data/DBLP/``` .
-## Dependencies
+### Dependencies
 - paddlepaddle>=1.6
 - pgl>=1.0.0
-## How to run
+### How to run
 Before training, run the below command to do data preprocessing.
 ```sh
 python data_process.py --data_path ./data/DBLP  --output_path ./data/data_processed
@@ -30,11 +30,21 @@ python multi_class.py --dataset ./data/data_processed/author_label.txt --ckpt_pa
 ```
+### Model Selection
+Actually, There are 3 models in this example, they are ```metapath2vec```, ```metapath2vec++``` and ```multi_metapath2vec++```. You can select different models by modifying ```config.yaml```.
-## Hyperparameters
+In order to run ```metapath2vec++``` model, you can easily rewrite the hyper parameter of **neg_sample_type** to **m2v_plus**, then ```metapath2vec++``` model will be selected.
+```multi-metapath2vec++``` means that you are not only use a single metapath, instead, you can use several metapaths at the same time to train the model. For example, you might want to use ```c2p-p2a-a2p-p2c``` and  ```p2a-a2p``` simultaneously. Then you can rewrite the below hyper parameters in ```config.yaml```.
+- **neg_sample_type**: "m2v_plus"
+- **walk_mode**: "multi_m2v"
+- **meta_path**: "c2p-p2a-a2p-p2c;p2a-a2p"
+- **first_node_type**: "c;p"
+### Hyperparameters
 All the hyper parameters are saved in ```config.yaml``` file. So before training, you can open the config.yaml to modify the hyper parameters as you like.
-Some important hyper parameters in config.yaml:
+Some important hyper parameters in ```config.yaml```:
 - **edge_path**: the directory of graph data that you want to load
 - **lr**: learning rate
 - **neg_num**: number of negative samples.

--- a/examples/distribute_metapath2vec/config.yaml
+++ b/examples/distribute_metapath2vec/config.yaml
@@ -31,7 +31,7 @@ is_distributed: False
 # trainging config
 epochs: 10
 optimizer: "sgd"
-lr: 1.0
+lr: 0.1
 warm_start_from_dir: null
 walkpath_files: "None"
 train_files: "None"

--- a/examples/distribute_metapath2vec/walker.py
+++ b/examples/distribute_metapath2vec/walker.py
@@ -87,9 +87,12 @@ class NodeGenerator(object):
            idx = cc % num_n_type
            n_type = n_type_list[idx]
            try:
-                nodes = node_generators[n_type].next()
+                nodes = next(node_generators[n_type])
            except StopIteration as e:
-                log.info("exception when iteration")
+                log.info("node type of %s iteration finished in one epoch" %
+                         (n_type))
+                node_generators[n_type] = \
+                        self.graph.node_batch_iter(self.batch_size, n_type=n_type)
                break
            yield (nodes, idx)
            cc += 1

--- a/examples/erniesage/config/erniesage_v1_cpu.yaml
+++ b/examples/erniesage/config/erniesage_v1_cpu.yaml
+# Global Enviroment Settings 
+#
+# trainer config ------
+learner_type: "cpu"
+optimizer_type: "adam"
+lr: 0.00005
+batch_size: 2
+CPU_NUM: 10
+epoch: 20
+log_per_step: 1
+save_per_step: 100
+output_path: "./output"
+ckpt_path: "./ernie_base_ckpt"
+# data config ------
+input_data: "./data.txt"
+graph_path: "./workdir"
+sample_workers: 1
+use_pyreader: true
+input_type: "text"
+# model config ------
+samples: [10]
+model_type: "ErnieSageModelV1"
+layer_type: "graphsage_sum"
+max_seqlen: 40
+num_layers: 1
+hidden_size: 128
+final_fc: true
+final_l2_norm: true
+loss_type: "hinge"
+margin: 0.3
+# infer config ------
+infer_model: "./output/last"
+infer_batch_size: 128
+# ernie config ------
+encoding: "utf8"
+ernie_vocab_file: "./vocab.txt"
+ernie_config:
+    attention_probs_dropout_prob: 0.1
+    hidden_act: "relu"
+    hidden_dropout_prob: 0.1
+    hidden_size: 768
+    initializer_range: 0.02
+    max_position_embeddings: 513
+    num_attention_heads: 12
+    num_hidden_layers: 12
+    sent_type_vocab_size: 4
+    task_type_vocab_size: 3
+    vocab_size: 18000
+    use_task_id: false
+    use_fp16: false
--- a/examples/erniesage/config/erniesage_v1_gpu.yaml
+++ b/examples/erniesage/config/erniesage_v1_gpu.yaml
+# Global Enviroment Settings 
+#
+# trainer config ------
+learner_type: "gpu"
+optimizer_type: "adam"
+lr: 0.00005
+batch_size: 32
+CPU_NUM: 10
+epoch: 20
+log_per_step: 1
+save_per_step: 100
+output_path: "./output"
+ckpt_path: "./ernie_base_ckpt"
+# data config ------
+input_data: "./data.txt"
+graph_path: "./workdir"
+sample_workers: 1
+use_pyreader: true
+input_type: "text"
+# model config ------
+samples: [10]
+model_type: "ErnieSageModelV1"
+layer_type: "graphsage_sum"
+max_seqlen: 40
+num_layers: 1
+hidden_size: 128
+final_fc: true
+final_l2_norm: true
+loss_type: "hinge"
+margin: 0.3
+# infer config ------
+infer_model: "./output/last"
+infer_batch_size: 128
+# ernie config ------
+encoding: "utf8"
+ernie_vocab_file: "./vocab.txt"
+ernie_config:
+    attention_probs_dropout_prob: 0.1
+    hidden_act: "relu"
+    hidden_dropout_prob: 0.1
+    hidden_size: 768
+    initializer_range: 0.02
+    max_position_embeddings: 513
+    num_attention_heads: 12
+    num_hidden_layers: 12
+    sent_type_vocab_size: 4
+    task_type_vocab_size: 3
+    vocab_size: 18000
+    use_task_id: false
+    use_fp16: false
--- a/examples/erniesage/config/erniesage_v2_cpu.yaml
+++ b/examples/erniesage/config/erniesage_v2_cpu.yaml
+# Global Enviroment Settings 
+#
+# trainer config ------
+learner_type: "cpu"
+optimizer_type: "adam"
+lr: 0.00005
+batch_size: 2
+CPU_NUM: 10
+epoch: 20
+log_per_step: 1
+save_per_step: 100
+output_path: "./output"
+ckpt_path: "./ernie_base_ckpt"
+# data config ------
+input_data: "./data.txt"
+graph_path: "./workdir"
+sample_workers: 1
+use_pyreader: true
+input_type: "text"
+# model config ------
+samples: [10]
+model_type: "ErnieSageModelV2"
+max_seqlen: 40
+num_layers: 1
+hidden_size: 128
+final_fc: true
+final_l2_norm: true
+loss_type: "hinge"
+margin: 0.3
+# infer config ------
+infer_model: "./output/last"
+infer_batch_size: 128
+# ernie config ------
+encoding: "utf8"
+ernie_vocab_file: "./vocab.txt"
+ernie_config:
+    attention_probs_dropout_prob: 0.1
+    hidden_act: "relu"
+    hidden_dropout_prob: 0.1
+    hidden_size: 768
+    initializer_range: 0.02
+    max_position_embeddings: 513
+    num_attention_heads: 12
+    num_hidden_layers: 12
+    sent_type_vocab_size: 4
+    task_type_vocab_size: 3
+    vocab_size: 18000
+    use_task_id: false
+    use_fp16: false
--- a/examples/erniesage/config/erniesage_v2_gpu.yaml
+++ b/examples/erniesage/config/erniesage_v2_gpu.yaml
+# Global Enviroment Settings 
+#
+# trainer config ------
+learner_type: "gpu"
+optimizer_type: "adam"
+lr: 0.00005
+batch_size: 32
+CPU_NUM: 10
+epoch: 20
+log_per_step: 1
+save_per_step: 100
+output_path: "./output"
+ckpt_path: "./ernie_base_ckpt"
+# data config ------
+input_data: "./data.txt"
+graph_path: "./workdir"
+sample_workers: 1
+use_pyreader: true
+input_type: "text"
+# model config ------
+samples: [10]
+model_type: "ErnieSageModelV2"
+max_seqlen: 40
+num_layers: 1
+hidden_size: 128
+final_fc: true
+final_l2_norm: true
+loss_type: "hinge"
+margin: 0.3
+# infer config ------
+infer_model: "./output/last"
+infer_batch_size: 128
+# ernie config ------
+encoding: "utf8"
+ernie_vocab_file: "./vocab.txt"
+ernie_config:
+    attention_probs_dropout_prob: 0.1
+    hidden_act: "relu"
+    hidden_dropout_prob: 0.1
+    hidden_size: 768
+    initializer_range: 0.02
+    max_position_embeddings: 513
+    num_attention_heads: 12
+    num_hidden_layers: 12
+    sent_type_vocab_size: 4
+    task_type_vocab_size: 3
+    vocab_size: 18000
+    use_task_id: false
+    use_fp16: false
--- a/examples/erniesage/config/erniesage_v3_cpu.yaml
+++ b/examples/erniesage/config/erniesage_v3_cpu.yaml
+# Global Enviroment Settings 
+#
+# trainer config ------
+learner_type: "cpu"
+optimizer_type: "adam"
+lr: 0.00005
+batch_size: 2
+CPU_NUM: 10
+epoch: 20
+log_per_step: 1
+save_per_step: 100
+output_path: "./output"
+ckpt_path: "./ernie_base_ckpt"
+# data config ------
+input_data: "./data.txt"
+graph_path: "./workdir"
+sample_workers: 1
+use_pyreader: true
+input_type: "text"
+# model config ------
+samples: [10]
+model_type: "ErnieSageModelV3"
+max_seqlen: 40
+num_layers: 1
+hidden_size: 128
+final_fc: true
+final_l2_norm: true
+loss_type: "hinge"
+margin: 0.3
+# infer config ------
+infer_model: "./output/last"
+infer_batch_size: 128
+# ernie config ------
+encoding: "utf8"
+ernie_vocab_file: "./vocab.txt"
+ernie_config:
+    attention_probs_dropout_prob: 0.1
+    hidden_act: "relu"
+    hidden_dropout_prob: 0.1
+    hidden_size: 768
+    initializer_range: 0.02
+    max_position_embeddings: 513
+    num_attention_heads: 12
+    num_hidden_layers: 12
+    sent_type_vocab_size: 4
+    task_type_vocab_size: 3
+    vocab_size: 18000
+    use_task_id: false
+    use_fp16: false
--- a/examples/erniesage/config/erniesage_v3_gpu.yaml
+++ b/examples/erniesage/config/erniesage_v3_gpu.yaml
+# Global Enviroment Settings 
+#
+# trainer config ------
+learner_type: "gpu"
+optimizer_type: "adam"
+lr: 0.00005
+batch_size: 32
+CPU_NUM: 10
+epoch: 20
+log_per_step: 1
+save_per_step: 100
+output_path: "./output"
+ckpt_path: "./ernie_base_ckpt"
+# data config ------
+input_data: "./data.txt"
+graph_path: "./workdir"
+sample_workers: 1
+use_pyreader: true
+input_type: "text"
+# model config ------
+samples: [10]
+model_type: "ErnieSageModelV3"
+max_seqlen: 40
+num_layers: 1
+hidden_size: 128
+final_fc: true
+final_l2_norm: true
+loss_type: "hinge"
+margin: 0.3
+# infer config ------
+infer_model: "./output/last"
+infer_batch_size: 128
+# ernie config ------
+encoding: "utf8"
+ernie_vocab_file: "./vocab.txt"
+ernie_config:
+    attention_probs_dropout_prob: 0.1
+    hidden_act: "relu"
+    hidden_dropout_prob: 0.1
+    hidden_size: 768
+    initializer_range: 0.02
+    max_position_embeddings: 513
+    num_attention_heads: 12
+    num_hidden_layers: 12
+    sent_type_vocab_size: 4
+    task_type_vocab_size: 3
+    vocab_size: 18000
+    use_task_id: false
+    use_fp16: false
--- a/examples/erniesage/dataset/__init__.py
+++ b/examples/erniesage/dataset/__init__.py
--- a/examples/erniesage/dataset/base_dataset.py
+++ b/examples/erniesage/dataset/base_dataset.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Base DataLoader 
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import os
+import sys
+import six
+from io import open
+from collections import namedtuple
+import numpy as np
+import tqdm
+import paddle
+from pgl.utils import mp_reader
+import collections
+import time
+from pgl.utils.logger import log
+import traceback
+if six.PY3:
+    import io
+    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
+def batch_iter(data, perm, batch_size, fid, num_workers):
+    """node_batch_iter
+    """
+    size = len(data)
+    start = 0
+    cc = 0
+    while start < size:
+        index = perm[start:start + batch_size]
+        start += batch_size
+        cc += 1
+        if cc % num_workers != fid:
+            continue
+        yield data[index]
+def scan_batch_iter(data, batch_size, fid, num_workers):
+    """node_batch_iter
+    """
+    batch = []
+    cc = 0
+    for line_example in data.scan(): 
+        cc += 1
+        if cc % num_workers != fid:
+            continue
+        batch.append(line_example)
+        if len(batch) == batch_size:
+            yield batch 
+            batch = []
+    if len(batch) > 0:
+        yield batch 
+class BaseDataGenerator(object):
+    """Base Data Geneartor"""
+    def __init__(self, buf_size, batch_size, num_workers, shuffle=True):
+        self.num_workers = num_workers
+        self.batch_size = batch_size
+        self.line_examples = []
+        self.buf_size = buf_size
+        self.shuffle = shuffle
+    def batch_fn(self, batch_examples):
+        """ batch_fn batch producer"""
+        raise NotImplementedError("No defined Batch Fn")
+    def batch_iter(self, fid, perm):
+        """ batch iterator"""
+        if self.shuffle:
+            for batch in batch_iter(self, perm, self.batch_size, fid, self.num_workers):
+                yield batch
+        else:
+            for batch in scan_batch_iter(self, self.batch_size, fid, self.num_workers):
+                yield batch
+    def __len__(self):
+        return len(self.line_examples)
+    def __getitem__(self, idx):
+        if isinstance(idx, collections.Iterable):
+            return [self[bidx] for bidx in idx]
+        else:
+            return self.line_examples[idx]
+    def generator(self):
+        """batch dict generator"""
+        def worker(filter_id, perm):
+            """ multiprocess worker"""
+            def func_run():
+                """ func_run """
+                pid = os.getpid()
+                np.random.seed(pid + int(time.time()))
+                for batch_examples in self.batch_iter(filter_id, perm):
+                    try:
+                        batch_dict = self.batch_fn(batch_examples)
+                    except Exception as e:
+                       traceback.print_exc()
+                       log.info(traceback.format_exc())
+                       log.info(str(e))
+                       continue
+                    if batch_dict is None:
+                        continue
+                    yield batch_dict
+            return func_run
+        # consume a seed
+        np.random.rand()
+        if self.shuffle:
+            perm = np.arange(0, len(self)) 
+            np.random.shuffle(perm)
+        else:
+            perm = None
+        if self.num_workers == 1:
+            r = paddle.reader.buffered(worker(0, perm), self.buf_size)
+        else:
+            worker_pool = [worker(wid, perm) for wid in range(self.num_workers)]
+            worker = mp_reader.multiprocess_reader(
+                worker_pool, use_pipe=True, queue_size=1000)
+            r = paddle.reader.buffered(worker, self.buf_size)
+        for batch in r():
+            yield batch
+    def scan(self): 
+        for line_example in self.line_examples:
+            yield line_example
--- a/examples/erniesage/dataset/graph_reader.py
+++ b/examples/erniesage/dataset/graph_reader.py
+"""Graph Dataset
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import os
+import pgl
+import sys
+import numpy as np
+from pgl.utils.logger import log
+from dataset.base_dataset import BaseDataGenerator
+from pgl.sample import alias_sample
+from pgl.sample import pinsage_sample
+from pgl.sample import graphsage_sample 
+from pgl.sample import edge_hash
+class GraphGenerator(BaseDataGenerator):
+    def __init__(self, graph_wrappers, data, batch_size, samples,
+        num_workers, feed_name_list, use_pyreader,
+        phase, graph_data_path, shuffle=True, buf_size=1000):
+        super(GraphGenerator, self).__init__(
+            buf_size=buf_size,
+            num_workers=num_workers,
+            batch_size=batch_size, shuffle=shuffle)
+        # For iteration
+        self.line_examples = data
+        self.graph_wrappers = graph_wrappers
+        self.samples = samples
+        self.feed_name_list = feed_name_list
+        self.use_pyreader = use_pyreader
+        self.phase = phase
+        self.load_graph(graph_data_path)
+        self.num_layers = len(graph_wrappers)
+    def load_graph(self, graph_data_path):
+        self.graph = pgl.graph.MemmapGraph(graph_data_path)
+        self.alias = np.load(os.path.join(graph_data_path, "alias.npy"), mmap_mode="r")
+        self.events = np.load(os.path.join(graph_data_path, "events.npy"), mmap_mode="r")
+        self.term_ids = np.load(os.path.join(graph_data_path, "term_ids.npy"), mmap_mode="r")
+    def batch_fn(self, batch_ex):
+        # batch_ex = [
+        #     (src, dst, neg),
+        #     (src, dst, neg),
+        #     (src, dst, neg),
+        #     ]
+        #
+        batch_src = []
+        batch_dst = []
+        batch_neg = []
+        for batch in batch_ex:
+            batch_src.append(batch[0])
+            batch_dst.append(batch[1])
+            if len(batch) == 3: # default neg samples
+                batch_neg.append(batch[2])
+        if len(batch_src) != self.batch_size:
+            if self.phase == "train":
+                return None  #Skip
+        if len(batch_neg) > 0:
+            batch_neg = np.unique(np.concatenate(batch_neg))
+        batch_src = np.array(batch_src, dtype="int64")
+        batch_dst = np.array(batch_dst, dtype="int64")
+        sampled_batch_neg = alias_sample(batch_dst.shape, self.alias, self.events)
+        if len(batch_neg) > 0:
+            batch_neg = np.concatenate([batch_neg, sampled_batch_neg], 0)
+        else:
+            batch_neg = sampled_batch_neg
+        if self.phase == "train":
+            ignore_edges = set()
+        else:
+            ignore_edges = set()
+        nodes = np.unique(np.concatenate([batch_src, batch_dst, batch_neg], 0))
+        subgraphs = graphsage_sample(self.graph, nodes, self.samples, ignore_edges=ignore_edges)
+        feed_dict = {}
+        for i in range(self.num_layers):
+            feed_dict.update(self.graph_wrappers[i].to_feed(subgraphs[i]))
+        # only reindex from first subgraph
+        sub_src_idx = subgraphs[0].reindex_from_parrent_nodes(batch_src)
+        sub_dst_idx = subgraphs[0].reindex_from_parrent_nodes(batch_dst)
+        sub_neg_idx = subgraphs[0].reindex_from_parrent_nodes(batch_neg)
+        feed_dict["user_index"] = np.array(sub_src_idx, dtype="int64")
+        feed_dict["item_index"] = np.array(sub_dst_idx, dtype="int64")
+        #feed_dict["neg_item_index"] = np.array(sub_neg_idx, dtype="int64")
+        feed_dict["term_ids"] = self.term_ids[subgraphs[0].node_feat["index"]]
+        return feed_dict
+    def __call__(self):
+        return self.generator()
+    def generator(self):
+        try:
+            for feed_dict in super(GraphGenerator, self).generator():
+                if self.use_pyreader:
+                    yield [feed_dict[name] for name in self.feed_name_list]
+                else:
+                    yield feed_dict
+        except Exception as e:
+            log.exception(e)
--- a/examples/erniesage/docs/source/_static/ernie_aggregator.png
+++ b/examples/erniesage/docs/source/_static/ernie_aggregator.png
--- a/examples/erniesage/docs/source/_static/text_graph.png
+++ b/examples/erniesage/docs/source/_static/text_graph.png
--- a/examples/erniesage/infer.py
+++ b/examples/erniesage/infer.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+import argparse
+import pickle
+import time
+import glob
+import os
+import io
+import traceback
+import pickle as pkl
+role = os.getenv("TRAINING_ROLE", "TRAINER")
+import numpy as np
+import yaml
+from easydict import EasyDict as edict
+import pgl
+from pgl.utils.logger import log
+from pgl.utils import paddle_helper
+import paddle
+import paddle.fluid as F
+from models.model_factory import Model
+from dataset.graph_reader import GraphGenerator 
+class PredictData(object):
+    def __init__(self, num_nodes):
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
+        train_usr = np.arange(trainer_id, num_nodes, trainer_count)
+        #self.data = (train_usr, train_usr)
+        self.data = train_usr
+    def __getitem__(self, index):
+        return [self.data[index], self.data[index]]
+def tostr(data_array):
+    return " ".join(["%.5lf" % d for d in  data_array])
+def run_predict(py_reader,
+              exe,
+              program,
+              model_dict,
+              log_per_step=1,
+              args=None):
+    if args.input_type == "text":
+        id2str = np.load(os.path.join(args.graph_path, "id2str.npy"), mmap_mode="r")
+    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+    trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
+    if not os.path.exists(args.output_path):
+        os.mkdir(args.output_path)
+    fout = io.open("%s/part-%s" % (args.output_path, trainer_id), "w", encoding="utf8")
+    batch = 0
+    for batch_feed_dict in py_reader():
+        batch += 1
+        batch_usr_feat, batch_ad_feat, batch_src_real_index = exe.run(
+            program,
+            feed=batch_feed_dict,
+            fetch_list=model_dict.outputs)
+        if batch % log_per_step == 0:
+            log.info("Predict %s finished" % batch)
+        for ufs, _, sri in zip(batch_usr_feat, batch_ad_feat, batch_src_real_index):
+            if args.input_type == "text":
+                sri = id2str[int(sri)]
+            line = "{}\t{}\n".format(sri, tostr(ufs))
+            fout.write(line)
+    fout.close()
+def _warmstart(exe, program, path='params'):
+    def _existed_persitables(var):
+        #if not isinstance(var, fluid.framework.Parameter):
+        #    return False
+        if not F.io.is_persistable(var):
+            return False
+        param_path = os.path.join(path, var.name)
+        log.info("Loading parameter: {} persistable: {} exists: {}".format(
+            param_path,
+            F.io.is_persistable(var),
+            os.path.exists(param_path),
+        ))
+        return os.path.exists(param_path)
+    F.io.load_vars(
+        exe,
+        path,
+        main_program=program,
+        predicate=_existed_persitables
+    )
+def main(config):
+    model = Model.factory(config)
+    if config.learner_type == "cpu":
+        place = F.CPUPlace()
+    elif config.learner_type == "gpu":
+        gpu_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        place = F.CUDAPlace(gpu_id)
+    else:
+        raise ValueError
+    exe = F.Executor(place)
+    val_program = F.default_main_program().clone(for_test=True)
+    exe.run(F.default_startup_program()) 
+    _warmstart(exe, F.default_startup_program(), path=config.infer_model)
+    num_threads = int(os.getenv("CPU_NUM", 1))
+    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
+    exec_strategy = F.ExecutionStrategy()
+    exec_strategy.num_threads = num_threads
+    build_strategy = F.BuildStrategy()
+    build_strategy.enable_inplace = True
+    build_strategy.memory_optimize = True
+    build_strategy.remove_unnecessary_lock = False
+    build_strategy.memory_optimize = False
+    if num_threads > 1:
+        build_strategy.reduce_strategy = F.BuildStrategy.ReduceStrategy.Reduce
+    val_compiled_prog = F.compiler.CompiledProgram(
+        val_program).with_data_parallel(
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy)
+    num_nodes = int(np.load(os.path.join(config.graph_path, "num_nodes.npy")))
+    predict_data = PredictData(num_nodes)
+    predict_iter = GraphGenerator(
+        graph_wrappers=model.graph_wrappers,
+        batch_size=config.infer_batch_size,
+        data=predict_data,
+        samples=config.samples,
+        num_workers=config.sample_workers,
+        feed_name_list=[var.name for var in model.feed_list],
+        use_pyreader=config.use_pyreader,
+        phase="predict",
+        graph_data_path=config.graph_path,
+        shuffle=False)
+    if config.learner_type == "cpu":
+        model.data_loader.decorate_batch_generator(
+            predict_iter, places=F.cpu_places())
+    elif config.learner_type == "gpu":
+        gpu_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        place = F.CUDAPlace(gpu_id)
+        model.data_loader.decorate_batch_generator(
+            predict_iter, places=place)
+    else:
+        raise ValueError
+    run_predict(model.data_loader,
+                program=val_compiled_prog,
+                exe=exe,
+                model_dict=model,
+                args=config)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='main')
+    parser.add_argument("--conf", type=str, default="./config.yaml")
+    args = parser.parse_args()
+    config = edict(yaml.load(open(args.conf), Loader=yaml.FullLoader))
+    print(config)
+    main(config)
--- a/examples/erniesage/job.sh
+++ b/examples/erniesage/job.sh
+unset http_proxy https_proxy
+set -x
+mode=${1:-local}
+config=${2:-"./config.yaml"}
+function parse_yaml {
+   local prefix=$2
+   local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034')
+   sed -ne "s|^\($s\):|\1|" \
+        -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \
+        -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p"  $1 |
+   awk -F$fs '{
+      indent = length($1)/2;
+      vname[indent] = $2;
+      for (i in vname) {if (i > indent) {delete vname[i]}}
+      if (length($3) > 0) {
+         vn=""; for (i=0; i<indent; i++) {vn=(vn)(vname[i])("_")}
+         printf("%s%s%s=\"%s\"\n", "'$prefix'",vn, $2, $3);
+      }
+   }'
+}
+eval $(parse_yaml $config)
+export CPU_NUM=$CPU_NUM
+export FLAGS_rpc_deadline=3000000 
+export FLAGS_rpc_retry_times=1000
+if [[ $async_mode == "True" ]];then
+    echo "async_mode is True"
+else
+    export FLAGS_communicator_send_queue_size=1
+    export FLAGS_communicator_min_send_grad_num_before_recv=0
+    export FLAGS_communicator_max_merge_var_num=1 # important! 
+    export FLAGS_communicator_merge_sparse_grad=0
+fi
+export FLAGS_communicator_recv_wait_times=5000000
+mkdir -p output
+python ./train.py --conf $config
+if [[ $TRAINING_ROLE == "TRAINER" ]];then
+    python ./infer.py --conf $config
+fi
--- a/examples/erniesage/learner.py
+++ b/examples/erniesage/learner.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import os
+role = os.getenv("TRAINING_ROLE", "TRAINER")
+import numpy as np
+from pgl.utils.logger import log
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import StrategyFactory
+from paddle.fluid.incubate.fleet.collective import DistributedStrategy
+from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
+from paddle.fluid.incubate.fleet.collective import fleet as cfleet
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet as tfleet
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from tensorboardX import SummaryWriter
+class Learner(object):
+    @classmethod
+    def factory(cls, name):
+        if name == "cpu":
+            return TranspilerLearner()
+        elif name == "gpu":
+            return CollectiveLearner()
+        else:
+            raise ValueError
+    def build(self, model, data_gen, config):
+        raise NotImplementedError
+    def warmstart(self, program, path='./checkpoints'):
+        def _existed_persitables(var):
+            #if not isinstance(var, fluid.framework.Parameter):
+            #    return False
+            if not F.io.is_persistable(var):
+                return False
+            param_path = os.path.join(path, var.name)
+            log.info("Loading parameter: {} persistable: {} exists: {}".format(
+                param_path,
+                F.io.is_persistable(var),
+                os.path.exists(param_path),
+            ))
+            return os.path.exists(param_path)
+        F.io.load_vars(
+            self.exe,
+            path,
+            main_program=program,
+            predicate=_existed_persitables
+        )
+    def start(self):
+        batch = 0
+        start = time.time()
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        if trainer_id == 0:
+            writer = SummaryWriter(os.path.join(self.config.output_path, "train_history"))
+        for epoch_idx in range(self.config.epoch):
+            for idx, batch_feed_dict in enumerate(self.model.data_loader()):
+                try:
+                    cpu_time = time.time()
+                    batch += 1
+                    batch_loss  = self.exe.run(
+                        self.program,
+                        feed=batch_feed_dict,
+                        fetch_list=[self.model.loss])
+                    end = time.time()
+                    if trainer_id == 0:
+                        writer.add_scalar("loss", np.mean(batch_loss), batch)
+                        if batch % self.config.log_per_step == 0:
+                            log.info(
+                                "Epoch %s Batch %s %s-Loss %s \t Speed(per batch) %.5lf/%.5lf sec"
+                                % (epoch_idx, batch, "train", np.mean(batch_loss), (end - start) /batch, (end - cpu_time)))
+                            writer.flush()
+                        if batch % self.config.save_per_step == 0:
+                            self.fleet.save_persistables(self.exe, os.path.join(self.config.output_path, str(batch)))
+                except Exception as e:
+                    log.info("Pyreader train error")
+                    log.exception(e)
+            log.info("epcoh %s done." % epoch_idx)
+    def stop(self):
+        self.fleet.save_persistables(self.exe, os.path.join(self.config.output_path, "last"))
+class TranspilerLearner(Learner):
+    def __init__(self):
+        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        paddle_role = role_maker.Role.WORKER
+        place = F.CPUPlace()
+        if training_role == "PSERVER":
+            paddle_role = role_maker.Role.SERVER
+        # set the fleet runtime environment according to configure
+        port = os.getenv("PADDLE_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVERS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = eplist  # ip:port,ip:port...
+        worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        role = role_maker.UserDefinedRoleMaker(
+            current_id=trainer_id,
+            role=paddle_role,
+            worker_num=worker_num,
+            server_endpoints=pserver_endpoints)
+        tfleet.init(role)
+        tfleet.save_on_pserver = True
+    def build(self, model, data_gen, config):
+        self.optimize(model.loss, config.optimizer_type, config.lr)
+        self.init_and_run_ps_worker(config.ckpt_path)
+        self.program = self.complie_program(model.loss)
+        self.fleet = tfleet
+        model.data_loader.decorate_batch_generator(
+            data_gen, places=F.cpu_places())
+        self.config = config
+        self.model = model
+    def optimize(self, loss, optimizer_type, lr):
+        strategy = DistributeTranspilerConfig()
+        strategy.sync_mode = False
+        log.info('learning rate:%f' % lr)
+        if optimizer_type == "sgd":
+            optimizer = F.optimizer.SGD(learning_rate=lr)
+        elif optimizer_type == "adam":
+            # Don't slice tensor ensure convergence 
+            optimizer = F.optimizer.Adam(learning_rate=lr, lazy_mode=True)
+        else:
+            raise ValueError("Unknown Optimizer %s" % optimizer_type)
+        #create the DistributeTranspiler configure
+        optimizer = tfleet.distributed_optimizer(optimizer, strategy)
+        optimizer.minimize(loss)
+    def init_and_run_ps_worker(self, ckpt_path):
+        # init and run server or worker
+        self.exe = F.Executor(F.CPUPlace())
+        if tfleet.is_server():
+            tfleet.init_server()
+            self.warmstart(tfleet.startup_program, path=ckpt_path)
+            tfleet.run_server()
+            exit()
+        if tfleet.is_worker():
+            log.info("start init worker done")
+            tfleet.init_worker()
+            self.exe.run(tfleet.startup_program)
+    def complie_program(self, loss):
+        num_threads = int(os.getenv("CPU_NUM", 1))
+        exec_strategy = F.ExecutionStrategy()
+        exec_strategy.num_threads = num_threads
+        exec_strategy.use_thread_barrier = False
+        build_strategy = F.BuildStrategy()
+        build_strategy.enable_inplace = True
+        build_strategy.memory_optimize = True
+        build_strategy.remove_unnecessary_lock = False
+        build_strategy.memory_optimize = False
+        build_strategy.async_mode = False
+        if num_threads > 1:
+            build_strategy.reduce_strategy = F.BuildStrategy.ReduceStrategy.Reduce
+        log.info("start build compile program...")
+        compiled_prog = F.compiler.CompiledProgram(tfleet.main_program
+            ).with_data_parallel(
+                loss_name=loss.name,
+                build_strategy=build_strategy,
+                exec_strategy=exec_strategy)
+        return compiled_prog
+class CollectiveLearner(Learner):
+    def __init__(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        cfleet.init(role)
+    def optimize(self, loss, optimizer_type, lr):
+        optimizer = F.optimizer.Adam(learning_rate=lr)
+        dist_strategy = DistributedStrategy()
+        optimizer = cfleet.distributed_optimizer(optimizer, strategy=dist_strategy)
+        _, param_grads = optimizer.minimize(loss, F.default_startup_program())
+    def build(self, model, data_gen, config):
+        self.optimize(model.loss, config.optimizer_type, config.lr)
+        self.program = cfleet.main_program
+        gpu_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        place = F.CUDAPlace(gpu_id)
+        self.exe = F.Executor(place)
+        self.exe.run(F.default_startup_program())
+        self.warmstart(F.default_startup_program(), config.ckpt_path)
+        self.fleet = cfleet
+        model.data_loader.decorate_batch_generator(
+            data_gen, places=place)
+        self.config = config
+        self.model = model
--- a/examples/erniesage/local_run.sh
+++ b/examples/erniesage/local_run.sh
+#!/bin/bash 
+set -x
+config=${1:-"./config.yaml"}
+unset http_proxy https_proxy
+function parse_yaml {
+   local prefix=$2
+   local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034')
+   sed -ne "s|^\($s\):|\1|" \
+        -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \
+        -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p"  $1 |
+   awk -F$fs '{
+      indent = length($1)/2;
+      vname[indent] = $2;
+      for (i in vname) {if (i > indent) {delete vname[i]}}
+      if (length($3) > 0) {
+         vn=""; for (i=0; i<indent; i++) {vn=(vn)(vname[i])("_")}
+         printf("%s%s%s=\"%s\"\n", "'$prefix'",vn, $2, $3);
+      }
+   }'
+}
+transpiler_local_train(){
+    export PADDLE_TRAINERS_NUM=1
+    export PADDLE_PSERVERS_NUM=1
+    export PADDLE_PORT=6206
+    export PADDLE_PSERVERS="127.0.0.1"
+    export BASE="./local_dir"
+    echo `which python`
+    if [ -d ${BASE} ]; then
+        rm -rf ${BASE}
+    fi 
+    mkdir ${BASE}
+    rm job_id
+    for((i=0;i<${PADDLE_PSERVERS_NUM};i++))
+    do
+        echo "start ps server: ${i}"
+        TRAINING_ROLE="PSERVER" PADDLE_TRAINER_ID=${i} sh job.sh local $config \
+            &> $BASE/pserver.$i.log &
+        echo $! >> job_id
+    done
+    sleep 3s 
+    for((j=0;j<${PADDLE_TRAINERS_NUM};j++))
+    do
+        echo "start ps work: ${j}"
+        TRAINING_ROLE="TRAINER" PADDLE_TRAINER_ID=${j} sh job.sh local $config \
+        echo $! >> job_id
+    done
+}
+collective_local_train(){
+    export PATH=./python27-gcc482-gpu/bin/:$PATH
+    echo `which python`
+    python -m paddle.distributed.launch train.py --conf $config
+    python -m paddle.distributed.launch infer.py --conf $config
+}
+eval $(parse_yaml $config)
+unalias python
+python3 ./preprocessing/dump_graph.py -i $input_data -o $graph_path --encoding $encoding \
+    -l $max_seqlen --vocab_file $ernie_vocab_file
+if [[ $learner_type == "cpu" ]];then
+    transpiler_local_train
+fi
+if [[ $learner_type == "gpu" ]];then
+    collective_local_train
+fi
--- a/examples/erniesage/models/__init__.py
+++ b/examples/erniesage/models/__init__.py
--- a/examples/erniesage/models/base.py
+++ b/examples/erniesage/models/base.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import glob
+import os
+import numpy as np
+import pgl
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from models import message_passing
+def get_layer(layer_type, gw, feature, hidden_size, act, initializer, learning_rate, name, is_test=False):
+    return getattr(message_passing, layer_type)(gw, feature, hidden_size, act, initializer, learning_rate, name)
+class BaseGraphWrapperBuilder(object):
+    def __init__(self, config):
+        self.config = config
+        self.node_feature_info = []
+        self.edge_feature_info = []
+    def __call__(self):
+        place = F.CPUPlace()
+        graph_wrappers = []
+        for i in range(self.config.num_layers):
+            # all graph have same node_feat_info
+            graph_wrappers.append(
+                pgl.graph_wrapper.GraphWrapper(
+                    "layer_%s" % i, place, node_feat=self.node_feature_info, edge_feat=self.edge_feature_info))
+        return graph_wrappers
+class GraphsageGraphWrapperBuilder(BaseGraphWrapperBuilder):
+    def __init__(self, config):
+        super(GraphsageGraphWrapperBuilder, self).__init__(config)
+        self.node_feature_info.append(('index', [None], np.dtype('int64')))
+class BaseGNNModel(object):
+    def __init__(self, config):
+        self.config = config
+        self.graph_wrapper_builder = self.gen_graph_wrapper_builder(config) 
+        self.net_fn = self.gen_net_fn(config)
+        self.feed_list_builder = self.gen_feed_list_builder(config)
+        self.data_loader_builder = self.gen_data_loader_builder(config)
+        self.loss_fn = self.gen_loss_fn(config)
+        self.build()
+    def gen_graph_wrapper_builder(self, config): 
+        return GraphsageGraphWrapperBuilder(config)
+    def gen_net_fn(self, config):
+        return BaseNet(config)
+    def gen_feed_list_builder(self, config):
+        return BaseFeedListBuilder(config) 
+    def gen_data_loader_builder(self, config):
+        return BaseDataLoaderBuilder(config)
+    def gen_loss_fn(self, config):
+        return BaseLoss(config)
+    def build(self):
+        self.graph_wrappers = self.graph_wrapper_builder()
+        self.inputs, self.outputs = self.net_fn(self.graph_wrappers)
+        self.feed_list = self.feed_list_builder(self.inputs, self.graph_wrappers)
+        self.data_loader = self.data_loader_builder(self.feed_list)
+        self.loss = self.loss_fn(self.outputs)
+class BaseFeedListBuilder(object):
+    def __init__(self, config):
+        self.config = config
+    def __call__(self, inputs, graph_wrappers):
+        feed_list = []
+        for i in range(len(graph_wrappers)):
+            feed_list.extend(graph_wrappers[i].holder_list)
+        feed_list.extend(inputs)
+        return feed_list
+class BaseDataLoaderBuilder(object):
+    def __init__(self, config):
+        self.config = config
+    def __call__(self, feed_list):
+        data_loader = F.io.PyReader(
+            feed_list=feed_list, capacity=20, use_double_buffer=True, iterable=True)
+        return data_loader
+class BaseNet(object):
+    def __init__(self, config):
+        self.config = config
+    def take_final_feature(self, feature, index, name):
+        """take final feature"""
+        feat = L.gather(feature, index, overwrite=False)
+        if self.config.final_fc:
+            feat = L.fc(feat,
+                           self.config.hidden_size,
+                           param_attr=F.ParamAttr(name=name + '_w'),
+                           bias_attr=F.ParamAttr(name=name + '_b'))
+        if self.config.final_l2_norm:
+            feat = L.l2_normalize(feat, axis=1)
+        return feat
+    def build_inputs(self):
+        user_index = L.data(
+            "user_index", shape=[None], dtype="int64", append_batch_size=False)
+        item_index = L.data(
+            "item_index", shape=[None], dtype="int64", append_batch_size=False)
+        return [user_index, item_index]
+    def build_embedding(self, graph_wrappers, inputs=None):
+        num_embed = int(np.load(os.path.join(self.config.graph_path, "num_nodes.npy")))
+        is_sparse = self.config.trainer_type == "Transpiler"
+        embed = L.embedding(
+            input=L.reshape(graph_wrappers[0].node_feat['index'], [-1, 1]),
+            size=[num_embed, self.config.hidden_size],
+            is_sparse=is_sparse,
+            param_attr=F.ParamAttr(name="node_embedding", initializer=F.initializer.Uniform(
+                low=-1. / self.config.hidden_size,
+                high=1. / self.config.hidden_size)))
+        return embed
+    def gnn_layers(self, graph_wrappers, feature):
+        features = [feature]
+        initializer = None
+        fc_lr = self.config.lr / 0.001
+        for i in range(self.config.num_layers):
+            if i == self.config.num_layers - 1:
+                act = None
+            else:
+                act = "leaky_relu"
+            feature = get_layer(
+                self.config.layer_type,
+                graph_wrappers[i],
+                feature,
+                self.config.hidden_size,
+                act,
+                initializer,
+                learning_rate=fc_lr,
+                name="%s_%s" % (self.config.layer_type, i))
+            features.append(feature)
+        return features
+    def __call__(self, graph_wrappers):
+        inputs = self.build_inputs()
+        feature = self.build_embedding(graph_wrappers, inputs)
+        features = self.gnn_layers(graph_wrappers, feature)
+        outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs]
+        src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0])
+        outputs.append(src_real_index)
+        return inputs, outputs
+class BaseLoss(object):
+    def __init__(self, config):
+        self.config = config
+    def __call__(self, outputs):
+        user_feat, item_feat = outputs[0], outputs[1]
+        loss_type = self.config.loss_type
+        # Calc Loss
+        if self.config.loss_type == "hinge":
+            pos = L.reduce_sum(user_feat * item_feat, -1, keep_dim=True) # [B, 1]
+            neg = L.matmul(user_feat, item_feat, transpose_y=True) # [B, B]
+            loss = L.reduce_mean(L.relu(neg - pos + self.config.margin))
+        elif self.config.loss_type == "softmax":
+            pass
+            # TODO
+            # pos = L.reduce_sum(user_feat * item_feat, -1, keep_dim=True) # [B, 1]
+            # neg = L.matmul(user_feat, neg_feat, transpose_y=True) # [B, B]
+            # logits = L.concat([pos, neg], -1) # [B, 1+B]
+            # labels = L.fill_constant_batch_size_like(logits, [-1, 1], "int64", 0)
+            # loss = L.reduce_mean(L.softmax_with_cross_entropy(logits, labels))
+        else:
+            raise ValueError
+        return loss
--- a/examples/erniesage/models/ernie.py
+++ b/examples/erniesage/models/ernie.py
+"""Ernie
+"""
+from models.base  import BaseNet, BaseGNNModel 
+class Ernie(BaseNet):
+    def build_inputs(self):
+        inputs = super(Ernie, self).build_inputs()
+        term_ids = L.data(
+            "term_ids", shape=[None, self.config.max_seqlen], dtype="int64", append_batch_size=False)
+        return inputs + [term_ids]
+    def build_embedding(self, graph_wrappers, term_ids):
+        term_ids = L.unsqueeze(term_ids, [-1])
+        ernie_config = self.config.ernie_config
+        ernie = ErnieModel(
+            src_ids=term_ids,
+            sentence_ids=L.zeros_like(term_ids),
+            task_ids=None,
+            config=ernie_config,
+            use_fp16=False,
+            name="student_")
+        feature = ernie.get_pooled_output()
+        return feature
+    def __call__(self, graph_wrappers):
+        inputs = self.build_inputs()
+        feature = self.build_embedding(graph_wrappers, inputs[-1])
+        features = [feature]
+        outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs[:-1]]
+        src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0])
+        outputs.append(src_real_index)
+        return inputs, outputs
+class ErnieModel(BaseGNNModel):
+    def gen_net_fn(self, config):
+        return Ernie(config)
--- a/examples/erniesage/models/ernie_model/__init__.py
+++ b/examples/erniesage/models/ernie_model/__init__.py
--- a/examples/erniesage/models/ernie_model/ernie.py
+++ b/examples/erniesage/models/ernie_model/ernie.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Ernie model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import json
+import six
+import logging
+import paddle.fluid as fluid
+import paddle.fluid.layers as L
+from io import open
+from models.ernie_model.transformer_encoder import encoder, pre_process_layer
+from models.ernie_model.transformer_encoder import graph_encoder
+log = logging.getLogger(__name__)
+class ErnieConfig(object):
+    def __init__(self, config_path):
+        self._config_dict = self._parse(config_path)
+    def _parse(self, config_path):
+        try:
+            with open(config_path, 'r', encoding='utf8') as json_file:
+                config_dict = json.load(json_file)
+        except Exception:
+            raise IOError("Error in parsing Ernie model config file '%s'" %
+                          config_path)
+        else:
+            return config_dict
+    def __getitem__(self, key):
+        return self._config_dict.get(key, None)
+    def print_config(self):
+        for arg, value in sorted(six.iteritems(self._config_dict)):
+            log.info('%s: %s' % (arg, value))
+        log.info('------------------------------------------------')
+class ErnieModel(object):
+    def __init__(self,
+                 src_ids,
+                 sentence_ids,
+                 task_ids=None,
+                 config=None,
+                 weight_sharing=True,
+                 use_fp16=False,
+                 name=""):
+        self._set_config(config, name, weight_sharing)
+        input_mask = self._build_input_mask(src_ids)
+        position_ids = self._build_position_ids(src_ids)
+        self._build_model(src_ids, position_ids, sentence_ids, task_ids,
+                          input_mask)
+        self._debug_summary(input_mask)
+    def _debug_summary(self, input_mask):
+        #histogram
+        seqlen_before_pad = L.cast(
+            L.reduce_sum(
+                input_mask, dim=1), dtype='float32')
+        seqlen_after_pad = L.reduce_sum(
+            L.cast(
+                L.zeros_like(input_mask), dtype='float32') + 1.0, dim=1)
+        pad_num = seqlen_after_pad - seqlen_before_pad
+        pad_rate = pad_num / seqlen_after_pad
+    def _build_position_ids(self, src_ids):
+        d_shape = L.shape(src_ids)
+        d_seqlen = d_shape[1]
+        d_batch = d_shape[0]
+        position_ids = L.reshape(
+            L.range(
+                0, d_seqlen, 1, dtype='int32'), [1, d_seqlen, 1],
+            inplace=True)
+        position_ids = L.expand(position_ids, [d_batch, 1, 1])
+        position_ids = L.cast(position_ids, 'int64')
+        position_ids.stop_gradient = True
+        return position_ids
+    def _build_input_mask(self, src_ids):
+        zero = L.fill_constant([1], dtype='int64', value=0)
+        input_mask = L.logical_not(L.equal(src_ids,
+                                           zero))  # assume pad id == 0
+        input_mask = L.cast(input_mask, 'float')
+        input_mask.stop_gradient = True
+        return input_mask
+    def _set_config(self, config, name, weight_sharing):
+        self._emb_size = config['hidden_size']
+        self._n_layer = config['num_hidden_layers']
+        self._n_head = config['num_attention_heads']
+        self._voc_size = config['vocab_size']
+        self._max_position_seq_len = config['max_position_embeddings']
+        if config.get('sent_type_vocab_size'):
+            self._sent_types = config['sent_type_vocab_size']
+        else:
+            self._sent_types = config['type_vocab_size']
+        self._use_task_id = config['use_task_id']
+        if self._use_task_id:
+            self._task_types = config['task_type_vocab_size']
+        self._hidden_act = config['hidden_act']
+        self._postprocess_cmd = config.get('postprocess_cmd', 'dan')
+        self._preprocess_cmd = config.get('preprocess_cmd', '')
+        self._prepostprocess_dropout = config['hidden_dropout_prob']
+        self._attention_dropout = config['attention_probs_dropout_prob']
+        self._weight_sharing = weight_sharing
+        self.name = name
+        self._word_emb_name = self.name + "word_embedding"
+        self._pos_emb_name = self.name + "pos_embedding"
+        self._sent_emb_name = self.name + "sent_embedding"
+        self._task_emb_name = self.name + "task_embedding"
+        self._dtype = "float16" if config['use_fp16'] else "float32"
+        self._emb_dtype = "float32"
+        # Initialize all weigths by truncated normal initializer, and all biases
+        # will be initialized by constant zero by default.
+        self._param_initializer = fluid.initializer.TruncatedNormal(
+            scale=config['initializer_range'])
+    def _build_model(self, src_ids, position_ids, sentence_ids, task_ids,
+                     input_mask):
+        emb_out = self._build_embedding(src_ids, position_ids, sentence_ids,
+                                        task_ids)
+        self.input_mask = input_mask
+        self._enc_out, self.all_hidden, self.all_attn, self.all_ffn = encoder(
+            enc_input=emb_out,
+            input_mask=input_mask,
+            n_layer=self._n_layer,
+            n_head=self._n_head,
+            d_key=self._emb_size // self._n_head,
+            d_value=self._emb_size // self._n_head,
+            d_model=self._emb_size,
+            d_inner_hid=self._emb_size * 4,
+            prepostprocess_dropout=self._prepostprocess_dropout,
+            attention_dropout=self._attention_dropout,
+            relu_dropout=0,
+            hidden_act=self._hidden_act,
+            preprocess_cmd=self._preprocess_cmd,
+            postprocess_cmd=self._postprocess_cmd,
+            param_initializer=self._param_initializer,
+            name=self.name + 'encoder')
+        if self._dtype == "float16":
+            self._enc_out = fluid.layers.cast(
+                x=self._enc_out, dtype=self._emb_dtype)
+    def _build_embedding(self, src_ids, position_ids, sentence_ids, task_ids):
+        # padding id in vocabulary must be set to 0
+        emb_out = fluid.layers.embedding(
+            input=src_ids,
+            size=[self._voc_size, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._word_emb_name, initializer=self._param_initializer),
+            is_sparse=False)
+        position_emb_out = fluid.layers.embedding(
+            input=position_ids,
+            size=[self._max_position_seq_len, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._pos_emb_name, initializer=self._param_initializer))
+        sent_emb_out = fluid.layers.embedding(
+            sentence_ids,
+            size=[self._sent_types, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._sent_emb_name, initializer=self._param_initializer))
+        self.all_emb = [emb_out, position_emb_out, sent_emb_out]
+        emb_out = emb_out + position_emb_out
+        emb_out = emb_out + sent_emb_out
+        if self._use_task_id:
+            task_emb_out = fluid.layers.embedding(
+                task_ids,
+                size=[self._task_types, self._emb_size],
+                dtype=self._emb_dtype,
+                param_attr=fluid.ParamAttr(
+                    name=self._task_emb_name,
+                    initializer=self._param_initializer))
+            emb_out = emb_out + task_emb_out
+        emb_out = pre_process_layer(
+            emb_out,
+            'nd',
+            self._prepostprocess_dropout,
+            name=self.name + 'pre_encoder')
+        if self._dtype == "float16":
+            emb_out = fluid.layers.cast(x=emb_out, dtype=self._dtype)
+        return emb_out
+    def get_sequence_output(self):
+        return self._enc_out
+    def get_pooled_output(self):
+        """Get the first feature of each sequence for classification"""
+        next_sent_feat = self._enc_out[:, 0, :]
+        #next_sent_feat = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1])
+        next_sent_feat = fluid.layers.fc(
+            input=next_sent_feat,
+            size=self._emb_size,
+            act="tanh",
+            param_attr=fluid.ParamAttr(
+                name=self.name + "pooled_fc.w_0",
+                initializer=self._param_initializer),
+            bias_attr=self.name + "pooled_fc.b_0")
+        return next_sent_feat
+    def get_lm_output(self, mask_label, mask_pos):
+        """Get the loss & accuracy for pretraining"""
+        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
+        # extract the first token feature in each sentence
+        self.next_sent_feat = self.get_pooled_output()
+        reshaped_emb_out = fluid.layers.reshape(
+            x=self._enc_out, shape=[-1, self._emb_size])
+        # extract masked tokens' feature
+        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
+        # transform: fc
+        mask_trans_feat = fluid.layers.fc(
+            input=mask_feat,
+            size=self._emb_size,
+            act=self._hidden_act,
+            param_attr=fluid.ParamAttr(
+                name=self.name + 'mask_lm_trans_fc.w_0',
+                initializer=self._param_initializer),
+            bias_attr=fluid.ParamAttr(name=self.name + 'mask_lm_trans_fc.b_0'))
+        # transform: layer norm 
+        mask_trans_feat = fluid.layers.layer_norm(
+            mask_trans_feat,
+            begin_norm_axis=len(mask_trans_feat.shape) - 1,
+            param_attr=fluid.ParamAttr(
+                name=self.name + 'mask_lm_trans_layer_norm_scale',
+                initializer=fluid.initializer.Constant(1.)),
+            bias_attr=fluid.ParamAttr(
+                name=self.name + 'mask_lm_trans_layer_norm_bias',
+                initializer=fluid.initializer.Constant(0.)))
+        # transform: layer norm 
+        #mask_trans_feat = pre_process_layer(
+        #    mask_trans_feat, 'n', name=self.name + 'mask_lm_trans')
+        mask_lm_out_bias_attr = fluid.ParamAttr(
+            name=self.name + "mask_lm_out_fc.b_0",
+            initializer=fluid.initializer.Constant(value=0.0))
+        if self._weight_sharing:
+            fc_out = fluid.layers.matmul(
+                x=mask_trans_feat,
+                y=fluid.default_main_program().global_block().var(
+                    self._word_emb_name),
+                transpose_y=True)
+            fc_out += fluid.layers.create_parameter(
+                shape=[self._voc_size],
+                dtype=self._emb_dtype,
+                attr=mask_lm_out_bias_attr,
+                is_bias=True)
+        else:
+            fc_out = fluid.layers.fc(input=mask_trans_feat,
+                                     size=self._voc_size,
+                                     param_attr=fluid.ParamAttr(
+                                         name=self.name + "mask_lm_out_fc.w_0",
+                                         initializer=self._param_initializer),
+                                     bias_attr=mask_lm_out_bias_attr)
+        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
+            logits=fc_out, label=mask_label)
+        return mask_lm_loss
+    def get_task_output(self, task, task_labels):
+        task_fc_out = fluid.layers.fc(
+            input=self.next_sent_feat,
+            size=task["num_labels"],
+            param_attr=fluid.ParamAttr(
+                name=self.name + task["task_name"] + "_fc.w_0",
+                initializer=self._param_initializer),
+            bias_attr=self.name + task["task_name"] + "_fc.b_0")
+        task_loss, task_softmax = fluid.layers.softmax_with_cross_entropy(
+            logits=task_fc_out, label=task_labels, return_softmax=True)
+        task_acc = fluid.layers.accuracy(input=task_softmax, label=task_labels)
+        return task_loss, task_acc
+class ErnieGraphModel(ErnieModel):
+    def __init__(self,
+                 src_ids,
+                 task_ids=None,
+                 config=None,
+                 weight_sharing=True,
+                 use_fp16=False,
+                 slot_seqlen=40,
+                 name=""):
+        self.slot_seqlen = slot_seqlen
+        self._set_config(config, name, weight_sharing)
+        input_mask = self._build_input_mask(src_ids)
+        position_ids = self._build_position_ids(src_ids)
+        sentence_ids = self._build_sentence_ids(src_ids)
+        self._build_model(src_ids, position_ids, sentence_ids, task_ids,
+                          input_mask)
+        self._debug_summary(input_mask)
+    def _build_position_ids(self, src_ids):
+        src_shape = L.shape(src_ids)
+        src_seqlen = src_shape[1]
+        src_batch = src_shape[0]
+        slot_seqlen = self.slot_seqlen
+        num_b = (src_seqlen / slot_seqlen) - 1
+        a_position_ids = L.reshape(
+            L.range(
+                0, slot_seqlen, 1, dtype='int32'), [1, slot_seqlen, 1],
+            inplace=True) # [1, slot_seqlen, 1]
+        a_position_ids = L.expand(a_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen * num_b, 1]
+        zero = L.fill_constant([1], dtype='int64', value=0)
+        input_mask = L.cast(L.equal(src_ids[:, :slot_seqlen], zero), "int32")  # assume pad id == 0 [B, slot_seqlen, 1]
+        a_pad_len = L.reduce_sum(input_mask, 1) # [B, 1, 1]
+        b_position_ids = L.reshape(
+            L.range(
+                slot_seqlen, 2*slot_seqlen, 1, dtype='int32'), [1, slot_seqlen, 1],
+            inplace=True) # [1, slot_seqlen, 1]
+        b_position_ids = L.expand(b_position_ids, [src_batch, num_b, 1]) # [B, slot_seqlen * num_b, 1]
+        b_position_ids = b_position_ids - a_pad_len # [B, slot_seqlen * num_b, 1]
+        position_ids = L.concat([a_position_ids, b_position_ids], 1)
+        position_ids = L.cast(position_ids, 'int64')
+        position_ids.stop_gradient = True
+        return position_ids
+    def _build_sentence_ids(self, src_ids):
+        src_shape = L.shape(src_ids)
+        src_seqlen = src_shape[1]
+        src_batch = src_shape[0]
+        slot_seqlen = self.slot_seqlen
+        zeros = L.zeros([src_batch, slot_seqlen, 1], "int64")
+        ones = L.ones([src_batch, src_seqlen-slot_seqlen, 1], "int64")
+        sentence_ids = L.concat([zeros, ones], 1)
+        sentence_ids.stop_gradient = True
+        return sentence_ids
+    def _build_model(self, src_ids, position_ids, sentence_ids, task_ids,
+                     input_mask):
+        emb_out = self._build_embedding(src_ids, position_ids, sentence_ids,
+                                        task_ids)
+        self.input_mask = input_mask
+        self._enc_out, self.all_hidden, self.all_attn, self.all_ffn = graph_encoder(
+            enc_input=emb_out,
+            input_mask=input_mask,
+            n_layer=self._n_layer,
+            n_head=self._n_head,
+            d_key=self._emb_size // self._n_head,
+            d_value=self._emb_size // self._n_head,
+            d_model=self._emb_size,
+            d_inner_hid=self._emb_size * 4,
+            prepostprocess_dropout=self._prepostprocess_dropout,
+            attention_dropout=self._attention_dropout,
+            relu_dropout=0,
+            hidden_act=self._hidden_act,
+            preprocess_cmd=self._preprocess_cmd,
+            postprocess_cmd=self._postprocess_cmd,
+            param_initializer=self._param_initializer,
+            slot_seqlen=self.slot_seqlen,
+            name=self.name + 'encoder')
+        if self._dtype == "float16":
+            self._enc_out = fluid.layers.cast(
+                x=self._enc_out, dtype=self._emb_dtype)
--- a/examples/erniesage/models/ernie_model/transformer_encoder.py
+++ b/examples/erniesage/models/ernie_model/transformer_encoder.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+import numpy as np
+from contextlib import contextmanager
+import paddle.fluid as fluid
+import paddle.fluid.layers as L
+import paddle.fluid.layers as layers
+#import propeller.paddle as propeller
+#from propeller import log
+#determin this at the begining
+to_3d = lambda a: a  # will change later
+to_2d = lambda a: a
+def multi_head_attention(queries,
+                         keys,
+                         values,
+                         attn_bias,
+                         d_key,
+                         d_value,
+                         d_model,
+                         n_head=1,
+                         dropout_rate=0.,
+                         cache=None,
+                         param_initializer=None,
+                         name='multi_head_att'):
+    """
+    Multi-Head Attention. Note that attn_bias is added to the logit before
+    computing softmax activiation to mask certain selected positions so that
+    they will not considered in attention weights.
+    """
+    keys = queries if keys is None else keys
+    values = keys if values is None else values
+    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
+        """
+        Add linear projection to queries, keys, and values.
+        """
+        q = layers.fc(input=queries,
+                      size=d_key * n_head,
+                      num_flatten_dims=len(queries.shape) - 1,
+                      param_attr=fluid.ParamAttr(
+                          name=name + '_query_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_query_fc.b_0')
+        k = layers.fc(input=keys,
+                      size=d_key * n_head,
+                      num_flatten_dims=len(keys.shape) - 1,
+                      param_attr=fluid.ParamAttr(
+                          name=name + '_key_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_key_fc.b_0')
+        v = layers.fc(input=values,
+                      size=d_value * n_head,
+                      num_flatten_dims=len(values.shape) - 1,
+                      param_attr=fluid.ParamAttr(
+                          name=name + '_value_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_value_fc.b_0')
+        return q, k, v
+    def __split_heads(x, n_head):
+        """
+        Reshape the last dimension of inpunt tensor x so that it becomes two
+        dimensions and then transpose. Specifically, input a tensor with shape
+        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
+        with shape [bs, n_head, max_sequence_length, hidden_dim].
+        """
+        hidden_size = x.shape[-1]
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        reshaped = layers.reshape(
+            x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
+        # permuate the dimensions into:
+        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
+        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
+    def __combine_heads(x):
+        """
+        Transpose and then reshape the last two dimensions of inpunt tensor x
+        so that it becomes one dimension, which is reverse to __split_heads.
+        """
+        if len(x.shape) == 3: return x
+        if len(x.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        #trans_x.desc.set_shape((-1, 1, n_head, d_value))
+        return layers.reshape(x=trans_x, shape=[0, 0, d_model], inplace=True)
+    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
+        """
+        Scaled Dot-Product Attention
+        """
+        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
+        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
+        if attn_bias:
+            product += attn_bias
+        weights = layers.softmax(product)
+        if dropout_rate:
+            weights = layers.dropout(
+                weights,
+                dropout_prob=dropout_rate,
+                dropout_implementation="upscale_in_train",
+                is_test=False)
+        out = layers.matmul(weights, v)
+        #return out, product
+        return out, weights
+    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
+    q = to_3d(q)
+    k = to_3d(k)
+    v = to_3d(v)
+    if cache is not None:  # use cache and concat time steps
+        # Since the inplace reshape in __split_heads changes the shape of k and
+        # v, which is the cache input for next time step, reshape the cache
+        # input from the previous time step first.
+        k = cache["k"] = layers.concat(
+            [layers.reshape(
+                cache["k"], shape=[0, 0, d_model]), k], axis=1)
+        v = cache["v"] = layers.concat(
+            [layers.reshape(
+                cache["v"], shape=[0, 0, d_model]), v], axis=1)
+    q = __split_heads(q, n_head)
+    k = __split_heads(k, n_head)
+    v = __split_heads(v, n_head)
+    ctx_multiheads, ctx_multiheads_attn = scaled_dot_product_attention(
+        q, k, v, attn_bias, d_key, dropout_rate)
+    out = __combine_heads(ctx_multiheads)
+    out = to_2d(out)
+    # Project back to the model size.
+    proj_out = layers.fc(input=out,
+                         size=d_model,
+                         num_flatten_dims=len(out.shape) - 1,
+                         param_attr=fluid.ParamAttr(
+                             name=name + '_output_fc.w_0',
+                             initializer=param_initializer),
+                         bias_attr=name + '_output_fc.b_0')
+    return proj_out, ctx_multiheads_attn
+def positionwise_feed_forward(x,
+                              d_inner_hid,
+                              d_hid,
+                              dropout_rate,
+                              hidden_act,
+                              param_initializer=None,
+                              name='ffn'):
+    """
+    Position-wise Feed-Forward Networks.
+    This module consists of two linear transformations with a ReLU activation
+    in between, which is applied to each position separately and identically.
+    """
+    hidden = layers.fc(input=x,
+                       size=d_inner_hid,
+                       num_flatten_dims=len(x.shape) - 1,
+                       act=hidden_act,
+                       param_attr=fluid.ParamAttr(
+                           name=name + '_fc_0.w_0',
+                           initializer=param_initializer),
+                       bias_attr=name + '_fc_0.b_0')
+    if dropout_rate:
+        hidden = layers.dropout(
+            hidden,
+            dropout_prob=dropout_rate,
+            dropout_implementation="upscale_in_train",
+            is_test=False)
+    out = layers.fc(input=hidden,
+                    size=d_hid,
+                    num_flatten_dims=len(hidden.shape) - 1,
+                    param_attr=fluid.ParamAttr(
+                        name=name + '_fc_1.w_0',
+                        initializer=param_initializer),
+                    bias_attr=name + '_fc_1.b_0')
+    return out
+def pre_post_process_layer(prev_out,
+                           out,
+                           process_cmd,
+                           dropout_rate=0.,
+                           name=''):
+    """
+    Add residual connection, layer normalization and droput to the out tensor
+    optionally according to the value of process_cmd.
+    This will be used before or after multi-head attention and position-wise
+    feed-forward networks.
+    """
+    for cmd in process_cmd:
+        if cmd == "a":  # add residual connection
+            out = out + prev_out if prev_out else out
+        elif cmd == "n":  # add layer normalization
+            out_dtype = out.dtype
+            if out_dtype == fluid.core.VarDesc.VarType.FP16:
+                out = layers.cast(x=out, dtype="float32")
+            out = layers.layer_norm(
+                out,
+                begin_norm_axis=len(out.shape) - 1,
+                param_attr=fluid.ParamAttr(
+                    name=name + '_layer_norm_scale',
+                    initializer=fluid.initializer.Constant(1.)),
+                bias_attr=fluid.ParamAttr(
+                    name=name + '_layer_norm_bias',
+                    initializer=fluid.initializer.Constant(0.)))
+            if out_dtype == fluid.core.VarDesc.VarType.FP16:
+                out = layers.cast(x=out, dtype="float16")
+        elif cmd == "d":  # add dropout
+            if dropout_rate:
+                out = layers.dropout(
+                    out,
+                    dropout_prob=dropout_rate,
+                    dropout_implementation="upscale_in_train",
+                    is_test=False)
+    return out
+pre_process_layer = partial(pre_post_process_layer, None)
+post_process_layer = pre_post_process_layer
+def encoder_layer(enc_input,
+                  attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  prepostprocess_dropout,
+                  attention_dropout,
+                  relu_dropout,
+                  hidden_act,
+                  preprocess_cmd="n",
+                  postprocess_cmd="da",
+                  param_initializer=None,
+                  name=''):
+    """The encoder layers that can be stacked to form a deep encoder.
+    This module consits of a multi-head (self) attention followed by
+    position-wise feed-forward networks and both the two components companied
+    with the post_process_layer to add residual connection, layer normalization
+    and droput.
+    """
+    #L.Print(L.reduce_mean(enc_input), message='1')
+    attn_output, ctx_multiheads_attn = multi_head_attention(
+        pre_process_layer(
+            enc_input,
+            preprocess_cmd,
+            prepostprocess_dropout,
+            name=name + '_pre_att'),
+        None,
+        None,
+        attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        attention_dropout,
+        param_initializer=param_initializer,
+        name=name + '_multi_head_att')
+    #L.Print(L.reduce_mean(attn_output), message='1')
+    attn_output = post_process_layer(
+        enc_input,
+        attn_output,
+        postprocess_cmd,
+        prepostprocess_dropout,
+        name=name + '_post_att')
+    #L.Print(L.reduce_mean(attn_output), message='2')
+    ffd_output = positionwise_feed_forward(
+        pre_process_layer(
+            attn_output,
+            preprocess_cmd,
+            prepostprocess_dropout,
+            name=name + '_pre_ffn'),
+        d_inner_hid,
+        d_model,
+        relu_dropout,
+        hidden_act,
+        param_initializer=param_initializer,
+        name=name + '_ffn')
+    #L.Print(L.reduce_mean(ffd_output), message='3')
+    ret = post_process_layer(
+        attn_output,
+        ffd_output,
+        postprocess_cmd,
+        prepostprocess_dropout,
+        name=name + '_post_ffn')
+    #L.Print(L.reduce_mean(ret), message='4')
+    return ret, ctx_multiheads_attn, ffd_output
+def build_pad_idx(input_mask):
+    pad_idx = L.where(L.cast(L.squeeze(input_mask, [2]), 'bool'))
+    return pad_idx
+def build_attn_bias(input_mask, n_head, dtype):
+    attn_bias = L.matmul(
+        input_mask, input_mask, transpose_y=True)  # [batch, seq, seq]
+    attn_bias = (1. - attn_bias) * -10000.
+    attn_bias = L.stack([attn_bias] * n_head, 1) # [batch, n_head, seq, seq]
+    if attn_bias.dtype != dtype:
+        attn_bias = L.cast(attn_bias, dtype)
+    return attn_bias
+def build_graph_attn_bias(input_mask, n_head, dtype, slot_seqlen):
+    input_shape = L.shape(input_mask)
+    input_batch = input_shape[0]
+    input_seqlen = input_shape[1]
+    num_slot = input_seqlen / slot_seqlen
+    num_b = num_slot - 1
+    ones = L.ones([num_b], dtype="float32") # [num_b]
+    diag_ones = L.diag(ones) # [num_b, num_b]
+    diag_ones = L.unsqueeze(diag_ones, [1, -1]) # [num_b, 1, num_b, 1]
+    diag_ones = L.expand(diag_ones, [1, slot_seqlen, 1, slot_seqlen]) # [num_b, seqlen, num_b, seqlen]
+    diag_ones = L.reshape(diag_ones, [1, num_b*slot_seqlen, num_b*slot_seqlen]) # [1, num_b*seqlen, num_b*seqlen]
+    graph_attn_bias = L.concat([L.ones([1, num_b*slot_seqlen, slot_seqlen], dtype="float32"), diag_ones], 2)
+    graph_attn_bias = L.concat([L.ones([1, slot_seqlen, num_slot*slot_seqlen], dtype="float32"), graph_attn_bias], 1) # [1, seq, seq]
+    pad_attn_bias = L.matmul(
+        input_mask, input_mask, transpose_y=True)  # [batch, seq, seq]
+    attn_bias = graph_attn_bias * pad_attn_bias
+    attn_bias = (1. - attn_bias) * -10000.
+    attn_bias = L.stack([attn_bias] * n_head, 1) # [batch, n_head, seq, seq]
+    if attn_bias.dtype != dtype:
+        attn_bias = L.cast(attn_bias, dtype)
+    return attn_bias
+def encoder(enc_input,
+            input_mask,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd="n",
+            postprocess_cmd="da",
+            param_initializer=None,
+            name=''):
+    """
+    The encoder is composed of a stack of identical layers returned by calling
+    encoder_layer.
+    """
+    #global to_2d, to_3d  #, batch, seqlen, dynamic_dim
+    d_shape = L.shape(input_mask)
+    pad_idx = build_pad_idx(input_mask)
+    attn_bias = build_attn_bias(input_mask, n_head, enc_input.dtype)
+    # d_batch = d_shape[0]
+    # d_seqlen = d_shape[1]
+    # pad_idx = L.where(
+    # L.cast(L.reshape(input_mask, [d_batch, d_seqlen]), 'bool'))
+    # attn_bias = L.matmul(
+    # input_mask, input_mask, transpose_y=True)  # [batch, seq, seq]
+    # attn_bias = (1. - attn_bias) * -10000.
+    # attn_bias = L.stack([attn_bias] * n_head, 1)
+    # if attn_bias.dtype != enc_input.dtype:
+    # attn_bias = L.cast(attn_bias, enc_input.dtype)
+    # def to_2d(t_3d):
+        # t_2d = L.gather_nd(t_3d, pad_idx)
+        # return t_2d
+    # def to_3d(t_2d):
+        # t_3d = L.scatter_nd(
+        # pad_idx, t_2d, shape=[d_shape[0], d_shape[1], d_model])
+        # return t_3d
+    enc_input = to_2d(enc_input)
+    all_hidden = []
+    all_attn = []
+    all_ffn = []
+    for i in range(n_layer):
+        enc_output, ctx_multiheads_attn, ffn_output = encoder_layer(
+            enc_input,
+            attn_bias,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd,
+            postprocess_cmd,
+            param_initializer=param_initializer,
+            name=name + '_layer_' + str(i))
+        all_hidden.append(enc_output)
+        all_attn.append(ctx_multiheads_attn)
+        all_ffn.append(ffn_output)
+        enc_input = enc_output
+    enc_output = pre_process_layer(
+        enc_output,
+        preprocess_cmd,
+        prepostprocess_dropout,
+        name="post_encoder")
+    enc_output = to_3d(enc_output)
+    #enc_output.desc.set_shape((-1, 1, final_dim))
+    return enc_output, all_hidden, all_attn, all_ffn
+def graph_encoder(enc_input,
+            input_mask,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd="n",
+            postprocess_cmd="da",
+            param_initializer=None,
+            slot_seqlen=40,
+            name=''):
+    """
+    The encoder is composed of a stack of identical layers returned by calling
+    encoder_layer.
+    """
+    #global to_2d, to_3d  #, batch, seqlen, dynamic_dim
+    d_shape = L.shape(input_mask)
+    pad_idx = build_pad_idx(input_mask)
+    attn_bias = build_graph_attn_bias(input_mask, n_head, enc_input.dtype, slot_seqlen)
+    #attn_bias = build_attn_bias(input_mask, n_head, enc_input.dtype)
+    # d_batch = d_shape[0]
+    # d_seqlen = d_shape[1]
+    # pad_idx = L.where(
+    # L.cast(L.reshape(input_mask, [d_batch, d_seqlen]), 'bool'))
+    # attn_bias = L.matmul(
+    # input_mask, input_mask, transpose_y=True)  # [batch, seq, seq]
+    # attn_bias = (1. - attn_bias) * -10000.
+    # attn_bias = L.stack([attn_bias] * n_head, 1)
+    # if attn_bias.dtype != enc_input.dtype:
+    # attn_bias = L.cast(attn_bias, enc_input.dtype)
+    # def to_2d(t_3d):
+        # t_2d = L.gather_nd(t_3d, pad_idx)
+        # return t_2d
+    # def to_3d(t_2d):
+        # t_3d = L.scatter_nd(
+        # pad_idx, t_2d, shape=[d_shape[0], d_shape[1], d_model])
+        # return t_3d
+    enc_input = to_2d(enc_input)
+    all_hidden = []
+    all_attn = []
+    all_ffn = []
+    for i in range(n_layer):
+        enc_output, ctx_multiheads_attn, ffn_output = encoder_layer(
+            enc_input,
+            attn_bias,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd,
+            postprocess_cmd,
+            param_initializer=param_initializer,
+            name=name + '_layer_' + str(i))
+        all_hidden.append(enc_output)
+        all_attn.append(ctx_multiheads_attn)
+        all_ffn.append(ffn_output)
+        enc_input = enc_output
+    enc_output = pre_process_layer(
+        enc_output,
+        preprocess_cmd,
+        prepostprocess_dropout,
+        name="post_encoder")
+    enc_output = to_3d(enc_output)
+    #enc_output.desc.set_shape((-1, 1, final_dim))
+    return enc_output, all_hidden, all_attn, all_ffn
--- a/examples/erniesage/models/erniesage_v1.py
+++ b/examples/erniesage/models/erniesage_v1.py
+import pgl
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from models.base import BaseNet, BaseGNNModel
+from models.ernie_model.ernie import ErnieModel
+from models.ernie_model.ernie import ErnieGraphModel
+from models.ernie_model.ernie import ErnieConfig
+class ErnieSageV1(BaseNet):
+    def build_inputs(self):
+        inputs = super(ErnieSageV1, self).build_inputs()
+        term_ids = L.data(
+            "term_ids", shape=[None, self.config.max_seqlen], dtype="int64", append_batch_size=False)
+        return inputs + [term_ids]
+    def build_embedding(self, graph_wrappers, term_ids):
+        term_ids = L.unsqueeze(term_ids, [-1])
+        ernie_config = self.config.ernie_config
+        ernie = ErnieModel(
+            src_ids=term_ids,
+            sentence_ids=L.zeros_like(term_ids),
+            task_ids=None,
+            config=ernie_config,
+            use_fp16=False,
+            name="student_")
+        feature = ernie.get_pooled_output()
+        return feature
+    def __call__(self, graph_wrappers):
+        inputs = self.build_inputs()
+        feature = self.build_embedding(graph_wrappers, inputs[-1])
+        features = self.gnn_layers(graph_wrappers, feature)
+        outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs[:-1]]
+        src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0])
+        outputs.append(src_real_index)
+        return inputs, outputs
+class ErnieSageModelV1(BaseGNNModel):
+    def gen_net_fn(self, config):
+        return ErnieSageV1(config)
--- a/examples/erniesage/models/erniesage_v2.py
+++ b/examples/erniesage/models/erniesage_v2.py
+import pgl
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from models.base import BaseNet, BaseGNNModel
+from models.ernie_model.ernie import ErnieModel
+from models.ernie_model.ernie import ErnieGraphModel
+from models.ernie_model.ernie import ErnieConfig
+class ErnieSageV2(BaseNet):
+    def build_inputs(self):
+        inputs = super(ErnieSageV2, self).build_inputs()
+        term_ids = L.data(
+            "term_ids", shape=[None, self.config.max_seqlen], dtype="int64", append_batch_size=False)
+        return inputs + [term_ids]
+    def gnn_layer(self, gw, feature, hidden_size, act, initializer, learning_rate, name):
+        def ernie_send(src_feat, dst_feat, edge_feat):
+            """doc"""
+            cls = L.fill_constant_batch_size_like(src_feat["term_ids"], [-1, 1, 1], "int64", 1)
+            src_ids = L.concat([cls, src_feat["term_ids"]], 1)
+            dst_ids = dst_feat["term_ids"]
+            sent_ids = L.concat([L.zeros_like(src_ids), L.ones_like(dst_ids)], 1)
+            term_ids = L.concat([src_ids, dst_ids], 1)
+            term_ids.stop_gradient = True
+            sent_ids.stop_gradient = True
+            ernie = ErnieModel(
+                term_ids, sent_ids,
+                config=self.config.ernie_config)
+            feature = ernie.get_pooled_output()
+            return feature
+        def erniesage_v2_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name):
+            feature = L.unsqueeze(feature, [-1])
+            msg = gw.send(ernie_send, nfeat_list=[("term_ids", feature)])
+            neigh_feature = gw.recv(msg, lambda feat: F.layers.sequence_pool(feat, pool_type="sum"))
+            term_ids = feature
+            cls = L.fill_constant_batch_size_like(term_ids, [-1, 1, 1], "int64", 1)
+            term_ids = L.concat([cls, term_ids], 1)
+            term_ids.stop_gradient = True
+            ernie = ErnieModel(
+                term_ids, L.zeros_like(term_ids),
+                config=self.config.ernie_config)
+            self_feature = ernie.get_pooled_output()
+            self_feature = L.fc(self_feature,
+                                           hidden_size,
+                                           act=act,
+                                           param_attr=F.ParamAttr(name=name + "_l",
+                                           learning_rate=learning_rate),
+                                           )
+            neigh_feature = L.fc(neigh_feature,
+                                            hidden_size,
+                                            act=act,
+                                            param_attr=F.ParamAttr(name=name + "_r",
+                                           learning_rate=learning_rate),
+                                            )
+            output = L.concat([self_feature, neigh_feature], axis=1)
+            output = L.l2_normalize(output, axis=1)
+            return output
+        return erniesage_v2_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name)
+    def gnn_layers(self, graph_wrappers, feature):
+        features = [feature]
+        initializer = None
+        fc_lr = self.config.lr / 0.001
+        for i in range(self.config.num_layers):
+            if i == self.config.num_layers - 1:
+                act = None
+            else:
+                act = "leaky_relu"
+            feature = self.gnn_layer(
+                graph_wrappers[i],
+                feature,
+                self.config.hidden_size,
+                act,
+                initializer,
+                learning_rate=fc_lr,
+                name="%s_%s" % ("erniesage_v2", i))
+            features.append(feature)
+        return features
+    def __call__(self, graph_wrappers):
+        inputs = self.build_inputs()
+        feature = inputs[-1]
+        features = self.gnn_layers(graph_wrappers, feature)
+        outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs[:-1]]
+        src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0])
+        outputs.append(src_real_index)
+        return inputs, outputs
+class ErnieSageModelV2(BaseGNNModel):
+    def gen_net_fn(self, config):
+        return ErnieSageV2(config)
--- a/examples/erniesage/models/erniesage_v3.py
+++ b/examples/erniesage/models/erniesage_v3.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pgl
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from models.base import BaseNet, BaseGNNModel
+from models.ernie_model.ernie import ErnieModel
+from models.ernie_model.ernie import ErnieGraphModel
+from models.ernie_model.ernie import ErnieConfig
+from models.message_passing import copy_send
+class ErnieSageV3(BaseNet):
+    def __init__(self, config):
+        super(ErnieSageV3, self).__init__(config)
+        self.config.layer_type = "ernie_recv_sum"
+    def build_inputs(self):
+        inputs = super(ErnieSageV3, self).build_inputs()
+        term_ids = L.data(
+            "term_ids", shape=[None, self.config.max_seqlen], dtype="int64", append_batch_size=False)
+        return inputs + [term_ids]
+    def gnn_layer(self, gw, feature, hidden_size, act, initializer, learning_rate, name):
+        def ernie_recv(feat):
+            """doc"""
+            # TODO maxlen  400
+            #pad_value = L.cast(L.assign(input=np.array([0], dtype=np.int32)), "int64")
+            pad_value = L.zeros([1], "int64")
+            out, _ = L.sequence_pad(feat, pad_value=pad_value, maxlen=10)
+            out = L.reshape(out, [0, 400])
+            return out
+        def erniesage_v3_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name):
+            msg = gw.send(copy_send, nfeat_list=[("h", feature)])
+            neigh_feature = gw.recv(msg, ernie_recv)
+            neigh_feature = L.cast(L.unsqueeze(neigh_feature, [-1]), "int64")
+            feature = L.unsqueeze(feature, [-1])
+            cls = L.fill_constant_batch_size_like(feature, [-1, 1, 1], "int64", 1)
+            term_ids = L.concat([cls, feature[:, :-1], neigh_feature], 1)
+            term_ids.stop_gradient = True
+            return term_ids
+        return erniesage_v3_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name)
+    def gnn_layers(self, graph_wrappers, feature):
+        features = [feature]
+        initializer = None
+        fc_lr = self.config.lr / 0.001
+        for i in range(self.config.num_layers):
+            if i == self.config.num_layers - 1:
+                act = None
+            else:
+                act = "leaky_relu"
+            feature = self.gnn_layer(
+                graph_wrappers[i],
+                feature,
+                self.config.hidden_size,
+                act,
+                initializer,
+                learning_rate=fc_lr,
+                name="%s_%s" % (self.config.layer_type, i))
+            features.append(feature)
+        return features
+    def take_final_feature(self, feature, index, name):
+        """take final feature"""
+        feat = L.gather(feature, index, overwrite=False)
+        ernie_config = self.config.ernie_config
+        ernie = ErnieGraphModel(
+            src_ids=feat,
+            config=ernie_config,
+            slot_seqlen=self.config.max_seqlen,
+            name="student_")
+        feat = ernie.get_pooled_output()
+        fc_lr = self.config.lr / 0.001
+        feat= L.fc(feat,
+                   self.config.hidden_size,
+                   act="relu",
+                   param_attr=F.ParamAttr(name=name + "_l",
+                   learning_rate=fc_lr),
+                   )
+        feat = L.l2_normalize(feat, axis=1)
+        if self.config.final_fc:
+            feat = L.fc(feat,
+                           self.config.hidden_size,
+                           param_attr=F.ParamAttr(name=name + '_w'),
+                           bias_attr=F.ParamAttr(name=name + '_b'))
+        if self.config.final_l2_norm:
+            feat = L.l2_normalize(feat, axis=1)
+        return feat
+    def __call__(self, graph_wrappers):
+        inputs = self.build_inputs()
+        feature = inputs[-1]
+        features = self.gnn_layers(graph_wrappers, feature)
+        outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs[:-1]]
+        src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0])
+        outputs.append(src_real_index)
+        return inputs, outputs
+class ErnieSageModelV3(BaseGNNModel):
+    def gen_net_fn(self, config):
+        return ErnieSageV3(config)
--- a/examples/erniesage/models/message_passing.py
+++ b/examples/erniesage/models/message_passing.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as L
+def copy_send(src_feat, dst_feat, edge_feat):
+    """doc"""
+    return src_feat["h"]
+def weighted_copy_send(src_feat, dst_feat, edge_feat):
+    """doc"""
+    return src_feat["h"] * edge_feat["weight"]
+def mean_recv(feat):
+    """doc"""
+    return fluid.layers.sequence_pool(feat, pool_type="average")
+def sum_recv(feat):
+    """doc"""
+    return fluid.layers.sequence_pool(feat, pool_type="sum")
+def max_recv(feat):
+    """doc"""
+    return fluid.layers.sequence_pool(feat, pool_type="max")
+def lstm_recv(feat):
+    """doc"""
+    hidden_dim = 128
+    forward, _ = fluid.layers.dynamic_lstm(
+        input=feat, size=hidden_dim * 4, use_peepholes=False)
+    output = fluid.layers.sequence_last_step(forward)
+    return output
+def graphsage_sum(gw, feature, hidden_size, act, initializer, learning_rate, name):
+    """doc"""
+    msg = gw.send(copy_send, nfeat_list=[("h", feature)])
+    neigh_feature = gw.recv(msg, sum_recv)
+    self_feature = feature
+    self_feature = fluid.layers.fc(self_feature,
+                                   hidden_size,
+                                   act=act,
+                                   param_attr=fluid.ParamAttr(name=name + "_l", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                   )
+    neigh_feature = fluid.layers.fc(neigh_feature,
+                                    hidden_size,
+                                    act=act,
+                                    param_attr=fluid.ParamAttr(name=name + "_r", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                    )
+    output = fluid.layers.concat([self_feature, neigh_feature], axis=1)
+    output = fluid.layers.l2_normalize(output, axis=1)
+    return output
+def graphsage_mean(gw, feature, hidden_size, act, initializer, learning_rate, name):
+    """doc"""
+    msg = gw.send(copy_send, nfeat_list=[("h", feature)])
+    neigh_feature = gw.recv(msg, mean_recv)
+    self_feature = feature
+    self_feature = fluid.layers.fc(self_feature,
+                                   hidden_size,
+                                   act=act,
+                                   param_attr=fluid.ParamAttr(name=name + "_l", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                   )
+    neigh_feature = fluid.layers.fc(neigh_feature,
+                                    hidden_size,
+                                    act=act,
+                                    param_attr=fluid.ParamAttr(name=name + "_r", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                    )
+    output = fluid.layers.concat([self_feature, neigh_feature], axis=1)
+    output = fluid.layers.l2_normalize(output, axis=1)
+    return output
+def pinsage_mean(gw, feature, hidden_size, act, initializer, learning_rate, name):
+    """doc"""
+    msg = gw.send(weighted_copy_send, nfeat_list=[("h", feature)], efeat_list=["weight"])
+    neigh_feature = gw.recv(msg, mean_recv)
+    self_feature = feature
+    self_feature = fluid.layers.fc(self_feature,
+                                   hidden_size,
+                                   act=act,
+                                   param_attr=fluid.ParamAttr(name=name + "_l", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                   )
+    neigh_feature = fluid.layers.fc(neigh_feature,
+                                    hidden_size,
+                                    act=act,
+                                    param_attr=fluid.ParamAttr(name=name + "_r", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                    )
+    output = fluid.layers.concat([self_feature, neigh_feature], axis=1)
+    output = fluid.layers.l2_normalize(output, axis=1)
+    return output
+def pinsage_sum(gw, feature, hidden_size, act, initializer, learning_rate, name):
+    """doc"""
+    msg = gw.send(weighted_copy_send, nfeat_list=[("h", feature)], efeat_list=["weight"])
+    neigh_feature = gw.recv(msg, sum_recv)
+    self_feature = feature
+    self_feature = fluid.layers.fc(self_feature,
+                                   hidden_size,
+                                   act=act,
+                                   param_attr=fluid.ParamAttr(name=name + "_l", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                   )
+    neigh_feature = fluid.layers.fc(neigh_feature,
+                                    hidden_size,
+                                    act=act,
+                                    param_attr=fluid.ParamAttr(name=name + "_r", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                    )
+    output = fluid.layers.concat([self_feature, neigh_feature], axis=1)
+    output = fluid.layers.l2_normalize(output, axis=1)
+    return output
--- a/examples/erniesage/models/model_factory.py
+++ b/examples/erniesage/models/model_factory.py
+from models.base import BaseGNNModel
+from models.ernie import ErnieModel
+from models.erniesage_v1 import ErnieSageModelV1
+from models.erniesage_v2 import ErnieSageModelV2
+from models.erniesage_v3 import ErnieSageModelV3
+class Model(object):
+    @classmethod
+    def factory(cls, config):
+        name = config.model_type
+        if name == "BaseGNNModel":
+            return BaseGNNModel(config)
+        if name == "ErnieModel":
+            return ErnieModel(config)
+        if name == "ErnieSageModelV1":
+            return ErnieSageModelV1(config)
+        if name == "ErnieSageModelV2":
+            return ErnieSageModelV2(config)
+        if name == "ErnieSageModelV3":
+            return ErnieSageModelV3(config)
+        else:
+            raise ValueError
--- a/examples/erniesage/preprocessing/dump_graph.py
+++ b/examples/erniesage/preprocessing/dump_graph.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+########################################################################
+#
+# Copyright (c) 2020 Baidu.com, Inc. All Rights Reserved
+#
+# File: dump_graph.py
+# Author: suweiyue(suweiyue@baidu.com)
+# Date: 2020/03/01 22:17:13
+#
+########################################################################
+"""
+    Comment.
+"""
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+#from __future__ import unicode_literals
+import io
+import os
+import sys
+import argparse
+import logging
+import multiprocessing
+from functools import partial
+from io import open
+import numpy as np
+import tqdm
+import pgl
+from pgl.graph_kernel import alias_sample_build_table
+from pgl.utils.logger import log
+from tokenization import FullTokenizer
+def term2id(string, tokenizer, max_seqlen):
+    string = string.split("\t")[1]
+    tokens = tokenizer.tokenize(string)
+    ids = tokenizer.convert_tokens_to_ids(tokens)
+    ids = ids[:max_seqlen-1]
+    ids = ids + [2] # ids + [sep]
+    ids = ids + [0] * (max_seqlen - len(ids))
+    return ids
+def dump_graph(args):
+    if not os.path.exists(args.outpath):
+        os.makedirs(args.outpath)
+    neg_samples = []
+    str2id = dict()
+    term_file = io.open(os.path.join(args.outpath, "terms.txt"), "w", encoding=args.encoding)
+    terms = []
+    count = 0
+    with io.open(args.inpath, encoding=args.encoding) as f:
+        edges = []
+        for idx, line in enumerate(f):
+            if idx % 100000 == 0:
+                log.info("%s readed %s lines" % (args.inpath, idx))
+            slots = []
+            for col_idx, col in enumerate(line.strip("\n").split("\t")):
+                s = col[:args.max_seqlen]
+                if s not in str2id:
+                    str2id[s] = count
+                    count += 1
+                    term_file.write(str(col_idx) + "\t" + col + "\n")
+                slots.append(str2id[s])
+            src = slots[0]
+            dst = slots[1]
+            neg_samples.append(slots[2:])
+            edges.append((src, dst))
+            edges.append((dst, src))
+        term_file.close()
+        edges = np.array(edges, dtype="int64")
+        num_nodes = len(str2id)
+        str2id.clear()
+    log.info("building graph...")
+    graph = pgl.graph.Graph(num_nodes=num_nodes, edges=edges)
+    indegree = graph.indegree()
+    graph.outdegree()
+    graph.dump(args.outpath)
+    # dump alias sample table
+    sqrt_indegree = np.sqrt(indegree)
+    distribution = 1. * sqrt_indegree / sqrt_indegree.sum()
+    alias, events = alias_sample_build_table(distribution)
+    np.save(os.path.join(args.outpath, "alias.npy"), alias)
+    np.save(os.path.join(args.outpath, "events.npy"), events)
+    np.save(os.path.join(args.outpath, "neg_samples.npy"), np.array(neg_samples))
+    log.info("End Build Graph")
+def dump_id2str_map(args):
+    log.info("Dump id2str map starting...")
+    id2str = np.array([line.strip("\n") for line in open(os.path.join(args.outpath, "terms.txt"), "r", encoding=args.encoding)])
+    np.save(os.path.join(args.outpath, "id2str.npy"), id2str)
+    log.info("Dump id2str map done.")
+def dump_node_feat(args):
+    log.info("Dump node feat starting...")
+    id2str = np.load(os.path.join(args.outpath, "id2str.npy"), mmap_mode="r")
+    pool = multiprocessing.Pool()
+    tokenizer = FullTokenizer(args.vocab_file)
+    term_ids = pool.map(partial(term2id, tokenizer=tokenizer, max_seqlen=args.max_seqlen), id2str)
+    np.save(os.path.join(args.outpath, "term_ids.npy"), np.array(term_ids))
+    log.info("Dump node feat done.")
+    pool.terminate()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='main')
+    parser.add_argument("-i", "--inpath", type=str, default=None)
+    parser.add_argument("-l", "--max_seqlen", type=int, default=30)
+    parser.add_argument("--vocab_file", type=str, default="./vocab.txt")
+    parser.add_argument("--encoding", type=str, default="utf8")
+    parser.add_argument("-o", "--outpath", type=str, default=None)
+    args = parser.parse_args()
+    dump_graph(args)
+    dump_id2str_map(args)
+    dump_node_feat(args)
--- a/examples/erniesage/preprocessing/tokenization.py
+++ b/examples/erniesage/preprocessing/tokenization.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import unicodedata
+import six
+import sentencepiece as sp
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    fin = open(vocab_file, 'rb')
+    for num, line in enumerate(fin):
+        items = convert_to_unicode(line.strip()).split("\t")
+        if len(items) > 2:
+            break
+        token = items[0]
+        index = items[1] if len(items) == 2 else num
+        token = token.strip()
+        vocab[token] = int(index)
+    return vocab
+def convert_by_vocab(vocab, items):
+    """Converts a sequence of [tokens|ids] using the vocab."""
+    output = []
+    for item in items:
+        output.append(vocab[item])
+    return output
+def convert_tokens_to_ids_include_unk(vocab, tokens, unk_token="[UNK]"):
+    output = []
+    for token in tokens:
+        if token in vocab:
+            output.append(vocab[token])
+        else:
+            output.append(vocab[unk_token])
+    return output
+def convert_tokens_to_ids(vocab, tokens):
+    return convert_by_vocab(vocab, tokens)
+def convert_ids_to_tokens(inv_vocab, ids):
+    return convert_by_vocab(inv_vocab, ids)
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a peice of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+class CharTokenizer(object):
+    """Runs end-to-end tokenziation."""
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.tokenizer = WordpieceTokenizer(vocab=self.vocab)
+    def tokenize(self, text):
+        split_tokens = []
+        for token in text.lower().split(" "):
+            for sub_token in self.tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+        Args:
+            do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+        return ["".join(x) for x in output]
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+        return False
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+class SentencepieceTokenizer(object):
+    """Runs SentencePiece tokenziation."""
+    def __init__(self, vocab_file, do_lower_case=True, unk_token="[UNK]"):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.do_lower_case = do_lower_case
+        self.tokenizer = sp.SentencePieceProcessor()
+        self.tokenizer.Load(vocab_file + ".model")
+        self.sp_unk_token = "<unk>"
+        self.unk_token = unk_token
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+        Returns:
+            A list of wordpiece tokens.
+        """
+        text = text.lower() if self.do_lower_case else text 
+        text = convert_to_unicode(text.replace("\1", " "))
+        tokens = self.tokenizer.EncodeAsPieces(text)
+        output_tokens = []
+        for token in tokens:
+            if token == self.sp_unk_token:
+                token = self.unk_token
+            if token in self.vocab:
+                output_tokens.append(token)
+            else:
+                output_tokens.append(self.unk_token)
+        return output_tokens
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+class WordsegTokenizer(object):
+    """Runs Wordseg tokenziation."""
+    def __init__(self, vocab_file, do_lower_case=True, unk_token="[UNK]", 
+            split_token="\1"):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.tokenizer = sp.SentencePieceProcessor()
+        self.tokenizer.Load(vocab_file + ".model")
+        self.do_lower_case = do_lower_case
+        self.unk_token = unk_token
+        self.split_token = split_token
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+        Returns:
+            A list of wordpiece tokens.
+        """
+        text = text.lower() if self.do_lower_case else text 
+        text = convert_to_unicode(text)
+        output_tokens = []
+        for token in text.split(self.split_token):
+            if token in self.vocab:
+                output_tokens.append(token)
+            else:
+                sp_tokens = self.tokenizer.EncodeAsPieces(token)
+                for sp_token in sp_tokens:
+                    if sp_token in self.vocab:
+                        output_tokens.append(sp_token)
+        return output_tokens
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenziation."""
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+        For example:
+            input = "unaffable"
+            output = ["un", "##aff", "##able"]
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through `BasicTokenizer.
+        Returns:
+            A list of wordpiece tokens.
+        """
+        text = convert_to_unicode(text)
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+        (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
--- a/examples/erniesage/train.py
+++ b/examples/erniesage/train.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import argparse
+import traceback
+import yaml
+import numpy as np
+from easydict import EasyDict as edict
+from pgl.utils.logger import log
+from pgl.utils import paddle_helper
+from learner import Learner
+from models.model_factory import Model
+from dataset.graph_reader import GraphGenerator 
+class TrainData(object):
+    def __init__(self, graph_path):
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
+        log.info("trainer_id: %s, trainer_count: %s." % (trainer_id, trainer_count))
+        edges = np.load(os.path.join(graph_path, "edges.npy"), allow_pickle=True)
+        # edges is bidirectional.
+        train_usr = edges[trainer_id::trainer_count, 0]
+        train_ad = edges[trainer_id::trainer_count, 1]
+        returns = {
+            "train_data": [train_usr, train_ad]
+        }
+        if os.path.exists(os.path.join(graph_path, "neg_samples.npy")):
+            neg_samples = np.load(os.path.join(graph_path, "neg_samples.npy"), allow_pickle=True)
+            if neg_samples.size != 0:
+                train_negs = neg_samples[trainer_id::trainer_count]
+                returns["train_data"].append(train_negs)
+        log.info("Load train_data done.")
+        self.data = returns
+    def __getitem__(self, index):
+        return [ data[index] for data in self.data["train_data"]]
+    def __len__(self):
+        return len(self.data["train_data"][0])
+def main(config):
+    # Select Model
+    model = Model.factory(config)
+    # Build Train Edges
+    data = TrainData(config.graph_path)
+    # Build Train Data
+    train_iter = GraphGenerator(
+        graph_wrappers=model.graph_wrappers,
+        batch_size=config.batch_size,
+        data=data,
+        samples=config.samples,
+        num_workers=config.sample_workers,
+        feed_name_list=[var.name for var in model.feed_list],
+        use_pyreader=config.use_pyreader,
+        phase="train",
+        graph_data_path=config.graph_path,
+        shuffle=True)
+    log.info("build graph reader done.")
+    learner = Learner.factory(config.learner_type)
+    learner.build(model, train_iter, config)
+    learner.start()
+    learner.stop()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='main')
+    parser.add_argument("--conf", type=str, default="./config.yaml")
+    args = parser.parse_args()
+    config = edict(yaml.load(open(args.conf), Loader=yaml.FullLoader))
+    print(config)
+    main(config)
--- a/examples/gin/Dataset.py
+++ b/examples/gin/Dataset.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file implement the dataset for GIN model.
+"""
+import os
+import sys
+import numpy as np
+from sklearn.model_selection import StratifiedKFold
+import pgl
+from pgl.utils.logger import log
+def fold10_split(dataset, fold_idx=0, seed=0, shuffle=True):
+    """10 fold splitter"""
+    assert 0 <= fold_idx and fold_idx < 10, print(
+        "fold_idx must be from 0 to 9.")
+    skf = StratifiedKFold(n_splits=10, shuffle=shuffle, random_state=seed)
+    labels = []
+    for i in range(len(dataset)):
+        g, c = dataset[i]
+        labels.append(c)
+    idx_list = []
+    for idx in skf.split(np.zeros(len(labels)), labels):
+        idx_list.append(idx)
+    train_idx, valid_idx = idx_list[fold_idx]
+    log.info("train_set : test_set == %d : %d" %
+             (len(train_idx), len(valid_idx)))
+    return Subset(dataset, train_idx), Subset(dataset, valid_idx)
+def random_split(dataset, split_ratio=0.7, seed=0, shuffle=True):
+    """random splitter"""
+    np.random.seed(seed)
+    indices = list(range(len(dataset)))
+    np.random.shuffle(indices)
+    split = int(split_ratio * len(dataset))
+    train_idx, valid_idx = indices[:split], indices[split:]
+    log.info("train_set : test_set == %d : %d" %
+             (len(train_idx), len(valid_idx)))
+    return Subset(dataset, train_idx), Subset(dataset, valid_idx)
+class BaseDataset(object):
+    """BaseDataset"""
+    def __init__(self):
+        pass
+    def __getitem__(self, idx):
+        """getitem"""
+        raise NotImplementedError
+    def __len__(self):
+        """len"""
+        raise NotImplementedError
+class Subset(BaseDataset):
+    """
+    Subset of a dataset at specified indices.
+    """
+    def __init__(self, dataset, indices):
+        self.dataset = dataset
+        self.indices = indices
+    def __getitem__(self, idx):
+        """getitem"""
+        return self.dataset[self.indices[idx]]
+    def __len__(self):
+        """len"""
+        return len(self.indices)
+class GINDataset(BaseDataset):
+    """Dataset for Graph Isomorphism Network (GIN)
+    Adapted from https://github.com/weihua916/powerful-gnns/blob/master/dataset.zip.
+    """
+    def __init__(self,
+                 data_path,
+                 dataset_name,
+                 self_loop,
+                 degree_as_nlabel=False):
+        self.data_path = data_path
+        self.dataset_name = dataset_name
+        self.self_loop = self_loop
+        self.degree_as_nlabel = degree_as_nlabel
+        self.graph_list = []
+        self.glabel_list = []
+        # relabel
+        self.glabel_dict = {}
+        self.nlabel_dict = {}
+        self.elabel_dict = {}
+        self.ndegree_dict = {}
+        # global num
+        self.num_graph = 0  # total graphs number
+        self.n = 0  # total nodes number
+        self.m = 0  # total edges number
+        # global num of classes
+        self.gclasses = 0
+        self.nclasses = 0
+        self.eclasses = 0
+        self.dim_nfeats = 0
+        # flags
+        self.degree_as_nlabel = degree_as_nlabel
+        self.nattrs_flag = False
+        self.nlabels_flag = False
+        self._load_data()
+    def __len__(self):
+        """return the number of graphs"""
+        return len(self.graph_list)
+    def __getitem__(self, idx):
+        """getitem"""
+        return self.graph_list[idx], self.glabel_list[idx]
+    def _load_data(self):
+        """Loads dataset
+        """
+        filename = os.path.join(self.data_path, self.dataset_name,
+                                "%s.txt" % self.dataset_name)
+        log.info("loading data from %s" % filename)
+        with open(filename, 'r') as reader:
+            # first line --> N, means total number of graphs
+            self.num_graph = int(reader.readline().strip())
+            for i in range(self.num_graph):
+                if (i + 1) % int(self.num_graph / 10) == 0:
+                    log.info("processing graph %s" % (i + 1))
+                graph = dict()
+                # second line --> [num_node, label] 
+                # means [node number of a graph, class label of a graph]
+                grow = reader.readline().strip().split()
+                n_nodes, glabel = [int(w) for w in grow]
+                # relabel graphs
+                if glabel not in self.glabel_dict:
+                    mapped = len(self.glabel_dict)
+                    self.glabel_dict[glabel] = mapped
+                graph['num_nodes'] = n_nodes
+                self.glabel_list.append(self.glabel_dict[glabel])
+                nlabels = []
+                node_features = []
+                num_edges = 0
+                edges = []
+                for j in range(graph['num_nodes']):
+                    slots = reader.readline().strip().split()
+                    # handle edges and node feature(if has)
+                    tmp = int(slots[
+                        1]) + 2  # tmp == 2 + num_edges of current node
+                    if tmp == len(slots):
+                        # no node feature
+                        nrow = [int(w) for w in slots]
+                        nfeat = None
+                    elif tmp < len(slots):
+                        nrow = [int(w) for w in slots[:tmp]]
+                        nfeat = [float(w) for w in slots[tmp:]]
+                        node_features.append(nfeat)
+                    else:
+                        raise Exception('edge number is not correct!')
+                    # relabel nodes if is has labels
+                    # if it doesn't have node labels, then every nrow[0] == 0
+                    if not nrow[0] in self.nlabel_dict:
+                        mapped = len(self.nlabel_dict)
+                        self.nlabel_dict[nrow[0]] = mapped
+                    nlabels.append(self.nlabel_dict[nrow[0]])
+                    num_edges += nrow[1]
+                    edges.extend([(j, u) for u in nrow[2:]])
+                    if self.self_loop:
+                        num_edges += 1
+                        edges.append((j, j))
+                if node_features != []:
+                    node_features = np.stack(node_features)
+                    graph['attr'] = node_features
+                    self.nattrs_flag = True
+                else:
+                    node_features = None
+                    graph['attr'] = node_features
+                graph['nlabel'] = np.array(
+                    nlabels, dtype="int64").reshape(-1, 1)
+                if len(self.nlabel_dict) > 1:
+                    self.nlabels_flag = True
+                graph['edges'] = edges
+                assert num_edges == len(edges)
+                g = pgl.graph.Graph(
+                    num_nodes=graph['num_nodes'],
+                    edges=graph['edges'],
+                    node_feat={
+                        'nlabel': graph['nlabel'],
+                        'attr': graph['attr']
+                    })
+                self.graph_list.append(g)
+                # update statistics of graphs
+                self.n += graph['num_nodes']
+                self.m += num_edges
+        # if no attr
+        if not self.nattrs_flag:
+            log.info('there are no node features in this dataset!')
+            label2idx = {}
+            # generate node attr by node degree
+            if self.degree_as_nlabel:
+                log.info('generate node features by node degree...')
+                nlabel_set = set([])
+                for g in self.graph_list:
+                    g.node_feat['nlabel'] = g.indegree()
+                    # extracting unique node labels
+                    nlabel_set = nlabel_set.union(set(g.node_feat['nlabel']))
+                    g.node_feat['nlabel'] = g.node_feat['nlabel'].reshape(-1,
+                                                                          1)
+                nlabel_set = list(nlabel_set)
+                # in case the labels/degrees are not continuous number
+                self.ndegree_dict = {
+                    nlabel_set[i]: i
+                    for i in range(len(nlabel_set))
+                }
+                label2idx = self.ndegree_dict
+            # generate node attr by node label
+            else:
+                log.info('generate node features by node label...')
+                label2idx = self.nlabel_dict
+            for g in self.graph_list:
+                attr = np.zeros((g.num_nodes, len(label2idx)))
+                idx = [
+                    label2idx[tag]
+                    for tag in g.node_feat['nlabel'].reshape(-1, )
+                ]
+                attr[:, idx] = 1
+                g.node_feat['attr'] = attr.astype("float32")
+        # after load, get the #classes and #dim
+        self.gclasses = len(self.glabel_dict)
+        self.nclasses = len(self.nlabel_dict)
+        self.eclasses = len(self.elabel_dict)
+        self.dim_nfeats = len(self.graph_list[0].node_feat['attr'][0])
+        message = "finished loading data\n"
+        message += """
+                    num_graph: %d
+                    num_graph_class: %d
+                    total_num_nodes: %d
+                    node Classes: %d
+                    node_features_dim: %d
+                    num_edges: %d
+                    edge_classes: %d
+                    Avg. of #Nodes: %.2f
+                    Avg. of #Edges: %.2f
+                    Graph Relabeled: %s
+                    Node Relabeled: %s
+                    Degree Relabeled(If degree_as_nlabel=True): %s""" % (
+            self.num_graph,
+            self.gclasses,
+            self.n,
+            self.nclasses,
+            self.dim_nfeats,
+            self.m,
+            self.eclasses,
+            self.n / self.num_graph,
+            self.m / self.num_graph,
+            self.glabel_dict,
+            self.nlabel_dict,
+            self.ndegree_dict, )
+        log.info(message)
+if __name__ == "__main__":
+    gindataset = GINDataset(
+        "./dataset/", "MUTAG", self_loop=True, degree_as_nlabel=False)
--- a/examples/gin/README.md
+++ b/examples/gin/README.md
+# Graph Isomorphism Network (GIN)
+[Graph Isomorphism Network \(GIN\)](https://arxiv.org/pdf/1810.00826.pdf) is a simple graph neural network that expects to achieve the ability as the Weisfeiler-Lehman graph isomorphism test. Based on PGL, we reproduce the GIN model.
+### Datasets
+The dataset can be downloaded from [here](https://github.com/weihua916/powerful-gnns/blob/master/dataset.zip).
+After downloading the data，uncompress them, then a directory named `./dataset/` can be found in current directory. Note that the current directory is the root directory of GIN model.
+### Dependencies
+- paddlepaddle >= 1.6
+- pgl 1.0.2
+### How to run
+For examples, use GPU to train GIN model on MUTAG dataset.
+```
+python main.py --use_cuda --dataset_name MUTAG  --data_path ./dataset
+```
+### Hyperparameters
+- data\_path: the root path of your dataset 
+- dataset\_name: the name of the dataset
+- fold\_idx: The $fold\_idx^{th}$ fold of dataset splited. Here we use 10 fold cross-validation
+- train\_eps: whether the $\epsilon$ parameter is learnable.
+### Experiment results （Accuracy）
+| |MUTAG | COLLAB   | IMDBBINARY | IMDBMULTI |
+|--|-------------|----------|------------|-----------------|
+|PGL result | 90.8           | 78.6 | 76.8     | 50.8          |
+|paper reuslt |90.0           | 80.0 | 75.1     | 52.3          |
--- a/examples/gin/dataloader.py
+++ b/examples/gin/dataloader.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file implement the graph dataloader.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import os
+import sys
+import time
+import argparse
+import numpy as np
+import collections
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as fl
+import pgl
+from pgl.utils import mp_reader
+from pgl.utils.logger import log
+def batch_iter(data, batch_size, fid, num_workers):
+    """node_batch_iter
+    """
+    size = len(data)
+    perm = np.arange(size)
+    np.random.shuffle(perm)
+    start = 0
+    cc = 0
+    while start < size:
+        index = perm[start:start + batch_size]
+        start += batch_size
+        cc += 1
+        if cc % num_workers != fid:
+            continue
+        yield data[index]
+def scan_batch_iter(data, batch_size, fid, num_workers):
+    """scan_batch_iter
+    """
+    batch = []
+    cc = 0
+    for line_example in data.scan():
+        cc += 1
+        if cc % num_workers != fid:
+            continue
+        batch.append(line_example)
+        if len(batch) == batch_size:
+            yield batch
+            batch = []
+    if len(batch) > 0:
+        yield batch
+class GraphDataloader(object):
+    """Graph Dataloader
+    """
+    def __init__(
+            self,
+            dataset,
+            batch_size,
+            seed=0,
+            num_workers=1,
+            buf_size=1000,
+            shuffle=True, ):
+        self.shuffle = shuffle
+        self.seed = seed
+        self.num_workers = num_workers
+        self.buf_size = buf_size
+        self.batch_size = batch_size
+        self.dataset = dataset
+    def batch_fn(self, batch_examples):
+        """ batch_fn batch producer"""
+        graphs = [b[0] for b in batch_examples]
+        labels = [b[1] for b in batch_examples]
+        join_graph = pgl.graph.MultiGraph(graphs)
+        labels = np.array(labels, dtype="int64").reshape(-1, 1)
+        return join_graph, labels
+        #  feed_dict = self.graph_wrapper.to_feed(join_graph)
+        #  raise NotImplementedError("No defined Batch Fn")
+    def batch_iter(self, fid):
+        """batch_iter"""
+        if self.shuffle:
+            for batch in batch_iter(self, self.batch_size, fid,
+                                    self.num_workers):
+                yield batch
+        else:
+            for batch in scan_batch_iter(self, self.batch_size, fid,
+                                         self.num_workers):
+                yield batch
+    def __len__(self):
+        """__len__"""
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        """__getitem__"""
+        if isinstance(idx, collections.Iterable):
+            return [self[bidx] for bidx in idx]
+        else:
+            return self.dataset[idx]
+    def __iter__(self):
+        """__iter__"""
+        def worker(filter_id):
+            def func_run():
+                for batch_examples in self.batch_iter(filter_id):
+                    batch_dict = self.batch_fn(batch_examples)
+                    yield batch_dict
+            return func_run
+        if self.num_workers == 1:
+            r = paddle.reader.buffered(worker(0), self.buf_size)
+        else:
+            worker_pool = [worker(wid) for wid in range(self.num_workers)]
+            worker = mp_reader.multiprocess_reader(
+                worker_pool, use_pipe=True, queue_size=1000)
+            r = paddle.reader.buffered(worker, self.buf_size)
+        for batch in r():
+            yield batch
+    def scan(self):
+        """scan"""
+        for example in self.dataset:
+            yield example
--- a/examples/gin/main.py
+++ b/examples/gin/main.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file implement the training process of GIN model.
+"""
+import os
+import sys
+import time
+import argparse
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.layers as fl
+import pgl
+from pgl.utils.logger import log
+from Dataset import GINDataset, fold10_split, random_split
+from dataloader import GraphDataloader
+from model import GINModel
+def main(args):
+    """main function"""
+    dataset = GINDataset(
+        args.data_path,
+        args.dataset_name,
+        self_loop=not args.train_eps,
+        degree_as_nlabel=True)
+    train_dataset, test_dataset = fold10_split(
+        dataset, fold_idx=args.fold_idx, seed=args.seed)
+    train_loader = GraphDataloader(train_dataset, batch_size=args.batch_size)
+    test_loader = GraphDataloader(
+        test_dataset, batch_size=args.batch_size, shuffle=False)
+    place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+    train_program = fluid.Program()
+    startup_program = fluid.Program()
+    with fluid.program_guard(train_program, startup_program):
+        gw = pgl.graph_wrapper.GraphWrapper(
+            "gw", place=place, node_feat=dataset[0][0].node_feat_info())
+        model = GINModel(args, gw, dataset.gclasses)
+        model.forward()
+    infer_program = train_program.clone(for_test=True)
+    with fluid.program_guard(train_program, startup_program):
+        epoch_step = int(len(train_dataset) / args.batch_size) + 1
+        boundaries = [
+            i
+            for i in range(50 * epoch_step, args.epochs * epoch_step,
+                           epoch_step * 50)
+        ]
+        values = [args.lr * 0.5**i for i in range(0, len(boundaries) + 1)]
+        lr = fl.piecewise_decay(boundaries=boundaries, values=values)
+        train_op = fluid.optimizer.Adam(lr).minimize(model.loss)
+    exe = fluid.Executor(place)
+    exe.run(startup_program)
+    # train and evaluate
+    global_step = 0
+    for epoch in range(1, args.epochs + 1):
+        for idx, batch_data in enumerate(train_loader):
+            g, labels = batch_data
+            feed_dict = gw.to_feed(g)
+            feed_dict['labels'] = labels
+            ret_loss, ret_lr, ret_acc = exe.run(
+                train_program,
+                feed=feed_dict,
+                fetch_list=[model.loss, lr, model.acc])
+            global_step += 1
+            if global_step % 10 == 0:
+                message = "epoch %d | step %d | " % (epoch, global_step)
+                message += "lr %.6f | loss %.6f | acc %.4f" % (
+                    ret_lr, ret_loss, ret_acc)
+                log.info(message)
+        # evaluate
+        result = evaluate(exe, infer_program, model, gw, test_loader)
+        message = "evaluating result"
+        for key, value in result.items():
+            message += " | %s %.6f" % (key, value)
+        log.info(message)
+def evaluate(exe, prog, model, gw, loader):
+    """evaluate"""
+    total_loss = []
+    total_acc = []
+    for idx, batch_data in enumerate(loader):
+        g, labels = batch_data
+        feed_dict = gw.to_feed(g)
+        feed_dict['labels'] = labels
+        ret_loss, ret_acc = exe.run(prog,
+                                    feed=feed_dict,
+                                    fetch_list=[model.loss, model.acc])
+        total_loss.append(ret_loss)
+        total_acc.append(ret_acc)
+    total_loss = np.mean(total_loss)
+    total_acc = np.mean(total_acc)
+    return {"loss": total_loss, "acc": total_acc}
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_path', type=str, default='./dataset')
+    parser.add_argument('--dataset_name', type=str, default='MUTAG')
+    parser.add_argument('--batch_size', type=int, default=32)
+    parser.add_argument('--fold_idx', type=int, default=0)
+    parser.add_argument('--output_path', type=str, default='./outputs/')
+    parser.add_argument('--use_cuda', action='store_true')
+    parser.add_argument('--num_layers', type=int, default=5)
+    parser.add_argument('--num_mlp_layers', type=int, default=2)
+    parser.add_argument('--hidden_size', type=int, default=64)
+    parser.add_argument(
+        '--pool_type',
+        type=str,
+        default="sum",
+        choices=["sum", "average", "max"])
+    parser.add_argument('--train_eps', action='store_true')
+    parser.add_argument('--epochs', type=int, default=350)
+    parser.add_argument('--lr', type=float, default=0.01)
+    parser.add_argument('--dropout_prob', type=float, default=0.5)
+    parser.add_argument('--seed', type=int, default=0)
+    args = parser.parse_args()
+    log.info(args)
+    if not os.path.exists(args.output_path):
+        os.makedirs(args.output_path)
+    main(args)
--- a/examples/gin/model.py
+++ b/examples/gin/model.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This file implement the GIN model.
+"""
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.layers as fl
+import pgl
+from pgl.layers.conv import gin
+class GINModel(object):
+    """GINModel"""
+    def __init__(self, args, gw, num_class):
+        self.args = args
+        self.num_layers = self.args.num_layers
+        self.hidden_size = self.args.hidden_size
+        self.train_eps = self.args.train_eps
+        self.pool_type = self.args.pool_type
+        self.dropout_prob = self.args.dropout_prob
+        self.num_class = num_class
+        self.gw = gw
+        self.labels = fl.data(name="labels", shape=[None, 1], dtype="int64")
+    def forward(self):
+        """forward"""
+        features_list = [self.gw.node_feat["attr"]]
+        for i in range(self.num_layers):
+            h = gin(self.gw,
+                    features_list[i],
+                    hidden_size=self.hidden_size,
+                    activation="relu",
+                    name="gin_%s" % (i),
+                    init_eps=0.0,
+                    train_eps=self.train_eps)
+            h = fl.layer_norm(
+                h,
+                begin_norm_axis=1,
+                param_attr=fluid.ParamAttr(
+                    name="norm_scale_%s" % (i),
+                    initializer=fluid.initializer.Constant(1.0)),
+                bias_attr=fluid.ParamAttr(
+                    name="norm_bias_%s" % (i),
+                    initializer=fluid.initializer.Constant(0.0)), )
+            h = fl.relu(h)
+            features_list.append(h)
+        output = 0
+        for i, h in enumerate(features_list):
+            pooled_h = pgl.layers.graph_pooling(self.gw, h, self.pool_type)
+            drop_h = fl.dropout(
+                pooled_h,
+                self.dropout_prob,
+                dropout_implementation="upscale_in_train")
+            output += fl.fc(drop_h,
+                            size=self.num_class,
+                            act=None,
+                            param_attr=fluid.ParamAttr(name="final_fc_%s" %
+                                                       (i)))
+        # calculate loss
+        self.loss = fl.softmax_with_cross_entropy(output, self.labels)
+        self.loss = fl.reduce_mean(self.loss)
+        self.acc = fl.accuracy(fl.softmax(output), self.labels)
--- a/examples/kg/README.md
+++ b/examples/kg/README.md
-# PGL - Knowledge Graph Embedding
-## Introduction 
-This package is mainly for computing node and relation embedding of knowledge graphs efficiently.  
-This package reproduce the following knowledge embedding models:
- TransE
- TransR
- RotatE
-## Dataset
-The dataset WN18 and FB15k are originally published by TransE paper and and be download [here](https://everest.hds.utc.fr/doku.php?id=en:transe)
-## Dependencies
-If you want to use the PGL-KGE in paddle, please install following packages.
- paddlepaddle>=1.7
- pgl
-## Experiment results
-FB15k dataset
-|  Models  |Mean Rank|  Mrr  | Hits@1 | Hits@3 | Hits@10 | MR@filter| Hits10@filter| 
-|----------|-------|-------|--------|--------|---------|---------|---------|
-| TransE| 214 | --   | --     | --  | 0.491   | 118 | 0.668|
-| TransR| 202 | --   | --     | --  | 0.502   | 115 | 0.683|
-| RotatE| 156| --   | --     | --  | 0.498   | 52 | 0.710|
-WN18 dataset
-|  Models  |Mean Rank|  Mrr  | Hits@1 | Hits@3 | Hits@10 | MR@filter| Hits10@filter| 
-|----------|-------|-------|--------|--------|---------|---------|---------|
-| TransE|  257 | --   | --     | --  |  0.800  | 245 | 0.915|
-| TransR|  255 | --   | --     | --  |  0.8012| 243 | 0.9371|
-| RotatE|  188 | --   | --     | --  |  0.8325| 176 | 0.9601|
-## References
-[1]. TransE https://ieeexplore.ieee.org/abstract/document/8047276
-[2]. TransR http://www.aaai.org/ocs/index.php/AAAI/AAAI15/paper/viewFile/9571/9523
-[3]. RotatE https://arxiv.org/abs/1902.10197
--- a/examples/kg/run.sh
+++ b/examples/kg/run.sh
-#CUDA_VISIBLE_DEVICES=2 \
-#FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-#python main.py \
-#    --use_cuda \
-#    --model TransE \
-#    --optimizer adam \
-#    --batch_size=512 \
-#    --learning_rate=0.001 \
-#    --epoch 100 \
-#    --evaluate_per_iteration 20 \
-#    --sample_workers 4 \
-#    --margin 4 \
-##    #--only_evaluate
-#CUDA_VISIBLE_DEVICES=2 \
-#FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-#python main.py \
-#    --use_cuda \
-#    --model RotatE \
-#    --data_dir ./data/WN18 \
-#    --optimizer adam \
-#    --batch_size=512 \
-#    --learning_rate=0.001 \
-#    --epoch 100 \
-#    --evaluate_per_iteration 100 \
-#    --sample_workers 10 \
-#    --margin 6 \
-#    --neg_times 10 
-CUDA_VISIBLE_DEVICES=2 \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python main.py \
-    --use_cuda \
-    --model RotatE \
-    --data_dir ./data/FB15k \
-    --optimizer adam \
-    --batch_size=512 \
-    --learning_rate=0.001 \
-    --epoch 100 \
-    --evaluate_per_iteration 100 \
-    --sample_workers 10 \
-    --margin 8 \
-    --neg_times 10 \
-    --neg_mode True
--- a/examples/pgl-ke/README.md
+++ b/examples/pgl-ke/README.md
+# PGL - Knowledge Graph Embedding
+This package is mainly for computing node and relation embedding of knowledge graphs efficiently.
+This package reproduce the following knowledge embedding models:
+- TransE
+- TransR
+- RotatE
+### Dataset
+The dataset WN18 and FB15k are originally published by TransE paper and can be download [here](https://everest.hds.utc.fr/doku.php?id=en:transe).
+FB15k: [https://drive.google.com/open?id=19I3LqaKjgq-3vOs0us7OgEL06TIs37W8](https://drive.google.com/open?id=19I3LqaKjgq-3vOs0us7OgEL06TIs37W8)
+WN18: [https://drive.google.com/open?id=1MXy257ZsjeXQHZScHLeQeVnUTPjltlwD](https://drive.google.com/open?id=1MXy257ZsjeXQHZScHLeQeVnUTPjltlwD)
+### Dependencies
+If you want to use the PGL-KG in paddle, please install following packages.
+- paddlepaddle>=1.7
+- pgl
+### Hyperparameters
+- use\_cuda: use cuda to train.
+- model: pgl-kg model names. Now available for `TransE`, `TransR` and `RotatE`.
+- data\_dir: the data path of dataset.
+- optimizer: optimizer to run the model.
+- batch\_size: batch size.
+- learning\_rate:learning rate.
+- epoch: epochs to run.
+- evaluate\_per\_iteration: evaluate after certain epochs.
+- sample\_workers: sample workers nums to prepare data.
+- margin: hyper-parameter for some model.
+For more hyper parameters usages, please refer the `main.py`. We also provide `run.sh` script to reproduce performance results (please download dataset in `./data` and specify the data\_dir paramter).
+### How to run
+For examples, use GPU to train TransR model on WN18 dataset.
+(please download WN18 dataset to `./data` floder)
+```
+python main.py --use_cuda --model TransR --data_dir ./data/WN18
+```
+We also provide `run.sh` script to reproduce following performance results.
+### Experiment results
+Here we report the experiment results on FB15k and WN18 dataset. The evaluation criteria are MR (mean rank), Mrr (mean reciprocal rank), Hit@N (The first N hit rate). The suffix `@f` means that we filter the exists relations of entities.
+FB15k dataset
+| Models | MR  |  Mrr  | Hits@1 | Hits@3 | Hits@10|  MR@f |Mrr@f|Hit1@f|Hit3@f|Hits10@f|
+|--------|-----|-------|--------|--------|--------|-------|-----|------|------|--------|
+| TransE | 215 | 0.205 |  0.093 | 0.234  |  0.446 |   74  |0.379| 0.235| 0.453|  0.647 |
+| TransR | 304 | 0.193 |  0.092 | 0.211  |  0.418 |  156  |0.366| 0.232| 0.435|  0.623 |
+| RotatE | 157 | 0.270 | 0.162  | 0.303  |  0.501 |   53  |0.478| 0.354| 0.547|  0.710 |
+WN18 dataset
+| Models | MR  |  Mrr  | Hits@1 | Hits@3 | Hits@10|  MR@f |Mrr@f|Hit1@f|Hit3@f|Hits10@f|
+|--------|-----|-------|--------|--------|--------|-------|-----|------|------|--------|
+| TransE | 219 | 0.338 | 0.082  | 0.523  |  0.800 |  208  |0.463| 0.135| 0.771| 0.932  |
+| TransR | 321 | 0.370 | 0.096  | 0.591  |  0.810 |  309  |0.513| 0.158| 0.941| 0.941  |
+| RotatE | 167 | 0.623 | 0.476  | 0.688  |  0.830 |  155  |0.915| 0.884| 0.941| 0.957  |
+## References
+[1]. [TransE: Translating embeddings for modeling multi-relational data.](https://ieeexplore.ieee.org/abstract/document/8047276)
+[2]. [TransR: Learning entity and relation embeddings for knowledge graph completion.](http://www.aaai.org/ocs/index.php/AAAI/AAAI15/paper/viewFile/9571/9523)
+[3]. [RotatE: Knowledge Graph Embedding by Relational Rotation in Complex Space.](https://arxiv.org/abs/1902.10197)
--- a/examples/kg/data_loader.py
+++ b/examples/kg/data_loader.py
@@ -19,10 +19,11 @@ import os
 import numpy as np
 from collections import defaultdict
 from pgl.utils.logger import log
-from pybloom import BloomFilter
+#from pybloom import BloomFilter
-class KBloader:
+class KGLoader:
    """
    load the FB15K
    """
@@ -65,8 +66,9 @@ class KBloader:
    def training_data_no_filter(self, train_triple_positive):
        """faster, no filter for exists triples"""
-        size = len(train_triple_positive)
+        size = len(train_triple_positive) * self._neg_times
-        train_triple_negative = train_triple_positive + 0
+        train_triple_negative = train_triple_positive.repeat(
+            self._neg_times, axis=0)
        replace_head_probability = 0.5 * np.ones(size)
        replace_entity_id = np.random.randint(self.entity_total, size=size)
        random_num = np.random.random(size=size)
@@ -122,7 +124,6 @@ class KBloader:
        """
        n = len(self._triple_train)
        rand_idx = np.random.permutation(n)
-        rand_idx = rand_idx % n
        n_triple = len(rand_idx)
        start = 0
        while start < n_triple:

--- a/examples/kg/evalutate.py
+++ b/examples/kg/evalutate.py
@@ -99,8 +99,10 @@ class Evaluate:
                                                 feed=batch_feed_dict)
                yield batch_feed_dict["test_triple"], head_score, tail_score
                n_used_eval_triple += 1
-                print('[{:.3f}s] #evaluation triple: {}/{}'.format(
+                if n_used_eval_triple % 500 == 0:
-                    timeit.default_timer() - start, n_used_eval_triple, 5000))
+                    print('[{:.3f}s] #evaluation triple: {}/{}'.format(
+                        timeit.default_timer(
+                        ) - start, n_used_eval_triple, self.reader.test_num))
        res_reader = mp_reader_mapper(
            reader=iterator,

--- a/examples/kg/main.py
+++ b/examples/kg/main.py
@@ -16,10 +16,13 @@ The script to run these models.
 """
 import argparse
 import timeit
+import os
+import numpy as np
 import paddle.fluid as fluid
-from data_loader import KBloader
+from data_loader import KGLoader
 from evalutate import Evaluate
 from model import model_dict
+from model.utils import load_var
 from mp_mapper import mp_reader_mapper
 from pgl.utils.logger import log
@@ -49,6 +52,7 @@ def run_round(batch_iter,
    run_time = 0
    data_time = 0
    t2 = timeit.default_timer()
+    start_epoch_time = timeit.default_timer()
    for batch_feed_dict in batch_iter():
        batch += 1
        t1 = timeit.default_timer()
@@ -62,8 +66,11 @@ def run_round(batch_iter,
        if batch % log_per_step == 0:
            tmp_epoch += 1
            if prefix == "train":
-                log.info("Epoch %s Ava Loss %s" %
+                log.info("Epoch %s (%.7f sec) Train Loss: %.7f" %
-                         (epoch + tmp_epoch, tmp_loss / batch))
+                         (epoch + tmp_epoch,
+                          timeit.default_timer() - start_epoch_time,
+                          tmp_loss[0] / batch))
+                start_epoch_time = timeit.default_timer()
            else:
                log.info("Batch %s" % batch)
            batch = 0
@@ -84,7 +91,7 @@ def train(args):
    :param args: all args.
    :return: None
    """
-    kgreader = KBloader(
+    kgreader = KGLoader(
        batch_size=args.batch_size,
        data_dir=args.data_dir,
        neg_mode=args.neg_mode,
@@ -117,8 +124,8 @@ def train(args):
        reader = mp_reader_mapper(
            data_repeat,
-            func=kgreader.training_data_map,
+            func=kgreader.training_data_no_filter
-            #func=kgreader.training_data_no_filter,
+            if args.nofilter else kgreader.training_data_map,
            num_works=args.sample_workers)
        return reader
@@ -148,6 +155,20 @@ def train(args):
    exe = fluid.Executor(places[0])
    exe.run(model.startup_program)
    exe.run(fluid.default_startup_program())
+    if args.pretrain and model.model_name in ["TransR", "transr"]:
+        pretrain_ent = os.path.join(args.checkpoint,
+                                    model.ent_name.replace("TransR", "TransE"))
+        pretrain_rel = os.path.join(args.checkpoint,
+                                    model.rel_name.replace("TransR", "TransE"))
+        if os.path.exists(pretrain_ent):
+            print("loading pretrain!")
+            #var = fluid.global_scope().find_var(model.ent_name)
+            load_var(exe, model.train_program, model.ent_name, pretrain_ent)
+            #var = fluid.global_scope().find_var(model.rel_name)
+            load_var(exe, model.train_program, model.rel_name, pretrain_rel)
+        else:
+            raise ValueError("pretrain file {} not exists!".format(
+                pretrain_ent))
    prog = fluid.CompiledProgram(model.train_program).with_data_parallel(
        loss_name=model.train_fetch_vars[0].name)
@@ -182,9 +203,9 @@ def train(args):
            log_per_step=kgreader.train_num // args.batch_size,
            epoch=epoch * args.evaluate_per_iteration)
        log.info("epoch\t%s" % ((1 + epoch) * args.evaluate_per_iteration))
-        if True:
+        fluid.io.save_params(
-            fluid.io.save_params(
+            exe, dirname=args.checkpoint, main_program=model.train_program)
-                exe, dirname=args.checkpoint, main_program=model.train_program)
+        if not args.noeval:
            eva = Evaluate(kgreader)
            eva.launch_evaluation(
                exe=exe,
@@ -273,6 +294,22 @@ def main():
    parser.add_argument(
        '--neg_mode', type=bool, help='return neg mode flag', default=False)
+    parser.add_argument(
+        '--nofilter',
+        type=bool,
+        help='don\'t filter invalid examples',
+        default=False)
+    parser.add_argument(
+        '--pretrain',
+        type=bool,
+        help='pretrain for TransR model',
+        default=False)
+    parser.add_argument(
+        '--noeval',
+        type=bool,
+        help='whether to evaluate the result',
+        default=False)
    args = parser.parse_args()
    log.info(args)
    train(args)

--- a/examples/kg/model/Model.py
+++ b/examples/kg/model/Model.py
--- a/examples/kg/model/RotatE.py
+++ b/examples/kg/model/RotatE.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 """
 RotatE:
-"Learning entity and relation embeddings for knowledge graph completion."
+"RotatE: Knowledge Graph Embedding by Relational Rotation in Complex Space."
-Lin, Yankai, et al.
+Sun, Zhiqing, et al.
-https://www.aaai.org/ocs/index.php/AAAI/AAAI15/paper/view/9571/9523
+https://arxiv.org/abs/1902.10197
 """
 import paddle.fluid as fluid
 from .Model import Model

--- a/examples/kg/model/TransE.py
+++ b/examples/kg/model/TransE.py
@@ -34,6 +34,7 @@ class TransE(Model):
                 learning_rate,
                 args,
                 optimizer="adam"):
+        self._neg_times = args.neg_times
        super(TransE, self).__init__(
            model_name="TransE",
            data_reader=data_reader,
@@ -84,6 +85,9 @@ class TransE(Model):
            fluid.layers.abs(pos_score), 1, keep_dim=False)
        neg = fluid.layers.reduce_sum(
            fluid.layers.abs(neg_score), 1, keep_dim=False)
+        neg = fluid.layers.reshape(
+            neg, shape=[-1, self._neg_times], inplace=True)
        loss = fluid.layers.reduce_mean(
            fluid.layers.relu(pos - neg + self._margin))
        return [loss]

--- a/examples/kg/model/TransR.py
+++ b/examples/kg/model/TransR.py
@@ -36,6 +36,7 @@ class TransR(Model):
                 args,
                 optimizer="adam"):
        """init"""
+        self._neg_times = args.neg_times
        super(TransR, self).__init__(
            model_name="TransR",
            data_reader=data_reader,
@@ -60,19 +61,19 @@ class TransR(Model):
            dtype="float32",
            name=self.rel_name,
            default_initializer=fluid.initializer.Xavier())
+        init_values = np.tile(
+            np.identity(
+                self._hidden_size, dtype="float32").reshape(-1),
+            (self._relation_total, 1))
        transfer_matrix = fluid.layers.create_parameter(
            shape=[
                self._relation_total, self._hidden_size * self._hidden_size
            ],
            dtype="float32",
-            name=self._prefix + "transfer_matrix", )
+            name=self._prefix + "transfer_matrix",
-        # Here is a trick, must init with identity matrix to get good hit@10 performance.
+            default_initializer=fluid.initializer.NumpyArrayInitializer(
-        fluid.layers.assign(
+                init_values))
-            np.tile(
-                np.identity(
-                    self._hidden_size, dtype="float32").reshape(-1),
-                (self._relation_total, 1)),
-            transfer_matrix)
        return entity_embedding, relation_embedding, transfer_matrix
    def score_with_l2_normalize(self, head, rel, tail):
@@ -111,7 +112,7 @@ class TransR(Model):
        pos_head_trans = self.matmul_with_expend_dims(pos_head, rel_matrix)
        pos_tail_trans = self.matmul_with_expend_dims(pos_tail, rel_matrix)
-        trans_neg = False
+        trans_neg = True
        if trans_neg:
            rel_matrix_neg = fluid.layers.reshape(
                lookup_table(self.train_neg_input[:, 1], transfer_matrix),
@@ -133,6 +134,9 @@ class TransR(Model):
            fluid.layers.abs(pos_score), -1, keep_dim=False)
        neg = fluid.layers.reduce_sum(
            fluid.layers.abs(neg_score), -1, keep_dim=False)
+        neg = fluid.layers.reshape(
+            neg, shape=[-1, self._neg_times], inplace=True)
        loss = fluid.layers.reduce_mean(
            fluid.layers.relu(pos - neg + self._margin))
        return [loss]

--- a/examples/kg/model/__init__.py
+++ b/examples/kg/model/__init__.py
--- a/examples/kg/model/utils.py
+++ b/examples/kg/model/utils.py
@@ -56,3 +56,64 @@ def lookup_table_gather(index, input):
    :return:
    """
    return fluid.layers.gather(index=index, input=input, overwrite=False)
+def _clone_var_in_block_(block, var):
+    assert isinstance(var, fluid.Variable)
+    if var.desc.type() == fluid.core.VarDesc.VarType.LOD_TENSOR:
+        return block.create_var(
+            name=var.name,
+            shape=var.shape,
+            dtype=var.dtype,
+            type=var.type,
+            lod_level=var.lod_level,
+            persistable=True)
+    else:
+        return block.create_var(
+            name=var.name,
+            shape=var.shape,
+            dtype=var.dtype,
+            type=var.type,
+            persistable=True)
+def load_var(executor, main_program=None, var=None, filename=None):
+    """
+    load_var to certain program
+    :param executor: executor
+    :param main_program: the program to load
+    :param var: the variable name in main_program.
+    :file_name: the file name of the file to load.
+    :return: None
+    """
+    load_prog = fluid.Program()
+    load_block = load_prog.global_block()
+    if main_program is None:
+        main_program = fluid.default_main_program()
+    if not isinstance(main_program, fluid.Program):
+        raise TypeError("program should be as Program type or None")
+    vars = list(filter(None, main_program.list_vars()))
+    # save origin param shape
+    orig_para_shape = {}
+    load_var_map = {}
+    for each_var in vars:
+        if each_var.name != var:
+            continue
+        assert isinstance(each_var, fluid.Variable)
+        if each_var.type == fluid.core.VarDesc.VarType.RAW:
+            continue
+        if isinstance(each_var, fluid.framework.Parameter):
+            orig_para_shape[each_var.name] = tuple(each_var.desc.get_shape())
+        new_var = _clone_var_in_block_(load_block, each_var)
+        if filename is not None:
+            load_block.append_op(
+                type='load',
+                inputs={},
+                outputs={'Out': [new_var]},
+                attrs={'file_path': filename})
+    executor.run(load_prog)
--- a/examples/kg/mp_mapper.py
+++ b/examples/kg/mp_mapper.py
@@ -65,12 +65,16 @@ def mp_reader_mapper(reader, func, num_works=4):
            all_process.append(p)
        data_iter = reader()
+        if not hasattr(data_iter, "__next__"):
+            __next__ = data_iter.next
+        else:
+            __next__ = data_iter.__next__
        def next_data():
            """next_data"""
            _next = None
            try:
-                _next = data_iter.next()
+                _next = __next__()
            except StopIteration:
                # log.debug(traceback.format_exc())
                pass

--- a/examples/pgl-ke/run.sh
+++ b/examples/pgl-ke/run.sh
+device=3
+CUDA_VISIBLE_DEVICES=$device \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python main.py \
+    --use_cuda \
+    --model TransE \
+    --data_dir ./data/FB15k \
+    --optimizer adam \
+    --batch_size=1024 \
+    --learning_rate=0.001 \
+    --epoch 200 \
+    --evaluate_per_iteration 200 \
+    --sample_workers 1 \
+    --margin 1.0 \
+    --nofilter True \
+    --neg_times 10 \
+    --neg_mode True
+    #--only_evaluate
+#  TransE FB15k
+#  -----Raw-Average-Results
+#  MeanRank: 214.94, MRR: 0.2051, Hits@1: 0.0929, Hits@3: 0.2343, Hits@10: 0.4458
+#  -----Filter-Average-Results
+#  MeanRank:  74.41, MRR: 0.3793, Hits@1: 0.2351, Hits@3: 0.4538, Hits@10: 0.6570
+CUDA_VISIBLE_DEVICES=$device \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python main.py \
+    --use_cuda \
+    --model TransE \
+    --data_dir ./data/WN18 \
+    --optimizer adam \
+    --batch_size=1024 \
+    --learning_rate=0.001 \
+    --epoch 100 \
+    --evaluate_per_iteration 100 \
+    --sample_workers 1 \
+    --margin 4 \
+    --nofilter True \
+    --neg_times 10 \
+    --neg_mode True
+#  TransE WN18
+#  -----Raw-Average-Results
+#  MeanRank: 219.08, MRR: 0.3383, Hits@1: 0.0821, Hits@3: 0.5233, Hits@10: 0.7997
+#  -----Filter-Average-Results
+#  MeanRank: 207.72, MRR: 0.4631, Hits@1: 0.1349, Hits@3: 0.7708, Hits@10: 0.9315
+#for  prertrain
+CUDA_VISIBLE_DEVICES=$device \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python main.py \
+    --use_cuda \
+    --model TransE \
+    --data_dir ./data/FB15k \
+    --optimizer adam \
+    --batch_size=512 \
+    --learning_rate=0.001 \
+    --epoch 30 \
+    --evaluate_per_iteration 30 \
+    --sample_workers 1 \
+    --margin 2.0 \
+    --nofilter True \
+    --noeval True \
+    --neg_times 10 \
+    --neg_mode True && \
+CUDA_VISIBLE_DEVICES=$device \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python main.py \
+    --use_cuda \
+    --model TransR \
+    --data_dir ./data/FB15k \
+    --optimizer adam \
+    --batch_size=512 \
+    --learning_rate=0.001 \
+    --epoch 200 \
+    --evaluate_per_iteration 200 \
+    --sample_workers 1 \
+    --margin 2.0 \
+    --pretrain True \
+    --nofilter True \
+    --neg_times 10 \
+    --neg_mode True
+#  FB15k TransR 200, pretrain 20
+#  -----Raw-Average-Results
+#  MeanRank: 303.81, MRR: 0.1931, Hits@1: 0.0920, Hits@3: 0.2109, Hits@10: 0.4181
+#  -----Filter-Average-Results
+#  MeanRank: 156.30, MRR: 0.3663, Hits@1: 0.2318, Hits@3: 0.4352, Hits@10: 0.6231
+# for pretrain
+CUDA_VISIBLE_DEVICES=$device \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python main.py \
+    --use_cuda \
+    --model TransE \
+    --data_dir ./data/WN18 \
+    --optimizer adam \
+    --batch_size=512 \
+    --learning_rate=0.001 \
+    --epoch 30 \
+    --evaluate_per_iteration 30 \
+    --sample_workers 1 \
+    --margin 4.0 \
+    --nofilter True \
+    --noeval True \
+    --neg_times 10 \
+    --neg_mode True && \
+CUDA_VISIBLE_DEVICES=$device \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python main.py \
+    --use_cuda \
+    --model TransR \
+    --data_dir ./data/WN18 \
+    --optimizer adam \
+    --batch_size=512 \
+    --learning_rate=0.001 \
+    --epoch 100 \
+    --evaluate_per_iteration 100 \
+    --sample_workers 1 \
+    --margin 4.0 \
+    --pretrain True \
+    --nofilter True \
+    --neg_times 10 \
+    --neg_mode True
+#  TransR WN18 100, pretrain 30
+#  -----Raw-Average-Results
+#  MeanRank: 321.41, MRR: 0.3706, Hits@1: 0.0955, Hits@3: 0.5906, Hits@10: 0.8099
+#  -----Filter-Average-Results
+#  MeanRank: 309.15, MRR: 0.5126, Hits@1: 0.1584, Hits@3: 0.8601, Hits@10: 0.9409
+CUDA_VISIBLE_DEVICES=$device \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python main.py \
+    --use_cuda \
+    --model RotatE \
+    --data_dir ./data/FB15k \
+    --optimizer adam \
+    --batch_size=512 \
+    --learning_rate=0.001 \
+    --epoch 100 \
+    --evaluate_per_iteration 100 \
+    --sample_workers 10 \
+    --margin 8 \
+    --neg_times 10 \
+    --neg_mode True
+# RotatE FB15k
+# -----Raw-Average-Results
+# MeanRank: 156.85, MRR: 0.2699, Hits@1: 0.1615, Hits@3: 0.3031, Hits@10: 0.5006
+# -----Filter-Average-Results
+# MeanRank:  53.35, MRR: 0.4776, Hits@1: 0.3537, Hits@3: 0.5473, Hits@10: 0.7062
+CUDA_VISIBLE_DEVICES=$device \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python main.py \
+    --use_cuda \
+    --model RotatE \
+    --data_dir ./data/WN18 \
+    --optimizer adam \
+    --batch_size=512 \
+    --learning_rate=0.001 \
+    --epoch 100 \
+    --evaluate_per_iteration 100 \
+    --sample_workers 10 \
+    --margin 6 \
+    --neg_times 10 \
+    --neg_mode True
+# RotaE WN18
+# -----Raw-Average-Results
+# MeanRank: 167.27, MRR: 0.6025, Hits@1: 0.4764, Hits@3: 0.6880, Hits@10: 0.8298
+# -----Filter-Average-Results
+# MeanRank: 155.23, MRR: 0.9145, Hits@1: 0.8843, Hits@3: 0.9412, Hits@10: 0.9570
--- a/ogb_examples/graphproppred/main_pgl.py
+++ b/ogb_examples/graphproppred/main_pgl.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""test ogb
-"""
-import argparse
-import pgl
-import numpy as np
-import paddle.fluid as fluid
-from pgl.contrib.ogb.graphproppred.dataset_pgl import PglGraphPropPredDataset
-from pgl.utils import paddle_helper
-from ogb.graphproppred import Evaluator
-from pgl.contrib.ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder
-def train(exe, batch_size, graph_wrapper, train_program, splitted_idx, dataset,
-          evaluator, fetch_loss, fetch_pred):
-    """Train"""
-    graphs, labels = dataset[splitted_idx["train"]]
-    perm = np.arange(0, len(graphs))
-    np.random.shuffle(perm)
-    start_batch = 0
-    batch_no = 0
-    pred_output = np.zeros_like(labels, dtype="float32")
-    while start_batch < len(perm):
-        batch_index = perm[start_batch:start_batch + batch_size]
-        start_batch += batch_size
-        batch_graph = pgl.graph.MultiGraph(graphs[batch_index])
-        batch_label = labels[batch_index]
-        batch_valid = (batch_label == batch_label).astype("float32")
-        batch_label = np.nan_to_num(batch_label).astype("float32")
-        feed_dict = graph_wrapper.to_feed(batch_graph)
-        feed_dict["label"] = batch_label
-        feed_dict["weight"] = batch_valid
-        loss, pred = exe.run(train_program,
-                             feed=feed_dict,
-                             fetch_list=[fetch_loss, fetch_pred])
-        pred_output[batch_index] = pred
-        batch_no += 1
-    print("train", evaluator.eval({"y_true": labels, "y_pred": pred_output}))
-def evaluate(exe, batch_size, graph_wrapper, val_program, splitted_idx,
-             dataset, mode, evaluator, fetch_pred):
-    """Eval"""
-    graphs, labels = dataset[splitted_idx[mode]]
-    perm = np.arange(0, len(graphs))
-    start_batch = 0
-    batch_no = 0
-    pred_output = np.zeros_like(labels, dtype="float32")
-    while start_batch < len(perm):
-        batch_index = perm[start_batch:start_batch + batch_size]
-        start_batch += batch_size
-        batch_graph = pgl.graph.MultiGraph(graphs[batch_index])
-        feed_dict = graph_wrapper.to_feed(batch_graph)
-        pred = exe.run(val_program, feed=feed_dict, fetch_list=[fetch_pred])
-        pred_output[batch_index] = pred[0]
-        batch_no += 1
-    print(mode, evaluator.eval({"y_true": labels, "y_pred": pred_output}))
-def send_func(src_feat, dst_feat, edge_feat):
-    """Send"""
-    return src_feat["h"] + edge_feat["h"]
-class GNNModel(object):
-    """GNNModel"""
-    def __init__(self, name, emb_dim, num_task, num_layers):
-        self.num_task = num_task
-        self.emb_dim = emb_dim
-        self.num_layers = num_layers
-        self.name = name
-        self.atom_encoder = AtomEncoder(name=name, emb_dim=emb_dim)
-        self.bond_encoder = BondEncoder(name=name, emb_dim=emb_dim)
-    def forward(self, graph):
-        """foward"""
-        h_node = self.atom_encoder(graph.node_feat['feat'])
-        h_edge = self.bond_encoder(graph.edge_feat['feat'])
-        for layer in range(self.num_layers):
-            msg = graph.send(
-                send_func,
-                nfeat_list=[("h", h_node)],
-                efeat_list=[("h", h_edge)])
-            h_node = graph.recv(msg, 'sum') + h_node
-            h_node = fluid.layers.fc(h_node,
-                                     size=self.emb_dim,
-                                     name=self.name + '_%s' % layer,
-                                     act="relu")
-        graph_nodes = pgl.layers.graph_pooling(graph, h_node, "average")
-        graph_pred = fluid.layers.fc(graph_nodes, self.num_task, name="final")
-        return graph_pred
-def main():
-    """main
-    """
-    # Training settings
-    parser = argparse.ArgumentParser(description='Graph Dataset')
-    parser.add_argument(
-        '--epochs',
-        type=int,
-        default=100,
-        help='number of epochs to train (default: 100)')
-    parser.add_argument(
-        '--dataset',
-        type=str,
-        default="ogbg-mol-tox21",
-        help='dataset name (default: proteinfunc)')
-    args = parser.parse_args()
-    place = fluid.CPUPlace()  # Dataset too big to use GPU
-    ### automatic dataloading and splitting
-    dataset = PglGraphPropPredDataset(name=args.dataset)
-    splitted_idx = dataset.get_idx_split()
-    ### automatic evaluator. takes dataset name as input
-    evaluator = Evaluator(args.dataset)
-    graph_data, label = dataset[:2]
-    batch_graph = pgl.graph.MultiGraph(graph_data)
-    graph_data = batch_graph
-    train_program = fluid.Program()
-    startup_program = fluid.Program()
-    test_program = fluid.Program()
-    # degree normalize
-    graph_data.edge_feat["feat"] = graph_data.edge_feat["feat"].astype("int64")
-    graph_data.node_feat["feat"] = graph_data.node_feat["feat"].astype("int64")
-    model = GNNModel(
-        name="gnn", num_task=dataset.num_tasks, emb_dim=64, num_layers=2)
-    with fluid.program_guard(train_program, startup_program):
-        gw = pgl.graph_wrapper.GraphWrapper(
-            "graph",
-            node_feat=graph_data.node_feat_info(),
-            edge_feat=graph_data.edge_feat_info())
-        pred = model.forward(gw)
-        sigmoid_pred = fluid.layers.sigmoid(pred)
-    val_program = train_program.clone(for_test=True)
-    initializer = []
-    with fluid.program_guard(train_program, startup_program):
-        train_label = fluid.layers.data(
-            name="label", dtype="float32", shape=[None, dataset.num_tasks])
-        train_weight = fluid.layers.data(
-            name="weight", dtype="float32", shape=[None, dataset.num_tasks])
-        train_loss_t = fluid.layers.sigmoid_cross_entropy_with_logits(
-            x=pred, label=train_label) * train_weight
-        train_loss_t = fluid.layers.reduce_sum(train_loss_t)
-        adam = fluid.optimizer.Adam(
-            learning_rate=1e-2,
-            regularization=fluid.regularizer.L2DecayRegularizer(
-                regularization_coeff=0.0005))
-        adam.minimize(train_loss_t)
-    exe = fluid.Executor(place)
-    exe.run(startup_program)
-    for epoch in range(1, args.epochs + 1):
-        print("Epoch", epoch)
-        train(exe, 128, gw, train_program, splitted_idx, dataset, evaluator,
-              train_loss_t, sigmoid_pred)
-        evaluate(exe, 128, gw, val_program, splitted_idx, dataset, "valid",
-                 evaluator, sigmoid_pred)
-        evaluate(exe, 128, gw, val_program, splitted_idx, dataset, "test",
-                 evaluator, sigmoid_pred)
-if __name__ == "__main__":
-    main()
--- a/ogb_examples/graphproppred/mol/README.md
+++ b/ogb_examples/graphproppred/mol/README.md
+# Graph Property Prediction for Open Graph Benchmark (OGB)
+[The Open Graph Benchmark (OGB)](https://ogb.stanford.edu/) is a collection of benchmark datasets, data loaders, and evaluators for graph machine learning. Here we complete the Graph Property Prediction task based on PGL.
+### Requirements
+- paddlpaddle >= 1.7.1
+- pgl 1.0.2
+- ogb
+NOTE: To install ogb that is fited for this project, run below command to install ogb
+```
+git clone https://github.com/snap-stanford/ogb.git
+git checkout 482c40bc9f31fe25f9df5aa11c8fb657bd2b1621
+python setup.py install
+```
+### How to run
+For example, use GPU to train model on ogbg-molhiv dataset and ogb-molpcba dataset.
+```
+CUDA_VISIBLE_DEVICES=1 python -u main.py --config hiv_config.yaml --use_cuda
+CUDA_VISIBLE_DEVICES=2 python -u main.py --config pcba_config.yaml --use_cuda
+```
+If you want to use CPU to train model, environment variables `CPU_NUM` should be specified and should be in the range of 1 to N, where N is the total CPU number on your machine.
+```
+CPU_NUM=1 python -u main.py --config hiv_config.yaml
+CPU_NUM=1 python -u main.py --config pcba_config.yaml
+```
+### Experiment results
+| model | hiv (rocauc)| pcba (prcauc)|
+|-------|-------------|--------------|
+| GIN   |0.7719 (0.0079) | 0.2232 (0.0018) |
--- a/ogb_examples/graphproppred/mol/args.py
+++ b/ogb_examples/graphproppred/mol/args.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import os
+import time
+import argparse
+from utils.args import ArgumentGroup
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument('--use_cuda', action='store_true')
+model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
+model_g.add_arg("init_checkpoint",          str,  None,           "Init checkpoint to resume training from.")
+model_g.add_arg("init_pretraining_params",  str,  None,
+                "Init pre-training params which preforms fine-tuning from. If the "
+                 "arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
+model_g.add_arg("./save_dir",              str,  "./checkpoints",  "Path to save checkpoints.")
+model_g.add_arg("hidden_size",             int,    128,       "hidden size.")
+train_g = ArgumentGroup(parser, "training", "training options.")
+train_g.add_arg("epoch",             int,    3,       "Number of epoches for fine-tuning.")
+train_g.add_arg("learning_rate",     float,  5e-5,    "Learning rate used to train with warmup.")
+train_g.add_arg("lr_scheduler",      str,    "linear_warmup_decay",
+                "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
+train_g.add_arg("weight_decay",      float,  0.01,    "Weight decay rate for L2 regularizer.")
+train_g.add_arg("warmup_proportion", float,  0.1,
+                "Proportion of training steps to perform linear learning rate warmup for.")
+train_g.add_arg("save_steps",        int,    10000,   "The steps interval to save checkpoints.")
+train_g.add_arg("validation_steps",  int,    1000,    "The steps interval to evaluate model performance.")
+train_g.add_arg("use_dynamic_loss_scaling",    bool,   True,   "Whether to use dynamic loss scaling.")
+train_g.add_arg("init_loss_scaling",           float,  102400,
+                "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
+train_g.add_arg("test_save",            str,    "./checkpoints/test_result",       "test_save")
+train_g.add_arg("metric",               str,    "simple_accuracy",   "metric")
+train_g.add_arg("incr_every_n_steps",          int,    100,   "Increases loss scaling every n consecutive.")
+train_g.add_arg("decr_every_n_nan_or_inf",     int,    2,
+                "Decreases loss scaling every n accumulated steps with nan or inf gradients.")
+train_g.add_arg("incr_ratio",                  float,  2.0,
+                "The multiplier to use when increasing the loss scaling.")
+train_g.add_arg("decr_ratio",                  float,  0.8,
+                "The less-than-one-multiplier to use when decreasing.")
+log_g = ArgumentGroup(parser,     "logging", "logging related.")
+log_g.add_arg("skip_steps",          int,    10,    "The steps interval to print loss.")
+log_g.add_arg("verbose",             bool,   False, "Whether to output verbose log.")
+log_g.add_arg("log_dir",             str,   './logs/', "Whether to output verbose log.")
+data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
+data_g.add_arg("tokenizer",           str, "FullTokenizer",
+              "ATTENTION: the INPUT must be splited by Word with blank while using SentencepieceTokenizer or WordsegTokenizer")
+data_g.add_arg("train_set",           str,  None,  "Path to training data.")
+data_g.add_arg("test_set",            str,  None,  "Path to test data.")
+data_g.add_arg("dev_set",             str,  None,  "Path to validation data.")
+data_g.add_arg("aug1_type",           str,  "scheme1",  "augment type")
+data_g.add_arg("aug2_type",           str,  "scheme1",  "augment type")
+data_g.add_arg("batch_size",          int,  32,    "Total examples' number in batch for training. see also --in_tokens.")
+data_g.add_arg("predict_batch_size",  int,  None,    "Total examples' number in batch for predict. see also --in_tokens.")
+data_g.add_arg("random_seed",         int,  None,     "Random seed.")
+data_g.add_arg("buf_size",         int,  1000,     "Random seed.")
+run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
+run_type_g.add_arg("num_iteration_per_drop_scope", int,    10,    "Iteration intervals to drop scope.")
+run_type_g.add_arg("do_train",                     bool,   True,  "Whether to perform training.")
+run_type_g.add_arg("do_val",                       bool,   True,  "Whether to perform evaluation on dev data set.")
+run_type_g.add_arg("do_test",                      bool,   True,  "Whether to perform evaluation on test data set.")
+run_type_g.add_arg("metrics",                      bool,   True,  "Whether to perform evaluation on test data set.")
+run_type_g.add_arg("shuffle",                      bool,   True,  "")
+run_type_g.add_arg("for_cn",                       bool,   True,  "model train for cn or for other langs.")
+run_type_g.add_arg("num_workers",                       int,   1,  "use multiprocess to generate graph")
+run_type_g.add_arg("output_dir",                       str,   None,  "path to save model")
+run_type_g.add_arg("config",                       str,   None,  "configure yaml file")
+run_type_g.add_arg("n",                       str,   None,  "task name")
+run_type_g.add_arg("task_name", str,   None,  "task name")
+run_type_g.add_arg("pretrain", bool,   False,  "Whether do pretrian")
+run_type_g.add_arg("pretrain_name", str,   None,  "pretrain task name")
+run_type_g.add_arg("pretrain_config", str,   None,  "pretrain config.yaml file")
+run_type_g.add_arg("pretrain_model_step", str,   None,  "pretrain model step")
+run_type_g.add_arg("model_type", str,   "BaseLineModel",  "pretrain model step")
+run_type_g.add_arg("num_class", int,   1,  "number class")
+run_type_g.add_arg("dataset_name", str,   None,  "finetune dataset name")
+run_type_g.add_arg("eval_metrics", str,   None,  "evaluate metrics")
+run_type_g.add_arg("task_type", str,   None,  "regression or classification")
--- a/ogb_examples/graphproppred/mol/data/__init__.py
+++ b/ogb_examples/graphproppred/mol/data/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/ogb_examples/graphproppred/mol/data/base_dataset.py
+++ b/ogb_examples/graphproppred/mol/data/base_dataset.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+from ogb.graphproppred import GraphPropPredDataset
+import pgl
+from pgl.utils.logger import log
+class BaseDataset(object):
+    def __init__(self):
+        pass
+    def __getitem__(self, idx):
+        raise NotImplementedError
+    def __len__(self):
+        raise NotImplementedError
+class Subset(BaseDataset):
+    r"""
+    Subset of a dataset at specified indices.
+    Arguments:
+        dataset (Dataset): The whole Dataset
+        indices (sequence): Indices in the whole set selected for subset
+    """
+    def __init__(self, dataset, indices):
+        self.dataset = dataset
+        self.indices = indices
+    def __getitem__(self, idx):
+        return self.dataset[self.indices[idx]]
+    def __len__(self):
+        return len(self.indices)
+class Dataset(BaseDataset):
+    def __init__(self, args):
+        self.args = args
+        self.raw_dataset = GraphPropPredDataset(name=args.dataset_name)
+        self.num_tasks = self.raw_dataset.num_tasks
+        self.eval_metrics = self.raw_dataset.eval_metric
+        self.task_type = self.raw_dataset.task_type
+        self.pgl_graph_list = []
+        self.graph_label_list = []
+        for i in range(len(self.raw_dataset)):
+            graph, label = self.raw_dataset[i]
+            edges = list(zip(graph["edge_index"][0], graph["edge_index"][1]))
+            g = pgl.graph.Graph(num_nodes=graph["num_nodes"], edges=edges)
+            if graph["edge_feat"] is not None:
+                g.edge_feat["feat"] = graph["edge_feat"]
+            if graph["node_feat"] is not None:
+                g.node_feat["feat"] = graph["node_feat"]
+            self.pgl_graph_list.append(g)
+            self.graph_label_list.append(label)
+    def __getitem__(self, idx):
+        return self.pgl_graph_list[idx], self.graph_label_list[idx]
+    def __len__(self):
+        return len(slef.pgl_graph_list)
+    def get_idx_split(self):
+        return self.raw_dataset.get_idx_split()
--- a/ogb_examples/graphproppred/mol/data/dataloader.py
+++ b/ogb_examples/graphproppred/mol/data/dataloader.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file implement the graph dataloader.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import ssl
+ssl._create_default_https_context = ssl._create_unverified_context
+# SSL
+import torch
+import sys
+import six
+from io import open
+import collections
+from collections import namedtuple
+import numpy as np
+import tqdm
+import time
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as fl
+import pgl
+from pgl.utils import mp_reader
+from pgl.utils.logger import log
+from ogb.graphproppred import GraphPropPredDataset
+def batch_iter(data, batch_size, fid, num_workers):
+    """node_batch_iter
+    """
+    size = len(data)
+    perm = np.arange(size)
+    np.random.shuffle(perm)
+    start = 0
+    cc = 0
+    while start < size:
+        index = perm[start:start + batch_size]
+        start += batch_size
+        cc += 1
+        if cc % num_workers != fid:
+            continue
+        yield data[index]
+def scan_batch_iter(data, batch_size, fid, num_workers):
+    """scan_batch_iter
+    """
+    batch = []
+    cc = 0
+    for line_example in data.scan():
+        cc += 1
+        if cc % num_workers != fid:
+            continue
+        batch.append(line_example)
+        if len(batch) == batch_size:
+            yield batch
+            batch = []
+    if len(batch) > 0:
+        yield batch
+class GraphDataloader(object):
+    """Graph Dataloader
+    """
+    def __init__(self,
+                 dataset,
+                 graph_wrapper,
+                 batch_size,
+                 seed=0,
+                 num_workers=1,
+                 buf_size=1000,
+                 shuffle=True):
+        self.shuffle = shuffle
+        self.seed = seed
+        self.num_workers = num_workers
+        self.buf_size = buf_size
+        self.batch_size = batch_size
+        self.dataset = dataset
+        self.graph_wrapper = graph_wrapper
+    def batch_fn(self, batch_examples):
+        """ batch_fn batch producer"""
+        graphs = [b[0] for b in batch_examples]
+        labels = [b[1] for b in batch_examples]
+        join_graph = pgl.graph.MultiGraph(graphs)
+        labels = np.array(labels)
+        feed_dict = self.graph_wrapper.to_feed(join_graph)
+        batch_valid = (labels == labels).astype("float32")
+        labels = np.nan_to_num(labels).astype("float32")
+        feed_dict['labels'] = labels
+        feed_dict['unmask'] = batch_valid
+        return feed_dict
+    def batch_iter(self, fid):
+        """batch_iter"""
+        if self.shuffle:
+            for batch in batch_iter(self, self.batch_size, fid,
+                                    self.num_workers):
+                yield batch
+        else:
+            for batch in scan_batch_iter(self, self.batch_size, fid,
+                                         self.num_workers):
+                yield batch
+    def __len__(self):
+        """__len__"""
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        """__getitem__"""
+        if isinstance(idx, collections.Iterable):
+            return [self[bidx] for bidx in idx]
+        else:
+            return self.dataset[idx]
+    def __iter__(self):
+        """__iter__"""
+        def worker(filter_id):
+            def func_run():
+                for batch_examples in self.batch_iter(filter_id):
+                    batch_dict = self.batch_fn(batch_examples)
+                    yield batch_dict
+            return func_run
+        if self.num_workers == 1:
+            r = paddle.reader.buffered(worker(0), self.buf_size)
+        else:
+            worker_pool = [worker(wid) for wid in range(self.num_workers)]
+            worker = mp_reader.multiprocess_reader(
+                worker_pool, use_pipe=True, queue_size=1000)
+            r = paddle.reader.buffered(worker, self.buf_size)
+        for batch in r():
+            yield batch
+    def scan(self):
+        """scan"""
+        for example in self.dataset:
+            yield example
+if __name__ == "__main__":
+    from base_dataset import BaseDataset, Subset
+    dataset = GraphPropPredDataset(name="ogbg-molhiv")
+    splitted_index = dataset.get_idx_split()
+    train_dataset = Subset(dataset, splitted_index['train'])
+    valid_dataset = Subset(dataset, splitted_index['valid'])
+    test_dataset = Subset(dataset, splitted_index['test'])
+    log.info("Train Examples: %s" % len(train_dataset))
+    log.info("Val Examples: %s" % len(valid_dataset))
+    log.info("Test Examples: %s" % len(test_dataset))
+    #  train_loader = GraphDataloader(train_dataset, batch_size=3)
+    #  for batch_data in train_loader:
+    #      graphs, labels = batch_data
+    #      print(labels.shape)
+    #      time.sleep(4)
--- a/ogb_examples/graphproppred/mol/data/splitters.py
+++ b/ogb_examples/graphproppred/mol/data/splitters.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+import logging
+from random import random
+import pandas as pd
+import numpy as np
+from itertools import compress
+import scipy.sparse as sp
+from sklearn.model_selection import StratifiedKFold
+from sklearn.preprocessing import StandardScaler
+from rdkit.Chem.Scaffolds import MurckoScaffold
+import pgl
+from pgl.utils import paddle_helper
+try:
+    from dataset.Dataset import Subset
+    from dataset.Dataset import ChemDataset
+except:
+    from Dataset import Subset
+    from Dataset import ChemDataset
+log = logging.getLogger("logger")
+def random_split(dataset, args):
+    total_precent = args.frac_train + args.frac_valid + args.frac_test
+    np.testing.assert_almost_equal(total_precent, 1.0)
+    length = len(dataset)
+    perm = list(range(length))
+    np.random.shuffle(perm)
+    num_train = int(args.frac_train * length)
+    num_valid = int(args.frac_valid * length)
+    num_test = int(args.frac_test * length)
+    train_indices = perm[0:num_train]
+    valid_indices = perm[num_train:(num_train + num_valid)]
+    test_indices = perm[(num_train + num_valid):]
+    assert (len(train_indices) + len(valid_indices) + len(test_indices)
+            ) == length
+    train_dataset = Subset(dataset, train_indices)
+    valid_dataset = Subset(dataset, valid_indices)
+    test_dataset = Subset(dataset, test_indices)
+    return train_dataset, valid_dataset, test_dataset
+def scaffold_split(dataset, args, return_smiles=False):
+    total_precent = args.frac_train + args.frac_valid + args.frac_test
+    np.testing.assert_almost_equal(total_precent, 1.0)
+    smiles_list_file = os.path.join(args.data_dir, "smiles.csv")
+    smiles_list = pd.read_csv(smiles_list_file, header=None)[0].tolist()
+    non_null = np.ones(len(dataset)) == 1
+    smiles_list = list(compress(enumerate(smiles_list), non_null))
+    # create dict of the form {scaffold_i: [idx1, idx....]}
+    all_scaffolds = {}
+    for i, smiles in smiles_list:
+        scaffold = MurckoScaffold.MurckoScaffoldSmiles(
+            smiles=smiles, includeChirality=True)
+        #  scaffold = generate_scaffold(smiles, include_chirality=True)
+        if scaffold not in all_scaffolds:
+            all_scaffolds[scaffold] = [i]
+        else:
+            all_scaffolds[scaffold].append(i)
+    # sort from largest to smallest sets
+    all_scaffolds = {
+        key: sorted(value)
+        for key, value in all_scaffolds.items()
+    }
+    all_scaffold_sets = [
+        scaffold_set
+        for (scaffold, scaffold_set) in sorted(
+            all_scaffolds.items(),
+            key=lambda x: (len(x[1]), x[1][0]),
+            reverse=True)
+    ]
+    # get train, valid test indices
+    train_cutoff = args.frac_train * len(smiles_list)
+    valid_cutoff = (args.frac_train + args.frac_valid) * len(smiles_list)
+    train_idx, valid_idx, test_idx = [], [], []
+    for scaffold_set in all_scaffold_sets:
+        if len(train_idx) + len(scaffold_set) > train_cutoff:
+            if len(train_idx) + len(valid_idx) + len(
+                    scaffold_set) > valid_cutoff:
+                test_idx.extend(scaffold_set)
+            else:
+                valid_idx.extend(scaffold_set)
+        else:
+            train_idx.extend(scaffold_set)
+    assert len(set(train_idx).intersection(set(valid_idx))) == 0
+    assert len(set(test_idx).intersection(set(valid_idx))) == 0
+    #  log.info(len(scaffold_set))
+    #  log.info(["train_idx", train_idx])
+    #  log.info(["valid_idx", valid_idx])
+    #  log.info(["test_idx", test_idx])
+    train_dataset = Subset(dataset, train_idx)
+    valid_dataset = Subset(dataset, valid_idx)
+    test_dataset = Subset(dataset, test_idx)
+    if return_smiles:
+        train_smiles = [smiles_list[i][1] for i in train_idx]
+        valid_smiles = [smiles_list[i][1] for i in valid_idx]
+        test_smiles = [smiles_list[i][1] for i in test_idx]
+        return train_dataset, valid_dataset, test_dataset, (
+            train_smiles, valid_smiles, test_smiles)
+    return train_dataset, valid_dataset, test_dataset
+if __name__ == "__main__":
+    file_path = os.path.dirname(os.path.realpath(__file__))
+    proj_path = os.path.join(file_path, '../')
+    sys.path.append(proj_path)
+    from utils.config import Config
+    from dataset.Dataset import Subset
+    from dataset.Dataset import ChemDataset
+    config_file = "./finetune_config.yaml"
+    args = Config(config_file)
+    log.info("loading dataset")
+    dataset = ChemDataset(args)
+    train_dataset, valid_dataset, test_dataset = scaffold_split(dataset, args)
+    log.info("Train Examples: %s" % len(train_dataset))
+    log.info("Val Examples: %s" % len(valid_dataset))
+    log.info("Test Examples: %s" % len(test_dataset))
+    import ipdb
+    ipdb.set_trace()
+    log.info("preprocess finish")
--- a/ogb_examples/graphproppred/mol/hiv_config.yaml
+++ b/ogb_examples/graphproppred/mol/hiv_config.yaml
+task_name: hiv
+seed: 15391
+dataset_name: ogbg-molhiv
+eval_metrics: null 
+task_type: null
+num_class: null
+pool_type: average
+train_eps: True
+norm_type: layer_norm
+model_type: GNNModel
+embed_dim: 128
+num_layers: 5
+hidden_size: 256
+save_dir: ./checkpoints
+# finetune model config
+init_checkpoint: null
+init_pretraining_params: null
+# data config
+data_dir: ./dataset/
+symmetry: True
+batch_size: 32
+buf_size: 1000
+metrics: True
+shuffle: True
+num_workers: 12
+output_dir: ./outputs/
+# trainging config
+epoch: 50
+learning_rate: 0.0001
+lr_scheduler: linear_warmup_decay
+weight_decay: 0.01
+warmup_proportion: 0.1
+save_steps: 10000
+validation_steps: 1000
+use_dynamic_loss_scaling: True
+init_loss_scaling: 102400
+metric: simple_accuracy
+incr_every_n_steps: 100
+decr_every_n_nan_or_inf: 2
+incr_ratio: 2.0
+decr_ratio: 0.8
+log_dir: ./logs
+eval_step: 400
+train_log_step: 20
+# log config
+skip_steps: 10
+verbose: False
--- a/ogb_examples/graphproppred/mol/main.py
+++ b/ogb_examples/graphproppred/mol/main.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ssl
+ssl._create_default_https_context = ssl._create_unverified_context
+# SSL
+import torch
+import os
+import re
+import time
+from random import random
+from functools import reduce, partial
+import numpy as np
+import multiprocessing
+from ogb.graphproppred import Evaluator
+import paddle
+import paddle.fluid as F
+import paddle.fluid.layers as L
+import pgl
+from pgl.utils import paddle_helper
+from pgl.utils.logger import log
+from utils.args import print_arguments, check_cuda, prepare_logger
+from utils.init import init_checkpoint, init_pretraining_params
+from utils.config import Config
+from optimization import optimization
+from monitor.train_monitor import train_and_evaluate
+from args import parser
+import model as Model
+from data.base_dataset import Subset, Dataset
+from data.dataloader import GraphDataloader
+def main(args):
+    log.info('loading data')
+    dataset = Dataset(args)
+    args.num_class = dataset.num_tasks
+    args.eval_metrics = dataset.eval_metrics
+    args.task_type = dataset.task_type
+    splitted_index = dataset.get_idx_split()
+    train_dataset = Subset(dataset, splitted_index['train'])
+    valid_dataset = Subset(dataset, splitted_index['valid'])
+    test_dataset = Subset(dataset, splitted_index['test'])
+    log.info("preprocess finish")
+    log.info("Train Examples: %s" % len(train_dataset))
+    log.info("Val Examples: %s" % len(valid_dataset))
+    log.info("Test Examples: %s" % len(test_dataset))
+    train_prog = F.Program()
+    startup_prog = F.Program()
+    if args.use_cuda:
+        dev_list = F.cuda_places()
+        place = dev_list[0]
+        dev_count = len(dev_list)
+    else:
+        place = F.CPUPlace()
+        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+        #  dev_count = args.cpu_num
+    log.info("building model")
+    with F.program_guard(train_prog, startup_prog):
+        with F.unique_name.guard():
+            graph_model = getattr(Model, args.model_type)(args, dataset)
+            train_ds = GraphDataloader(
+                train_dataset,
+                graph_model.graph_wrapper,
+                batch_size=args.batch_size)
+            num_train_examples = len(train_dataset)
+            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count
+            warmup_steps = int(max_train_steps * args.warmup_proportion)
+            scheduled_lr, loss_scaling = optimization(
+                loss=graph_model.loss,
+                warmup_steps=warmup_steps,
+                num_train_steps=max_train_steps,
+                learning_rate=args.learning_rate,
+                train_program=train_prog,
+                startup_prog=startup_prog,
+                weight_decay=args.weight_decay,
+                scheduler=args.lr_scheduler,
+                use_fp16=False,
+                use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
+                init_loss_scaling=args.init_loss_scaling,
+                incr_every_n_steps=args.incr_every_n_steps,
+                decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
+                incr_ratio=args.incr_ratio,
+                decr_ratio=args.decr_ratio)
+    test_prog = F.Program()
+    with F.program_guard(test_prog, startup_prog):
+        with F.unique_name.guard():
+            _graph_model = getattr(Model, args.model_type)(args, dataset)
+    test_prog = test_prog.clone(for_test=True)
+    valid_ds = GraphDataloader(
+        valid_dataset,
+        graph_model.graph_wrapper,
+        batch_size=args.batch_size,
+        shuffle=False)
+    test_ds = GraphDataloader(
+        test_dataset,
+        graph_model.graph_wrapper,
+        batch_size=args.batch_size,
+        shuffle=False)
+    exe = F.Executor(place)
+    exe.run(startup_prog)
+    for init in graph_model.init_vars:
+        init(place)
+    for init in _graph_model.init_vars:
+        init(place)
+    if args.init_pretraining_params is not None:
+        init_pretraining_params(
+            exe, args.init_pretraining_params, main_program=startup_prog)
+    nccl2_num_trainers = 1
+    nccl2_trainer_id = 0
+    if dev_count > 1:
+        exec_strategy = F.ExecutionStrategy()
+        exec_strategy.num_threads = dev_count
+        train_exe = F.ParallelExecutor(
+            use_cuda=args.use_cuda,
+            loss_name=graph_model.loss.name,
+            exec_strategy=exec_strategy,
+            main_program=train_prog,
+            num_trainers=nccl2_num_trainers,
+            trainer_id=nccl2_trainer_id)
+        test_exe = exe
+    else:
+        train_exe, test_exe = exe, exe
+    evaluator = Evaluator(args.dataset_name)
+    train_and_evaluate(
+        exe=exe,
+        train_exe=train_exe,
+        valid_exe=test_exe,
+        train_ds=train_ds,
+        valid_ds=valid_ds,
+        test_ds=test_ds,
+        train_prog=train_prog,
+        valid_prog=test_prog,
+        args=args,
+        dev_count=dev_count,
+        evaluator=evaluator,
+        model=graph_model)
+if __name__ == "__main__":
+    args = parser.parse_args()
+    if args.config is not None:
+        config = Config(args.config, isCreate=True, isSave=True)
+    config['use_cuda'] = args.use_cuda
+    log.info(config)
+    main(config)
--- a/ogb_examples/graphproppred/mol/model.py
+++ b/ogb_examples/graphproppred/mol/model.py
+#-*- coding: utf-8 -*-
+import os
+import re
+import time
+import logging
+from random import random
+from functools import reduce, partial
+import numpy as np
+import multiprocessing
+import paddle
+import paddle.fluid as F
+import paddle.fluid.layers as L
+import pgl
+from pgl.graph_wrapper import GraphWrapper
+from pgl.layers.conv import gcn, gat
+from pgl.utils import paddle_helper
+from pgl.utils.logger import log
+from utils.args import print_arguments, check_cuda, prepare_logger
+from utils.init import init_checkpoint, init_pretraining_params
+from mol_encoder import AtomEncoder, BondEncoder
+def copy_send(src_feat, dst_feat, edge_feat):
+    return src_feat["h"]
+def mean_recv(feat):
+    return L.sequence_pool(feat, pool_type="average")
+def sum_recv(feat):
+    return L.sequence_pool(feat, pool_type="sum")
+def max_recv(feat):
+    return L.sequence_pool(feat, pool_type="max")
+def unsqueeze(tensor):
+    tensor = L.unsqueeze(tensor, axes=-1)
+    tensor.stop_gradient = True
+    return tensor
+class Metric:
+    def __init__(self, **args):
+        self.args = args
+    @property
+    def vars(self):
+        values = [self.args[k] for k in self.args.keys()]
+        return values
+    def parse(self, fetch_list):
+        tup = list(zip(self.args.keys(), [float(v[0]) for v in fetch_list]))
+        return dict(tup)
+def gin_layer(gw, node_features, edge_features, train_eps, name):
+    def send_func(src_feat, dst_feat, edge_feat):
+        """Send"""
+        return src_feat["h"] + edge_feat["h"]
+    epsilon = L.create_parameter(
+        shape=[1, 1],
+        dtype="float32",
+        attr=F.ParamAttr(name="%s_eps" % name),
+        default_initializer=F.initializer.ConstantInitializer(value=0.0))
+    if not train_eps:
+        epsilon.stop_gradient = True
+    msg = gw.send(
+        send_func,
+        nfeat_list=[("h", node_features)],
+        efeat_list=[("h", edge_features)])
+    node_feat = gw.recv(msg, "sum") + node_features * (epsilon + 1.0)
+    #  if apply_func is not None:
+    #      node_feat = apply_func(node_feat, name)
+    return node_feat
+class GNNModel(object):
+    def __init__(self, args, dataset):
+        self.args = args
+        self.dataset = dataset
+        self.hidden_size = self.args.hidden_size
+        self.embed_dim = self.args.embed_dim
+        self.dropout_prob = self.args.dropout_rate
+        self.pool_type = self.args.pool_type
+        self._init_vars = []
+        graph_data = []
+        g, label = self.dataset[0]
+        graph_data.append(g)
+        g, label = self.dataset[1]
+        graph_data.append(g)
+        batch_graph = pgl.graph.MultiGraph(graph_data)
+        graph_data = batch_graph
+        graph_data.edge_feat["feat"] = graph_data.edge_feat["feat"].astype(
+            "int64")
+        graph_data.node_feat["feat"] = graph_data.node_feat["feat"].astype(
+            "int64")
+        self.graph_wrapper = GraphWrapper(
+            name="graph",
+            place=F.CPUPlace(),
+            node_feat=graph_data.node_feat_info(),
+            edge_feat=graph_data.edge_feat_info())
+        self.atom_encoder = AtomEncoder(name="atom", emb_dim=self.embed_dim)
+        self.bond_encoder = BondEncoder(name="bond", emb_dim=self.embed_dim)
+        self.labels = L.data(
+            "labels",
+            shape=[None, self.args.num_class],
+            dtype="float32",
+            append_batch_size=False)
+        self.unmask = L.data(
+            "unmask",
+            shape=[None, self.args.num_class],
+            dtype="float32",
+            append_batch_size=False)
+        self.build_model()
+    def build_model(self):
+        node_features = self.atom_encoder(self.graph_wrapper.node_feat['feat'])
+        edge_features = self.bond_encoder(self.graph_wrapper.edge_feat['feat'])
+        self._enc_out = self.node_repr_encode(node_features, edge_features)
+        logits = L.fc(self._enc_out,
+                      self.args.num_class,
+                      act=None,
+                      param_attr=F.ParamAttr(name="final_fc"))
+        #  L.Print(self.labels, message="labels")
+        #  L.Print(self.unmask, message="unmask")
+        loss = L.sigmoid_cross_entropy_with_logits(x=logits, label=self.labels)
+        loss = loss * self.unmask
+        self.loss = L.reduce_sum(loss) / L.reduce_sum(self.unmask)
+        self.pred = L.sigmoid(logits)
+        self._metrics = Metric(loss=self.loss)
+    def node_repr_encode(self, node_features, edge_features):
+        features_list = [node_features]
+        for layer in range(self.args.num_layers):
+            feat = gin_layer(
+                self.graph_wrapper,
+                features_list[layer],
+                edge_features,
+                train_eps=self.args.train_eps,
+                name="gin_%s" % layer, )
+            feat = self.mlp(feat, name="mlp_%s" % layer)
+            feat = feat + features_list[layer]  # residual
+            features_list.append(feat)
+        output = pgl.layers.graph_pooling(
+            self.graph_wrapper, features_list[-1], self.args.pool_type)
+        return output
+    def mlp(self, features, name):
+        h = features
+        dim = features.shape[-1]
+        dim_list = [dim * 2, dim]
+        for i in range(2):
+            h = L.fc(h,
+                     size=dim_list[i],
+                     name="%s_fc_%s" % (name, i),
+                     act=None)
+            if self.args.norm_type == "layer_norm":
+                log.info("norm_type is %s" % self.args.norm_type)
+                h = L.layer_norm(
+                    h,
+                    begin_norm_axis=1,
+                    param_attr=F.ParamAttr(
+                        name="norm_scale_%s_%s" % (name, i),
+                        initializer=F.initializer.Constant(1.0)),
+                    bias_attr=F.ParamAttr(
+                        name="norm_bias_%s_%s" % (name, i),
+                        initializer=F.initializer.Constant(0.0)), )
+            else:
+                log.info("using batch_norm")
+                h = L.batch_norm(h)
+            h = pgl.layers.graph_norm(self.graph_wrapper, h)
+            h = L.relu(h)
+        return h
+    def get_enc_output(self):
+        return self._enc_out
+    @property
+    def init_vars(self):
+        return self._init_vars
+    @property
+    def metrics(self):
+        return self._metrics
--- a/ogb_examples/graphproppred/mol/mol_encoder.py
+++ b/ogb_examples/graphproppred/mol/mol_encoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MolEncoder for ogb
+"""
+import paddle.fluid as fluid
+from ogb.utils.features import get_atom_feature_dims, get_bond_feature_dims
+class AtomEncoder(object):
+    """AtomEncoder for encoding node features"""
+    def __init__(self, name, emb_dim):
+        self.emb_dim = emb_dim
+        self.name = name
+    def __call__(self, x):
+        atom_feature = get_atom_feature_dims()
+        atom_input = fluid.layers.split(
+            x, num_or_sections=len(atom_feature), dim=-1)
+        outputs = None
+        count = 0
+        for _x, _atom_input_dim in zip(atom_input, atom_feature):
+            count += 1
+            emb = fluid.layers.embedding(
+                _x,
+                size=(_atom_input_dim, self.emb_dim),
+                param_attr=fluid.ParamAttr(
+                    name=self.name + '_atom_feat_%s' % count))
+            if outputs is None:
+                outputs = emb
+            else:
+                outputs = outputs + emb
+        return outputs
+class BondEncoder(object):
+    """Bond for encoding edge features"""
+    def __init__(self, name, emb_dim):
+        self.emb_dim = emb_dim
+        self.name = name
+    def __call__(self, x):
+        bond_feature = get_bond_feature_dims()
+        bond_input = fluid.layers.split(
+            x, num_or_sections=len(bond_feature), dim=-1)
+        outputs = None
+        count = 0
+        for _x, _bond_input_dim in zip(bond_input, bond_feature):
+            count += 1
+            emb = fluid.layers.embedding(
+                _x,
+                size=(_bond_input_dim, self.emb_dim),
+                param_attr=fluid.ParamAttr(
+                    name=self.name + '_bond_feat_%s' % count))
+            if outputs is None:
+                outputs = emb
+            else:
+                outputs = outputs + emb
+        return outputs
--- a/ogb_examples/graphproppred/mol/monitor/train_monitor.py
+++ b/ogb_examples/graphproppred/mol/monitor/train_monitor.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import tqdm
+import json
+import numpy as np
+import os
+from datetime import datetime
+import logging
+from collections import defaultdict
+from tensorboardX import SummaryWriter
+import paddle.fluid as F
+from pgl.utils.logger import log
+def multi_device(reader, dev_count):
+    if dev_count == 1:
+        for batch in reader:
+            yield batch
+    else:
+        batches = []
+        for batch in reader:
+            batches.append(batch)
+            if len(batches) == dev_count:
+                yield batches
+                batches = []
+def evaluate(exe, loader, prog, model, evaluator):
+    total_labels = []
+    for i in range(len(loader.dataset)):
+        g, l = loader.dataset[i]
+        total_labels.append(l)
+    total_labels = np.vstack(total_labels)
+    pred_output = []
+    for feed_dict in loader:
+        ret = exe.run(prog, feed=feed_dict, fetch_list=model.pred)
+        pred_output.append(ret[0])
+    pred_output = np.vstack(pred_output)
+    result = evaluator.eval({"y_true": total_labels, "y_pred": pred_output})
+    return result
+def _create_if_not_exist(path):
+    basedir = os.path.dirname(path)
+    if not os.path.exists(basedir):
+        os.makedirs(basedir)
+def train_and_evaluate(exe,
+                       train_exe,
+                       valid_exe,
+                       train_ds,
+                       valid_ds,
+                       test_ds,
+                       train_prog,
+                       valid_prog,
+                       args,
+                       model,
+                       evaluator,
+                       dev_count=1):
+    global_step = 0
+    timestamp = datetime.now().strftime("%Hh%Mm%Ss")
+    log_path = os.path.join(args.log_dir, "tensorboard_log_%s" % timestamp)
+    _create_if_not_exist(log_path)
+    writer = SummaryWriter(log_path)
+    best_valid_score = 0.0
+    for e in range(args.epoch):
+        for feed_dict in multi_device(train_ds, dev_count):
+            if dev_count > 1:
+                ret = train_exe.run(feed=feed_dict,
+                                    fetch_list=model.metrics.vars)
+                ret = [[np.mean(v)] for v in ret]
+            else:
+                ret = train_exe.run(train_prog,
+                                    feed=feed_dict,
+                                    fetch_list=model.metrics.vars)
+            ret = model.metrics.parse(ret)
+            if global_step % args.train_log_step == 0:
+                writer.add_scalar(
+                    "batch_loss", ret['loss'], global_step=global_step)
+                log.info("epoch: %d | step: %d | loss: %.4f " %
+                         (e, global_step, ret['loss']))
+            global_step += 1
+            if global_step % args.eval_step == 0:
+                valid_ret = evaluate(exe, valid_ds, valid_prog, model,
+                                     evaluator)
+                message = "valid: "
+                for key, value in valid_ret.items():
+                    message += "%s %.4f | " % (key, value)
+                    writer.add_scalar(
+                        "eval_%s" % key, value, global_step=global_step)
+                log.info(message)
+                # testing
+                test_ret = evaluate(exe, test_ds, valid_prog, model, evaluator)
+                message = "test: "
+                for key, value in test_ret.items():
+                    message += "%s %.4f | " % (key, value)
+                    writer.add_scalar(
+                        "test_%s" % key, value, global_step=global_step)
+                log.info(message)
+        # evaluate after one epoch
+        valid_ret = evaluate(exe, valid_ds, valid_prog, model, evaluator)
+        message = "epoch %s valid: " % e
+        for key, value in valid_ret.items():
+            message += "%s %.4f | " % (key, value)
+            writer.add_scalar("eval_%s" % key, value, global_step=global_step)
+        log.info(message)
+        # testing
+        test_ret = evaluate(exe, test_ds, valid_prog, model, evaluator)
+        message = "epoch %s test: " % e
+        for key, value in test_ret.items():
+            message += "%s %.4f | " % (key, value)
+            writer.add_scalar("test_%s" % key, value, global_step=global_step)
+        log.info(message)
+        message = "epoch %s best %s result | " % (e, args.eval_metrics)
+        if valid_ret[args.eval_metrics] > best_valid_score:
+            best_valid_score = valid_ret[args.eval_metrics]
+            best_test_score = test_ret[args.eval_metrics]
+        message += "valid %.4f | test %.4f" % (best_valid_score,
+                                               best_test_score)
+        log.info(message)
+        #  if global_step % args.save_step == 0:
+        #      F.io.save_persistables(exe, os.path.join(args.save_dir, "%s" % global_step), train_prog)
+    writer.close()
--- a/ogb_examples/graphproppred/mol/optimization.py
+++ b/ogb_examples/graphproppred/mol/optimization.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimization and learning rate scheduling."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import numpy as np
+import paddle.fluid as fluid
+from utils.fp16 import create_master_params_grads, master_param_to_train_param, apply_dynamic_loss_scaling
+def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
+    """ Applies linear warmup of learning rate from 0 and decay to 0."""
+    with fluid.default_main_program()._lr_schedule_guard():
+        lr = fluid.layers.tensor.create_global_var(
+            shape=[1],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            name="scheduled_learning_rate")
+        global_step = fluid.layers.learning_rate_scheduler._decay_step_counter(
+        )
+        with fluid.layers.control_flow.Switch() as switch:
+            with switch.case(global_step < warmup_steps):
+                warmup_lr = learning_rate * (global_step / warmup_steps)
+                fluid.layers.tensor.assign(warmup_lr, lr)
+            with switch.default():
+                decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay(
+                    learning_rate=learning_rate,
+                    decay_steps=num_train_steps,
+                    end_learning_rate=0.0,
+                    power=1.0,
+                    cycle=False)
+                fluid.layers.tensor.assign(decayed_lr, lr)
+        return lr
+def optimization(loss,
+                 warmup_steps,
+                 num_train_steps,
+                 learning_rate,
+                 train_program,
+                 startup_prog,
+                 weight_decay,
+                 scheduler='linear_warmup_decay',
+                 use_fp16=False,
+                 use_dynamic_loss_scaling=False,
+                 init_loss_scaling=1.0,
+                 incr_every_n_steps=1000,
+                 decr_every_n_nan_or_inf=2,
+                 incr_ratio=2.0,
+                 decr_ratio=0.8):
+    if warmup_steps > 0:
+        if scheduler == 'noam_decay':
+            scheduled_lr = fluid.layers.learning_rate_scheduler\
+             .noam_decay(1/(warmup_steps *(learning_rate ** 2)),
+                         warmup_steps)
+        elif scheduler == 'linear_warmup_decay':
+            scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
+                                               num_train_steps)
+        else:
+            raise ValueError("Unkown learning rate scheduler, should be "
+                             "'noam_decay' or 'linear_warmup_decay'")
+        optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
+    else:
+        scheduled_lr = fluid.layers.create_global_var(
+            name=fluid.unique_name.generate("learning_rate"),
+            shape=[1],
+            value=learning_rate,
+            dtype='float32',
+            persistable=True)
+        optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
+        optimizer._learning_rate_map[fluid.default_main_program(
+        )] = scheduled_lr
+    fluid.clip.set_gradient_clip(
+        clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
+    def exclude_from_weight_decay(name):
+        if name.find("layer_norm") > -1:
+            return True
+        bias_suffix = ["_bias", "_b", ".b_0"]
+        for suffix in bias_suffix:
+            if name.endswith(suffix):
+                return True
+        return False
+    param_list = dict()
+    loss_scaling = fluid.layers.create_global_var(
+        name=fluid.unique_name.generate("loss_scaling"),
+        shape=[1],
+        value=init_loss_scaling,
+        dtype='float32',
+        persistable=True)
+    if use_fp16:
+        loss *= loss_scaling
+        param_grads = optimizer.backward(loss)
+        master_param_grads = create_master_params_grads(
+            param_grads, train_program, startup_prog, loss_scaling)
+        for param, _ in master_param_grads:
+            param_list[param.name] = param * 1.0
+            param_list[param.name].stop_gradient = True
+        if use_dynamic_loss_scaling:
+            apply_dynamic_loss_scaling(
+                loss_scaling, master_param_grads, incr_every_n_steps,
+                decr_every_n_nan_or_inf, incr_ratio, decr_ratio)
+        optimizer.apply_gradients(master_param_grads)
+        if weight_decay > 0:
+            for param, grad in master_param_grads:
+                if exclude_from_weight_decay(param.name.rstrip(".master")):
+                    continue
+                with param.block.program._optimized_guard(
+                    [param, grad]), fluid.framework.name_scope("weight_decay"):
+                    updated_param = param - param_list[
+                        param.name] * weight_decay * scheduled_lr
+                    fluid.layers.assign(output=param, input=updated_param)
+        master_param_to_train_param(master_param_grads, param_grads,
+                                    train_program)
+    else:
+        for param in train_program.global_block().all_parameters():
+            param_list[param.name] = param * 1.0
+            param_list[param.name].stop_gradient = True
+        _, param_grads = optimizer.minimize(loss)
+        if weight_decay > 0:
+            for param, grad in param_grads:
+                if exclude_from_weight_decay(param.name):
+                    continue
+                with param.block.program._optimized_guard(
+                    [param, grad]), fluid.framework.name_scope("weight_decay"):
+                    updated_param = param - param_list[
+                        param.name] * weight_decay * scheduled_lr
+                    fluid.layers.assign(output=param, input=updated_param)
+    return scheduled_lr, loss_scaling
--- a/ogb_examples/graphproppred/mol/pcba_config.yaml
+++ b/ogb_examples/graphproppred/mol/pcba_config.yaml
+task_name: pcba
+seed: 28994
+dataset_name: ogbg-molpcba
+eval_metrics: null
+task_type: null
+num_class: null
+pool_type: average
+train_eps: True
+norm_type: layer_norm
+model_type: GNNModel
+embed_dim: 128
+num_layers: 5
+hidden_size: 256
+save_dir: ./checkpoints
+# finetune model config
+init_checkpoint: null
+init_pretraining_params: null
+# data config
+data_dir: ./dataset/
+symmetry: True
+batch_size: 256
+buf_size: 1000
+metrics: True
+shuffle: True
+num_workers: 12
+output_dir: ./outputs/
+# trainging config
+epoch: 50
+learning_rate: 0.005
+lr_scheduler: linear_warmup_decay
+weight_decay: 0.01
+warmup_proportion: 0.1
+save_steps: 10000
+validation_steps: 1000
+use_dynamic_loss_scaling: True
+init_loss_scaling: 102400
+metric: simple_accuracy
+incr_every_n_steps: 100
+decr_every_n_nan_or_inf: 2
+incr_ratio: 2.0
+decr_ratio: 0.8
+log_dir: ./logs
+eval_step: 1000
+train_log_step: 20
+# log config
+skip_steps: 10
+verbose: False
--- a/ogb_examples/graphproppred/mol/utils/__init__.py
+++ b/ogb_examples/graphproppred/mol/utils/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/ogb_examples/graphproppred/mol/utils/args.py
+++ b/ogb_examples/graphproppred/mol/utils/args.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Arguments for configuration."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import six
+import os
+import sys
+import argparse
+import logging
+import paddle.fluid as fluid
+log = logging.getLogger("logger")
+def prepare_logger(logger, debug=False, save_to_file=None):
+    formatter = logging.Formatter(
+        fmt='[%(levelname)s] %(asctime)s [%(filename)12s:%(lineno)5d]:\t%(message)s'
+    )
+    #  console_hdl = logging.StreamHandler()
+    #  console_hdl.setFormatter(formatter)
+    #  logger.addHandler(console_hdl)
+    if save_to_file is not None:  #and not os.path.exists(save_to_file):
+        if os.path.isdir(save_to_file):
+            file_hdl = logging.FileHandler(
+                os.path.join(save_to_file, 'log.txt'))
+        else:
+            file_hdl = logging.FileHandler(save_to_file)
+        file_hdl.setFormatter(formatter)
+        logger.addHandler(file_hdl)
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+def str2bool(v):
+    # because argparse does not support to parse "true, False" as python
+    # boolean directly
+    return v.lower() in ("true", "t", "1")
+class ArgumentGroup(object):
+    def __init__(self, parser, title, des):
+        self._group = parser.add_argument_group(title=title, description=des)
+    def add_arg(self,
+                name,
+                type,
+                default,
+                help,
+                positional_arg=False,
+                **kwargs):
+        prefix = "" if positional_arg else "--"
+        type = str2bool if type == bool else type
+        self._group.add_argument(
+            prefix + name,
+            default=default,
+            type=type,
+            help=help + ' Default: %(default)s.',
+            **kwargs)
+def print_arguments(args):
+    log.info('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(six.iteritems(vars(args))):
+        log.info('%s: %s' % (arg, value))
+    log.info('------------------------------------------------')
+def check_cuda(use_cuda, err = \
+    "\nYou can not set use_cuda = True in the model because you are using paddlepaddle-cpu.\n \
+    Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n"
+                                                                                                                     ):
+    try:
+        if use_cuda == True and fluid.is_compiled_with_cuda() == False:
+            log.error(err)
+            sys.exit(1)
+    except Exception as e:
+        pass
--- a/ogb_examples/graphproppred/mol/utils/cards.py
+++ b/ogb_examples/graphproppred/mol/utils/cards.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import os
+def get_cards():
+    """
+    get gpu cards number
+    """
+    num = 0
+    cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
+    if cards != '':
+        num = len(cards.split(","))
+    return num
--- a/ogb_examples/graphproppred/mol/utils/config.py
+++ b/ogb_examples/graphproppred/mol/utils/config.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file implement a class for model configure.
+"""
+import datetime
+import os
+import yaml
+import random
+import shutil
+import six
+import logging
+log = logging.getLogger("logger")
+class AttrDict(dict):
+    """Attr dict
+    """
+    def __init__(self, d):
+        self.dict = d
+    def __getattr__(self, attr):
+        value = self.dict[attr]
+        if isinstance(value, dict):
+            return AttrDict(value)
+        else:
+            return value
+    def __str__(self):
+        return str(self.dict)
+class Config(object):
+    """Implementation of Config class for model configure.
+    Args:
+        config_file(str): configure filename, which is a yaml file.
+        isCreate(bool): if true, create some neccessary directories to save models, log file and other outputs.
+        isSave(bool): if true, save config_file in order to record the configure message.
+    """
+    def __init__(self, config_file, isCreate=False, isSave=False):
+        self.config_file = config_file
+        #  self.config = self.get_config_from_yaml(config_file)
+        self.config = self.load_config(config_file)
+        if isCreate:
+            self.create_necessary_dirs()
+            if isSave:
+                self.save_config_file()
+    def load_config(self, config_file):
+        """Load config file"""
+        with open(config_file) as f:
+            if hasattr(yaml, 'FullLoader'):
+                config = yaml.load(f, Loader=yaml.FullLoader)
+            else:
+                config = yaml.load(f)
+        return config
+    def create_necessary_dirs(self):
+        """Create some necessary directories to save some important files.
+        """
+        self.config['log_dir'] = os.path.join(self.config['log_dir'],
+                                              self.config['task_name'])
+        self.config['save_dir'] = os.path.join(self.config['save_dir'],
+                                               self.config['task_name'])
+        self.config['output_dir'] = os.path.join(self.config['output_dir'],
+                                                 self.config['task_name'])
+        self.make_dir(self.config['log_dir'])
+        self.make_dir(self.config['save_dir'])
+        self.make_dir(self.config['output_dir'])
+    def save_config_file(self):
+        """Save config file so that we can know the config when we look back
+        """
+        filename = self.config_file.split('/')[-1]
+        targetpath = os.path.join(self.config['save_dir'], filename)
+        try:
+            shutil.copyfile(self.config_file, targetpath)
+        except shutil.SameFileError:
+            log.info("%s and %s are the same file, did not copy by shutil"\
+                    % (self.config_file, targetpath))
+    def make_dir(self, path):
+        """Build directory"""
+        if not os.path.exists(path):
+            os.makedirs(path)
+    def __getitem__(self, key):
+        return self.config[key]
+    def __call__(self):
+        """__call__"""
+        return self.config
+    def __getattr__(self, attr):
+        try:
+            result = self.config[attr]
+        except KeyError:
+            log.warn("%s attribute is not existed, return None" % attr)
+            result = None
+        return result
+    def __setitem__(self, key, value):
+        self.config[key] = value
+    def __str__(self):
+        return str(self.config)
+    def pretty_print(self):
+        log.info(
+            "-----------------------------------------------------------------")
+        log.info("config file: %s" % self.config_file)
+        for key, value in sorted(
+                self.config.items(), key=lambda item: item[0]):
+            log.info("%s: %s" % (key, value))
+        log.info(
+            "-----------------------------------------------------------------")
--- a/ogb_examples/graphproppred/mol/utils/fp16.py
+++ b/ogb_examples/graphproppred/mol/utils/fp16.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+def append_cast_op(i, o, prog):
+    """
+    Append a cast op in a given Program to cast input `i` to data type `o.dtype`.
+    Args:
+        i (Variable): The input Variable.
+        o (Variable): The output Variable.
+        prog (Program): The Program to append cast op.
+    """
+    prog.global_block().append_op(
+        type="cast",
+        inputs={"X": i},
+        outputs={"Out": o},
+        attrs={"in_dtype": i.dtype,
+               "out_dtype": o.dtype})
+def copy_to_master_param(p, block):
+    v = block.vars.get(p.name, None)
+    if v is None:
+        raise ValueError("no param name %s found!" % p.name)
+    new_p = fluid.framework.Parameter(
+        block=block,
+        shape=v.shape,
+        dtype=fluid.core.VarDesc.VarType.FP32,
+        type=v.type,
+        lod_level=v.lod_level,
+        stop_gradient=p.stop_gradient,
+        trainable=p.trainable,
+        optimize_attr=p.optimize_attr,
+        regularizer=p.regularizer,
+        gradient_clip_attr=p.gradient_clip_attr,
+        error_clip=p.error_clip,
+        name=v.name + ".master")
+    return new_p
+def apply_dynamic_loss_scaling(loss_scaling, master_params_grads,
+                               incr_every_n_steps, decr_every_n_nan_or_inf,
+                               incr_ratio, decr_ratio):
+    _incr_every_n_steps = fluid.layers.fill_constant(
+        shape=[1], dtype='int32', value=incr_every_n_steps)
+    _decr_every_n_nan_or_inf = fluid.layers.fill_constant(
+        shape=[1], dtype='int32', value=decr_every_n_nan_or_inf)
+    _num_good_steps = fluid.layers.create_global_var(
+        name=fluid.unique_name.generate("num_good_steps"),
+        shape=[1],
+        value=0,
+        dtype='int32',
+        persistable=True)
+    _num_bad_steps = fluid.layers.create_global_var(
+        name=fluid.unique_name.generate("num_bad_steps"),
+        shape=[1],
+        value=0,
+        dtype='int32',
+        persistable=True)
+    grads = [fluid.layers.reduce_sum(g) for [_, g] in master_params_grads]
+    all_grads = fluid.layers.concat(grads)
+    all_grads_sum = fluid.layers.reduce_sum(all_grads)
+    is_overall_finite = fluid.layers.isfinite(all_grads_sum)
+    update_loss_scaling(is_overall_finite, loss_scaling, _num_good_steps,
+                        _num_bad_steps, _incr_every_n_steps,
+                        _decr_every_n_nan_or_inf, incr_ratio, decr_ratio)
+    # apply_gradient append all ops in global block, thus we shouldn't
+    # apply gradient in the switch branch.
+    with fluid.layers.Switch() as switch:
+        with switch.case(is_overall_finite):
+            pass
+        with switch.default():
+            for _, g in master_params_grads:
+                fluid.layers.assign(fluid.layers.zeros_like(g), g)
+def create_master_params_grads(params_grads, main_prog, startup_prog,
+                               loss_scaling):
+    master_params_grads = []
+    for p, g in params_grads:
+        with main_prog._optimized_guard([p, g]):
+            # create master parameters
+            master_param = copy_to_master_param(p, main_prog.global_block())
+            startup_master_param = startup_prog.global_block()._clone_variable(
+                master_param)
+            startup_p = startup_prog.global_block().var(p.name)
+            append_cast_op(startup_p, startup_master_param, startup_prog)
+            # cast fp16 gradients to fp32 before apply gradients
+            if g.name.find("layer_norm") > -1:
+                scaled_g = g / loss_scaling
+                master_params_grads.append([p, scaled_g])
+                continue
+            master_grad = fluid.layers.cast(g, "float32")
+            master_grad = master_grad / loss_scaling
+            master_params_grads.append([master_param, master_grad])
+    return master_params_grads
+def master_param_to_train_param(master_params_grads, params_grads, main_prog):
+    for idx, m_p_g in enumerate(master_params_grads):
+        train_p, _ = params_grads[idx]
+        if train_p.name.find("layer_norm") > -1:
+            continue
+        with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
+            append_cast_op(m_p_g[0], train_p, main_prog)
+def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps,
+                        num_bad_steps, incr_every_n_steps,
+                        decr_every_n_nan_or_inf, incr_ratio, decr_ratio):
+    """
+    Update loss scaling according to overall gradients. If all gradients is 
+    finite after incr_every_n_steps, loss scaling will increase by incr_ratio. 
+    Otherwisw, loss scaling will decrease by decr_ratio after 
+    decr_every_n_nan_or_inf steps and each step some gradients are infinite.
+    Args:
+        is_overall_finite (Variable): A boolean variable indicates whether 
+                                     all gradients are finite.
+        prev_loss_scaling (Variable): Previous loss scaling.
+        num_good_steps (Variable): A variable accumulates good steps in which 
+                                   all gradients are finite.
+        num_bad_steps (Variable): A variable accumulates bad steps in which 
+                                  some gradients are infinite.
+        incr_every_n_steps (Variable): A variable represents increasing loss 
+                                       scaling every n consecutive steps with 
+                                       finite gradients.
+        decr_every_n_nan_or_inf (Variable): A variable represents decreasing 
+                                            loss scaling every n accumulated 
+                                            steps with nan or inf gradients.
+        incr_ratio(float): The multiplier to use when increasing the loss 
+                           scaling.
+        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
+                           loss scaling.
+    """
+    zero_steps = fluid.layers.fill_constant(shape=[1], dtype='int32', value=0)
+    with fluid.layers.Switch() as switch:
+        with switch.case(is_overall_finite):
+            should_incr_loss_scaling = fluid.layers.less_than(
+                incr_every_n_steps, num_good_steps + 1)
+            with fluid.layers.Switch() as switch1:
+                with switch1.case(should_incr_loss_scaling):
+                    new_loss_scaling = prev_loss_scaling * incr_ratio
+                    loss_scaling_is_finite = fluid.layers.isfinite(
+                        new_loss_scaling)
+                    with fluid.layers.Switch() as switch2:
+                        with switch2.case(loss_scaling_is_finite):
+                            fluid.layers.assign(new_loss_scaling,
+                                                prev_loss_scaling)
+                        with switch2.default():
+                            pass
+                    fluid.layers.assign(zero_steps, num_good_steps)
+                    fluid.layers.assign(zero_steps, num_bad_steps)
+                with switch1.default():
+                    fluid.layers.increment(num_good_steps)
+                    fluid.layers.assign(zero_steps, num_bad_steps)
+        with switch.default():
+            should_decr_loss_scaling = fluid.layers.less_than(
+                decr_every_n_nan_or_inf, num_bad_steps + 1)
+            with fluid.layers.Switch() as switch3:
+                with switch3.case(should_decr_loss_scaling):
+                    new_loss_scaling = prev_loss_scaling * decr_ratio
+                    static_loss_scaling = \
+                        fluid.layers.fill_constant(shape=[1],
+                                             dtype='float32',
+                                             value=1.0)
+                    less_than_one = fluid.layers.less_than(new_loss_scaling,
+                                                           static_loss_scaling)
+                    with fluid.layers.Switch() as switch4:
+                        with switch4.case(less_than_one):
+                            fluid.layers.assign(static_loss_scaling,
+                                                prev_loss_scaling)
+                        with switch4.default():
+                            fluid.layers.assign(new_loss_scaling,
+                                                prev_loss_scaling)
+                    fluid.layers.assign(zero_steps, num_good_steps)
+                    fluid.layers.assign(zero_steps, num_bad_steps)
+                with switch3.default():
+                    fluid.layers.assign(zero_steps, num_good_steps)
+                    fluid.layers.increment(num_bad_steps)
--- a/ogb_examples/graphproppred/mol/utils/init.py
+++ b/ogb_examples/graphproppred/mol/utils/init.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import os
+import six
+import ast
+import copy
+import logging
+import numpy as np
+import paddle.fluid as fluid
+log = logging.getLogger("logger")
+def cast_fp32_to_fp16(exe, main_program):
+    log.info("Cast parameters to float16 data format.")
+    for param in main_program.global_block().all_parameters():
+        if not param.name.endswith(".master"):
+            param_t = fluid.global_scope().find_var(param.name).get_tensor()
+            data = np.array(param_t)
+            if param.name.startswith("encoder_layer") \
+                    and "layer_norm" not in param.name:
+                param_t.set(np.float16(data).view(np.uint16), exe.place)
+            #load fp32
+            master_param_var = fluid.global_scope().find_var(param.name +
+                                                             ".master")
+            if master_param_var is not None:
+                master_param_var.get_tensor().set(data, exe.place)
+def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
+    assert os.path.exists(
+        init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
+    def existed_persitables(var):
+        if not fluid.io.is_persistable(var):
+            return False
+        return os.path.exists(os.path.join(init_checkpoint_path, var.name))
+    fluid.io.load_vars(
+        exe,
+        init_checkpoint_path,
+        main_program=main_program,
+        predicate=existed_persitables)
+    log.info("Load model from {}".format(init_checkpoint_path))
+    if use_fp16:
+        cast_fp32_to_fp16(exe, main_program)
+def init_pretraining_params(exe,
+                            pretraining_params_path,
+                            main_program,
+                            use_fp16=False):
+    assert os.path.exists(pretraining_params_path
+                          ), "[%s] cann't be found." % pretraining_params_path
+    def existed_params(var):
+        if not isinstance(var, fluid.framework.Parameter):
+            return False
+        return os.path.exists(os.path.join(pretraining_params_path, var.name))
+    fluid.io.load_vars(
+        exe,
+        pretraining_params_path,
+        main_program=main_program,
+        predicate=existed_params)
+    log.info("Load pretraining parameters from {}.".format(
+        pretraining_params_path))
+    if use_fp16:
+        cast_fp32_to_fp16(exe, main_program)
--- a/ogb_examples/linkproppred/main_pgl.py
+++ b/ogb_examples/linkproppred/main_pgl.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""test ogb
-"""
-import argparse
-import time
-import logging
-import numpy as np
-import paddle.fluid as fluid
-import pgl
-from pgl.contrib.ogb.linkproppred.dataset_pgl import PglLinkPropPredDataset
-from pgl.utils import paddle_helper
-from ogb.linkproppred import Evaluator
-def send_func(src_feat, dst_feat, edge_feat):
-    """send_func"""
-    return src_feat["h"]
-def recv_func(feat):
-    """recv_func"""
-    return fluid.layers.sequence_pool(feat, pool_type="sum")
-class GNNModel(object):
-    """GNNModel"""
-    def __init__(self, name, num_nodes, emb_dim, num_layers):
-        self.num_nodes = num_nodes
-        self.emb_dim = emb_dim
-        self.num_layers = num_layers
-        self.name = name
-        self.src_nodes = fluid.layers.data(
-            name='src_nodes',
-            shape=[None],
-            dtype='int64', )
-        self.dst_nodes = fluid.layers.data(
-            name='dst_nodes',
-            shape=[None],
-            dtype='int64', )
-        self.edge_label = fluid.layers.data(
-            name='edge_label',
-            shape=[None, 1],
-            dtype='float32', )
-    def forward(self, graph):
-        """forward"""
-        h = fluid.layers.create_parameter(
-            shape=[self.num_nodes, self.emb_dim],
-            dtype="float32",
-            name=self.name + "_embedding")
-        for layer in range(self.num_layers):
-            msg = graph.send(
-                send_func,
-                nfeat_list=[("h", h)], )
-            h = graph.recv(msg, recv_func)
-            h = fluid.layers.fc(
-                h,
-                size=self.emb_dim,
-                bias_attr=False,
-                param_attr=fluid.ParamAttr(name=self.name + '_%s' % layer))
-            h = h * graph.node_feat["norm"]
-            bias = fluid.layers.create_parameter(
-                shape=[self.emb_dim],
-                dtype='float32',
-                is_bias=True,
-                name=self.name + '_bias_%s' % layer)
-            h = fluid.layers.elementwise_add(h, bias, act="relu")
-        src = fluid.layers.gather(h, self.src_nodes, overwrite=False)
-        dst = fluid.layers.gather(h, self.dst_nodes, overwrite=False)
-        edge_embed = src * dst
-        pred = fluid.layers.fc(input=edge_embed,
-                               size=1,
-                               name=self.name + "_pred_output")
-        prob = fluid.layers.sigmoid(pred)
-        loss = fluid.layers.sigmoid_cross_entropy_with_logits(pred,
-                                                              self.edge_label)
-        loss = fluid.layers.reduce_sum(loss)
-        return pred, prob, loss
-def main():
-    """main
-    """
-    # Training settings
-    parser = argparse.ArgumentParser(description='Graph Dataset')
-    parser.add_argument(
-        '--epochs',
-        type=int,
-        default=4,
-        help='number of epochs to train (default: 100)')
-    parser.add_argument(
-        '--dataset',
-        type=str,
-        default="ogbl-ppa",
-        help='dataset name (default: protein protein associations)')
-    parser.add_argument('--use_cuda', action='store_true')
-    parser.add_argument('--batch_size', type=int, default=5120)
-    parser.add_argument('--embed_dim', type=int, default=64)
-    parser.add_argument('--num_layers', type=int, default=2)
-    parser.add_argument('--lr', type=float, default=0.001)
-    args = parser.parse_args()
-    print(args)
-    place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
-    ### automatic dataloading and splitting
-    print("loadding dataset")
-    dataset = PglLinkPropPredDataset(name=args.dataset)
-    splitted_edge = dataset.get_edge_split()
-    print(splitted_edge['train_edge'].shape)
-    print(splitted_edge['train_edge_label'].shape)
-    print("building evaluator")
-    ### automatic evaluator. takes dataset name as input
-    evaluator = Evaluator(args.dataset)
-    graph_data = dataset[0]
-    print("num_nodes: %d" % graph_data.num_nodes)
-    train_program = fluid.Program()
-    startup_program = fluid.Program()
-    # degree normalize
-    indegree = graph_data.indegree()
-    norm = np.zeros_like(indegree, dtype="float32")
-    norm[indegree > 0] = np.power(indegree[indegree > 0], -0.5)
-    graph_data.node_feat["norm"] = np.expand_dims(norm, -1).astype("float32")
-    #  graph_data.node_feat["index"] = np.array([i for i in range(graph_data.num_nodes)], dtype=np.int64).reshape(-1,1)
-    with fluid.program_guard(train_program, startup_program):
-        model = GNNModel(
-            name="gnn",
-            num_nodes=graph_data.num_nodes,
-            emb_dim=args.embed_dim,
-            num_layers=args.num_layers)
-        gw = pgl.graph_wrapper.GraphWrapper(
-            "graph",
-            node_feat=graph_data.node_feat_info(),
-            edge_feat=graph_data.edge_feat_info())
-        pred, prob, loss = model.forward(gw)
-    val_program = train_program.clone(for_test=True)
-    with fluid.program_guard(train_program, startup_program):
-        global_steps = int(splitted_edge['train_edge'].shape[0] /
-                           args.batch_size * 2)
-        learning_rate = fluid.layers.polynomial_decay(args.lr, global_steps,
-                                                      0.00005)
-        adam = fluid.optimizer.Adam(
-            learning_rate=learning_rate,
-            regularization=fluid.regularizer.L2DecayRegularizer(
-                regularization_coeff=0.0005))
-        adam.minimize(loss)
-    exe = fluid.Executor(place)
-    exe.run(startup_program)
-    feed = gw.to_feed(graph_data)
-    print("evaluate result before training: ")
-    result = test(exe, val_program, prob, evaluator, feed, splitted_edge)
-    print(result)
-    print("training")
-    cc = 0
-    for epoch in range(1, args.epochs + 1):
-        for batch_data, batch_label in data_generator(
-                graph_data,
-                splitted_edge["train_edge"],
-                splitted_edge["train_edge_label"],
-                batch_size=args.batch_size):
-            feed['src_nodes'] = batch_data[:, 0].reshape(-1, 1)
-            feed['dst_nodes'] = batch_data[:, 1].reshape(-1, 1)
-            feed['edge_label'] = batch_label.astype("float32")
-            res_loss, y_pred, b_lr = exe.run(
-                train_program,
-                feed=feed,
-                fetch_list=[loss, prob, learning_rate])
-            if cc % 1 == 0:
-                print("epoch %d | step %d | lr %s | Loss %s" %
-                      (epoch, cc, b_lr[0], res_loss[0]))
-            cc += 1
-            if cc % 20 == 0:
-                print("Evaluating...")
-                result = test(exe, val_program, prob, evaluator, feed,
-                              splitted_edge)
-                print("epoch %d | step %d" % (epoch, cc))
-                print(result)
-def test(exe, val_program, prob, evaluator, feed, splitted_edge):
-    """Evaluation"""
-    result = {}
-    feed['src_nodes'] = splitted_edge["valid_edge"][:, 0].reshape(-1, 1)
-    feed['dst_nodes'] = splitted_edge["valid_edge"][:, 1].reshape(-1, 1)
-    feed['edge_label'] = splitted_edge["valid_edge_label"].astype(
-        "float32").reshape(-1, 1)
-    y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0]
-    input_dict = {
-        "y_pred_pos":
-        y_pred[splitted_edge["valid_edge_label"] == 1].reshape(-1, ),
-        "y_pred_neg":
-        y_pred[splitted_edge["valid_edge_label"] == 0].reshape(-1, )
-    }
-    result["valid"] = evaluator.eval(input_dict)
-    feed['src_nodes'] = splitted_edge["test_edge"][:, 0].reshape(-1, 1)
-    feed['dst_nodes'] = splitted_edge["test_edge"][:, 1].reshape(-1, 1)
-    feed['edge_label'] = splitted_edge["test_edge_label"].astype(
-        "float32").reshape(-1, 1)
-    y_pred = exe.run(val_program, feed=feed, fetch_list=[prob])[0]
-    input_dict = {
-        "y_pred_pos":
-        y_pred[splitted_edge["test_edge_label"] == 1].reshape(-1, ),
-        "y_pred_neg":
-        y_pred[splitted_edge["test_edge_label"] == 0].reshape(-1, )
-    }
-    result["test"] = evaluator.eval(input_dict)
-    return result
-def data_generator(graph, data, label_data, batch_size, shuffle=True):
-    """Data Generator"""
-    perm = np.arange(0, len(data))
-    if shuffle:
-        np.random.shuffle(perm)
-    offset = 0
-    while offset < len(perm):
-        batch_index = perm[offset:(offset + batch_size)]
-        offset += batch_size
-        pos_data = data[batch_index]
-        pos_label = label_data[batch_index]
-        neg_src_node = pos_data[:, 0]
-        neg_dst_node = np.random.choice(
-            pos_data.reshape(-1, ), size=len(neg_src_node))
-        neg_data = np.hstack(
-            [neg_src_node.reshape(-1, 1), neg_dst_node.reshape(-1, 1)])
-        exists = graph.has_edges_between(neg_src_node, neg_dst_node)
-        neg_data = neg_data[np.invert(exists)]
-        neg_label = np.zeros(shape=len(neg_data), dtype=np.int64)
-        batch_data = np.vstack([pos_data, neg_data])
-        label = np.vstack([pos_label.reshape(-1, 1), neg_label.reshape(-1, 1)])
-        yield batch_data, label
-if __name__ == "__main__":
-    main()
--- a/ogb_examples/linkproppred/ogbl-ppa/README.md
+++ b/ogb_examples/linkproppred/ogbl-ppa/README.md
+# Graph Link Prediction for Open Graph Benchmark (OGB) PPA dataset
+[The Open Graph Benchmark (OGB)](https://ogb.stanford.edu/) is a collection of benchmark datasets, data loaders, and evaluators for graph machine learning. Here we complete the Graph Link Prediction task based on PGL.
+### Requirements
+paddlpaddle >= 1.7.1
+pgl 1.0.2
+ogb
+### How to Run
+```
+CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --use_cuda 1  --num_workers 4 --output_path ./output/model_1 --batch_size 65536 --epoch 1000 --learning_rate 0.005  --hidden_size 256 
+```
+The best record will be saved in ./output/model_1/best.txt.
--- a/ogb_examples/linkproppred/ogbl-ppa/args.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/args.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""finetune args"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import os
+import time
+import argparse
+from utils.args import ArgumentGroup
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
+model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.")
+model_g.add_arg("init_pretraining_params", str, None,
+ "Init pre-training params which preforms fine-tuning from. If the "
+ "arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
+train_g = ArgumentGroup(parser, "training", "training options.")
+train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.")
+train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.")
+run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
+run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.")
+run_type_g.add_arg("num_workers", int, 1, "use multiprocess to generate graph")
+run_type_g.add_arg("output_path", str, None, "path to save model")
+run_type_g.add_arg("hidden_size", int, 128, "model hidden-size")
+run_type_g.add_arg("batch_size", int, 128, "batch_size")
--- a/ogb_examples/linkproppred/ogbl-ppa/dataloader/__init__.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/dataloader/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/ogb_examples/linkproppred/ogbl-ppa/dataloader/base_dataloader.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/dataloader/base_dataloader.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Base DataLoader 
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import os
+import sys
+import six
+from io import open
+from collections import namedtuple
+import numpy as np
+import tqdm
+import paddle
+from pgl.utils import mp_reader
+import collections
+import time
+import pgl
+if six.PY3:
+    import io
+    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
+def batch_iter(data, perm, batch_size, fid, num_workers):
+    """node_batch_iter
+    """
+    size = len(data)
+    start = 0
+    cc = 0
+    while start < size:
+        index = perm[start:start + batch_size]
+        start += batch_size
+        cc += 1
+        if cc % num_workers != fid:
+            continue
+        yield data[index]
+def scan_batch_iter(data, batch_size, fid, num_workers):
+    """node_batch_iter
+    """
+    batch = []
+    cc = 0
+    for line_example in data.scan():
+        cc += 1
+        if cc % num_workers != fid:
+            continue
+        batch.append(line_example)
+        if len(batch) == batch_size:
+            yield batch
+            batch = []
+    if len(batch) > 0:
+        yield batch
+class BaseDataGenerator(object):
+    """Base Data Geneartor"""
+    def __init__(self, buf_size, batch_size, num_workers, shuffle=True):
+        self.num_workers = num_workers
+        self.batch_size = batch_size
+        self.line_examples = []
+        self.buf_size = buf_size
+        self.shuffle = shuffle
+    def batch_fn(self, batch_examples):
+        """ batch_fn batch producer"""
+        raise NotImplementedError("No defined Batch Fn")
+    def batch_iter(self, fid, perm):
+        """ batch iterator"""
+        if self.shuffle:
+            for batch in batch_iter(self, perm, self.batch_size, fid,
+                                    self.num_workers):
+                yield batch
+        else:
+            for batch in scan_batch_iter(self, self.batch_size, fid,
+                                         self.num_workers):
+                yield batch
+    def __len__(self):
+        return len(self.line_examples)
+    def __getitem__(self, idx):
+        if isinstance(idx, collections.Iterable):
+            return [self[bidx] for bidx in idx]
+        else:
+            return self.line_examples[idx]
+    def generator(self):
+        """batch dict generator"""
+        def worker(filter_id, perm):
+            """ multiprocess worker"""
+            def func_run():
+                """ func_run """
+                pid = os.getpid()
+                np.random.seed(pid + int(time.time()))
+                for batch_examples in self.batch_iter(filter_id, perm):
+                    batch_dict = self.batch_fn(batch_examples)
+                    yield batch_dict
+            return func_run
+        # consume a seed
+        np.random.rand()
+        if self.shuffle:
+            perm = np.arange(0, len(self))
+            np.random.shuffle(perm)
+        else:
+            perm = None
+        if self.num_workers == 1:
+            r = paddle.reader.buffered(worker(0, perm), self.buf_size)
+        else:
+            worker_pool = [
+                worker(wid, perm) for wid in range(self.num_workers)
+            ]
+            worker = mp_reader.multiprocess_reader(
+                worker_pool, use_pipe=True, queue_size=1000)
+            r = paddle.reader.buffered(worker, self.buf_size)
+        for batch in r():
+            yield batch
+    def scan(self):
+        for line_example in self.line_examples:
+            yield line_example
--- a/ogb_examples/linkproppred/ogbl-ppa/dataloader/ogbl_ppa_dataloader.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/dataloader/ogbl_ppa_dataloader.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+from dataloader.base_dataloader import BaseDataGenerator
+import ssl
+ssl._create_default_https_context = ssl._create_unverified_context
+from ogb.linkproppred import LinkPropPredDataset
+from ogb.linkproppred import Evaluator
+import tqdm
+from collections import namedtuple
+import pgl
+import numpy as np
+class PPADataGenerator(BaseDataGenerator):
+    def __init__(self,
+                 graph_wrapper=None,
+                 buf_size=1000,
+                 batch_size=128,
+                 num_workers=1,
+                 shuffle=True,
+                 phase="train"):
+        super(PPADataGenerator, self).__init__(
+            buf_size=buf_size,
+            num_workers=num_workers,
+            batch_size=batch_size,
+            shuffle=shuffle)
+        self.d_name = "ogbl-ppa"
+        self.graph_wrapper = graph_wrapper
+        dataset = LinkPropPredDataset(name=self.d_name)
+        splitted_edge = dataset.get_edge_split()
+        self.phase = phase
+        graph = dataset[0]
+        edges = graph["edge_index"].T
+        #self.graph = pgl.graph.Graph(num_nodes=graph["num_nodes"],
+        #       edges=edges, 
+        #       node_feat={"nfeat": graph["node_feat"],
+        #             "node_id": np.arange(0, graph["num_nodes"], dtype="int64").reshape(-1, 1) })
+        #self.graph.indegree()
+        self.num_nodes = graph["num_nodes"]
+        if self.phase == 'train':
+            edges = splitted_edge["train"]["edge"]
+            labels = np.ones(len(edges))
+        elif self.phase == "valid":
+            # Compute the embedding for all the nodes
+            pos_edges = splitted_edge["valid"]["edge"]
+            neg_edges = splitted_edge["valid"]["edge_neg"]
+            pos_labels = np.ones(len(pos_edges))
+            neg_labels = np.zeros(len(neg_edges))
+            edges = np.vstack([pos_edges, neg_edges])
+            labels = pos_labels.tolist() + neg_labels.tolist()
+        elif self.phase == "test":
+            # Compute the embedding for all the nodes
+            pos_edges = splitted_edge["test"]["edge"]
+            neg_edges = splitted_edge["test"]["edge_neg"]
+            pos_labels = np.ones(len(pos_edges))
+            neg_labels = np.zeros(len(neg_edges))
+            edges = np.vstack([pos_edges, neg_edges])
+            labels = pos_labels.tolist() + neg_labels.tolist()
+        self.line_examples = []
+        Example = namedtuple('Example', ['src', "dst", "label"])
+        for edge, label in zip(edges, labels):
+            self.line_examples.append(
+                Example(
+                    src=edge[0], dst=edge[1], label=label))
+        print("Phase", self.phase)
+        print("Len Examples", len(self.line_examples))
+    def batch_fn(self, batch_ex):
+        batch_src = []
+        batch_dst = []
+        join_graph = []
+        cc = 0
+        batch_node_id = []
+        batch_labels = []
+        for ex in batch_ex:
+            batch_src.append(ex.src)
+            batch_dst.append(ex.dst)
+            batch_labels.append(ex.label)
+        if self.phase == "train":
+            for num in range(1):
+                rand_src = np.random.randint(
+                    low=0, high=self.num_nodes, size=len(batch_ex))
+                rand_dst = np.random.randint(
+                    low=0, high=self.num_nodes, size=len(batch_ex))
+                batch_src = batch_src + rand_src.tolist()
+                batch_dst = batch_dst + rand_dst.tolist()
+                batch_labels = batch_labels + np.zeros_like(
+                    rand_src, dtype="int64").tolist()
+        feed_dict = {}
+        feed_dict["batch_src"] = np.array(batch_src, dtype="int64")
+        feed_dict["batch_dst"] = np.array(batch_dst, dtype="int64")
+        feed_dict["labels"] = np.array(batch_labels, dtype="int64")
+        return feed_dict
--- a/ogb_examples/linkproppred/ogbl-ppa/model.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/model.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""lbs_model"""
+import os
+import re
+import time
+from random import random
+from functools import reduce, partial
+import numpy as np
+import multiprocessing
+import paddle
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from pgl.graph_wrapper import GraphWrapper
+from pgl.layers.conv import gcn, gat
+class BaseGraph(object):
+    """Base Graph Model"""
+    def __init__(self, args):
+        node_feature = [('nfeat', [None, 58], "float32"),
+                        ('node_id', [None, 1], "int64")]
+        self.hidden_size = args.hidden_size
+        self.num_nodes = args.num_nodes
+        self.graph_wrapper = None  # GraphWrapper(
+        #name="graph", place=F.CPUPlace(), node_feat=node_feature)
+        self.build_model(args)
+    def build_model(self, args):
+        """ build graph model"""
+        self.batch_src = L.data(name="batch_src", shape=[-1], dtype="int64")
+        self.batch_src = L.reshape(self.batch_src, [-1, 1])
+        self.batch_dst = L.data(name="batch_dst", shape=[-1], dtype="int64")
+        self.batch_dst = L.reshape(self.batch_dst, [-1, 1])
+        self.labels = L.data(name="labels", shape=[-1], dtype="int64")
+        self.labels = L.reshape(self.labels, [-1, 1])
+        self.labels.stop_gradients = True
+        self.src_repr = L.embedding(
+            self.batch_src,
+            size=(self.num_nodes, self.hidden_size),
+            param_attr=F.ParamAttr(
+                name="node_embeddings",
+                initializer=F.initializer.NormalInitializer(
+                    loc=0.0, scale=1.0)))
+        self.dst_repr = L.embedding(
+            self.batch_dst,
+            size=(self.num_nodes, self.hidden_size),
+            param_attr=F.ParamAttr(
+                name="node_embeddings",
+                initializer=F.initializer.NormalInitializer(
+                    loc=0.0, scale=1.0)))
+        self.link_predictor(self.src_repr, self.dst_repr)
+        self.bce_loss()
+    def link_predictor(self, x, y):
+        """ siamese network"""
+        feat = x * y
+        feat = L.fc(feat, size=self.hidden_size, name="link_predictor_1")
+        feat = L.relu(feat)
+        feat = L.fc(feat, size=self.hidden_size, name="link_predictor_2")
+        feat = L.relu(feat)
+        self.logits = L.fc(feat,
+                           size=1,
+                           act="sigmoid",
+                           name="link_predictor_logits")
+    def bce_loss(self):
+        """listwise model"""
+        mask = L.cast(self.labels > 0.5, dtype="float32")
+        mask.stop_gradients = True
+        self.loss = L.log_loss(self.logits, mask, epsilon=1e-15)
+        self.loss = L.reduce_mean(self.loss) * 2
+        proba = L.sigmoid(self.logits)
+        proba = L.concat([proba * -1 + 1, proba], axis=1)
+        auc_out, batch_auc_out, _ = \
+             L.auc(input=proba, label=self.labels, curve='ROC', slide_steps=1)
+        self.metrics = {
+            "loss": self.loss,
+            "auc": batch_auc_out,
+        }
+    def neighbor_aggregator(self, node_repr):
+        """neighbor aggregation"""
+        return node_repr
--- a/ogb_examples/linkproppred/ogbl-ppa/monitor/__init__.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/monitor/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""init"""
--- a/ogb_examples/linkproppred/ogbl-ppa/monitor/train_monitor.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/monitor/train_monitor.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""train and evaluate"""
+import tqdm
+import json
+import numpy as np
+import sys
+import os
+import paddle.fluid as F
+from tensorboardX import SummaryWriter
+from ogb.linkproppred import Evaluator
+from ogb.linkproppred import LinkPropPredDataset
+def multi_device(reader, dev_count):
+    """multi device"""
+    if dev_count == 1:
+        for batch in reader:
+            yield batch
+    else:
+        batches = []
+        for batch in reader:
+            batches.append(batch)
+            if len(batches) == dev_count:
+                yield batches
+                batches = []
+class OgbEvaluator(object):
+    def __init__(self):
+        d_name = "ogbl-ppa"
+        dataset = LinkPropPredDataset(name=d_name)
+        splitted_edge = dataset.get_edge_split()
+        graph = dataset[0]
+        self.num_nodes = graph["num_nodes"]
+        self.ogb_evaluator = Evaluator(name="ogbl-ppa")
+    def eval(self, scores, labels, phase):
+        labels = np.reshape(labels, [-1])
+        ret = {}
+        pos = scores[labels > 0.5].squeeze(-1)
+        neg = scores[labels < 0.5].squeeze(-1)
+        for K in [10, 50, 100]:
+            self.ogb_evaluator.K = K
+            ret['%s_hits@%s' % (phase, K)] = self.ogb_evaluator.eval({
+                'y_pred_pos': pos,
+                'y_pred_neg': neg,
+            })[f'hits@{K}']
+        return ret
+def evaluate(model, valid_exe, valid_ds, valid_prog, dev_count, evaluator,
+             phase):
+    """evaluate """
+    cc = 0
+    scores = []
+    labels = []
+    for feed_dict in tqdm.tqdm(
+            multi_device(valid_ds.generator(), dev_count), desc='evaluating'):
+        if dev_count > 1:
+            output = valid_exe.run(feed=feed_dict,
+                                   fetch_list=[model.logits, model.labels])
+        else:
+            output = valid_exe.run(valid_prog,
+                                   feed=feed_dict,
+                                   fetch_list=[model.logits, model.labels])
+        scores.append(output[0])
+        labels.append(output[1])
+    scores = np.vstack(scores)
+    labels = np.vstack(labels)
+    ret = evaluator.eval(scores, labels, phase)
+    return ret
+def _create_if_not_exist(path):
+    basedir = os.path.dirname(path)
+    if not os.path.exists(basedir):
+        os.makedirs(basedir)
+def train_and_evaluate(exe,
+                       train_exe,
+                       valid_exe,
+                       train_ds,
+                       valid_ds,
+                       test_ds,
+                       train_prog,
+                       valid_prog,
+                       model,
+                       metric,
+                       epoch=20,
+                       dev_count=1,
+                       train_log_step=5,
+                       eval_step=10000,
+                       evaluator=None,
+                       output_path=None):
+    """train and evaluate"""
+    global_step = 0
+    log_path = os.path.join(output_path, "log")
+    _create_if_not_exist(log_path)
+    writer = SummaryWriter(log_path)
+    best_model = 0
+    for e in range(epoch):
+        for feed_dict in tqdm.tqdm(
+                multi_device(train_ds.generator(), dev_count),
+                desc='Epoch %s' % e):
+            if dev_count > 1:
+                ret = train_exe.run(feed=feed_dict, fetch_list=metric.vars)
+                ret = [[np.mean(v)] for v in ret]
+            else:
+                ret = train_exe.run(train_prog,
+                                    feed=feed_dict,
+                                    fetch_list=metric.vars)
+            ret = metric.parse(ret)
+            if global_step % train_log_step == 0:
+                for key, value in ret.items():
+                    writer.add_scalar(
+                        'train_' + key, value, global_step=global_step)
+            global_step += 1
+            if global_step % eval_step == 0:
+                eval_ret = evaluate(model, exe, valid_ds, valid_prog, 1,
+                                    evaluator, "valid")
+                test_eval_ret = evaluate(model, exe, test_ds, valid_prog, 1,
+                                         evaluator, "test")
+                eval_ret.update(test_eval_ret)
+                sys.stderr.write(json.dumps(eval_ret, indent=4) + "\n")
+                for key, value in eval_ret.items():
+                    writer.add_scalar(key, value, global_step=global_step)
+                if eval_ret["valid_hits@100"] > best_model:
+                    F.io.save_persistables(
+                        exe,
+                        os.path.join(output_path, "checkpoint"), train_prog)
+                    eval_ret["step"] = global_step
+                    with open(os.path.join(output_path, "best.txt"), "w") as f:
+                        f.write(json.dumps(eval_ret, indent=2) + '\n')
+                    best_model = eval_ret["valid_hits@100"]
+        # Epoch End
+        eval_ret = evaluate(model, exe, valid_ds, valid_prog, 1, evaluator,
+                            "valid")
+        test_eval_ret = evaluate(model, exe, test_ds, valid_prog, 1, evaluator,
+                                 "test")
+        eval_ret.update(test_eval_ret)
+        sys.stderr.write(json.dumps(eval_ret, indent=4) + "\n")
+        for key, value in eval_ret.items():
+            writer.add_scalar(key, value, global_step=global_step)
+        if eval_ret["valid_hits@100"] > best_model:
+            F.io.save_persistables(exe,
+                                   os.path.join(output_path, "checkpoint"),
+                                   train_prog)
+            eval_ret["step"] = global_step
+            with open(os.path.join(output_path, "best.txt"), "w") as f:
+                f.write(json.dumps(eval_ret, indent=2) + '\n')
+            best_model = eval_ret["valid_hits@100"]
+    writer.close()
--- a/ogb_examples/linkproppred/ogbl-ppa/train.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/train.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""listwise model
+"""
+import torch
+import os
+import re
+import time
+import logging
+from random import random
+from functools import reduce, partial
+# For downloading ogb
+import ssl
+ssl._create_default_https_context = ssl._create_unverified_context
+# SSL
+import numpy as np
+import multiprocessing
+import pgl
+import paddle
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from args import parser
+from utils.args import print_arguments, check_cuda
+from utils.init import init_checkpoint, init_pretraining_params
+from model import BaseGraph
+from dataloader.ogbl_ppa_dataloader import PPADataGenerator
+from monitor.train_monitor import train_and_evaluate, OgbEvaluator
+log = logging.getLogger(__name__)
+class Metric(object):
+    """Metric"""
+    def __init__(self, **args):
+        self.args = args
+    @property
+    def vars(self):
+        """ fetch metric vars"""
+        values = [self.args[k] for k in self.args.keys()]
+        return values
+    def parse(self, fetch_list):
+        """parse"""
+        tup = list(zip(self.args.keys(), [float(v[0]) for v in fetch_list]))
+        return dict(tup)
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+    evaluator = OgbEvaluator()
+    train_prog = F.Program()
+    startup_prog = F.Program()
+    args.num_nodes = evaluator.num_nodes
+    if args.use_cuda:
+        dev_list = F.cuda_places()
+        place = dev_list[0]
+        dev_count = len(dev_list)
+    else:
+        place = F.CPUPlace()
+        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+    with F.program_guard(train_prog, startup_prog):
+        with F.unique_name.guard():
+            graph_model = BaseGraph(args)
+            test_prog = train_prog.clone(for_test=True)
+            opt = F.optimizer.Adam(learning_rate=args.learning_rate)
+            opt.minimize(graph_model.loss)
+    #test_prog = F.Program()
+    #with F.program_guard(test_prog, startup_prog):
+    #    with F.unique_name.guard():
+    #        _graph_model = BaseGraph(args)
+    train_ds = PPADataGenerator(
+        phase="train",
+        graph_wrapper=graph_model.graph_wrapper,
+        num_workers=args.num_workers,
+        batch_size=args.batch_size)
+    valid_ds = PPADataGenerator(
+        phase="valid",
+        graph_wrapper=graph_model.graph_wrapper,
+        num_workers=args.num_workers,
+        batch_size=args.batch_size)
+    test_ds = PPADataGenerator(
+        phase="test",
+        graph_wrapper=graph_model.graph_wrapper,
+        num_workers=args.num_workers,
+        batch_size=args.batch_size)
+    exe = F.Executor(place)
+    exe.run(startup_prog)
+    if args.init_pretraining_params is not None:
+        init_pretraining_params(
+            exe, args.init_pretraining_params, main_program=startup_prog)
+    metric = Metric(**graph_model.metrics)
+    nccl2_num_trainers = 1
+    nccl2_trainer_id = 0
+    if dev_count > 1:
+        exec_strategy = F.ExecutionStrategy()
+        exec_strategy.num_threads = dev_count
+        train_exe = F.ParallelExecutor(
+            use_cuda=args.use_cuda,
+            loss_name=graph_model.loss.name,
+            exec_strategy=exec_strategy,
+            main_program=train_prog,
+            num_trainers=nccl2_num_trainers,
+            trainer_id=nccl2_trainer_id)
+        test_exe = exe
+    else:
+        train_exe, test_exe = exe, exe
+    train_and_evaluate(
+        exe=exe,
+        train_exe=train_exe,
+        valid_exe=test_exe,
+        train_ds=train_ds,
+        valid_ds=valid_ds,
+        test_ds=test_ds,
+        train_prog=train_prog,
+        valid_prog=test_prog,
+        train_log_step=5,
+        output_path=args.output_path,
+        dev_count=dev_count,
+        model=graph_model,
+        epoch=args.epoch,
+        eval_step=1000000,
+        evaluator=evaluator,
+        metric=metric)
--- a/ogb_examples/linkproppred/ogbl-ppa/utils/__init__.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/utils/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""utils"""
--- a/ogb_examples/linkproppred/ogbl-ppa/utils/args.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/utils/args.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Arguments for configuration."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import six
+import os
+import sys
+import argparse
+import logging
+import paddle.fluid as fluid
+log = logging.getLogger(__name__)
+def prepare_logger(logger, debug=False, save_to_file=None):
+    """doc"""
+    formatter = logging.Formatter(
+        fmt='[%(levelname)s] %(asctime)s [%(filename)12s:%(lineno)5d]:\t%(message)s'
+    )
+    #console_hdl = logging.StreamHandler()
+    #console_hdl.setFormatter(formatter)
+    #logger.addHandler(console_hdl)
+    if save_to_file is not None and not os.path.exists(save_to_file):
+        file_hdl = logging.FileHandler(save_to_file)
+        file_hdl.setFormatter(formatter)
+        logger.addHandler(file_hdl)
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+def str2bool(v):
+    """doc"""
+    # because argparse does not support to parse "true, False" as python
+    # boolean directly
+    return v.lower() in ("true", "t", "1")
+class ArgumentGroup(object):
+    """doc"""
+    def __init__(self, parser, title, des):
+        self._group = parser.add_argument_group(title=title, description=des)
+    def add_arg(self,
+                name,
+                type,
+                default,
+                help,
+                positional_arg=False,
+                **kwargs):
+        """doc"""
+        prefix = "" if positional_arg else "--"
+        type = str2bool if type == bool else type
+        self._group.add_argument(
+            prefix + name,
+            default=default,
+            type=type,
+            help=help + ' Default: %(default)s.',
+            **kwargs)
+def print_arguments(args):
+    """doc"""
+    log.info('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(six.iteritems(vars(args))):
+        log.info('%s: %s' % (arg, value))
+    log.info('------------------------------------------------')
+def check_cuda(use_cuda, err= \
+    "\nYou can not set use_cuda=True in the model because you are using paddlepaddle-cpu.\n \
+    Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda=False to run models on CPU.\n"
+                                                                                                                     ):
+    """doc"""
+    try:
+        if use_cuda == True and fluid.is_compiled_with_cuda() == False:
+            log.error(err)
+            sys.exit(1)
+    except Exception as e:
+        pass
--- a/ogb_examples/linkproppred/ogbl-ppa/utils/cards.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/utils/cards.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""cards"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import os
+def get_cards():
+    """
+    get gpu cards number
+    """
+    num = 0
+    cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
+    if cards != '':
+        num = len(cards.split(","))
+    return num
--- a/ogb_examples/linkproppred/ogbl-ppa/utils/fp16.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/utils/fp16.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+def append_cast_op(i, o, prog):
+    """
+    Append a cast op in a given Program to cast input `i` to data type `o.dtype`.
+    Args:
+        i (Variable): The input Variable.
+        o (Variable): The output Variable.
+        prog (Program): The Program to append cast op.
+    """
+    prog.global_block().append_op(
+        type="cast",
+        inputs={"X": i},
+        outputs={"Out": o},
+        attrs={"in_dtype": i.dtype,
+               "out_dtype": o.dtype})
+def copy_to_master_param(p, block):
+    v = block.vars.get(p.name, None)
+    if v is None:
+        raise ValueError("no param name %s found!" % p.name)
+    new_p = fluid.framework.Parameter(
+        block=block,
+        shape=v.shape,
+        dtype=fluid.core.VarDesc.VarType.FP32,
+        type=v.type,
+        lod_level=v.lod_level,
+        stop_gradient=p.stop_gradient,
+        trainable=p.trainable,
+        optimize_attr=p.optimize_attr,
+        regularizer=p.regularizer,
+        gradient_clip_attr=p.gradient_clip_attr,
+        error_clip=p.error_clip,
+        name=v.name + ".master")
+    return new_p
+def apply_dynamic_loss_scaling(loss_scaling, master_params_grads,
+                               incr_every_n_steps, decr_every_n_nan_or_inf,
+                               incr_ratio, decr_ratio):
+    _incr_every_n_steps = fluid.layers.fill_constant(
+        shape=[1], dtype='int32', value=incr_every_n_steps)
+    _decr_every_n_nan_or_inf = fluid.layers.fill_constant(
+        shape=[1], dtype='int32', value=decr_every_n_nan_or_inf)
+    _num_good_steps = fluid.layers.create_global_var(
+        name=fluid.unique_name.generate("num_good_steps"),
+        shape=[1],
+        value=0,
+        dtype='int32',
+        persistable=True)
+    _num_bad_steps = fluid.layers.create_global_var(
+        name=fluid.unique_name.generate("num_bad_steps"),
+        shape=[1],
+        value=0,
+        dtype='int32',
+        persistable=True)
+    grads = [fluid.layers.reduce_sum(g) for [_, g] in master_params_grads]
+    all_grads = fluid.layers.concat(grads)
+    all_grads_sum = fluid.layers.reduce_sum(all_grads)
+    is_overall_finite = fluid.layers.isfinite(all_grads_sum)
+    update_loss_scaling(is_overall_finite, loss_scaling, _num_good_steps,
+                        _num_bad_steps, _incr_every_n_steps,
+                        _decr_every_n_nan_or_inf, incr_ratio, decr_ratio)
+    # apply_gradient append all ops in global block, thus we shouldn't
+    # apply gradient in the switch branch.
+    with fluid.layers.Switch() as switch:
+        with switch.case(is_overall_finite):
+            pass
+        with switch.default():
+            for _, g in master_params_grads:
+                fluid.layers.assign(fluid.layers.zeros_like(g), g)
+def create_master_params_grads(params_grads, main_prog, startup_prog,
+                               loss_scaling):
+    master_params_grads = []
+    for p, g in params_grads:
+        with main_prog._optimized_guard([p, g]):
+            # create master parameters
+            master_param = copy_to_master_param(p, main_prog.global_block())
+            startup_master_param = startup_prog.global_block()._clone_variable(
+                master_param)
+            startup_p = startup_prog.global_block().var(p.name)
+            append_cast_op(startup_p, startup_master_param, startup_prog)
+            # cast fp16 gradients to fp32 before apply gradients
+            if g.name.find("layer_norm") > -1:
+                scaled_g = g / loss_scaling
+                master_params_grads.append([p, scaled_g])
+                continue
+            master_grad = fluid.layers.cast(g, "float32")
+            master_grad = master_grad / loss_scaling
+            master_params_grads.append([master_param, master_grad])
+    return master_params_grads
+def master_param_to_train_param(master_params_grads, params_grads, main_prog):
+    for idx, m_p_g in enumerate(master_params_grads):
+        train_p, _ = params_grads[idx]
+        if train_p.name.find("layer_norm") > -1:
+            continue
+        with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
+            append_cast_op(m_p_g[0], train_p, main_prog)
+def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps,
+                        num_bad_steps, incr_every_n_steps,
+                        decr_every_n_nan_or_inf, incr_ratio, decr_ratio):
+    """
+    Update loss scaling according to overall gradients. If all gradients is 
+    finite after incr_every_n_steps, loss scaling will increase by incr_ratio. 
+    Otherwisw, loss scaling will decrease by decr_ratio after 
+    decr_every_n_nan_or_inf steps and each step some gradients are infinite.
+    Args:
+        is_overall_finite (Variable): A boolean variable indicates whether 
+                                     all gradients are finite.
+        prev_loss_scaling (Variable): Previous loss scaling.
+        num_good_steps (Variable): A variable accumulates good steps in which 
+                                   all gradients are finite.
+        num_bad_steps (Variable): A variable accumulates bad steps in which 
+                                  some gradients are infinite.
+        incr_every_n_steps (Variable): A variable represents increasing loss 
+                                       scaling every n consecutive steps with 
+                                       finite gradients.
+        decr_every_n_nan_or_inf (Variable): A variable represents decreasing 
+                                            loss scaling every n accumulated 
+                                            steps with nan or inf gradients.
+        incr_ratio(float): The multiplier to use when increasing the loss 
+                           scaling.
+        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
+                           loss scaling.
+    """
+    zero_steps = fluid.layers.fill_constant(shape=[1], dtype='int32', value=0)
+    with fluid.layers.Switch() as switch:
+        with switch.case(is_overall_finite):
+            should_incr_loss_scaling = fluid.layers.less_than(
+                incr_every_n_steps, num_good_steps + 1)
+            with fluid.layers.Switch() as switch1:
+                with switch1.case(should_incr_loss_scaling):
+                    new_loss_scaling = prev_loss_scaling * incr_ratio
+                    loss_scaling_is_finite = fluid.layers.isfinite(
+                        new_loss_scaling)
+                    with fluid.layers.Switch() as switch2:
+                        with switch2.case(loss_scaling_is_finite):
+                            fluid.layers.assign(new_loss_scaling,
+                                                prev_loss_scaling)
+                        with switch2.default():
+                            pass
+                    fluid.layers.assign(zero_steps, num_good_steps)
+                    fluid.layers.assign(zero_steps, num_bad_steps)
+                with switch1.default():
+                    fluid.layers.increment(num_good_steps)
+                    fluid.layers.assign(zero_steps, num_bad_steps)
+        with switch.default():
+            should_decr_loss_scaling = fluid.layers.less_than(
+                decr_every_n_nan_or_inf, num_bad_steps + 1)
+            with fluid.layers.Switch() as switch3:
+                with switch3.case(should_decr_loss_scaling):
+                    new_loss_scaling = prev_loss_scaling * decr_ratio
+                    static_loss_scaling = \
+                        fluid.layers.fill_constant(shape=[1],
+                                             dtype='float32',
+                                             value=1.0)
+                    less_than_one = fluid.layers.less_than(new_loss_scaling,
+                                                           static_loss_scaling)
+                    with fluid.layers.Switch() as switch4:
+                        with switch4.case(less_than_one):
+                            fluid.layers.assign(static_loss_scaling,
+                                                prev_loss_scaling)
+                        with switch4.default():
+                            fluid.layers.assign(new_loss_scaling,
+                                                prev_loss_scaling)
+                    fluid.layers.assign(zero_steps, num_good_steps)
+                    fluid.layers.assign(zero_steps, num_bad_steps)
+                with switch3.default():
+                    fluid.layers.assign(zero_steps, num_good_steps)
+                    fluid.layers.increment(num_bad_steps)
--- a/ogb_examples/linkproppred/ogbl-ppa/utils/init.py
+++ b/ogb_examples/linkproppred/ogbl-ppa/utils/init.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""paddle init"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import os
+import six
+import ast
+import copy
+import logging
+import numpy as np
+import paddle.fluid as fluid
+log = logging.getLogger(__name__)
+def cast_fp32_to_fp16(exe, main_program):
+    """doc"""
+    log.info("Cast parameters to float16 data format.")
+    for param in main_program.global_block().all_parameters():
+        if not param.name.endswith(".master"):
+            param_t = fluid.global_scope().find_var(param.name).get_tensor()
+            data = np.array(param_t)
+            if param.name.startswith("encoder_layer") \
+                    and "layer_norm" not in param.name:
+                param_t.set(np.float16(data).view(np.uint16), exe.place)
+            #load fp32
+            master_param_var = fluid.global_scope().find_var(param.name +
+                                                             ".master")
+            if master_param_var is not None:
+                master_param_var.get_tensor().set(data, exe.place)
+def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
+    """init"""
+    assert os.path.exists(
+        init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
+    def existed_persitables(var):
+        """existed"""
+        if not fluid.io.is_persistable(var):
+            return False
+        return os.path.exists(os.path.join(init_checkpoint_path, var.name))
+    fluid.io.load_vars(
+        exe,
+        init_checkpoint_path,
+        main_program=main_program,
+        predicate=existed_persitables)
+    log.info("Load model from {}".format(init_checkpoint_path))
+    if use_fp16:
+        cast_fp32_to_fp16(exe, main_program)
+def init_pretraining_params(exe,
+                            pretraining_params_path,
+                            main_program,
+                            use_fp16=False):
+    """init"""
+    assert os.path.exists(pretraining_params_path
+                          ), "[%s] cann't be found." % pretraining_params_path
+    def existed_params(var):
+        """doc"""
+        if not isinstance(var, fluid.framework.Parameter):
+            return False
+        return os.path.exists(os.path.join(pretraining_params_path, var.name))
+    fluid.io.load_vars(
+        exe,
+        pretraining_params_path,
+        main_program=main_program,
+        predicate=existed_params)
+    log.info("Load pretraining parameters from {}.".format(
+        pretraining_params_path))
+    if use_fp16:
+        cast_fp32_to_fp16(exe, main_program)
--- a/pgl/graph_wrapper.py
+++ b/pgl/graph_wrapper.py
@@ -40,7 +40,6 @@ def recv(dst, uniq_dst, bucketing_index, msg, reduce_function, num_nodes,
         num_edges):
    """Recv message from given msg to dst nodes.
    """
-    empty_msg_flag = fluid.layers.cast(num_edges > 0, dtype="float32")
    if reduce_function == "sum":
        if isinstance(msg, dict):
            raise TypeError("The message for build-in function"
@@ -49,8 +48,9 @@ def recv(dst, uniq_dst, bucketing_index, msg, reduce_function, num_nodes,
        try:
            out_dim = msg.shape[-1]
            init_output = fluid.layers.fill_constant(
-                shape=[num_nodes, out_dim], value=0, dtype="float32")
+                shape=[num_nodes, out_dim], value=0, dtype=msg.dtype)
            init_output.stop_gradient = False
+            empty_msg_flag = fluid.layers.cast(num_edges > 0, dtype=msg.dtype)
            msg = msg * empty_msg_flag
            output = paddle_helper.scatter_add(init_output, dst, msg)
            return output
@@ -66,10 +66,12 @@ def recv(dst, uniq_dst, bucketing_index, msg, reduce_function, num_nodes,
    bucketed_msg = op.nested_lod_reset(msg, bucketing_index)
    output = reduce_function(bucketed_msg)
    output_dim = output.shape[-1]
+    empty_msg_flag = fluid.layers.cast(num_edges > 0, dtype=output.dtype)
    output = output * empty_msg_flag
    init_output = fluid.layers.fill_constant(
-        shape=[num_nodes, output_dim], value=0, dtype="float32")
+        shape=[num_nodes, output_dim], value=0, dtype=output.dtype)
    init_output.stop_gradient = True
    final_output = fluid.layers.scatter(init_output, uniq_dst, output)
    return final_output

--- a/pgl/layers/conv.py
+++ b/pgl/layers/conv.py
@@ -230,7 +230,7 @@ def gin(gw,
        epsilon.stop_gradient = True
    msg = gw.send(send_src_copy, nfeat_list=[("h", feature)])
-    output = gw.recv(msg, "sum") + (1.0 + epsilon) * feature
+    output = gw.recv(msg, "sum") + feature * (epsilon + 1.0)
    output = fluid.layers.fc(output,
                             size=hidden_size,
@@ -238,8 +238,18 @@ def gin(gw,
                             param_attr=fluid.ParamAttr(name="%s_w_0" % name),
                             bias_attr=fluid.ParamAttr(name="%s_b_0" % name))
-    output = fluid.layers.batch_norm(output)
+    output = fluid.layers.layer_norm(
-    output = getattr(fluid.layers, activation)(output)
+        output,
+        begin_norm_axis=1,
+        param_attr=fluid.ParamAttr(
+            name="norm_scale_%s" % (name),
+            initializer=fluid.initializer.Constant(1.0)),
+        bias_attr=fluid.ParamAttr(
+            name="norm_bias_%s" % (name),
+            initializer=fluid.initializer.Constant(0.0)), )
+    if activation is not None:
+        output = getattr(fluid.layers, activation)(output)
    output = fluid.layers.fc(output,
                             size=hidden_size,