Merge pull request #57 from WeiyueSu/erniesage

add ERNIESage example

Merge pull request #57 from WeiyueSu/erniesage
add ERNIESage example
b2019d1c · Huang Zhengjie · GitHub · 75bdb082 · 6786a5b1 · b2019d1c
29 changed file
--- a/examples/erniesage/config/erniesage_v1_cpu.yaml
+++ b/examples/erniesage/config/erniesage_v1_cpu.yaml
+# Global Enviroment Settings 
+#
+# trainer config ------
+learner_type: "cpu"
+optimizer_type: "adam"
+lr: 0.00005
+batch_size: 2
+CPU_NUM: 10
+epoch: 20
+log_per_step: 1
+save_per_step: 100
+output_path: "./output"
+ckpt_path: "./ernie_base_ckpt"
+
+# data config ------
+input_data: "./data.txt"
+graph_path: "./workdir"
+sample_workers: 1
+use_pyreader: true
+input_type: "text"
+
+# model config ------
+samples: [10]
+model_type: "ErnieSageModelV1"
+layer_type: "graphsage_sum"
+
+max_seqlen: 40
+
+num_layers: 1
+hidden_size: 128
+final_fc: true
+final_l2_norm: true
+loss_type: "hinge"
+margin: 0.3
+
+# infer config ------
+infer_model: "./output/last"
+infer_batch_size: 128
+
+# ernie config ------
+encoding: "utf8"
+ernie_vocab_file: "./vocab.txt"
+ernie_config:
+    attention_probs_dropout_prob: 0.1
+    hidden_act: "relu"
+    hidden_dropout_prob: 0.1
+    hidden_size: 768
+    initializer_range: 0.02
+    max_position_embeddings: 513
+    num_attention_heads: 12
+    num_hidden_layers: 12
+    sent_type_vocab_size: 4
+    task_type_vocab_size: 3
+    vocab_size: 18000
+    use_task_id: false
+    use_fp16: false
--- a/examples/erniesage/config/erniesage_v1_gpu.yaml
+++ b/examples/erniesage/config/erniesage_v1_gpu.yaml
+# Global Enviroment Settings 
+#
+# trainer config ------
+learner_type: "gpu"
+optimizer_type: "adam"
+lr: 0.00005
+batch_size: 32
+CPU_NUM: 10
+epoch: 20
+log_per_step: 1
+save_per_step: 100
+output_path: "./output"
+ckpt_path: "./ernie_base_ckpt"
+
+# data config ------
+input_data: "./data.txt"
+graph_path: "./workdir"
+sample_workers: 1
+use_pyreader: true
+input_type: "text"
+
+# model config ------
+samples: [10]
+model_type: "ErnieSageModelV1"
+layer_type: "graphsage_sum"
+
+max_seqlen: 40
+
+num_layers: 1
+hidden_size: 128
+final_fc: true
+final_l2_norm: true
+loss_type: "hinge"
+margin: 0.3
+
+# infer config ------
+infer_model: "./output/last"
+infer_batch_size: 128
+
+# ernie config ------
+encoding: "utf8"
+ernie_vocab_file: "./vocab.txt"
+ernie_config:
+    attention_probs_dropout_prob: 0.1
+    hidden_act: "relu"
+    hidden_dropout_prob: 0.1
+    hidden_size: 768
+    initializer_range: 0.02
+    max_position_embeddings: 513
+    num_attention_heads: 12
+    num_hidden_layers: 12
+    sent_type_vocab_size: 4
+    task_type_vocab_size: 3
+    vocab_size: 18000
+    use_task_id: false
+    use_fp16: false
--- a/examples/erniesage/config/erniesage_v2_cpu.yaml
+++ b/examples/erniesage/config/erniesage_v2_cpu.yaml
+# Global Enviroment Settings 
+#
+# trainer config ------
+learner_type: "cpu"
+optimizer_type: "adam"
+lr: 0.00005
+batch_size: 2
+CPU_NUM: 10
+epoch: 20
+log_per_step: 1
+save_per_step: 100
+output_path: "./output"
+ckpt_path: "./ernie_base_ckpt"
+
+# data config ------
+input_data: "./data.txt"
+graph_path: "./workdir"
+sample_workers: 1
+use_pyreader: true
+input_type: "text"
+
+# model config ------
+samples: [10]
+model_type: "ErnieSageModelV2"
+
+max_seqlen: 40
+
+num_layers: 1
+hidden_size: 128
+final_fc: true
+final_l2_norm: true
+loss_type: "hinge"
+margin: 0.3
+
+# infer config ------
+infer_model: "./output/last"
+infer_batch_size: 128
+
+# ernie config ------
+encoding: "utf8"
+ernie_vocab_file: "./vocab.txt"
+ernie_config:
+    attention_probs_dropout_prob: 0.1
+    hidden_act: "relu"
+    hidden_dropout_prob: 0.1
+    hidden_size: 768
+    initializer_range: 0.02
+    max_position_embeddings: 513
+    num_attention_heads: 12
+    num_hidden_layers: 12
+    sent_type_vocab_size: 4
+    task_type_vocab_size: 3
+    vocab_size: 18000
+    use_task_id: false
+    use_fp16: false
--- a/examples/erniesage/config/erniesage_v2_gpu.yaml
+++ b/examples/erniesage/config/erniesage_v2_gpu.yaml
+# Global Enviroment Settings 
+#
+# trainer config ------
+learner_type: "gpu"
+optimizer_type: "adam"
+lr: 0.00005
+batch_size: 32
+CPU_NUM: 10
+epoch: 20
+log_per_step: 1
+save_per_step: 100
+output_path: "./output"
+ckpt_path: "./ernie_base_ckpt"
+
+# data config ------
+input_data: "./data.txt"
+graph_path: "./workdir"
+sample_workers: 1
+use_pyreader: true
+input_type: "text"
+
+# model config ------
+samples: [10]
+model_type: "ErnieSageModelV2"
+
+max_seqlen: 40
+
+num_layers: 1
+hidden_size: 128
+final_fc: true
+final_l2_norm: true
+loss_type: "hinge"
+margin: 0.3
+
+# infer config ------
+infer_model: "./output/last"
+infer_batch_size: 128
+
+# ernie config ------
+encoding: "utf8"
+ernie_vocab_file: "./vocab.txt"
+ernie_config:
+    attention_probs_dropout_prob: 0.1
+    hidden_act: "relu"
+    hidden_dropout_prob: 0.1
+    hidden_size: 768
+    initializer_range: 0.02
+    max_position_embeddings: 513
+    num_attention_heads: 12
+    num_hidden_layers: 12
+    sent_type_vocab_size: 4
+    task_type_vocab_size: 3
+    vocab_size: 18000
+    use_task_id: false
+    use_fp16: false
--- a/examples/erniesage/config/erniesage_v3_cpu.yaml
+++ b/examples/erniesage/config/erniesage_v3_cpu.yaml
+# Global Enviroment Settings 
+#
+# trainer config ------
+learner_type: "cpu"
+optimizer_type: "adam"
+lr: 0.00005
+batch_size: 2
+CPU_NUM: 10
+epoch: 20
+log_per_step: 1
+save_per_step: 100
+output_path: "./output"
+ckpt_path: "./ernie_base_ckpt"
+
+# data config ------
+input_data: "./data.txt"
+graph_path: "./workdir"
+sample_workers: 1
+use_pyreader: true
+input_type: "text"
+
+# model config ------
+samples: [10]
+model_type: "ErnieSageModelV3"
+
+max_seqlen: 40
+
+num_layers: 1
+hidden_size: 128
+final_fc: true
+final_l2_norm: true
+loss_type: "hinge"
+margin: 0.3
+
+# infer config ------
+infer_model: "./output/last"
+infer_batch_size: 128
+
+# ernie config ------
+encoding: "utf8"
+ernie_vocab_file: "./vocab.txt"
+ernie_config:
+    attention_probs_dropout_prob: 0.1
+    hidden_act: "relu"
+    hidden_dropout_prob: 0.1
+    hidden_size: 768
+    initializer_range: 0.02
+    max_position_embeddings: 513
+    num_attention_heads: 12
+    num_hidden_layers: 12
+    sent_type_vocab_size: 4
+    task_type_vocab_size: 3
+    vocab_size: 18000
+    use_task_id: false
+    use_fp16: false
--- a/examples/erniesage/config/erniesage_v3_gpu.yaml
+++ b/examples/erniesage/config/erniesage_v3_gpu.yaml
+# Global Enviroment Settings 
+#
+# trainer config ------
+learner_type: "gpu"
+optimizer_type: "adam"
+lr: 0.00005
+batch_size: 32
+CPU_NUM: 10
+epoch: 20
+log_per_step: 1
+save_per_step: 100
+output_path: "./output"
+ckpt_path: "./ernie_base_ckpt"
+
+# data config ------
+input_data: "./data.txt"
+graph_path: "./workdir"
+sample_workers: 1
+use_pyreader: true
+input_type: "text"
+
+# model config ------
+samples: [10]
+model_type: "ErnieSageModelV3"
+
+max_seqlen: 40
+
+num_layers: 1
+hidden_size: 128
+final_fc: true
+final_l2_norm: true
+loss_type: "hinge"
+margin: 0.3
+
+# infer config ------
+infer_model: "./output/last"
+infer_batch_size: 128
+
+# ernie config ------
+encoding: "utf8"
+ernie_vocab_file: "./vocab.txt"
+ernie_config:
+    attention_probs_dropout_prob: 0.1
+    hidden_act: "relu"
+    hidden_dropout_prob: 0.1
+    hidden_size: 768
+    initializer_range: 0.02
+    max_position_embeddings: 513
+    num_attention_heads: 12
+    num_hidden_layers: 12
+    sent_type_vocab_size: 4
+    task_type_vocab_size: 3
+    vocab_size: 18000
+    use_task_id: false
+    use_fp16: false
--- a/examples/erniesage/dataset/__init__.py
+++ b/examples/erniesage/dataset/__init__.py
--- a/examples/erniesage/dataset/base_dataset.py
+++ b/examples/erniesage/dataset/base_dataset.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Base DataLoader 
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+
+import os
+import sys
+import six
+from io import open
+from collections import namedtuple
+import numpy as np
+import tqdm
+import paddle
+from pgl.utils import mp_reader
+import collections
+import time
+from pgl.utils.logger import log
+import traceback
+
+
+if six.PY3:
+    import io
+    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
+
+
+def batch_iter(data, perm, batch_size, fid, num_workers):
+    """node_batch_iter
+    """
+    size = len(data)
+    start = 0
+    cc = 0
+    while start < size:
+        index = perm[start:start + batch_size]
+        start += batch_size
+        cc += 1
+        if cc % num_workers != fid:
+            continue
+        yield data[index]
+
+
+def scan_batch_iter(data, batch_size, fid, num_workers):
+    """node_batch_iter
+    """
+    batch = []
+    cc = 0
+    for line_example in data.scan(): 
+        cc += 1
+        if cc % num_workers != fid:
+            continue
+        batch.append(line_example)
+        if len(batch) == batch_size:
+            yield batch 
+            batch = []
+
+    if len(batch) > 0:
+        yield batch 
+
+
+class BaseDataGenerator(object):
+    """Base Data Geneartor"""
+
+    def __init__(self, buf_size, batch_size, num_workers, shuffle=True):
+        self.num_workers = num_workers
+        self.batch_size = batch_size
+        self.line_examples = []
+        self.buf_size = buf_size
+        self.shuffle = shuffle
+
+    def batch_fn(self, batch_examples):
+        """ batch_fn batch producer"""
+        raise NotImplementedError("No defined Batch Fn")
+
+    def batch_iter(self, fid, perm):
+        """ batch iterator"""
+        if self.shuffle:
+            for batch in batch_iter(self, perm, self.batch_size, fid, self.num_workers):
+                yield batch
+        else:
+            for batch in scan_batch_iter(self, self.batch_size, fid, self.num_workers):
+                yield batch
+
+    def __len__(self):
+        return len(self.line_examples)
+
+    def __getitem__(self, idx):
+        if isinstance(idx, collections.Iterable):
+            return [self[bidx] for bidx in idx]
+        else:
+            return self.line_examples[idx]
+
+    def generator(self):
+        """batch dict generator"""
+
+        def worker(filter_id, perm):
+            """ multiprocess worker"""
+
+            def func_run():
+                """ func_run """
+                pid = os.getpid()
+                np.random.seed(pid + int(time.time()))
+                for batch_examples in self.batch_iter(filter_id, perm):
+                    try:
+                        batch_dict = self.batch_fn(batch_examples)
+                    except Exception as e:
+                       traceback.print_exc()
+                       log.info(traceback.format_exc())
+                       log.info(str(e))
+                       continue
+
+                    if batch_dict is None:
+                        continue
+                    yield batch_dict
+
+
+
+            return func_run
+
+        # consume a seed
+        np.random.rand()
+
+        if self.shuffle:
+            perm = np.arange(0, len(self)) 
+            np.random.shuffle(perm)
+        else:
+            perm = None
+
+        if self.num_workers == 1:
+            r = paddle.reader.buffered(worker(0, perm), self.buf_size)
+        else:
+            worker_pool = [worker(wid, perm) for wid in range(self.num_workers)]
+            worker = mp_reader.multiprocess_reader(
+                worker_pool, use_pipe=True, queue_size=1000)
+            r = paddle.reader.buffered(worker, self.buf_size)
+
+        for batch in r():
+            yield batch
+
+    def scan(self): 
+        for line_example in self.line_examples:
+            yield line_example
--- a/examples/erniesage/dataset/graph_reader.py
+++ b/examples/erniesage/dataset/graph_reader.py
+"""Graph Dataset
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+
+import os
+import pgl
+import sys
+
+import numpy as np
+
+from pgl.utils.logger import log
+from dataset.base_dataset import BaseDataGenerator
+from pgl.sample import alias_sample
+from pgl.sample import pinsage_sample
+from pgl.sample import graphsage_sample 
+from pgl.sample import edge_hash
+
+
+class GraphGenerator(BaseDataGenerator):
+    def __init__(self, graph_wrappers, data, batch_size, samples,
+        num_workers, feed_name_list, use_pyreader,
+        phase, graph_data_path, shuffle=True, buf_size=1000):
+
+        super(GraphGenerator, self).__init__(
+            buf_size=buf_size,
+            num_workers=num_workers,
+            batch_size=batch_size, shuffle=shuffle)
+        # For iteration
+        self.line_examples = data
+
+        self.graph_wrappers = graph_wrappers
+        self.samples = samples
+        self.feed_name_list = feed_name_list
+        self.use_pyreader = use_pyreader
+        self.phase = phase
+        self.load_graph(graph_data_path)
+        self.num_layers = len(graph_wrappers)
+
+    def load_graph(self, graph_data_path):
+        self.graph = pgl.graph.MemmapGraph(graph_data_path)
+        self.alias = np.load(os.path.join(graph_data_path, "alias.npy"), mmap_mode="r")
+        self.events = np.load(os.path.join(graph_data_path, "events.npy"), mmap_mode="r")
+        self.term_ids = np.load(os.path.join(graph_data_path, "term_ids.npy"), mmap_mode="r")
+ 
+    def batch_fn(self, batch_ex):
+        # batch_ex = [
+        #     (src, dst, neg),
+        #     (src, dst, neg),
+        #     (src, dst, neg),
+        #     ]
+        #
+        batch_src = []
+        batch_dst = []
+        batch_neg = []
+        for batch in batch_ex:
+            batch_src.append(batch[0])
+            batch_dst.append(batch[1])
+            if len(batch) == 3: # default neg samples
+                batch_neg.append(batch[2])
+
+        if len(batch_src) != self.batch_size:
+            if self.phase == "train":
+                return None  #Skip
+
+        if len(batch_neg) > 0:
+            batch_neg = np.unique(np.concatenate(batch_neg))
+        batch_src = np.array(batch_src, dtype="int64")
+        batch_dst = np.array(batch_dst, dtype="int64")
+
+        sampled_batch_neg = alias_sample(batch_dst.shape, self.alias, self.events)
+    
+        if len(batch_neg) > 0:
+            batch_neg = np.concatenate([batch_neg, sampled_batch_neg], 0)
+        else:
+            batch_neg = sampled_batch_neg
+
+        if self.phase == "train":
+            ignore_edges = set()
+        else:
+            ignore_edges = set()
+
+        nodes = np.unique(np.concatenate([batch_src, batch_dst, batch_neg], 0))
+        subgraphs = graphsage_sample(self.graph, nodes, self.samples, ignore_edges=ignore_edges)
+        feed_dict = {}
+        for i in range(self.num_layers):
+            feed_dict.update(self.graph_wrappers[i].to_feed(subgraphs[i]))
+
+        # only reindex from first subgraph
+        sub_src_idx = subgraphs[0].reindex_from_parrent_nodes(batch_src)
+        sub_dst_idx = subgraphs[0].reindex_from_parrent_nodes(batch_dst)
+        sub_neg_idx = subgraphs[0].reindex_from_parrent_nodes(batch_neg)
+
+        feed_dict["user_index"] = np.array(sub_src_idx, dtype="int64")
+        feed_dict["item_index"] = np.array(sub_dst_idx, dtype="int64")
+        #feed_dict["neg_item_index"] = np.array(sub_neg_idx, dtype="int64")
+        feed_dict["term_ids"] = self.term_ids[subgraphs[0].node_feat["index"]]
+        return feed_dict
+
+    def __call__(self):
+        return self.generator()
+
+    def generator(self):
+        try:
+            for feed_dict in super(GraphGenerator, self).generator():
+                if self.use_pyreader:
+                    yield [feed_dict[name] for name in self.feed_name_list]
+                else:
+                    yield feed_dict
+
+        except Exception as e:
+            log.exception(e)
+ 
+
+    
--- a/examples/erniesage/docs/source/_static/ernie_aggregator.png
+++ b/examples/erniesage/docs/source/_static/ernie_aggregator.png
--- a/examples/erniesage/docs/source/_static/text_graph.png
+++ b/examples/erniesage/docs/source/_static/text_graph.png
--- a/examples/erniesage/infer.py
+++ b/examples/erniesage/infer.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+import argparse
+import pickle
+import time
+import glob
+import os
+import io
+import traceback
+import pickle as pkl
+role = os.getenv("TRAINING_ROLE", "TRAINER")
+
+import numpy as np
+import yaml
+from easydict import EasyDict as edict
+import pgl
+from pgl.utils.logger import log
+from pgl.utils import paddle_helper
+import paddle
+import paddle.fluid as F
+
+from models.model_factory import Model
+from dataset.graph_reader import GraphGenerator 
+
+
+class PredictData(object):
+    def __init__(self, num_nodes):
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
+        train_usr = np.arange(trainer_id, num_nodes, trainer_count)
+        #self.data = (train_usr, train_usr)
+        self.data = train_usr
+
+    def __getitem__(self, index):
+        return [self.data[index], self.data[index]]
+
+def tostr(data_array):
+    return " ".join(["%.5lf" % d for d in  data_array])
+
+def run_predict(py_reader,
+              exe,
+              program,
+              model_dict,
+              log_per_step=1,
+              args=None):
+
+    if args.input_type == "text":
+        id2str = np.load(os.path.join(args.graph_path, "id2str.npy"), mmap_mode="r")
+
+    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+    trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
+    if not os.path.exists(args.output_path):
+        os.mkdir(args.output_path)
+
+    fout = io.open("%s/part-%s" % (args.output_path, trainer_id), "w", encoding="utf8")
+    batch = 0
+        
+    for batch_feed_dict in py_reader():
+        batch += 1
+        batch_usr_feat, batch_ad_feat, batch_src_real_index = exe.run(
+            program,
+            feed=batch_feed_dict,
+            fetch_list=model_dict.outputs)
+
+        if batch % log_per_step == 0:
+            log.info("Predict %s finished" % batch)
+
+        for ufs, _, sri in zip(batch_usr_feat, batch_ad_feat, batch_src_real_index):
+            if args.input_type == "text":
+                sri = id2str[int(sri)]
+            line = "{}\t{}\n".format(sri, tostr(ufs))
+            fout.write(line)
+
+    fout.close()
+
+def _warmstart(exe, program, path='params'):
+    def _existed_persitables(var):
+        #if not isinstance(var, fluid.framework.Parameter):
+        #    return False
+        if not F.io.is_persistable(var):
+            return False
+        param_path = os.path.join(path, var.name)
+        log.info("Loading parameter: {} persistable: {} exists: {}".format(
+            param_path,
+            F.io.is_persistable(var),
+            os.path.exists(param_path),
+        ))
+        return os.path.exists(param_path)
+    F.io.load_vars(
+        exe,
+        path,
+        main_program=program,
+        predicate=_existed_persitables
+    )
+
+def main(config):
+    model = Model.factory(config)
+
+    if config.learner_type == "cpu":
+        place = F.CPUPlace()
+    elif config.learner_type == "gpu":
+        gpu_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        place = F.CUDAPlace(gpu_id)
+    else:
+        raise ValueError
+
+    exe = F.Executor(place)
+
+    val_program = F.default_main_program().clone(for_test=True)
+    exe.run(F.default_startup_program()) 
+    _warmstart(exe, F.default_startup_program(), path=config.infer_model)
+
+    num_threads = int(os.getenv("CPU_NUM", 1))
+    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
+
+    exec_strategy = F.ExecutionStrategy()
+    exec_strategy.num_threads = num_threads
+    build_strategy = F.BuildStrategy()
+    build_strategy.enable_inplace = True
+    build_strategy.memory_optimize = True
+    build_strategy.remove_unnecessary_lock = False
+    build_strategy.memory_optimize = False
+
+    if num_threads > 1:
+        build_strategy.reduce_strategy = F.BuildStrategy.ReduceStrategy.Reduce
+
+    val_compiled_prog = F.compiler.CompiledProgram(
+        val_program).with_data_parallel(
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy)
+
+    num_nodes = int(np.load(os.path.join(config.graph_path, "num_nodes.npy")))
+
+    predict_data = PredictData(num_nodes)
+
+    predict_iter = GraphGenerator(
+        graph_wrappers=model.graph_wrappers,
+        batch_size=config.infer_batch_size,
+        data=predict_data,
+        samples=config.samples,
+        num_workers=config.sample_workers,
+        feed_name_list=[var.name for var in model.feed_list],
+        use_pyreader=config.use_pyreader,
+        phase="predict",
+        graph_data_path=config.graph_path,
+        shuffle=False)
+
+    if config.learner_type == "cpu":
+        model.data_loader.decorate_batch_generator(
+            predict_iter, places=F.cpu_places())
+    elif config.learner_type == "gpu":
+        gpu_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        place = F.CUDAPlace(gpu_id)
+        model.data_loader.decorate_batch_generator(
+            predict_iter, places=place)
+    else:
+        raise ValueError
+
+    run_predict(model.data_loader,
+                program=val_compiled_prog,
+                exe=exe,
+                model_dict=model,
+                args=config)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='main')
+    parser.add_argument("--conf", type=str, default="./config.yaml")
+    args = parser.parse_args()
+    config = edict(yaml.load(open(args.conf), Loader=yaml.FullLoader))
+    print(config)
+    main(config)
--- a/examples/erniesage/job.sh
+++ b/examples/erniesage/job.sh
+
+unset http_proxy https_proxy
+set -x
+mode=${1:-local}
+config=${2:-"./config.yaml"}
+
+function parse_yaml {
+   local prefix=$2
+   local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034')
+   sed -ne "s|^\($s\):|\1|" \
+        -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \
+        -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p"  $1 |
+   awk -F$fs '{
+      indent = length($1)/2;
+      vname[indent] = $2;
+      for (i in vname) {if (i > indent) {delete vname[i]}}
+      if (length($3) > 0) {
+         vn=""; for (i=0; i<indent; i++) {vn=(vn)(vname[i])("_")}
+         printf("%s%s%s=\"%s\"\n", "'$prefix'",vn, $2, $3);
+      }
+   }'
+}
+eval $(parse_yaml $config)
+
+export CPU_NUM=$CPU_NUM
+export FLAGS_rpc_deadline=3000000 
+export FLAGS_rpc_retry_times=1000
+
+if [[ $async_mode == "True" ]];then
+    echo "async_mode is True"
+else
+    export FLAGS_communicator_send_queue_size=1
+    export FLAGS_communicator_min_send_grad_num_before_recv=0
+    export FLAGS_communicator_max_merge_var_num=1 # important! 
+    export FLAGS_communicator_merge_sparse_grad=0
+fi
+
+export FLAGS_communicator_recv_wait_times=5000000
+
+mkdir -p output
+
+python ./train.py --conf $config
+if [[ $TRAINING_ROLE == "TRAINER" ]];then
+    python ./infer.py --conf $config
+fi
--- a/examples/erniesage/learner.py
+++ b/examples/erniesage/learner.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import os
+role = os.getenv("TRAINING_ROLE", "TRAINER")
+
+import numpy as np
+from pgl.utils.logger import log
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import StrategyFactory
+from paddle.fluid.incubate.fleet.collective import DistributedStrategy
+from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
+from paddle.fluid.incubate.fleet.collective import fleet as cfleet
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet as tfleet
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from tensorboardX import SummaryWriter
+
+
+class Learner(object):
+    @classmethod
+    def factory(cls, name):
+        if name == "cpu":
+            return TranspilerLearner()
+        elif name == "gpu":
+            return CollectiveLearner()
+        else:
+            raise ValueError
+
+    def build(self, model, data_gen, config):
+        raise NotImplementedError
+
+    def warmstart(self, program, path='./checkpoints'):
+        def _existed_persitables(var):
+            #if not isinstance(var, fluid.framework.Parameter):
+            #    return False
+            if not F.io.is_persistable(var):
+                return False
+            param_path = os.path.join(path, var.name)
+            log.info("Loading parameter: {} persistable: {} exists: {}".format(
+                param_path,
+                F.io.is_persistable(var),
+                os.path.exists(param_path),
+            ))
+            return os.path.exists(param_path)
+        F.io.load_vars(
+            self.exe,
+            path,
+            main_program=program,
+            predicate=_existed_persitables
+        )
+
+    def start(self):
+        batch = 0
+        start = time.time()
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        if trainer_id == 0:
+            writer = SummaryWriter(os.path.join(self.config.output_path, "train_history"))
+
+        for epoch_idx in range(self.config.epoch):
+            for idx, batch_feed_dict in enumerate(self.model.data_loader()):
+                try:
+                    cpu_time = time.time()
+                    batch += 1
+                    batch_loss  = self.exe.run(
+                        self.program,
+                        feed=batch_feed_dict,
+                        fetch_list=[self.model.loss])
+                    end = time.time()
+                    if trainer_id == 0:
+                        writer.add_scalar("loss", np.mean(batch_loss), batch)
+                        if batch % self.config.log_per_step == 0:
+                            log.info(
+                                "Epoch %s Batch %s %s-Loss %s \t Speed(per batch) %.5lf/%.5lf sec"
+                                % (epoch_idx, batch, "train", np.mean(batch_loss), (end - start) /batch, (end - cpu_time)))
+                            writer.flush()
+                        if batch % self.config.save_per_step == 0:
+                            self.fleet.save_persistables(self.exe, os.path.join(self.config.output_path, str(batch)))
+                except Exception as e:
+                    log.info("Pyreader train error")
+                    log.exception(e)
+            log.info("epcoh %s done." % epoch_idx)
+
+    def stop(self):
+        self.fleet.save_persistables(self.exe, os.path.join(self.config.output_path, "last"))
+
+
+class TranspilerLearner(Learner):
+    def __init__(self):
+        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        paddle_role = role_maker.Role.WORKER
+        place = F.CPUPlace()
+        if training_role == "PSERVER":
+            paddle_role = role_maker.Role.SERVER
+
+        # set the fleet runtime environment according to configure
+        port = os.getenv("PADDLE_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVERS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = eplist  # ip:port,ip:port...
+        worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        role = role_maker.UserDefinedRoleMaker(
+            current_id=trainer_id,
+            role=paddle_role,
+            worker_num=worker_num,
+            server_endpoints=pserver_endpoints)
+        tfleet.init(role)
+        tfleet.save_on_pserver = True
+
+    def build(self, model, data_gen, config):
+        self.optimize(model.loss, config.optimizer_type, config.lr)
+        self.init_and_run_ps_worker(config.ckpt_path)
+        self.program = self.complie_program(model.loss)
+        self.fleet = tfleet
+        model.data_loader.decorate_batch_generator(
+            data_gen, places=F.cpu_places())
+        self.config = config
+        self.model = model
+
+    def optimize(self, loss, optimizer_type, lr):
+        strategy = DistributeTranspilerConfig()
+        strategy.sync_mode = False
+        log.info('learning rate:%f' % lr)
+        if optimizer_type == "sgd":
+            optimizer = F.optimizer.SGD(learning_rate=lr)
+        elif optimizer_type == "adam":
+            # Don't slice tensor ensure convergence 
+            optimizer = F.optimizer.Adam(learning_rate=lr, lazy_mode=True)
+        else:
+            raise ValueError("Unknown Optimizer %s" % optimizer_type)
+        #create the DistributeTranspiler configure
+        optimizer = tfleet.distributed_optimizer(optimizer, strategy)
+        optimizer.minimize(loss)
+
+    def init_and_run_ps_worker(self, ckpt_path):
+        # init and run server or worker
+        self.exe = F.Executor(F.CPUPlace())
+        if tfleet.is_server():
+            tfleet.init_server()
+            self.warmstart(tfleet.startup_program, path=ckpt_path)
+            tfleet.run_server()
+            exit()
+
+        if tfleet.is_worker():
+            log.info("start init worker done")
+            tfleet.init_worker()
+            self.exe.run(tfleet.startup_program)
+
+    def complie_program(self, loss):
+        num_threads = int(os.getenv("CPU_NUM", 1))
+        exec_strategy = F.ExecutionStrategy()
+        exec_strategy.num_threads = num_threads
+        exec_strategy.use_thread_barrier = False
+        build_strategy = F.BuildStrategy()
+        build_strategy.enable_inplace = True
+        build_strategy.memory_optimize = True
+        build_strategy.remove_unnecessary_lock = False
+        build_strategy.memory_optimize = False
+        build_strategy.async_mode = False
+
+        if num_threads > 1:
+            build_strategy.reduce_strategy = F.BuildStrategy.ReduceStrategy.Reduce
+
+        log.info("start build compile program...")
+        compiled_prog = F.compiler.CompiledProgram(tfleet.main_program
+            ).with_data_parallel(
+                loss_name=loss.name,
+                build_strategy=build_strategy,
+                exec_strategy=exec_strategy)
+
+        return compiled_prog
+
+
+class CollectiveLearner(Learner):
+    def __init__(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        cfleet.init(role)
+
+    def optimize(self, loss, optimizer_type, lr):
+        optimizer = F.optimizer.Adam(learning_rate=lr)
+        dist_strategy = DistributedStrategy()
+        optimizer = cfleet.distributed_optimizer(optimizer, strategy=dist_strategy)
+        _, param_grads = optimizer.minimize(loss, F.default_startup_program())
+    
+    def build(self, model, data_gen, config):
+        self.optimize(model.loss, config.optimizer_type, config.lr)
+        self.program = cfleet.main_program
+        gpu_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        place = F.CUDAPlace(gpu_id)
+        self.exe = F.Executor(place)
+        self.exe.run(F.default_startup_program())
+        self.warmstart(F.default_startup_program(), config.ckpt_path)
+        self.fleet = cfleet
+        model.data_loader.decorate_batch_generator(
+            data_gen, places=place)
+        self.config = config
+        self.model = model
--- a/examples/erniesage/local_run.sh
+++ b/examples/erniesage/local_run.sh
+#!/bin/bash 
+
+set -x
+config=${1:-"./config.yaml"}
+unset http_proxy https_proxy
+
+function parse_yaml {
+   local prefix=$2
+   local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034')
+   sed -ne "s|^\($s\):|\1|" \
+        -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \
+        -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p"  $1 |
+   awk -F$fs '{
+      indent = length($1)/2;
+      vname[indent] = $2;
+      for (i in vname) {if (i > indent) {delete vname[i]}}
+      if (length($3) > 0) {
+         vn=""; for (i=0; i<indent; i++) {vn=(vn)(vname[i])("_")}
+         printf("%s%s%s=\"%s\"\n", "'$prefix'",vn, $2, $3);
+      }
+   }'
+}
+
+transpiler_local_train(){
+    export PADDLE_TRAINERS_NUM=1
+    export PADDLE_PSERVERS_NUM=1
+    export PADDLE_PORT=6206
+    export PADDLE_PSERVERS="127.0.0.1"
+    export BASE="./local_dir"
+    echo `which python`
+    if [ -d ${BASE} ]; then
+        rm -rf ${BASE}
+    fi 
+    mkdir ${BASE}
+    rm job_id
+    for((i=0;i<${PADDLE_PSERVERS_NUM};i++))
+    do
+        echo "start ps server: ${i}"
+        TRAINING_ROLE="PSERVER" PADDLE_TRAINER_ID=${i} sh job.sh local $config \
+            &> $BASE/pserver.$i.log &
+        echo $! >> job_id
+    done
+    sleep 3s 
+    for((j=0;j<${PADDLE_TRAINERS_NUM};j++))
+    do
+        echo "start ps work: ${j}"
+        TRAINING_ROLE="TRAINER" PADDLE_TRAINER_ID=${j} sh job.sh local $config \
+        echo $! >> job_id
+    done
+}
+
+collective_local_train(){
+    export PATH=./python27-gcc482-gpu/bin/:$PATH
+    echo `which python`
+    python -m paddle.distributed.launch train.py --conf $config
+    python -m paddle.distributed.launch infer.py --conf $config
+}
+
+eval $(parse_yaml $config)
+unalias python
+
+python3 ./preprocessing/dump_graph.py -i $input_data -o $graph_path --encoding $encoding \
+    -l $max_seqlen --vocab_file $ernie_vocab_file
+
+if [[ $learner_type == "cpu" ]];then
+    transpiler_local_train
+fi
+if [[ $learner_type == "gpu" ]];then
+    collective_local_train
+fi
--- a/examples/erniesage/models/__init__.py
+++ b/examples/erniesage/models/__init__.py
--- a/examples/erniesage/models/base.py
+++ b/examples/erniesage/models/base.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import glob
+import os
+
+import numpy as np
+
+import pgl
+import paddle.fluid as F
+import paddle.fluid.layers as L
+
+from models import message_passing
+
+def get_layer(layer_type, gw, feature, hidden_size, act, initializer, learning_rate, name, is_test=False):
+    return getattr(message_passing, layer_type)(gw, feature, hidden_size, act, initializer, learning_rate, name)
+
+
+class BaseGraphWrapperBuilder(object):
+    def __init__(self, config):
+        self.config = config
+        self.node_feature_info = []
+        self.edge_feature_info = []
+
+    def __call__(self):
+        place = F.CPUPlace()
+        graph_wrappers = []
+        for i in range(self.config.num_layers):
+            # all graph have same node_feat_info
+            graph_wrappers.append(
+                pgl.graph_wrapper.GraphWrapper(
+                    "layer_%s" % i, place, node_feat=self.node_feature_info, edge_feat=self.edge_feature_info))
+        return graph_wrappers
+
+
+class GraphsageGraphWrapperBuilder(BaseGraphWrapperBuilder):
+    def __init__(self, config):
+        super(GraphsageGraphWrapperBuilder, self).__init__(config)
+        self.node_feature_info.append(('index', [None], np.dtype('int64')))
+
+
+class BaseGNNModel(object):
+    def __init__(self, config):
+        self.config = config
+        self.graph_wrapper_builder = self.gen_graph_wrapper_builder(config) 
+        self.net_fn = self.gen_net_fn(config)
+        self.feed_list_builder = self.gen_feed_list_builder(config)
+        self.data_loader_builder = self.gen_data_loader_builder(config)
+        self.loss_fn = self.gen_loss_fn(config)
+        self.build()
+
+
+    def gen_graph_wrapper_builder(self, config): 
+        return GraphsageGraphWrapperBuilder(config)
+
+    def gen_net_fn(self, config):
+        return BaseNet(config)
+
+    def gen_feed_list_builder(self, config):
+        return BaseFeedListBuilder(config) 
+
+    def gen_data_loader_builder(self, config):
+        return BaseDataLoaderBuilder(config)
+
+    def gen_loss_fn(self, config):
+        return BaseLoss(config)
+
+    def build(self):
+        self.graph_wrappers = self.graph_wrapper_builder()
+        self.inputs, self.outputs = self.net_fn(self.graph_wrappers)
+        self.feed_list = self.feed_list_builder(self.inputs, self.graph_wrappers)
+        self.data_loader = self.data_loader_builder(self.feed_list)
+        self.loss = self.loss_fn(self.outputs)
+
+class BaseFeedListBuilder(object):
+    def __init__(self, config):
+        self.config = config
+
+    def __call__(self, inputs, graph_wrappers):
+        feed_list = []
+        for i in range(len(graph_wrappers)):
+            feed_list.extend(graph_wrappers[i].holder_list)
+        feed_list.extend(inputs)
+        return feed_list
+
+
+class BaseDataLoaderBuilder(object):
+    def __init__(self, config):
+        self.config = config
+
+    def __call__(self, feed_list):
+        data_loader = F.io.PyReader(
+            feed_list=feed_list, capacity=20, use_double_buffer=True, iterable=True)
+        return data_loader
+
+
+
+class BaseNet(object):
+    def __init__(self, config):
+        self.config = config
+
+    def take_final_feature(self, feature, index, name):
+        """take final feature"""
+        feat = L.gather(feature, index, overwrite=False)
+
+        if self.config.final_fc:
+            feat = L.fc(feat,
+                           self.config.hidden_size,
+                           param_attr=F.ParamAttr(name=name + '_w'),
+                           bias_attr=F.ParamAttr(name=name + '_b'))
+
+        if self.config.final_l2_norm:
+            feat = L.l2_normalize(feat, axis=1)
+        return feat
+
+    def build_inputs(self):
+        user_index = L.data(
+            "user_index", shape=[None], dtype="int64", append_batch_size=False)
+        item_index = L.data(
+            "item_index", shape=[None], dtype="int64", append_batch_size=False)
+        return [user_index, item_index]
+
+    def build_embedding(self, graph_wrappers, inputs=None):
+        num_embed = int(np.load(os.path.join(self.config.graph_path, "num_nodes.npy")))
+        is_sparse = self.config.trainer_type == "Transpiler"
+
+        embed = L.embedding(
+            input=L.reshape(graph_wrappers[0].node_feat['index'], [-1, 1]),
+            size=[num_embed, self.config.hidden_size],
+            is_sparse=is_sparse,
+            param_attr=F.ParamAttr(name="node_embedding", initializer=F.initializer.Uniform(
+                low=-1. / self.config.hidden_size,
+                high=1. / self.config.hidden_size)))
+        return embed
+
+    def gnn_layers(self, graph_wrappers, feature):
+        features = [feature]
+
+        initializer = None
+        fc_lr = self.config.lr / 0.001
+
+        for i in range(self.config.num_layers):
+            if i == self.config.num_layers - 1:
+                act = None
+            else:
+                act = "leaky_relu"
+
+            feature = get_layer(
+                self.config.layer_type,
+                graph_wrappers[i],
+                feature,
+                self.config.hidden_size,
+                act,
+                initializer,
+                learning_rate=fc_lr,
+                name="%s_%s" % (self.config.layer_type, i))
+            features.append(feature)
+        return features
+
+    def __call__(self, graph_wrappers):
+        inputs = self.build_inputs()
+        feature = self.build_embedding(graph_wrappers, inputs)
+        features = self.gnn_layers(graph_wrappers, feature)
+        outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs]
+        src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0])
+        outputs.append(src_real_index)
+        return inputs, outputs
+
+class BaseLoss(object):
+    def __init__(self, config):
+        self.config = config
+
+    def __call__(self, outputs):
+        user_feat, item_feat = outputs[0], outputs[1]
+        loss_type = self.config.loss_type
+        # Calc Loss
+        if self.config.loss_type == "hinge":
+            pos = L.reduce_sum(user_feat * item_feat, -1, keep_dim=True) # [B, 1]
+            neg = L.matmul(user_feat, item_feat, transpose_y=True) # [B, B]
+            loss = L.reduce_mean(L.relu(neg - pos + self.config.margin))
+        elif self.config.loss_type == "softmax":
+            pass
+            # TODO
+            # pos = L.reduce_sum(user_feat * item_feat, -1, keep_dim=True) # [B, 1]
+            # neg = L.matmul(user_feat, neg_feat, transpose_y=True) # [B, B]
+            # logits = L.concat([pos, neg], -1) # [B, 1+B]
+            # labels = L.fill_constant_batch_size_like(logits, [-1, 1], "int64", 0)
+            # loss = L.reduce_mean(L.softmax_with_cross_entropy(logits, labels))
+        else:
+            raise ValueError
+        return loss
--- a/examples/erniesage/models/ernie.py
+++ b/examples/erniesage/models/ernie.py
+"""Ernie
+"""
+from models.base  import BaseNet, BaseGNNModel 
+
+class Ernie(BaseNet):
+
+    def build_inputs(self):
+        inputs = super(Ernie, self).build_inputs()
+        term_ids = L.data(
+            "term_ids", shape=[None, self.config.max_seqlen], dtype="int64", append_batch_size=False)
+        return inputs + [term_ids]
+
+    def build_embedding(self, graph_wrappers, term_ids):
+        term_ids = L.unsqueeze(term_ids, [-1])
+        ernie_config = self.config.ernie_config
+        ernie = ErnieModel(
+            src_ids=term_ids,
+            sentence_ids=L.zeros_like(term_ids),
+            task_ids=None,
+            config=ernie_config,
+            use_fp16=False,
+            name="student_")
+        feature = ernie.get_pooled_output()
+        return feature
+
+    def __call__(self, graph_wrappers):
+        inputs = self.build_inputs()
+        feature = self.build_embedding(graph_wrappers, inputs[-1])
+        features = [feature]
+        outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs[:-1]]
+        src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0])
+        outputs.append(src_real_index)
+        return inputs, outputs
+
+
+class ErnieModel(BaseGNNModel):
+    def gen_net_fn(self, config):
+        return Ernie(config)
+
+
--- a/examples/erniesage/models/ernie_model/__init__.py
+++ b/examples/erniesage/models/ernie_model/__init__.py
--- a/examples/erniesage/models/ernie_model/ernie.py
+++ b/examples/erniesage/models/ernie_model/ernie.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Ernie model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import json
+import six
+import logging
+import paddle.fluid as fluid
+import paddle.fluid.layers as L
+
+from io import open
+
+from models.ernie_model.transformer_encoder import encoder, pre_process_layer
+from models.ernie_model.transformer_encoder import graph_encoder
+
+log = logging.getLogger(__name__)
+
+
+class ErnieConfig(object):
+    def __init__(self, config_path):
+        self._config_dict = self._parse(config_path)
+
+    def _parse(self, config_path):
+        try:
+            with open(config_path, 'r', encoding='utf8') as json_file:
+                config_dict = json.load(json_file)
+        except Exception:
+            raise IOError("Error in parsing Ernie model config file '%s'" %
+                          config_path)
+        else:
+            return config_dict
+
+    def __getitem__(self, key):
+        return self._config_dict.get(key, None)
+
+    def print_config(self):
+        for arg, value in sorted(six.iteritems(self._config_dict)):
+            log.info('%s: %s' % (arg, value))
+        log.info('------------------------------------------------')
+
+
+class ErnieModel(object):
+    def __init__(self,
+                 src_ids,
+                 sentence_ids,
+                 task_ids=None,
+                 config=None,
+                 weight_sharing=True,
+                 use_fp16=False,
+                 name=""):
+
+        self._set_config(config, name, weight_sharing)
+        input_mask = self._build_input_mask(src_ids)
+        position_ids = self._build_position_ids(src_ids)
+        self._build_model(src_ids, position_ids, sentence_ids, task_ids,
+                          input_mask)
+        self._debug_summary(input_mask)
+
+    def _debug_summary(self, input_mask):
+        #histogram
+        seqlen_before_pad = L.cast(
+            L.reduce_sum(
+                input_mask, dim=1), dtype='float32')
+        seqlen_after_pad = L.reduce_sum(
+            L.cast(
+                L.zeros_like(input_mask), dtype='float32') + 1.0, dim=1)
+        pad_num = seqlen_after_pad - seqlen_before_pad
+        pad_rate = pad_num / seqlen_after_pad
+
+    def _build_position_ids(self, src_ids):
+        d_shape = L.shape(src_ids)
+        d_seqlen = d_shape[1]
+        d_batch = d_shape[0]
+        position_ids = L.reshape(
+            L.range(
+                0, d_seqlen, 1, dtype='int32'), [1, d_seqlen, 1],
+            inplace=True)
+        position_ids = L.expand(position_ids, [d_batch, 1, 1])
+        position_ids = L.cast(position_ids, 'int64')
+        position_ids.stop_gradient = True
+        return position_ids
+
+    def _build_input_mask(self, src_ids):
+        zero = L.fill_constant([1], dtype='int64', value=0)
+        input_mask = L.logical_not(L.equal(src_ids,
+                                           zero))  # assume pad id == 0
+        input_mask = L.cast(input_mask, 'float')
+        input_mask.stop_gradient = True
+        return input_mask
+
+    def _set_config(self, config, name, weight_sharing):
+        self._emb_size = config['hidden_size']
+        self._n_layer = config['num_hidden_layers']
+        self._n_head = config['num_attention_heads']
+        self._voc_size = config['vocab_size']
+        self._max_position_seq_len = config['max_position_embeddings']
+        if config.get('sent_type_vocab_size'):
+            self._sent_types = config['sent_type_vocab_size']
+        else:
+            self._sent_types = config['type_vocab_size']
+
+        self._use_task_id = config['use_task_id']
+        if self._use_task_id:
+            self._task_types = config['task_type_vocab_size']
+        self._hidden_act = config['hidden_act']
+        self._postprocess_cmd = config.get('postprocess_cmd', 'dan')
+        self._preprocess_cmd = config.get('preprocess_cmd', '')
+        self._prepostprocess_dropout = config['hidden_dropout_prob']
+        self._attention_dropout = config['attention_probs_dropout_prob']
+        self._weight_sharing = weight_sharing
+        self.name = name
+
+        self._word_emb_name = self.name + "word_embedding"
+        self._pos_emb_name = self.name + "pos_embedding"
+        self._sent_emb_name = self.name + "sent_embedding"
+        self._task_emb_name = self.name + "task_embedding"
+        self._dtype = "float16" if config['use_fp16'] else "float32"
+        self._emb_dtype = "float32"
+
+        # Initialize all weigths by truncated normal initializer, and all biases
+        # will be initialized by constant zero by default.
+        self._param_initializer = fluid.initializer.TruncatedNormal(
+            scale=config['initializer_range'])
+
+    def _build_model(self, src_ids, position_ids, sentence_ids, task_ids,
+                     input_mask):
+
+        emb_out = self._build_embedding(src_ids, position_ids, sentence_ids,
+                                        task_ids)
+        self.input_mask = input_mask
+        self._enc_out, self.all_hidden, self.all_attn, self.all_ffn = encoder(
+            enc_input=emb_out,
+            input_mask=input_mask,
+            n_layer=self._n_layer,
+            n_head=self._n_head,
+            d_key=self._emb_size // self._n_head,
+            d_value=self._emb_size // self._n_head,
+            d_model=self._emb_size,
+            d_inner_hid=self._emb_size * 4,
+            prepostprocess_dropout=self._prepostprocess_dropout,
+            attention_dropout=self._attention_dropout,
+            relu_dropout=0,
+            hidden_act=self._hidden_act,
+            preprocess_cmd=self._preprocess_cmd,
+            postprocess_cmd=self._postprocess_cmd,
+            param_initializer=self._param_initializer,
+            name=self.name + 'encoder')
+        if self._dtype == "float16":
+            self._enc_out = fluid.layers.cast(
+                x=self._enc_out, dtype=self._emb_dtype)
+
+    def _build_embedding(self, src_ids, position_ids, sentence_ids, task_ids):
+        # padding id in vocabulary must be set to 0
+        emb_out = fluid.layers.embedding(
+            input=src_ids,
+            size=[self._voc_size, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._word_emb_name, initializer=self._param_initializer),
+            is_sparse=False)
+
+        position_emb_out = fluid.layers.embedding(
+            input=position_ids,
+            size=[self._max_position_seq_len, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._pos_emb_name, initializer=self._param_initializer))
+
+        sent_emb_out = fluid.layers.embedding(
+            sentence_ids,
+            size=[self._sent_types, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._sent_emb_name, initializer=self._param_initializer))
+
+        self.all_emb = [emb_out, position_emb_out, sent_emb_out]
+        emb_out = emb_out + position_emb_out
+        emb_out = emb_out + sent_emb_out
+
+        if self._use_task_id:
+            task_emb_out = fluid.layers.embedding(
+                task_ids,
+                size=[self._task_types, self._emb_size],
+                dtype=self._emb_dtype,
+                param_attr=fluid.ParamAttr(
+                    name=self._task_emb_name,
+                    initializer=self._param_initializer))
+
+            emb_out = emb_out + task_emb_out
+
+        emb_out = pre_process_layer(
+            emb_out,
+            'nd',
+            self._prepostprocess_dropout,
+            name=self.name + 'pre_encoder')
+
+        if self._dtype == "float16":
+            emb_out = fluid.layers.cast(x=emb_out, dtype=self._dtype)
+        return emb_out
+
+    def get_sequence_output(self):
+        return self._enc_out
+
+    def get_pooled_output(self):
+        """Get the first feature of each sequence for classification"""
+        next_sent_feat = self._enc_out[:, 0, :]
+        #next_sent_feat = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1])
+        next_sent_feat = fluid.layers.fc(
+            input=next_sent_feat,
+            size=self._emb_size,
+            act="tanh",
+            param_attr=fluid.ParamAttr(
+                name=self.name + "pooled_fc.w_0",
+                initializer=self._param_initializer),
+            bias_attr=self.name + "pooled_fc.b_0")
+        return next_sent_feat
+
+    def get_lm_output(self, mask_label, mask_pos):
+        """Get the loss & accuracy for pretraining"""
+
+        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
+
+        # extract the first token feature in each sentence
+        self.next_sent_feat = self.get_pooled_output()
+        reshaped_emb_out = fluid.layers.reshape(
+            x=self._enc_out, shape=[-1, self._emb_size])
+        # extract masked tokens' feature
+        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
+
+        # transform: fc
+        mask_trans_feat = fluid.layers.fc(
+            input=mask_feat,
+            size=self._emb_size,
+            act=self._hidden_act,
+            param_attr=fluid.ParamAttr(
+                name=self.name + 'mask_lm_trans_fc.w_0',
+                initializer=self._param_initializer),
+            bias_attr=fluid.ParamAttr(name=self.name + 'mask_lm_trans_fc.b_0'))
+
+        # transform: layer norm 
+        mask_trans_feat = fluid.layers.layer_norm(
+            mask_trans_feat,
+            begin_norm_axis=len(mask_trans_feat.shape) - 1,
+            param_attr=fluid.ParamAttr(
+                name=self.name + 'mask_lm_trans_layer_norm_scale',
+                initializer=fluid.initializer.Constant(1.)),
+            bias_attr=fluid.ParamAttr(
+                name=self.name + 'mask_lm_trans_layer_norm_bias',
+                initializer=fluid.initializer.Constant(0.)))
+        # transform: layer norm 
+        #mask_trans_feat = pre_process_layer(
+        #    mask_trans_feat, 'n', name=self.name + 'mask_lm_trans')
+
+        mask_lm_out_bias_attr = fluid.ParamAttr(
+            name=self.name + "mask_lm_out_fc.b_0",
+            initializer=fluid.initializer.Constant(value=0.0))
+        if self._weight_sharing:
+            fc_out = fluid.layers.matmul(
+                x=mask_trans_feat,
+                y=fluid.default_main_program().global_block().var(
+                    self._word_emb_name),
+                transpose_y=True)
+            fc_out += fluid.layers.create_parameter(
+                shape=[self._voc_size],
+                dtype=self._emb_dtype,
+                attr=mask_lm_out_bias_attr,
+                is_bias=True)
+
+        else:
+            fc_out = fluid.layers.fc(input=mask_trans_feat,
+                                     size=self._voc_size,
+                                     param_attr=fluid.ParamAttr(
+                                         name=self.name + "mask_lm_out_fc.w_0",
+                                         initializer=self._param_initializer),
+                                     bias_attr=mask_lm_out_bias_attr)
+
+        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
+            logits=fc_out, label=mask_label)
+        return mask_lm_loss
+
+    def get_task_output(self, task, task_labels):
+        task_fc_out = fluid.layers.fc(
+            input=self.next_sent_feat,
+            size=task["num_labels"],
+            param_attr=fluid.ParamAttr(
+                name=self.name + task["task_name"] + "_fc.w_0",
+                initializer=self._param_initializer),
+            bias_attr=self.name + task["task_name"] + "_fc.b_0")
+        task_loss, task_softmax = fluid.layers.softmax_with_cross_entropy(
+            logits=task_fc_out, label=task_labels, return_softmax=True)
+        task_acc = fluid.layers.accuracy(input=task_softmax, label=task_labels)
+        return task_loss, task_acc
+
+
+class ErnieGraphModel(ErnieModel):
+    def __init__(self,
+                 src_ids,
+                 task_ids=None,
+                 config=None,
+                 weight_sharing=True,
+                 use_fp16=False,
+                 slot_seqlen=40,
+                 name=""):
+        self.slot_seqlen = slot_seqlen
+        self._set_config(config, name, weight_sharing)
+        input_mask = self._build_input_mask(src_ids)
+        position_ids = self._build_position_ids(src_ids)
+        sentence_ids = self._build_sentence_ids(src_ids)
+        self._build_model(src_ids, position_ids, sentence_ids, task_ids,
+                          input_mask)
+        self._debug_summary(input_mask)
+
+    def _build_position_ids(self, src_ids):
+        src_shape = L.shape(src_ids)
+        src_seqlen = src_shape[1]
+        src_batch = src_shape[0]
+
+        slot_seqlen = self.slot_seqlen
+
+        num_b = (src_seqlen / slot_seqlen) - 1
+        a_position_ids = L.reshape(
+            L.range(
+                0, slot_seqlen, 1, dtype='int32'), [1, slot_seqlen, 1],
+            inplace=True) # [1, slot_seqlen, 1]
+        a_position_ids = L.expand(a_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen * num_b, 1]
+
+        zero = L.fill_constant([1], dtype='int64', value=0)
+        input_mask = L.cast(L.equal(src_ids[:, :slot_seqlen], zero), "int32")  # assume pad id == 0 [B, slot_seqlen, 1]
+        a_pad_len = L.reduce_sum(input_mask, 1) # [B, 1, 1]
+
+        b_position_ids = L.reshape(
+            L.range(
+                slot_seqlen, 2*slot_seqlen, 1, dtype='int32'), [1, slot_seqlen, 1],
+            inplace=True) # [1, slot_seqlen, 1]
+        b_position_ids = L.expand(b_position_ids, [src_batch, num_b, 1]) # [B, slot_seqlen * num_b, 1]
+        b_position_ids = b_position_ids - a_pad_len # [B, slot_seqlen * num_b, 1]
+
+        position_ids = L.concat([a_position_ids, b_position_ids], 1)
+        position_ids = L.cast(position_ids, 'int64')
+        position_ids.stop_gradient = True
+        return position_ids
+
+    def _build_sentence_ids(self, src_ids):
+        src_shape = L.shape(src_ids)
+        src_seqlen = src_shape[1]
+        src_batch = src_shape[0]
+
+        slot_seqlen = self.slot_seqlen
+
+        zeros = L.zeros([src_batch, slot_seqlen, 1], "int64")
+        ones = L.ones([src_batch, src_seqlen-slot_seqlen, 1], "int64")
+
+        sentence_ids = L.concat([zeros, ones], 1)
+        sentence_ids.stop_gradient = True
+        return sentence_ids
+
+    def _build_model(self, src_ids, position_ids, sentence_ids, task_ids,
+                     input_mask):
+
+        emb_out = self._build_embedding(src_ids, position_ids, sentence_ids,
+                                        task_ids)
+        self.input_mask = input_mask
+        self._enc_out, self.all_hidden, self.all_attn, self.all_ffn = graph_encoder(
+            enc_input=emb_out,
+            input_mask=input_mask,
+            n_layer=self._n_layer,
+            n_head=self._n_head,
+            d_key=self._emb_size // self._n_head,
+            d_value=self._emb_size // self._n_head,
+            d_model=self._emb_size,
+            d_inner_hid=self._emb_size * 4,
+            prepostprocess_dropout=self._prepostprocess_dropout,
+            attention_dropout=self._attention_dropout,
+            relu_dropout=0,
+            hidden_act=self._hidden_act,
+            preprocess_cmd=self._preprocess_cmd,
+            postprocess_cmd=self._postprocess_cmd,
+            param_initializer=self._param_initializer,
+            slot_seqlen=self.slot_seqlen,
+            name=self.name + 'encoder')
+        if self._dtype == "float16":
+            self._enc_out = fluid.layers.cast(
+                x=self._enc_out, dtype=self._emb_dtype)
--- a/examples/erniesage/models/ernie_model/transformer_encoder.py
+++ b/examples/erniesage/models/ernie_model/transformer_encoder.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+import numpy as np
+from contextlib import contextmanager
+
+import paddle.fluid as fluid
+import paddle.fluid.layers as L
+import paddle.fluid.layers as layers
+#import propeller.paddle as propeller
+#from propeller import log
+
+#determin this at the begining
+to_3d = lambda a: a  # will change later
+to_2d = lambda a: a
+
+
+def multi_head_attention(queries,
+                         keys,
+                         values,
+                         attn_bias,
+                         d_key,
+                         d_value,
+                         d_model,
+                         n_head=1,
+                         dropout_rate=0.,
+                         cache=None,
+                         param_initializer=None,
+                         name='multi_head_att'):
+    """
+    Multi-Head Attention. Note that attn_bias is added to the logit before
+    computing softmax activiation to mask certain selected positions so that
+    they will not considered in attention weights.
+    """
+    keys = queries if keys is None else keys
+    values = keys if values is None else values
+
+    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
+        """
+        Add linear projection to queries, keys, and values.
+        """
+        q = layers.fc(input=queries,
+                      size=d_key * n_head,
+                      num_flatten_dims=len(queries.shape) - 1,
+                      param_attr=fluid.ParamAttr(
+                          name=name + '_query_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_query_fc.b_0')
+        k = layers.fc(input=keys,
+                      size=d_key * n_head,
+                      num_flatten_dims=len(keys.shape) - 1,
+                      param_attr=fluid.ParamAttr(
+                          name=name + '_key_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_key_fc.b_0')
+        v = layers.fc(input=values,
+                      size=d_value * n_head,
+                      num_flatten_dims=len(values.shape) - 1,
+                      param_attr=fluid.ParamAttr(
+                          name=name + '_value_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_value_fc.b_0')
+        return q, k, v
+
+    def __split_heads(x, n_head):
+        """
+        Reshape the last dimension of inpunt tensor x so that it becomes two
+        dimensions and then transpose. Specifically, input a tensor with shape
+        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
+        with shape [bs, n_head, max_sequence_length, hidden_dim].
+        """
+        hidden_size = x.shape[-1]
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        reshaped = layers.reshape(
+            x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
+
+        # permuate the dimensions into:
+        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
+        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
+
+    def __combine_heads(x):
+        """
+        Transpose and then reshape the last two dimensions of inpunt tensor x
+        so that it becomes one dimension, which is reverse to __split_heads.
+        """
+        if len(x.shape) == 3: return x
+        if len(x.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        #trans_x.desc.set_shape((-1, 1, n_head, d_value))
+        return layers.reshape(x=trans_x, shape=[0, 0, d_model], inplace=True)
+
+    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
+        """
+        Scaled Dot-Product Attention
+        """
+        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
+        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
+        if attn_bias:
+            product += attn_bias
+        weights = layers.softmax(product)
+        if dropout_rate:
+            weights = layers.dropout(
+                weights,
+                dropout_prob=dropout_rate,
+                dropout_implementation="upscale_in_train",
+                is_test=False)
+        out = layers.matmul(weights, v)
+        #return out, product
+        return out, weights
+
+    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
+    q = to_3d(q)
+    k = to_3d(k)
+    v = to_3d(v)
+
+    if cache is not None:  # use cache and concat time steps
+        # Since the inplace reshape in __split_heads changes the shape of k and
+        # v, which is the cache input for next time step, reshape the cache
+        # input from the previous time step first.
+        k = cache["k"] = layers.concat(
+            [layers.reshape(
+                cache["k"], shape=[0, 0, d_model]), k], axis=1)
+        v = cache["v"] = layers.concat(
+            [layers.reshape(
+                cache["v"], shape=[0, 0, d_model]), v], axis=1)
+
+    q = __split_heads(q, n_head)
+    k = __split_heads(k, n_head)
+    v = __split_heads(v, n_head)
+
+    ctx_multiheads, ctx_multiheads_attn = scaled_dot_product_attention(
+        q, k, v, attn_bias, d_key, dropout_rate)
+
+    out = __combine_heads(ctx_multiheads)
+
+    out = to_2d(out)
+
+    # Project back to the model size.
+    proj_out = layers.fc(input=out,
+                         size=d_model,
+                         num_flatten_dims=len(out.shape) - 1,
+                         param_attr=fluid.ParamAttr(
+                             name=name + '_output_fc.w_0',
+                             initializer=param_initializer),
+                         bias_attr=name + '_output_fc.b_0')
+    return proj_out, ctx_multiheads_attn
+
+
+def positionwise_feed_forward(x,
+                              d_inner_hid,
+                              d_hid,
+                              dropout_rate,
+                              hidden_act,
+                              param_initializer=None,
+                              name='ffn'):
+    """
+    Position-wise Feed-Forward Networks.
+    This module consists of two linear transformations with a ReLU activation
+    in between, which is applied to each position separately and identically.
+    """
+    hidden = layers.fc(input=x,
+                       size=d_inner_hid,
+                       num_flatten_dims=len(x.shape) - 1,
+                       act=hidden_act,
+                       param_attr=fluid.ParamAttr(
+                           name=name + '_fc_0.w_0',
+                           initializer=param_initializer),
+                       bias_attr=name + '_fc_0.b_0')
+    if dropout_rate:
+        hidden = layers.dropout(
+            hidden,
+            dropout_prob=dropout_rate,
+            dropout_implementation="upscale_in_train",
+            is_test=False)
+    out = layers.fc(input=hidden,
+                    size=d_hid,
+                    num_flatten_dims=len(hidden.shape) - 1,
+                    param_attr=fluid.ParamAttr(
+                        name=name + '_fc_1.w_0',
+                        initializer=param_initializer),
+                    bias_attr=name + '_fc_1.b_0')
+    return out
+
+
+def pre_post_process_layer(prev_out,
+                           out,
+                           process_cmd,
+                           dropout_rate=0.,
+                           name=''):
+    """
+    Add residual connection, layer normalization and droput to the out tensor
+    optionally according to the value of process_cmd.
+    This will be used before or after multi-head attention and position-wise
+    feed-forward networks.
+    """
+    for cmd in process_cmd:
+        if cmd == "a":  # add residual connection
+            out = out + prev_out if prev_out else out
+        elif cmd == "n":  # add layer normalization
+            out_dtype = out.dtype
+            if out_dtype == fluid.core.VarDesc.VarType.FP16:
+                out = layers.cast(x=out, dtype="float32")
+            out = layers.layer_norm(
+                out,
+                begin_norm_axis=len(out.shape) - 1,
+                param_attr=fluid.ParamAttr(
+                    name=name + '_layer_norm_scale',
+                    initializer=fluid.initializer.Constant(1.)),
+                bias_attr=fluid.ParamAttr(
+                    name=name + '_layer_norm_bias',
+                    initializer=fluid.initializer.Constant(0.)))
+            if out_dtype == fluid.core.VarDesc.VarType.FP16:
+                out = layers.cast(x=out, dtype="float16")
+        elif cmd == "d":  # add dropout
+            if dropout_rate:
+                out = layers.dropout(
+                    out,
+                    dropout_prob=dropout_rate,
+                    dropout_implementation="upscale_in_train",
+                    is_test=False)
+    return out
+
+
+pre_process_layer = partial(pre_post_process_layer, None)
+post_process_layer = pre_post_process_layer
+
+
+def encoder_layer(enc_input,
+                  attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  prepostprocess_dropout,
+                  attention_dropout,
+                  relu_dropout,
+                  hidden_act,
+                  preprocess_cmd="n",
+                  postprocess_cmd="da",
+                  param_initializer=None,
+                  name=''):
+    """The encoder layers that can be stacked to form a deep encoder.
+    This module consits of a multi-head (self) attention followed by
+    position-wise feed-forward networks and both the two components companied
+    with the post_process_layer to add residual connection, layer normalization
+    and droput.
+    """
+    #L.Print(L.reduce_mean(enc_input), message='1')
+    attn_output, ctx_multiheads_attn = multi_head_attention(
+        pre_process_layer(
+            enc_input,
+            preprocess_cmd,
+            prepostprocess_dropout,
+            name=name + '_pre_att'),
+        None,
+        None,
+        attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        attention_dropout,
+        param_initializer=param_initializer,
+        name=name + '_multi_head_att')
+    #L.Print(L.reduce_mean(attn_output), message='1')
+    attn_output = post_process_layer(
+        enc_input,
+        attn_output,
+        postprocess_cmd,
+        prepostprocess_dropout,
+        name=name + '_post_att')
+
+    #L.Print(L.reduce_mean(attn_output), message='2')
+    ffd_output = positionwise_feed_forward(
+        pre_process_layer(
+            attn_output,
+            preprocess_cmd,
+            prepostprocess_dropout,
+            name=name + '_pre_ffn'),
+        d_inner_hid,
+        d_model,
+        relu_dropout,
+        hidden_act,
+        param_initializer=param_initializer,
+        name=name + '_ffn')
+    #L.Print(L.reduce_mean(ffd_output), message='3')
+    ret = post_process_layer(
+        attn_output,
+        ffd_output,
+        postprocess_cmd,
+        prepostprocess_dropout,
+        name=name + '_post_ffn')
+    #L.Print(L.reduce_mean(ret), message='4')
+    return ret, ctx_multiheads_attn, ffd_output
+
+
+def build_pad_idx(input_mask):
+    pad_idx = L.where(L.cast(L.squeeze(input_mask, [2]), 'bool'))
+    return pad_idx
+
+
+def build_attn_bias(input_mask, n_head, dtype):
+    attn_bias = L.matmul(
+        input_mask, input_mask, transpose_y=True)  # [batch, seq, seq]
+    attn_bias = (1. - attn_bias) * -10000.
+    attn_bias = L.stack([attn_bias] * n_head, 1) # [batch, n_head, seq, seq]
+    if attn_bias.dtype != dtype:
+        attn_bias = L.cast(attn_bias, dtype)
+    return attn_bias
+
+
+def build_graph_attn_bias(input_mask, n_head, dtype, slot_seqlen):
+
+    input_shape = L.shape(input_mask)
+    input_batch = input_shape[0]
+    input_seqlen = input_shape[1]
+    num_slot = input_seqlen / slot_seqlen
+    num_b = num_slot - 1
+    ones = L.ones([num_b], dtype="float32") # [num_b]
+    diag_ones = L.diag(ones) # [num_b, num_b]
+    diag_ones = L.unsqueeze(diag_ones, [1, -1]) # [num_b, 1, num_b, 1]
+    diag_ones = L.expand(diag_ones, [1, slot_seqlen, 1, slot_seqlen]) # [num_b, seqlen, num_b, seqlen]
+    diag_ones = L.reshape(diag_ones, [1, num_b*slot_seqlen, num_b*slot_seqlen]) # [1, num_b*seqlen, num_b*seqlen]
+    
+    graph_attn_bias = L.concat([L.ones([1, num_b*slot_seqlen, slot_seqlen], dtype="float32"), diag_ones], 2)
+    graph_attn_bias = L.concat([L.ones([1, slot_seqlen, num_slot*slot_seqlen], dtype="float32"), graph_attn_bias], 1) # [1, seq, seq]
+
+    pad_attn_bias = L.matmul(
+        input_mask, input_mask, transpose_y=True)  # [batch, seq, seq]
+    attn_bias = graph_attn_bias * pad_attn_bias
+
+    attn_bias = (1. - attn_bias) * -10000.
+    attn_bias = L.stack([attn_bias] * n_head, 1) # [batch, n_head, seq, seq]
+    if attn_bias.dtype != dtype:
+        attn_bias = L.cast(attn_bias, dtype)
+    return attn_bias
+
+
+def encoder(enc_input,
+            input_mask,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd="n",
+            postprocess_cmd="da",
+            param_initializer=None,
+            name=''):
+    """
+    The encoder is composed of a stack of identical layers returned by calling
+    encoder_layer.
+    """
+
+    #global to_2d, to_3d  #, batch, seqlen, dynamic_dim
+    d_shape = L.shape(input_mask)
+    pad_idx = build_pad_idx(input_mask)
+    attn_bias = build_attn_bias(input_mask, n_head, enc_input.dtype)
+
+    # d_batch = d_shape[0]
+    # d_seqlen = d_shape[1]
+    # pad_idx = L.where(
+    # L.cast(L.reshape(input_mask, [d_batch, d_seqlen]), 'bool'))
+
+    # attn_bias = L.matmul(
+    # input_mask, input_mask, transpose_y=True)  # [batch, seq, seq]
+    # attn_bias = (1. - attn_bias) * -10000.
+    # attn_bias = L.stack([attn_bias] * n_head, 1)
+    # if attn_bias.dtype != enc_input.dtype:
+    # attn_bias = L.cast(attn_bias, enc_input.dtype)
+
+    # def to_2d(t_3d):
+        # t_2d = L.gather_nd(t_3d, pad_idx)
+        # return t_2d
+
+    # def to_3d(t_2d):
+        # t_3d = L.scatter_nd(
+        # pad_idx, t_2d, shape=[d_shape[0], d_shape[1], d_model])
+        # return t_3d
+
+    enc_input = to_2d(enc_input)
+    all_hidden = []
+    all_attn = []
+    all_ffn = []
+    for i in range(n_layer):
+        enc_output, ctx_multiheads_attn, ffn_output = encoder_layer(
+            enc_input,
+            attn_bias,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd,
+            postprocess_cmd,
+            param_initializer=param_initializer,
+            name=name + '_layer_' + str(i))
+        all_hidden.append(enc_output)
+        all_attn.append(ctx_multiheads_attn)
+        all_ffn.append(ffn_output)
+        enc_input = enc_output
+    enc_output = pre_process_layer(
+        enc_output,
+        preprocess_cmd,
+        prepostprocess_dropout,
+        name="post_encoder")
+    enc_output = to_3d(enc_output)
+    #enc_output.desc.set_shape((-1, 1, final_dim))
+    return enc_output, all_hidden, all_attn, all_ffn
+
+def graph_encoder(enc_input,
+            input_mask,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd="n",
+            postprocess_cmd="da",
+            param_initializer=None,
+            slot_seqlen=40,
+            name=''):
+    """
+    The encoder is composed of a stack of identical layers returned by calling
+    encoder_layer.
+    """
+
+    #global to_2d, to_3d  #, batch, seqlen, dynamic_dim
+    d_shape = L.shape(input_mask)
+    pad_idx = build_pad_idx(input_mask)
+    attn_bias = build_graph_attn_bias(input_mask, n_head, enc_input.dtype, slot_seqlen)
+    #attn_bias = build_attn_bias(input_mask, n_head, enc_input.dtype)
+
+    # d_batch = d_shape[0]
+    # d_seqlen = d_shape[1]
+    # pad_idx = L.where(
+    # L.cast(L.reshape(input_mask, [d_batch, d_seqlen]), 'bool'))
+
+    # attn_bias = L.matmul(
+    # input_mask, input_mask, transpose_y=True)  # [batch, seq, seq]
+    # attn_bias = (1. - attn_bias) * -10000.
+    # attn_bias = L.stack([attn_bias] * n_head, 1)
+    # if attn_bias.dtype != enc_input.dtype:
+    # attn_bias = L.cast(attn_bias, enc_input.dtype)
+
+    # def to_2d(t_3d):
+        # t_2d = L.gather_nd(t_3d, pad_idx)
+        # return t_2d
+
+    # def to_3d(t_2d):
+        # t_3d = L.scatter_nd(
+        # pad_idx, t_2d, shape=[d_shape[0], d_shape[1], d_model])
+        # return t_3d
+
+    enc_input = to_2d(enc_input)
+    all_hidden = []
+    all_attn = []
+    all_ffn = []
+    for i in range(n_layer):
+        enc_output, ctx_multiheads_attn, ffn_output = encoder_layer(
+            enc_input,
+            attn_bias,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd,
+            postprocess_cmd,
+            param_initializer=param_initializer,
+            name=name + '_layer_' + str(i))
+        all_hidden.append(enc_output)
+        all_attn.append(ctx_multiheads_attn)
+        all_ffn.append(ffn_output)
+        enc_input = enc_output
+    enc_output = pre_process_layer(
+        enc_output,
+        preprocess_cmd,
+        prepostprocess_dropout,
+        name="post_encoder")
+    enc_output = to_3d(enc_output)
+    #enc_output.desc.set_shape((-1, 1, final_dim))
+    return enc_output, all_hidden, all_attn, all_ffn
--- a/examples/erniesage/models/erniesage_v1.py
+++ b/examples/erniesage/models/erniesage_v1.py
+import pgl
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from models.base import BaseNet, BaseGNNModel
+from models.ernie_model.ernie import ErnieModel
+from models.ernie_model.ernie import ErnieGraphModel
+from models.ernie_model.ernie import ErnieConfig
+
+class ErnieSageV1(BaseNet):
+
+    def build_inputs(self):
+        inputs = super(ErnieSageV1, self).build_inputs()
+        term_ids = L.data(
+            "term_ids", shape=[None, self.config.max_seqlen], dtype="int64", append_batch_size=False)
+        return inputs + [term_ids]
+
+    def build_embedding(self, graph_wrappers, term_ids):
+        term_ids = L.unsqueeze(term_ids, [-1])
+        ernie_config = self.config.ernie_config
+        ernie = ErnieModel(
+            src_ids=term_ids,
+            sentence_ids=L.zeros_like(term_ids),
+            task_ids=None,
+            config=ernie_config,
+            use_fp16=False,
+            name="student_")
+        feature = ernie.get_pooled_output()
+        return feature
+
+    def __call__(self, graph_wrappers):
+        inputs = self.build_inputs()
+        feature = self.build_embedding(graph_wrappers, inputs[-1])
+        features = self.gnn_layers(graph_wrappers, feature)
+        outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs[:-1]]
+        src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0])
+        outputs.append(src_real_index)
+        return inputs, outputs
+
+
+class ErnieSageModelV1(BaseGNNModel):
+    def gen_net_fn(self, config):
+        return ErnieSageV1(config)
--- a/examples/erniesage/models/erniesage_v2.py
+++ b/examples/erniesage/models/erniesage_v2.py
+import pgl
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from models.base import BaseNet, BaseGNNModel
+from models.ernie_model.ernie import ErnieModel
+from models.ernie_model.ernie import ErnieGraphModel
+from models.ernie_model.ernie import ErnieConfig
+
+
+class ErnieSageV2(BaseNet):
+
+    def build_inputs(self):
+        inputs = super(ErnieSageV2, self).build_inputs()
+        term_ids = L.data(
+            "term_ids", shape=[None, self.config.max_seqlen], dtype="int64", append_batch_size=False)
+        return inputs + [term_ids]
+
+    def gnn_layer(self, gw, feature, hidden_size, act, initializer, learning_rate, name):
+        def ernie_send(src_feat, dst_feat, edge_feat):
+            """doc"""
+            cls = L.fill_constant_batch_size_like(src_feat["term_ids"], [-1, 1, 1], "int64", 1)
+            src_ids = L.concat([cls, src_feat["term_ids"]], 1)
+            dst_ids = dst_feat["term_ids"]
+
+            sent_ids = L.concat([L.zeros_like(src_ids), L.ones_like(dst_ids)], 1)
+            term_ids = L.concat([src_ids, dst_ids], 1)
+
+            term_ids.stop_gradient = True
+            sent_ids.stop_gradient = True
+            ernie = ErnieModel(
+                term_ids, sent_ids,
+                config=self.config.ernie_config)
+            feature = ernie.get_pooled_output()
+            return feature
+
+        def erniesage_v2_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name):
+            feature = L.unsqueeze(feature, [-1])
+            msg = gw.send(ernie_send, nfeat_list=[("term_ids", feature)])
+            neigh_feature = gw.recv(msg, lambda feat: F.layers.sequence_pool(feat, pool_type="sum"))
+
+            term_ids = feature
+            cls = L.fill_constant_batch_size_like(term_ids, [-1, 1, 1], "int64", 1)
+            term_ids = L.concat([cls, term_ids], 1)
+            term_ids.stop_gradient = True
+            ernie = ErnieModel(
+                term_ids, L.zeros_like(term_ids),
+                config=self.config.ernie_config)
+            self_feature = ernie.get_pooled_output()
+
+            self_feature = L.fc(self_feature,
+                                           hidden_size,
+                                           act=act,
+                                           param_attr=F.ParamAttr(name=name + "_l",
+                                           learning_rate=learning_rate),
+                                           )
+            neigh_feature = L.fc(neigh_feature,
+                                            hidden_size,
+                                            act=act,
+                                            param_attr=F.ParamAttr(name=name + "_r",
+                                           learning_rate=learning_rate),
+                                            )
+            output = L.concat([self_feature, neigh_feature], axis=1)
+            output = L.l2_normalize(output, axis=1)
+            return output
+        return erniesage_v2_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name)
+
+    def gnn_layers(self, graph_wrappers, feature):
+        features = [feature]
+
+        initializer = None
+        fc_lr = self.config.lr / 0.001
+
+        for i in range(self.config.num_layers):
+            if i == self.config.num_layers - 1:
+                act = None
+            else:
+                act = "leaky_relu"
+
+            feature = self.gnn_layer(
+                graph_wrappers[i],
+                feature,
+                self.config.hidden_size,
+                act,
+                initializer,
+                learning_rate=fc_lr,
+                name="%s_%s" % ("erniesage_v2", i))
+            features.append(feature)
+        return features
+
+    def __call__(self, graph_wrappers):
+        inputs = self.build_inputs()
+        feature = inputs[-1]
+        features = self.gnn_layers(graph_wrappers, feature)
+        outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs[:-1]]
+        src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0])
+        outputs.append(src_real_index)
+        return inputs, outputs
+
+
+class ErnieSageModelV2(BaseGNNModel):
+    def gen_net_fn(self, config):
+        return ErnieSageV2(config)
--- a/examples/erniesage/models/erniesage_v3.py
+++ b/examples/erniesage/models/erniesage_v3.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pgl
+import paddle.fluid as F
+import paddle.fluid.layers as L
+
+from models.base import BaseNet, BaseGNNModel
+from models.ernie_model.ernie import ErnieModel
+from models.ernie_model.ernie import ErnieGraphModel
+from models.ernie_model.ernie import ErnieConfig
+from models.message_passing import copy_send
+
+
+class ErnieSageV3(BaseNet):
+    def __init__(self, config):
+        super(ErnieSageV3, self).__init__(config)
+        self.config.layer_type = "ernie_recv_sum"
+
+    def build_inputs(self):
+        inputs = super(ErnieSageV3, self).build_inputs()
+        term_ids = L.data(
+            "term_ids", shape=[None, self.config.max_seqlen], dtype="int64", append_batch_size=False)
+        return inputs + [term_ids]
+
+    def gnn_layer(self, gw, feature, hidden_size, act, initializer, learning_rate, name):
+        def ernie_recv(feat):
+            """doc"""
+            # TODO maxlen  400
+            #pad_value = L.cast(L.assign(input=np.array([0], dtype=np.int32)), "int64")
+            pad_value = L.zeros([1], "int64")
+            out, _ = L.sequence_pad(feat, pad_value=pad_value, maxlen=10)
+            out = L.reshape(out, [0, 400])
+            return out
+
+        def erniesage_v3_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name):
+            msg = gw.send(copy_send, nfeat_list=[("h", feature)])
+            neigh_feature = gw.recv(msg, ernie_recv)
+            neigh_feature = L.cast(L.unsqueeze(neigh_feature, [-1]), "int64")
+
+            feature = L.unsqueeze(feature, [-1])
+            cls = L.fill_constant_batch_size_like(feature, [-1, 1, 1], "int64", 1)
+            term_ids = L.concat([cls, feature[:, :-1], neigh_feature], 1)
+            term_ids.stop_gradient = True
+            return term_ids
+        return erniesage_v3_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name)
+
+    def gnn_layers(self, graph_wrappers, feature):
+        features = [feature]
+
+        initializer = None
+        fc_lr = self.config.lr / 0.001
+
+        for i in range(self.config.num_layers):
+            if i == self.config.num_layers - 1:
+                act = None
+            else:
+                act = "leaky_relu"
+
+            feature = self.gnn_layer(
+                graph_wrappers[i],
+                feature,
+                self.config.hidden_size,
+                act,
+                initializer,
+                learning_rate=fc_lr,
+                name="%s_%s" % (self.config.layer_type, i))
+            features.append(feature)
+        return features
+
+    def take_final_feature(self, feature, index, name):
+        """take final feature"""
+        feat = L.gather(feature, index, overwrite=False)
+
+        ernie_config = self.config.ernie_config
+        ernie = ErnieGraphModel(
+            src_ids=feat,
+            config=ernie_config,
+            slot_seqlen=self.config.max_seqlen,
+            name="student_")
+        feat = ernie.get_pooled_output()
+        fc_lr = self.config.lr / 0.001
+        feat= L.fc(feat,
+                   self.config.hidden_size,
+                   act="relu",
+                   param_attr=F.ParamAttr(name=name + "_l",
+                   learning_rate=fc_lr),
+                   )
+        feat = L.l2_normalize(feat, axis=1)
+
+        if self.config.final_fc:
+            feat = L.fc(feat,
+                           self.config.hidden_size,
+                           param_attr=F.ParamAttr(name=name + '_w'),
+                           bias_attr=F.ParamAttr(name=name + '_b'))
+
+        if self.config.final_l2_norm:
+            feat = L.l2_normalize(feat, axis=1)
+        return feat
+
+    def __call__(self, graph_wrappers):
+        inputs = self.build_inputs()
+        feature = inputs[-1]
+        features = self.gnn_layers(graph_wrappers, feature)
+        outputs = [self.take_final_feature(features[-1], i, "final_fc") for i in inputs[:-1]]
+        src_real_index = L.gather(graph_wrappers[0].node_feat['index'], inputs[0])
+        outputs.append(src_real_index)
+        return inputs, outputs
+    
+
+class ErnieSageModelV3(BaseGNNModel):
+    def gen_net_fn(self, config):
+        return ErnieSageV3(config)
--- a/examples/erniesage/models/message_passing.py
+++ b/examples/erniesage/models/message_passing.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as L
+
+
+def copy_send(src_feat, dst_feat, edge_feat):
+    """doc"""
+    return src_feat["h"]
+
+def weighted_copy_send(src_feat, dst_feat, edge_feat):
+    """doc"""
+    return src_feat["h"] * edge_feat["weight"]
+
+def mean_recv(feat):
+    """doc"""
+    return fluid.layers.sequence_pool(feat, pool_type="average")
+
+
+def sum_recv(feat):
+    """doc"""
+    return fluid.layers.sequence_pool(feat, pool_type="sum")
+
+
+def max_recv(feat):
+    """doc"""
+    return fluid.layers.sequence_pool(feat, pool_type="max")
+
+
+def lstm_recv(feat):
+    """doc"""
+    hidden_dim = 128
+    forward, _ = fluid.layers.dynamic_lstm(
+        input=feat, size=hidden_dim * 4, use_peepholes=False)
+    output = fluid.layers.sequence_last_step(forward)
+    return output
+
+
+def graphsage_sum(gw, feature, hidden_size, act, initializer, learning_rate, name):
+    """doc"""
+    msg = gw.send(copy_send, nfeat_list=[("h", feature)])
+    neigh_feature = gw.recv(msg, sum_recv)
+    self_feature = feature
+    self_feature = fluid.layers.fc(self_feature,
+                                   hidden_size,
+                                   act=act,
+                                   param_attr=fluid.ParamAttr(name=name + "_l", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                   )
+    neigh_feature = fluid.layers.fc(neigh_feature,
+                                    hidden_size,
+                                    act=act,
+                                    param_attr=fluid.ParamAttr(name=name + "_r", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                    )
+    output = fluid.layers.concat([self_feature, neigh_feature], axis=1)
+    output = fluid.layers.l2_normalize(output, axis=1)
+    return output
+
+
+def graphsage_mean(gw, feature, hidden_size, act, initializer, learning_rate, name):
+    """doc"""
+    msg = gw.send(copy_send, nfeat_list=[("h", feature)])
+    neigh_feature = gw.recv(msg, mean_recv)
+    self_feature = feature
+    self_feature = fluid.layers.fc(self_feature,
+                                   hidden_size,
+                                   act=act,
+                                   param_attr=fluid.ParamAttr(name=name + "_l", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                   )
+    neigh_feature = fluid.layers.fc(neigh_feature,
+                                    hidden_size,
+                                    act=act,
+                                    param_attr=fluid.ParamAttr(name=name + "_r", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                    )
+    output = fluid.layers.concat([self_feature, neigh_feature], axis=1)
+    output = fluid.layers.l2_normalize(output, axis=1)
+    return output
+
+
+def pinsage_mean(gw, feature, hidden_size, act, initializer, learning_rate, name):
+    """doc"""
+    msg = gw.send(weighted_copy_send, nfeat_list=[("h", feature)], efeat_list=["weight"])
+    neigh_feature = gw.recv(msg, mean_recv)
+    self_feature = feature
+    self_feature = fluid.layers.fc(self_feature,
+                                   hidden_size,
+                                   act=act,
+                                   param_attr=fluid.ParamAttr(name=name + "_l", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                   )
+    neigh_feature = fluid.layers.fc(neigh_feature,
+                                    hidden_size,
+                                    act=act,
+                                    param_attr=fluid.ParamAttr(name=name + "_r", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                    )
+    output = fluid.layers.concat([self_feature, neigh_feature], axis=1)
+    output = fluid.layers.l2_normalize(output, axis=1)
+    return output
+
+
+def pinsage_sum(gw, feature, hidden_size, act, initializer, learning_rate, name):
+    """doc"""
+    msg = gw.send(weighted_copy_send, nfeat_list=[("h", feature)], efeat_list=["weight"])
+    neigh_feature = gw.recv(msg, sum_recv)
+    self_feature = feature
+    self_feature = fluid.layers.fc(self_feature,
+                                   hidden_size,
+                                   act=act,
+                                   param_attr=fluid.ParamAttr(name=name + "_l", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                   )
+    neigh_feature = fluid.layers.fc(neigh_feature,
+                                    hidden_size,
+                                    act=act,
+                                    param_attr=fluid.ParamAttr(name=name + "_r", initializer=initializer,
+                                   learning_rate=learning_rate),
+                                    )
+    output = fluid.layers.concat([self_feature, neigh_feature], axis=1)
+    output = fluid.layers.l2_normalize(output, axis=1)
+    return output
--- a/examples/erniesage/models/model_factory.py
+++ b/examples/erniesage/models/model_factory.py
+from models.base import BaseGNNModel
+from models.ernie import ErnieModel
+from models.erniesage_v1 import ErnieSageModelV1
+from models.erniesage_v2 import ErnieSageModelV2
+from models.erniesage_v3 import ErnieSageModelV3
+
+class Model(object):
+    @classmethod
+    def factory(cls, config):
+        name = config.model_type
+        if name == "BaseGNNModel":
+            return BaseGNNModel(config)
+        if name == "ErnieModel":
+            return ErnieModel(config)
+        if name == "ErnieSageModelV1":
+            return ErnieSageModelV1(config)
+        if name == "ErnieSageModelV2":
+            return ErnieSageModelV2(config)
+        if name == "ErnieSageModelV3":
+            return ErnieSageModelV3(config)
+        else:
+            raise ValueError
+
+
--- a/examples/erniesage/preprocessing/dump_graph.py
+++ b/examples/erniesage/preprocessing/dump_graph.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+########################################################################
+#
+# Copyright (c) 2020 Baidu.com, Inc. All Rights Reserved
+#
+# File: dump_graph.py
+# Author: suweiyue(suweiyue@baidu.com)
+# Date: 2020/03/01 22:17:13
+#
+########################################################################
+"""
+    Comment.
+"""
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+#from __future__ import unicode_literals
+
+import io
+import os
+import sys
+import argparse
+import logging
+import multiprocessing
+from functools import partial
+from io import open
+
+import numpy as np
+import tqdm
+import pgl
+from pgl.graph_kernel import alias_sample_build_table
+from pgl.utils.logger import log
+
+from tokenization import FullTokenizer
+
+
+def term2id(string, tokenizer, max_seqlen):
+    string = string.split("\t")[1]
+    tokens = tokenizer.tokenize(string)
+    ids = tokenizer.convert_tokens_to_ids(tokens)
+    ids = ids[:max_seqlen-1]
+    ids = ids + [2] # ids + [sep]
+    ids = ids + [0] * (max_seqlen - len(ids))
+    return ids
+
+
+def dump_graph(args):
+    if not os.path.exists(args.outpath):
+        os.makedirs(args.outpath)
+    neg_samples = []
+    str2id = dict()
+    term_file = io.open(os.path.join(args.outpath, "terms.txt"), "w", encoding=args.encoding)
+    terms = []
+    count = 0
+
+    with io.open(args.inpath, encoding=args.encoding) as f:
+        edges = []
+        for idx, line in enumerate(f):
+            if idx % 100000 == 0:
+                log.info("%s readed %s lines" % (args.inpath, idx))
+            slots = []
+            for col_idx, col in enumerate(line.strip("\n").split("\t")):
+                s = col[:args.max_seqlen]
+                if s not in str2id:
+                    str2id[s] = count
+                    count += 1
+                    term_file.write(str(col_idx) + "\t" + col + "\n")
+                    
+                slots.append(str2id[s])
+
+            src = slots[0]
+            dst = slots[1]
+            neg_samples.append(slots[2:])
+            edges.append((src, dst))
+            edges.append((dst, src))
+
+        term_file.close()
+        edges = np.array(edges, dtype="int64")
+        num_nodes = len(str2id)
+        str2id.clear()
+    log.info("building graph...")
+    graph = pgl.graph.Graph(num_nodes=num_nodes, edges=edges)
+    indegree = graph.indegree()
+    graph.outdegree()
+    graph.dump(args.outpath)
+    
+    # dump alias sample table
+    sqrt_indegree = np.sqrt(indegree)
+    distribution = 1. * sqrt_indegree / sqrt_indegree.sum()
+    alias, events = alias_sample_build_table(distribution)
+    np.save(os.path.join(args.outpath, "alias.npy"), alias)
+    np.save(os.path.join(args.outpath, "events.npy"), events)
+    np.save(os.path.join(args.outpath, "neg_samples.npy"), np.array(neg_samples))
+    log.info("End Build Graph")
+
+def dump_id2str_map(args):
+    log.info("Dump id2str map starting...")
+    id2str = np.array([line.strip("\n") for line in open(os.path.join(args.outpath, "terms.txt"), "r", encoding=args.encoding)])
+    np.save(os.path.join(args.outpath, "id2str.npy"), id2str)
+    log.info("Dump id2str map done.")
+
+def dump_node_feat(args):
+    log.info("Dump node feat starting...")
+    id2str = np.load(os.path.join(args.outpath, "id2str.npy"), mmap_mode="r")
+    pool = multiprocessing.Pool()
+    tokenizer = FullTokenizer(args.vocab_file)
+    term_ids = pool.map(partial(term2id, tokenizer=tokenizer, max_seqlen=args.max_seqlen), id2str)
+    np.save(os.path.join(args.outpath, "term_ids.npy"), np.array(term_ids))
+    log.info("Dump node feat done.")
+    pool.terminate()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='main')
+    parser.add_argument("-i", "--inpath", type=str, default=None)
+    parser.add_argument("-l", "--max_seqlen", type=int, default=30)
+    parser.add_argument("--vocab_file", type=str, default="./vocab.txt")
+    parser.add_argument("--encoding", type=str, default="utf8")
+    parser.add_argument("-o", "--outpath", type=str, default=None)
+    args = parser.parse_args()
+    dump_graph(args)
+    dump_id2str_map(args)
+    dump_node_feat(args)
--- a/examples/erniesage/preprocessing/tokenization.py
+++ b/examples/erniesage/preprocessing/tokenization.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import unicodedata
+import six
+import sentencepiece as sp
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    fin = open(vocab_file, 'rb')
+    for num, line in enumerate(fin):
+        items = convert_to_unicode(line.strip()).split("\t")
+        if len(items) > 2:
+            break
+        token = items[0]
+        index = items[1] if len(items) == 2 else num
+        token = token.strip()
+        vocab[token] = int(index)
+    return vocab
+
+
+def convert_by_vocab(vocab, items):
+    """Converts a sequence of [tokens|ids] using the vocab."""
+    output = []
+    for item in items:
+        output.append(vocab[item])
+    return output
+
+
+def convert_tokens_to_ids_include_unk(vocab, tokens, unk_token="[UNK]"):
+    output = []
+    for token in tokens:
+        if token in vocab:
+            output.append(vocab[token])
+        else:
+            output.append(vocab[unk_token])
+    return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+    return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+    return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a peice of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class CharTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in text.lower().split(" "):
+            for sub_token in self.tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+
+        Args:
+            do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class SentencepieceTokenizer(object):
+    """Runs SentencePiece tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True, unk_token="[UNK]"):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.do_lower_case = do_lower_case
+        self.tokenizer = sp.SentencePieceProcessor()
+        self.tokenizer.Load(vocab_file + ".model")
+        self.sp_unk_token = "<unk>"
+        self.unk_token = unk_token
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+        text = text.lower() if self.do_lower_case else text 
+        text = convert_to_unicode(text.replace("\1", " "))
+        tokens = self.tokenizer.EncodeAsPieces(text)
+        
+        output_tokens = []
+        for token in tokens:
+            if token == self.sp_unk_token:
+                token = self.unk_token
+            
+            if token in self.vocab:
+                output_tokens.append(token)
+            else:
+                output_tokens.append(self.unk_token)
+        
+        return output_tokens
+    
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class WordsegTokenizer(object):
+    """Runs Wordseg tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True, unk_token="[UNK]", 
+            split_token="\1"):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.tokenizer = sp.SentencePieceProcessor()
+        self.tokenizer.Load(vocab_file + ".model")
+        
+        self.do_lower_case = do_lower_case
+        self.unk_token = unk_token
+        self.split_token = split_token
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+        text = text.lower() if self.do_lower_case else text 
+        text = convert_to_unicode(text)
+        
+        output_tokens = []
+        for token in text.split(self.split_token):
+            if token in self.vocab:
+                output_tokens.append(token)
+            else:
+                sp_tokens = self.tokenizer.EncodeAsPieces(token)
+                for sp_token in sp_tokens:
+                    if sp_token in self.vocab:
+                        output_tokens.append(sp_token)
+        return output_tokens
+    
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenziation."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+            input = "unaffable"
+            output = ["un", "##aff", "##able"]
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through `BasicTokenizer.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+
+        text = convert_to_unicode(text)
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+        (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
--- a/examples/erniesage/train.py
+++ b/examples/erniesage/train.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import argparse
+import traceback
+
+import yaml
+import numpy as np
+from easydict import EasyDict as edict
+from pgl.utils.logger import log
+from pgl.utils import paddle_helper
+
+from learner import Learner
+from models.model_factory import Model
+from dataset.graph_reader import GraphGenerator 
+
+
+class TrainData(object):
+    def __init__(self, graph_path):
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
+        log.info("trainer_id: %s, trainer_count: %s." % (trainer_id, trainer_count))
+
+        edges = np.load(os.path.join(graph_path, "edges.npy"), allow_pickle=True)
+        # edges is bidirectional.
+        train_usr = edges[trainer_id::trainer_count, 0]
+        train_ad = edges[trainer_id::trainer_count, 1]
+        returns = {
+            "train_data": [train_usr, train_ad]
+        }
+
+        if os.path.exists(os.path.join(graph_path, "neg_samples.npy")):
+            neg_samples = np.load(os.path.join(graph_path, "neg_samples.npy"), allow_pickle=True)
+            if neg_samples.size != 0:
+                train_negs = neg_samples[trainer_id::trainer_count]
+                returns["train_data"].append(train_negs)
+        log.info("Load train_data done.")
+        self.data = returns
+
+    def __getitem__(self, index):
+        return [ data[index] for data in self.data["train_data"]]
+
+    def __len__(self):
+        return len(self.data["train_data"][0])
+
+
+def main(config):
+    # Select Model
+    model = Model.factory(config)
+
+    # Build Train Edges
+    data = TrainData(config.graph_path)
+
+    # Build Train Data
+    train_iter = GraphGenerator(
+        graph_wrappers=model.graph_wrappers,
+        batch_size=config.batch_size,
+        data=data,
+        samples=config.samples,
+        num_workers=config.sample_workers,
+        feed_name_list=[var.name for var in model.feed_list],
+        use_pyreader=config.use_pyreader,
+        phase="train",
+        graph_data_path=config.graph_path,
+        shuffle=True)
+
+    log.info("build graph reader done.")
+
+    learner = Learner.factory(config.learner_type)
+    learner.build(model, train_iter, config)
+
+    learner.start()
+    learner.stop()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='main')
+    parser.add_argument("--conf", type=str, default="./config.yaml")
+    args = parser.parse_args()
+    config = edict(yaml.load(open(args.conf), Loader=yaml.FullLoader))
+    print(config)
+    main(config)