Merge pull request #10 from PaddlePaddle/master

Pull from master

Merge pull request #10 from PaddlePaddle/master
Pull from master
37a209b1 · Huang Zhengjie · GitHub · 7b0ee446 · f78f4639 · 37a209b1
23 changed file
--- a/docs/source/api/pgl.contrib.heter_graph.rst
+++ b/docs/source/api/pgl.contrib.heter_graph.rst
-pgl.contrib.heter\_graph module: Heterogenous Graph Storage
+pgl.heter\_graph module: Heterogenous Graph Storage
 ===============================

-.. automodule:: pgl.contrib.heter_graph
+.. automodule:: pgl.heter_graph
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/pgl.contrib.heter_graph_wrapper.rst
+++ b/docs/source/api/pgl.contrib.heter_graph_wrapper.rst
-pgl.contrib.heter\_graph\_wrapper module: Heterogenous Graph data holders for Paddle GNN.
+pgl.heter\_graph\_wrapper module: Heterogenous Graph data holders for Paddle GNN.
 =========================

-.. automodule:: pgl.contrib.heter_graph_wrapper
+.. automodule:: pgl.heter_graph_wrapper
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/api/pgl.rst
+++ b/docs/source/api/pgl.rst
@@ -9,5 +9,5 @@ API Reference
   pgl.data_loader
   pgl.utils.paddle_helper
   pgl.utils.mp_reader
-   pgl.contrib.heter_graph
-   pgl.contrib.heter_graph_wrapper
+   pgl.heter_graph
+   pgl.heter_graph_wrapper
--- a/docs/source/quick_start/md/quick_start_for_heterGraph.md
+++ b/docs/source/quick_start/md/quick_start_for_heterGraph.md
@@ -58,8 +58,8 @@ Now, we can build a heterogenous graph by using PGL.
 import paddle.fluid as fluid
 import paddle.fluid.layers as fl
 import pgl
-from pgl.contrib import heter_graph
-from pgl.contrib import heter_graph_wrapper
+from pgl import heter_graph
+from pgl import heter_graph_wrapper

 g = heter_graph.HeterGraph(num_nodes=num_nodes,
                            edges=edges,
@@ -160,8 +160,3 @@ for epoch in range(30):
    train_loss = exe.run(fluid.default_main_program(), feed=feed_dict, fetch_list=[loss], return_numpy=True)
    print('Epoch %d | Loss: %f'%(epoch, train_loss[0]))
 ```
-
-
-
-
-
--- a/examples/erniesage/README.en.md
+++ b/examples/erniesage/README.en.md
+# ERNIESage in PGL
+
+[中文版 README](./README.md)
+
+
+## Introduction
+In many industrial applications, there is often a special graph shown below: Text Graph. As the name implies, the node attributes of such graph consist of text, and the edges provide structural information. Take the search scenario for example, nodes can be expressed by search query, web page titles, and web page content, while the edges are constructed by user feedback or hyperlink information.
+
+<img src="./docs/source/_static/text_graph.png" alt="Text Graph" width="800">
+
+**ERNIESage** (abbreviation of ERNIE SAmple aggreGatE), a model proposed by the PGL team, effectively improves the performance on text graph by simultaneously modeling text semantics and graph structure information. It's worth mentioning that [**ERNIE**](https://github.com/PaddlePaddle/ERNIE) in **ERNIESage** is a continual pre-training framework for language understanding launched by Baidu.
+
+**ERNIESage** is an aggregation of ERNIE and GraphSAGE. Its structure is shown in the figure below. The main idea is to use ERNIE as an aggregation function (Aggregators) to model the semantic and structural relationship between its own nodes and neighbor nodes. In addition, for the position-independent characteristics of neighbor nodes, attention mask and independent position embedding mechanism for neighbor blindness are designed.
+
+<img src="./docs/source/_static/ernie_aggregator.png" alt="ERNIESage" width="800">
+
+GraphSAGE with ID feature can only model the graph structure information, while ERNIE can only deal with the text. With the help of PGL, the proposed **ERNIESage** model can combine the advantages of both models. Take the following recommendation example of text graph, we can see that **ERNIESage** achieves the best performance when compared to single ERNIE model or GraphSAGE model.
+
+<img src="./docs/source/_static/ERNIESage_result.png" alt="ERNIESage_result" width="800">
+
+Thanks to the flexibility and usability of PGL, **ERNIESage** can be quickly implemented under PGL's Message Passing paradigm. Acutally, there are four PGL version of ERNIESage:
+
+- **ERNIESage v1**: ERNIE is applied to the NODE of the text graph;
+- **ERNIESage v2**: ERNIE is applied to the EDGE of the text graph;
+- **ERNIESage v3**: ERNIE is applied to the first order neighbors and center node;
+- **ERNIESage v4**: ERNIE is applied to the N-order neighbors and center node.
+
+<img src="./docs/source/_static/ERNIESage_v1_4.png" alt="ERNIESage_v1_4" width="800">
+
+## Dependencies
+- paddlepaddle>=1.7
+- pgl>=1.1
+
+## Dataformat
+In the example data ```data.txt```, part of NLPCC2016-DBQA is used, and the format is "query \t answer" for each line.
+```text
+NLPCC2016-DBQA is a sub-task of NLPCC-ICCPOL 2016 Shared Task which is hosted by NLPCC(Natural Language Processing and Chinese Computing), this task targets on selecting documents from the candidates to answer the questions. [url: http://tcci.ccf.org.cn/conference/2016/dldoc/evagline2.pdf]
+```
+
+## How to run
+
+We adopt [PaddlePaddle Fleet](https://github.com/PaddlePaddle/Fleet) as our distributed training frameworks ```config/*.yaml``` are some example config files for hyperparameters. Among them, the ERNIE model checkpoint ```ckpt_path``` and the vocabulary ```ernie_vocab_file``` can be downloaded on the [ERNIE](https://github.com/PaddlePaddle/ERNIE) page.
+
+```sh
+# train ERNIESage in distributed gpu mode.
+sh local_run.sh config/enriesage_v1_gpu.yaml
+
+# train ERNIESage in distributed cpu mode.
+sh local_run.sh config/enriesage_v1_cpu.yaml
+```
+
+## Hyperparamters
+
+- learner_type: `gpu` or `cpu`; gpu use fleet Collective mode, cpu use fleet Transpiler mode.
+
+## Citation
+```
+@misc{ERNIESage,
+  author = {PGL Team},
+  title = {ERNIESage: ERNIE SAmple aggreGatE},
+  year = {2020},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/PaddlePaddle/PGL/tree/master/examples/erniesage},
+}
+```
--- a/examples/erniesage/README.md
+++ b/examples/erniesage/README.md
+# 使用PGL实现ERNIESage
+
+[ENG Readme](./README.en.md)
+
+## 背景介绍 
+
+在很多工业应用中，往往出现如下图所示的一种特殊的图：Text Graph。顾名思义，图的节点属性由文本构成，而边的构建提供了结构信息。如搜索场景下的Text Graph，节点可由搜索词、网页标题、网页正文来表达，用户反馈和超链信息则可构成边关系。
+
+<img src="./docs/source/_static/text_graph.png" alt="Text Graph" width="800">
+
+**ERNIESage** 由PGL团队提出，是ERNIE SAmple aggreGatE的简称，该模型可以同时建模文本语义与图结构信息，有效提升 Text Graph 的应用效果。其中 [**ERNIE**](https://github.com/PaddlePaddle/ERNIE) 是百度推出的基于知识增强的持续学习语义理解框架。
+
+**ERNIESage** 是 ERNIE 与 GraphSAGE 碰撞的结果，是 ERNIE SAmple aggreGatE 的简称，它的结构如下图所示，主要思想是通过 ERNIE 作为聚合函数（Aggregators），建模自身节点和邻居节点的语义与结构关系。ERNIESage 对于文本的建模是构建在邻居聚合的阶段，中心节点文本会与所有邻居节点文本进行拼接；然后通过预训练的 ERNIE 模型进行消息汇聚，捕捉中心节点以及邻居节点之间的相互关系；最后使用 ERNIESage 搭配独特的邻居互相看不见的 Attention Mask 和独立的 Position Embedding 体系，就可以轻松构建 TextGraph 中句子之间以及词之间的关系。
+
+<img src="./docs/source/_static/ernie_aggregator.png" alt="ERNIESage" width="800">
+
+使用ID特征的GraphSAGE只能够建模图的结构信息，而单独的ERNIE只能处理文本信息。通过PGL搭建的图与文本的桥梁，**ERNIESage**能够很简单的把GraphSAGE以及ERNIE的优点结合一起。以下面TextGraph的场景，**ERNIESage**的效果能够比单独的ERNIE以及GraphSAGE模型都要好。
+
+<img src="./docs/source/_static/ERNIESage_result.png" alt="ERNIESage_result" width="800">
+
+**ERNIESage**可以很轻松地在PGL中的消息传递范式中进行实现，目前PGL提供了4个版本的ERNIESage模型：
+
+- **ERNIESage v1**: ERNIE 作用于text graph节点上;
+- **ERNIESage v2**: ERNIE 作用在text graph的边上;
+- **ERNIESage v3**: ERNIE 作用于一阶邻居及起边上;
+- **ERNIESage v4**: ERNIE 作用于N阶邻居及边上;
+
+<img src="./docs/source/_static/ERNIESage_v1_4.png" alt="ERNIESage_v1_4" width="800">
+
+## 环境依赖
+- paddlepaddle>=1.7
+- pgl>=1.1
+
+## Dataformat
+示例数据```data.txt```中使用了NLPCC2016-DBQA的部分数据，格式为每行"query \t answer"。
+```text
+NLPCC2016-DBQA 是由国际自然语言处理和中文计算会议 NLPCC 于 2016 年举办的评测任务，其目标是从候选中找到合适的文档作为问题的答案。[链接: http://tcci.ccf.org.cn/conference/2016/dldoc/evagline2.pdf]
+```
+
+## How to run
+
+我们采用了[PaddlePaddle Fleet](https://github.com/PaddlePaddle/Fleet)作为我们的分布式训练框架，在```config/*.yaml```中，有部分用于训练ERNIESage的配置, 其中ERNIE模型```ckpt_path```以及词表```ernie_vocab_file```在[ERNIE](https://github.com/PaddlePaddle/ERNIE)下载。
+
+
+```sh
+# 分布式GPU模式或单机模式ERNIESage
+sh local_run.sh config/erniesage_v2_gpu.yaml
+
+# 分布式CPU模式训练ERNIESage
+sh local_run.sh config/erniesage_v2_cpu.yaml
+```
+
+## Hyperparamters
+
+- learner_type: `gpu` or `cpu`; gpu 使用fleet Collective 模式, cpu 使用fleet Transpiler 模式.
+
+## Citation
+```
+@misc{ERNIESage,
+  author = {PGL Team},
+  title = {ERNIESage: ERNIE SAmple aggreGatE},
+  year = {2020},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/PaddlePaddle/PGL/tree/master/examples/erniesage},
+}
+```
--- a/examples/erniesage/config/erniesage_v2_cpu.yaml
+++ b/examples/erniesage/config/erniesage_v2_cpu.yaml
@@ -4,9 +4,9 @@
 learner_type: "cpu"
 optimizer_type: "adam"
 lr: 0.00005
-batch_size: 2
-CPU_NUM: 10
-epoch: 20
+batch_size: 4
+CPU_NUM: 16
+epoch: 3
 log_per_step: 1
 save_per_step: 100
 output_path: "./output"
@@ -31,6 +31,7 @@ final_fc: true
 final_l2_norm: true
 loss_type: "hinge"
 margin: 0.3
+neg_type: "random_neg"

 # infer config ------
 infer_model: "./output/last"

--- a/examples/erniesage/config/erniesage_v2_gpu.yaml
+++ b/examples/erniesage/config/erniesage_v2_gpu.yaml
@@ -6,9 +6,9 @@ optimizer_type: "adam"
 lr: 0.00005
 batch_size: 32
 CPU_NUM: 10
-epoch: 20
-log_per_step: 1
-save_per_step: 100
+epoch: 3
+log_per_step: 10
+save_per_step: 1000
 output_path: "./output"
 ckpt_path: "./ernie_base_ckpt"

@@ -31,6 +31,7 @@ final_fc: true
 final_l2_norm: true
 loss_type: "hinge"
 margin: 0.3
+neg_type: "random_neg"

 # infer config ------
 infer_model: "./output/last"

--- a/examples/erniesage/data.txt
+++ b/examples/erniesage/data.txt
--- a/examples/erniesage/dataset/graph_reader.py
+++ b/examples/erniesage/dataset/graph_reader.py
@@ -86,6 +86,7 @@ class GraphGenerator(BaseDataGenerator):

        nodes = np.unique(np.concatenate([batch_src, batch_dst, batch_neg], 0))
        subgraphs = graphsage_sample(self.graph, nodes, self.samples, ignore_edges=ignore_edges)
+        #subgraphs[0].reindex_to_parrent_nodes(subgraphs[0].nodes)
        feed_dict = {}
        for i in range(self.num_layers):
            feed_dict.update(self.graph_wrappers[i].to_feed(subgraphs[i]))
@@ -97,7 +98,7 @@ class GraphGenerator(BaseDataGenerator):

        feed_dict["user_index"] = np.array(sub_src_idx, dtype="int64")
        feed_dict["item_index"] = np.array(sub_dst_idx, dtype="int64")
-        #feed_dict["neg_item_index"] = np.array(sub_neg_idx, dtype="int64")
+        feed_dict["neg_item_index"] = np.array(sub_neg_idx, dtype="int64")
        feed_dict["term_ids"] = self.term_ids[subgraphs[0].node_feat["index"]]
        return feed_dict


--- a/examples/erniesage/docs/source/_static/ERNIESage_result.png
+++ b/examples/erniesage/docs/source/_static/ERNIESage_result.png
--- a/examples/erniesage/docs/source/_static/ERNIESage_v1_4.png
+++ b/examples/erniesage/docs/source/_static/ERNIESage_v1_4.png
--- a/examples/erniesage/infer.py
+++ b/examples/erniesage/infer.py
@@ -72,7 +72,7 @@ def run_predict(py_reader,
        
    for batch_feed_dict in py_reader():
        batch += 1
-        batch_usr_feat, batch_ad_feat, batch_src_real_index = exe.run(
+        batch_usr_feat, batch_ad_feat, _, batch_src_real_index = exe.run(
            program,
            feed=batch_feed_dict,
            fetch_list=model_dict.outputs)
@@ -183,5 +183,6 @@ if __name__ == "__main__":
    parser.add_argument("--conf", type=str, default="./config.yaml")
    args = parser.parse_args()
    config = edict(yaml.load(open(args.conf), Loader=yaml.FullLoader))
+    config.loss_type = "hinge"
    print(config)
    main(config)
--- a/examples/erniesage/job.sh
+++ b/examples/erniesage/job.sh
-
-unset http_proxy https_proxy
-set -x
-mode=${1:-local}
-config=${2:-"./config.yaml"}
-
-function parse_yaml {
-   local prefix=$2
-   local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034')
-   sed -ne "s|^\($s\):|\1|" \
-        -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \
-        -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p"  $1 |
-   awk -F$fs '{
-      indent = length($1)/2;
-      vname[indent] = $2;
-      for (i in vname) {if (i > indent) {delete vname[i]}}
-      if (length($3) > 0) {
-         vn=""; for (i=0; i<indent; i++) {vn=(vn)(vname[i])("_")}
-         printf("%s%s%s=\"%s\"\n", "'$prefix'",vn, $2, $3);
-      }
-   }'
-}
-eval $(parse_yaml $config)
-
-export CPU_NUM=$CPU_NUM
-export FLAGS_rpc_deadline=3000000 
-export FLAGS_rpc_retry_times=1000
-
-if [[ $async_mode == "True" ]];then
-    echo "async_mode is True"
-else
-    export FLAGS_communicator_send_queue_size=1
-    export FLAGS_communicator_min_send_grad_num_before_recv=0
-    export FLAGS_communicator_max_merge_var_num=1 # important! 
-    export FLAGS_communicator_merge_sparse_grad=0
-fi
-
-export FLAGS_communicator_recv_wait_times=5000000
-
-mkdir -p output
-
-python ./train.py --conf $config
-if [[ $TRAINING_ROLE == "TRAINER" ]];then
-    python ./infer.py --conf $config
-fi
--- a/examples/erniesage/learner.py
+++ b/examples/erniesage/learner.py
@@ -26,6 +26,17 @@ from paddle.fluid.incubate.fleet.collective import fleet as cfleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet as tfleet
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from tensorboardX import SummaryWriter
+from paddle.fluid.transpiler.distribute_transpiler import DistributedMode
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import TrainerRuntimeConfig
+
+# hack it!
+base_get_communicator_flags = TrainerRuntimeConfig.get_communicator_flags
+def get_communicator_flags(self):
+    flag_dict = base_get_communicator_flags(self)
+    flag_dict['communicator_max_merge_var_num'] = str(1)
+    flag_dict['communicator_send_queue_size'] = str(1)
+    return flag_dict
+TrainerRuntimeConfig.get_communicator_flags = get_communicator_flags


 class Learner(object):
@@ -132,8 +143,6 @@ class TranspilerLearner(Learner):
        self.model = model

    def optimize(self, loss, optimizer_type, lr):
-        strategy = DistributeTranspilerConfig()
-        strategy.sync_mode = False
        log.info('learning rate:%f' % lr)
        if optimizer_type == "sgd":
            optimizer = F.optimizer.SGD(learning_rate=lr)
@@ -143,7 +152,8 @@ class TranspilerLearner(Learner):
        else:
            raise ValueError("Unknown Optimizer %s" % optimizer_type)
        #create the DistributeTranspiler configure
-        optimizer = tfleet.distributed_optimizer(optimizer, strategy)
+        self.strategy = StrategyFactory.create_sync_strategy()
+        optimizer = tfleet.distributed_optimizer(optimizer, self.strategy)
        optimizer.minimize(loss)

    def init_and_run_ps_worker(self, ckpt_path):
@@ -193,6 +203,7 @@ class CollectiveLearner(Learner):
    def optimize(self, loss, optimizer_type, lr):
        optimizer = F.optimizer.Adam(learning_rate=lr)
        dist_strategy = DistributedStrategy()
+        dist_strategy.enable_sequential_execution = True
        optimizer = cfleet.distributed_optimizer(optimizer, strategy=dist_strategy)
        _, param_grads = optimizer.minimize(loss, F.default_startup_program())
    

--- a/examples/erniesage/local_run.sh
+++ b/examples/erniesage/local_run.sh
@@ -36,7 +36,7 @@ transpiler_local_train(){
    for((i=0;i<${PADDLE_PSERVERS_NUM};i++))
    do
        echo "start ps server: ${i}"
-        TRAINING_ROLE="PSERVER" PADDLE_TRAINER_ID=${i} sh job.sh local $config \
+        TRAINING_ROLE="PSERVER" PADDLE_TRAINER_ID=${i} python ./train.py --conf $config \
            &> $BASE/pserver.$i.log &
        echo $! >> job_id
    done
@@ -44,23 +44,20 @@ transpiler_local_train(){
    for((j=0;j<${PADDLE_TRAINERS_NUM};j++))
    do
        echo "start ps work: ${j}"
-        TRAINING_ROLE="TRAINER" PADDLE_TRAINER_ID=${j} sh job.sh local $config \
-        echo $! >> job_id
+        TRAINING_ROLE="TRAINER" PADDLE_TRAINER_ID=${j} python ./train.py --conf $config 
+        TRAINING_ROLE="TRAINER" PADDLE_TRAINER_ID=${j} python ./infer.py --conf $config 
    done
 }

 collective_local_train(){
-    export PATH=./python27-gcc482-gpu/bin/:$PATH
    echo `which python`
    python -m paddle.distributed.launch train.py --conf $config
    python -m paddle.distributed.launch infer.py --conf $config
 }

 eval $(parse_yaml $config)
-unalias python

-python3 ./preprocessing/dump_graph.py -i $input_data -o $graph_path --encoding $encoding \
-    -l $max_seqlen --vocab_file $ernie_vocab_file
+python ./preprocessing/dump_graph.py -i $input_data -o $graph_path --encoding $encoding -l $max_seqlen --vocab_file $ernie_vocab_file

 if [[ $learner_type == "cpu" ]];then
    transpiler_local_train

--- a/examples/erniesage/models/base.py
+++ b/examples/erniesage/models/base.py
@@ -40,7 +40,7 @@ class BaseGraphWrapperBuilder(object):
            # all graph have same node_feat_info
            graph_wrappers.append(
                pgl.graph_wrapper.GraphWrapper(
-                    "layer_%s" % i, place, node_feat=self.node_feature_info, edge_feat=self.edge_feature_info))
+                    "layer_%s" % i, node_feat=self.node_feature_info, edge_feat=self.edge_feature_info))
        return graph_wrappers


@@ -129,7 +129,9 @@ class BaseNet(object):
            "user_index", shape=[None], dtype="int64", append_batch_size=False)
        item_index = L.data(
            "item_index", shape=[None], dtype="int64", append_batch_size=False)
-        return [user_index, item_index]
+        neg_item_index = L.data(
+            "neg_item_index", shape=[None], dtype="int64", append_batch_size=False)
+        return [user_index, item_index, neg_item_index]

    def build_embedding(self, graph_wrappers, inputs=None):
        num_embed = int(np.load(os.path.join(self.config.graph_path, "num_nodes.npy")))
@@ -177,18 +179,58 @@ class BaseNet(object):
        outputs.append(src_real_index)
        return inputs, outputs

+def all_gather(X):
+    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+    trainer_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
+    if trainer_num == 1:
+        copy_X = X * 1
+        copy_X.stop_gradients=True
+        return copy_X
+
+    Xs = []
+    for i in range(trainer_num):
+        copy_X = X * 1
+        copy_X =  L.collective._broadcast(copy_X, i, True)
+        copy_X.stop_gradients=True
+        Xs.append(copy_X)
+
+    if len(Xs) > 1:
+        Xs=L.concat(Xs, 0)
+        Xs.stop_gradients=True
+    else:
+        Xs = Xs[0]
+    return Xs
+
 class BaseLoss(object):
    def __init__(self, config):
        self.config = config

    def __call__(self, outputs):
-        user_feat, item_feat = outputs[0], outputs[1]
+        user_feat, item_feat, neg_item_feat = outputs[0], outputs[1], outputs[2]
        loss_type = self.config.loss_type
+
+        if self.config.neg_type == "batch_neg":
+            neg_item_feat = item_feat
        # Calc Loss
        if self.config.loss_type == "hinge":
            pos = L.reduce_sum(user_feat * item_feat, -1, keep_dim=True) # [B, 1]
-            neg = L.matmul(user_feat, item_feat, transpose_y=True) # [B, B]
+            neg = L.matmul(user_feat, neg_item_feat, transpose_y=True) # [B, B]
            loss = L.reduce_mean(L.relu(neg - pos + self.config.margin))
+        elif self.config.loss_type == "all_hinge":
+            pos = L.reduce_sum(user_feat * item_feat, -1, keep_dim=True) # [B, 1]
+            all_pos = all_gather(pos) # [B * n, 1]
+            all_neg_item_feat = all_gather(neg_item_feat) # [B * n, 1]
+            all_user_feat = all_gather(user_feat) # [B * n, 1]
+
+            neg1 = L.matmul(user_feat, all_neg_item_feat, transpose_y=True) # [B, B * n]
+            neg2 = L.matmul(all_user_feat, neg_item_feat, transpose_y=True) # [B *n, B]
+
+            loss1 = L.reduce_mean(L.relu(neg1 - pos + self.config.margin))
+            loss2 = L.reduce_mean(L.relu(neg2 - all_pos + self.config.margin))
+
+            #loss = (loss1 + loss2) / 2
+            loss = loss1 + loss2
+        
        elif self.config.loss_type == "softmax":
            pass
            # TODO

--- a/examples/erniesage/models/ernie_model/ernie.py
+++ b/examples/erniesage/models/ernie_model/ernie.py
@@ -59,6 +59,8 @@ class ErnieModel(object):
    def __init__(self,
                 src_ids,
                 sentence_ids,
+                 position_ids=None,
+                 input_mask=None,
                 task_ids=None,
                 config=None,
                 weight_sharing=True,
@@ -66,8 +68,10 @@ class ErnieModel(object):
                 name=""):

        self._set_config(config, name, weight_sharing)
-        input_mask = self._build_input_mask(src_ids)
-        position_ids = self._build_position_ids(src_ids)
+        if position_ids is None:
+            position_ids = self._build_position_ids(src_ids)
+        if input_mask is None:
+            input_mask = self._build_input_mask(src_ids)
        self._build_model(src_ids, position_ids, sentence_ids, task_ids,
                          input_mask)
        self._debug_summary(input_mask)

--- a/examples/erniesage/models/ernie_model/transformer_encoder.py
+++ b/examples/erniesage/models/ernie_model/transformer_encoder.py
@@ -19,8 +19,6 @@ from contextlib import contextmanager
 import paddle.fluid as fluid
 import paddle.fluid.layers as L
 import paddle.fluid.layers as layers
-#import propeller.paddle as propeller
-#from propeller import log

 #determin this at the begining
 to_3d = lambda a: a  # will change later
@@ -85,7 +83,7 @@ def multi_head_attention(queries,
        # The value 0 in shape attr means copying the corresponding dimension
        # size of the input as the output dimension size.
        reshaped = layers.reshape(
-            x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
+                x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)

        # permuate the dimensions into:
        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
@@ -262,7 +260,6 @@ def encoder_layer(enc_input,
    with the post_process_layer to add residual connection, layer normalization
    and droput.
    """
-    #L.Print(L.reduce_mean(enc_input), message='1')
    attn_output, ctx_multiheads_attn = multi_head_attention(
        pre_process_layer(
            enc_input,
@@ -279,7 +276,6 @@ def encoder_layer(enc_input,
        attention_dropout,
        param_initializer=param_initializer,
        name=name + '_multi_head_att')
-    #L.Print(L.reduce_mean(attn_output), message='1')
    attn_output = post_process_layer(
        enc_input,
        attn_output,
@@ -287,7 +283,6 @@ def encoder_layer(enc_input,
        prepostprocess_dropout,
        name=name + '_post_att')

-    #L.Print(L.reduce_mean(attn_output), message='2')
    ffd_output = positionwise_feed_forward(
        pre_process_layer(
            attn_output,
@@ -300,14 +295,12 @@ def encoder_layer(enc_input,
        hidden_act,
        param_initializer=param_initializer,
        name=name + '_ffn')
-    #L.Print(L.reduce_mean(ffd_output), message='3')
    ret = post_process_layer(
        attn_output,
        ffd_output,
        postprocess_cmd,
        prepostprocess_dropout,
        name=name + '_post_ffn')
-    #L.Print(L.reduce_mean(ret), message='4')
    return ret, ctx_multiheads_attn, ffd_output


@@ -374,7 +367,7 @@ def encoder(enc_input,
    encoder_layer.
    """

-    #global to_2d, to_3d  #, batch, seqlen, dynamic_dim
+    global to_2d, to_3d  #, batch, seqlen, dynamic_dim
    d_shape = L.shape(input_mask)
    pad_idx = build_pad_idx(input_mask)
    attn_bias = build_attn_bias(input_mask, n_head, enc_input.dtype)
@@ -391,14 +384,14 @@ def encoder(enc_input,
    # if attn_bias.dtype != enc_input.dtype:
    # attn_bias = L.cast(attn_bias, enc_input.dtype)

-    # def to_2d(t_3d):
-        # t_2d = L.gather_nd(t_3d, pad_idx)
-        # return t_2d
+    def to_2d(t_3d):
+        t_2d = L.gather_nd(t_3d, pad_idx)
+        return t_2d

-    # def to_3d(t_2d):
-        # t_3d = L.scatter_nd(
-        # pad_idx, t_2d, shape=[d_shape[0], d_shape[1], d_model])
-        # return t_3d
+    def to_3d(t_2d):
+        t_3d = L.scatter_nd(
+        pad_idx, t_2d, shape=[d_shape[0], d_shape[1], d_model])
+        return t_3d

    enc_input = to_2d(enc_input)
    all_hidden = []
@@ -456,7 +449,7 @@ def graph_encoder(enc_input,
    encoder_layer.
    """

-    #global to_2d, to_3d  #, batch, seqlen, dynamic_dim
+    global to_2d, to_3d  #, batch, seqlen, dynamic_dim
    d_shape = L.shape(input_mask)
    pad_idx = build_pad_idx(input_mask)
    attn_bias = build_graph_attn_bias(input_mask, n_head, enc_input.dtype, slot_seqlen)
@@ -474,14 +467,14 @@ def graph_encoder(enc_input,
    # if attn_bias.dtype != enc_input.dtype:
    # attn_bias = L.cast(attn_bias, enc_input.dtype)

-    # def to_2d(t_3d):
-        # t_2d = L.gather_nd(t_3d, pad_idx)
-        # return t_2d
+    def to_2d(t_3d):
+        t_2d = L.gather_nd(t_3d, pad_idx)
+        return t_2d

-    # def to_3d(t_2d):
-        # t_3d = L.scatter_nd(
-        # pad_idx, t_2d, shape=[d_shape[0], d_shape[1], d_model])
-        # return t_3d
+    def to_3d(t_2d):
+        t_3d = L.scatter_nd(
+        pad_idx, t_2d, shape=[d_shape[0], d_shape[1], d_model])
+        return t_3d

    enc_input = to_2d(enc_input)
    all_hidden = []

--- a/examples/erniesage/models/erniesage_v2.py
+++ b/examples/erniesage/models/erniesage_v2.py
@@ -3,8 +3,6 @@ import paddle.fluid as F
 import paddle.fluid.layers as L
 from models.base import BaseNet, BaseGNNModel
 from models.ernie_model.ernie import ErnieModel
-from models.ernie_model.ernie import ErnieGraphModel
-from models.ernie_model.ernie import ErnieConfig


 class ErnieSageV2(BaseNet):
@@ -16,19 +14,52 @@ class ErnieSageV2(BaseNet):
        return inputs + [term_ids]

    def gnn_layer(self, gw, feature, hidden_size, act, initializer, learning_rate, name):
+        def build_position_ids(src_ids, dst_ids):
+            src_shape = L.shape(src_ids)
+            src_batch = src_shape[0]
+            src_seqlen = src_shape[1]
+            dst_seqlen = src_seqlen - 1 # without cls
+
+            src_position_ids = L.reshape(
+                L.range(
+                    0, src_seqlen, 1, dtype='int32'), [1, src_seqlen, 1],
+                inplace=True) # [1, slot_seqlen, 1]
+            src_position_ids = L.expand(src_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen * num_b, 1]
+            zero = L.fill_constant([1], dtype='int64', value=0)
+            input_mask = L.cast(L.equal(src_ids, zero), "int32")  # assume pad id == 0 [B, slot_seqlen, 1]
+            src_pad_len = L.reduce_sum(input_mask, 1) # [B, 1, 1]
+
+            dst_position_ids = L.reshape(
+                L.range(
+                    src_seqlen, src_seqlen+dst_seqlen, 1, dtype='int32'), [1, dst_seqlen, 1],
+                inplace=True) # [1, slot_seqlen, 1]
+            dst_position_ids = L.expand(dst_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen, 1]
+            dst_position_ids = dst_position_ids - src_pad_len # [B, slot_seqlen, 1]
+
+            position_ids = L.concat([src_position_ids, dst_position_ids], 1)
+            position_ids = L.cast(position_ids, 'int64')
+            position_ids.stop_gradient = True
+            return position_ids
+
+
        def ernie_send(src_feat, dst_feat, edge_feat):
            """doc"""
+            # input_ids
            cls = L.fill_constant_batch_size_like(src_feat["term_ids"], [-1, 1, 1], "int64", 1)
            src_ids = L.concat([cls, src_feat["term_ids"]], 1)
            dst_ids = dst_feat["term_ids"]

+            # sent_ids
            sent_ids = L.concat([L.zeros_like(src_ids), L.ones_like(dst_ids)], 1)
            term_ids = L.concat([src_ids, dst_ids], 1)

+            # position_ids
+            position_ids = build_position_ids(src_ids, dst_ids)
+
            term_ids.stop_gradient = True
            sent_ids.stop_gradient = True
            ernie = ErnieModel(
-                term_ids, sent_ids,
+                term_ids, sent_ids, position_ids,
                config=self.config.ernie_config)
            feature = ernie.get_pooled_output()
            return feature

--- a/examples/erniesage/models/erniesage_v3.py
+++ b/examples/erniesage/models/erniesage_v3.py
@@ -18,7 +18,6 @@ import paddle.fluid.layers as L
 from models.base import BaseNet, BaseGNNModel
 from models.ernie_model.ernie import ErnieModel
 from models.ernie_model.ernie import ErnieGraphModel
-from models.ernie_model.ernie import ErnieConfig
 from models.message_passing import copy_send



--- a/examples/erniesage/preprocessing/dump_graph.py
+++ b/examples/erniesage/preprocessing/dump_graph.py
@@ -53,6 +53,7 @@ def dump_graph(args):
    term_file = io.open(os.path.join(args.outpath, "terms.txt"), "w", encoding=args.encoding)
    terms = []
    count = 0
+    item_distribution = []

    with io.open(args.inpath, encoding=args.encoding) as f:
        edges = []
@@ -66,6 +67,7 @@ def dump_graph(args):
                    str2id[s] = count
                    count += 1
                    term_file.write(str(col_idx) + "\t" + col + "\n")
+                    item_distribution.append(0)
                    
                slots.append(str2id[s])

@@ -74,6 +76,7 @@ def dump_graph(args):
            neg_samples.append(slots[2:])
            edges.append((src, dst))
            edges.append((dst, src))
+            item_distribution[dst] += 1

        term_file.close()
        edges = np.array(edges, dtype="int64")
@@ -82,12 +85,14 @@ def dump_graph(args):
    log.info("building graph...")
    graph = pgl.graph.Graph(num_nodes=num_nodes, edges=edges)
    indegree = graph.indegree()
+    graph.indegree()
    graph.outdegree()
    graph.dump(args.outpath)
    
    # dump alias sample table
-    sqrt_indegree = np.sqrt(indegree)
-    distribution = 1. * sqrt_indegree / sqrt_indegree.sum()
+    item_distribution = np.array(item_distribution)
+    item_distribution = np.sqrt(item_distribution)
+    distribution = 1. * item_distribution / item_distribution.sum()
    alias, events = alias_sample_build_table(distribution)
    np.save(os.path.join(args.outpath, "alias.npy"), alias)
    np.save(os.path.join(args.outpath, "events.npy"), events)

--- a/pgl/__init__.py
+++ b/pgl/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 """Generate pgl apis
 """
-__version__ = "1.0.2"
+__version__ = "1.1.0"
 from pgl import layers
 from pgl import graph_wrapper
 from pgl import graph