Merge branch 'master' into doc_v7

8a9d286c · wuzhihua · GitHub · 78d3513f · 3d0cffc3 · 8a9d286c
59 changed file
--- a/core/model.py
+++ b/core/model.py
@@ -149,11 +149,13 @@ class Model(object):
        return optimizer_i
    def optimizer(self):
-        learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
+        opt_name = envs.get_global_env("hyper_parameters.optimizer.class")
-                                            None, self._namespace)
+        opt_lr = envs.get_global_env(
-        optimizer = envs.get_global_env("hyper_parameters.optimizer", None,
+            "hyper_parameters.optimizer.learning_rate")
-                                        self._namespace)
+        opt_strategy = envs.get_global_env(
-        return self._build_optimizer(optimizer, learning_rate)
+            "hyper_parameters.optimizer.strategy")
+        return self._build_optimizer(opt_name, opt_lr, opt_strategy)
    def input_data(self, is_infer=False, **kwargs):
        name = "dataset." + kwargs.get("dataset_name") + "."

--- a/core/trainers/single_infer.py
+++ b/core/trainers/single_infer.py
@@ -167,6 +167,7 @@ class SingleInfer(TranspileTrainer):
                        model = envs.lazy_instance_by_fliename(
                            model_path, "Model")(self._env)
                        model._infer_data_var = model.input_data(
+                            is_infer=True,
                            dataset_name=model_dict["dataset_name"])
                        if envs.get_global_env("dataset." + dataset_name +
                                               ".type") == "DataLoader":

--- a/core/trainers/single_trainer.py
+++ b/core/trainers/single_trainer.py
@@ -147,11 +147,6 @@ class SingleTrainer(TranspileTrainer):
            startup_program = fluid.Program()
            scope = fluid.Scope()
            dataset_name = model_dict["dataset_name"]
-            opt_name = envs.get_global_env("hyper_parameters.optimizer.class")
-            opt_lr = envs.get_global_env(
-                "hyper_parameters.optimizer.learning_rate")
-            opt_strategy = envs.get_global_env(
-                "hyper_parameters.optimizer.strategy")
            with fluid.program_guard(train_program, startup_program):
                with fluid.unique_name.guard():
                    with fluid.scope_guard(scope):
@@ -168,8 +163,7 @@ class SingleTrainer(TranspileTrainer):
                            self._get_dataloader(dataset_name,
                                                 model._data_loader)
                        model.net(model._data_var, False)
-                        optimizer = model._build_optimizer(opt_name, opt_lr,
+                        optimizer = model.optimizer()
-                                                           opt_strategy)
                        optimizer.minimize(model._cost)
            self._model[model_dict["name"]][0] = train_program
            self._model[model_dict["name"]][1] = startup_program
@@ -234,10 +228,12 @@ class SingleTrainer(TranspileTrainer):
        scope = self._model[model_name][2]
        program = self._model[model_name][0]
        reader = self._dataset[reader_name]
+        threads = model_dict.get("thread_num", 1)
        with fluid.scope_guard(scope):
            self._exe.train_from_dataset(
                program=program,
                dataset=reader,
+                thread=threads,
                fetch_list=fetch_vars,
                fetch_info=fetch_alias,
                print_period=fetch_period)
@@ -247,8 +243,23 @@ class SingleTrainer(TranspileTrainer):
        model_name = model_dict["name"]
        model_class = self._model[model_name][3]
        program = self._model[model_name][0].clone()
+        _build_strategy = fluid.BuildStrategy()
+        _exe_strategy = fluid.ExecutionStrategy()
+        # 0: kCoeffNumDevice; 1: One; 2: Customized
+        _build_strategy.gradient_scale_strategy = model_dict.get(
+            "gradient_scale_strategy", 0)
+        if "thread_num" in model_dict and model_dict["thread_num"] > 1:
+            _build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+            _exe_strategy.num_threads = model_dict["thread_num"]
+            os.environ['CPU_NUM'] = str(_exe_strategy.num_threads)
        program = fluid.compiler.CompiledProgram(program).with_data_parallel(
-            loss_name=model_class.get_avg_cost().name)
+            loss_name=model_class.get_avg_cost().name,
+            build_strategy=_build_strategy,
+            exec_strategy=_exe_strategy)
        fetch_vars = []
        fetch_alias = []
        fetch_period = int(

--- a/models/contentunderstanding/classification/__init__.py
+++ b/models/contentunderstanding/classification/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/models/contentunderstanding/classification/config.yaml
+++ b/models/contentunderstanding/classification/config.yaml
@@ -12,28 +12,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-train:
+workspace: "paddlerec.models.contentunderstanding.classification"
-  trainer:
-    # for cluster training
-    strategy: "async"
-  epochs: 10
+dataset:
-  workspace: "paddlerec.models.contentunderstanding.classification"
+- name: data1
+  batch_size: 5
+  type: DataLoader
+  data_path: "{workspace}/data/train_data"
+  data_converter: "{workspace}/reader.py"
+hyper_parameters:
+  optimizer:
+    class: Adagrad
+    learning_rate: 0.001
+  is_sparse: False
-  reader:
+mode: runner1
-    batch_size: 5
-    class: "{workspace}/reader.py"
-    train_data_path: "{workspace}/train_data"
-  model:
+runner:
-    models: "{workspace}/model.py"
+- name: runner1
+  class: single_train
+  epochs: 10
+  device: cpu
+  save_checkpoint_interval: 2
+  save_inference_interval: 4
+  save_checkpoint_path: "increment"
+  save_inference_path: "inference"
+  save_inference_feed_varnames: []
+  save_inference_fetch_varnames: []
-  save:
+phase:
-    increment:
+- name: phase1
-      dirname: "increment"
+  model: "{workspace}/model.py"
-      epoch_interval: 1
+  dataset_name: data1
-      save_last: True
+  thread_num: 1
-    inference:
-      dirname: "inference"
-      epoch_interval: 100
-      save_last: True
--- a/models/contentunderstanding/classification/train_data/part-0
+++ b/models/contentunderstanding/classification/train_data/part-0
--- a/models/contentunderstanding/classification/model.py
+++ b/models/contentunderstanding/classification/model.py
@@ -27,19 +27,27 @@ class Model(ModelBase):
        self.emb_dim = 8
        self.hid_dim = 128
        self.class_dim = 2
+        self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse",
+                                             False)
-    def train_net(self):
+    def input_data(self, is_infer=False, **kwargs):
-        """ network definition """
        data = fluid.data(
            name="input", shape=[None, self.max_len], dtype='int64')
        label = fluid.data(name="label", shape=[None, 1], dtype='int64')
        seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64')
+        return [data, label, seq_len]
-        self._data_var = [data, label, seq_len]
+    def net(self, input, is_infer=False):
+        """ network definition """
+        data = input[0]
+        label = input[1]
+        seq_len = input[2]
        # embedding layer
-        emb = fluid.embedding(input=data, size=[self.dict_dim, self.emb_dim])
+        emb = fluid.embedding(
+            input=data,
+            size=[self.dict_dim, self.emb_dim],
+            is_sparse=self.is_sparse)
        emb = fluid.layers.sequence_unpad(emb, length=seq_len)
        # convolution layer
        conv = fluid.nets.sequence_conv_pool(
@@ -59,19 +67,8 @@ class Model(ModelBase):
        avg_cost = fluid.layers.mean(x=cost)
        acc = fluid.layers.accuracy(input=prediction, label=label)
-        self.cost = avg_cost
+        self._cost = avg_cost
-        self._metrics["acc"] = acc
+        if is_infer:
+            self._infer_results["acc"] = acc
-    def get_avg_cost(self):
+        else:
-        return self.cost
+            self._metrics["acc"] = acc
-    def get_metrics(self):
-        return self._metrics
-    def optimizer(self):
-        learning_rate = 0.01
-        sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate)
-        return sgd_optimizer
-    def infer_net(self):
-        self.train_net()
--- a/models/contentunderstanding/classification/reader.py
+++ b/models/contentunderstanding/classification/reader.py
@@ -22,7 +22,7 @@ class TrainReader(Reader):
        pass
    def _process_line(self, l):
-        l = l.strip().split(" ")
+        l = l.strip().split()
        data = l[0:10]
        seq_len = l[10:11]
        label = l[11:]
@@ -37,8 +37,6 @@ class TrainReader(Reader):
            data = [int(i) for i in data]
            label = [int(i) for i in label]
            seq_len = [int(i) for i in seq_len]
-            print >> sys.stderr, str(
-                [('data', data), ('label', label), ('seq_len', seq_len)])
            yield [('data', data), ('label', label), ('seq_len', seq_len)]
        return data_iter
--- a/models/contentunderstanding/readme.md
+++ b/models/contentunderstanding/readme.md
@@ -37,7 +37,18 @@
 <img align="center" src="../../doc/imgs/cnn-ckim2014.png">
 <p>
-## 使用教程
+##使用教程(快速开始)
+```
+python -m paddlerec.run -m paddlerec.models.contentunderstanding.tagspace
+python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
+```
+## 使用教程（复现论文）
+###注意
+为了方便使用者能够快速的跑通每一个模型，我们在每个模型下都提供了样例数据。如果需要复现readme中的效果请使用以下提供的脚本下载对应数据集以及数据预处理。
 ### 数据处理
 **（1）TagSpace**
@@ -64,20 +75,42 @@ mv test.csv raw_big_test_data
 python text2paddle.py raw_big_train_data/ raw_big_test_data/ train_big_data test_big_data big_vocab_text.txt big_vocab_tag.txt
 ```
-**（2）Classification**
+### 训练
+```
+cd modles/contentunderstanding/tagspace
+python -m paddlerec.run -m ./config.yaml # 自定义修改超参后，指定配置文件，使用自定义配置
+```
-无
+### 预测
+```
+# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
+# 修改对应模型的config.yaml，mode配置infer_runner
+# 示例: mode: train_runner -> mode: infer_runner
+# infer_runner中 class配置为 class: single_infer
+# 修改phase阶段为infer的配置，参照config注释
+# 修改完config.yaml后 执行:
+python -m paddlerec.run -m ./config.yaml
+```
-### 训练
+**（2）Classification**
+### 训练
 ```
-python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
+cd modles/contentunderstanding/classification
+python -m paddlerec.run -m ./config.yaml # 自定义修改超参后，指定配置文件，使用自定义配置
 ```
 ### 预测
 ```
-python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
+# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
+# 修改对应模型的config.yaml，mode配置infer_runner
+# 示例: mode: train_runner -> mode: infer_runner
+# infer_runner中 class配置为 class: single_infer
+# 修改phase阶段为infer的配置，参照config注释
+# 修改完config.yaml后 执行:
+python -m paddlerec.run -m ./config.yaml
 ```
 ## 效果对比

--- a/models/contentunderstanding/tagspace/config.yaml
+++ b/models/contentunderstanding/tagspace/config.yaml
@@ -12,38 +12,44 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-train:
+workspace: "paddlerec.models.contentunderstanding.tagspace"
-  trainer:
-    # for cluster training
-    strategy: "async"
-  epochs: 10
+dataset:
-  workspace: "paddlerec.models.contentunderstanding.tagspace"
+- name: sample_1
+  type: QueueDataset
+  batch_size: 5
+  data_path: "{workspace}/data/train_data"
+  data_converter: "{workspace}/reader.py"
-  reader:
+hyper_parameters:
-    batch_size: 5
+  optimizer:
-    class: "{workspace}/reader.py"
+    class: Adagrad
-    train_data_path: "{workspace}/train_data"
+    learning_rate: 0.001
+  vocab_text_size: 11447
+  vocab_tag_size: 4
+  emb_dim: 10
+  hid_dim: 1000
+  win_size: 5
+  margin: 0.1
+  neg_size: 3
+  num_devices: 1
-  model:
+mode: runner1
-    models: "{workspace}/model.py"
-    hyper_parameters:
-        vocab_text_size: 11447
-        vocab_tag_size: 4
-        emb_dim: 10
-        hid_dim: 1000
-        win_size: 5
-        margin: 0.1
-        neg_size: 3
-        num_devices: 1
+runner:
+- name: runner1
+  class: single_train
+  epochs: 10
+  device: cpu
+  save_checkpoint_interval: 2
+  save_inference_interval: 4
+  save_checkpoint_path: "increment"
+  save_inference_path: "inference"
+  save_inference_feed_varnames: []
+  save_inference_fetch_varnames: []
-  save:
+phase:
-    increment:
+- name: phase1
-      dirname: "increment"
+  model: "{workspace}/model.py"
-      epoch_interval: 1
+  dataset_name: sample_1
-      save_last: True
+  thread_num: 1
-    inference:
-      dirname: "inference"
-      epoch_interval: 100
-      save_last: True
--- a/models/contentunderstanding/tagspace/test_data/small_test.csv
+++ b/models/contentunderstanding/tagspace/test_data/small_test.csv
--- a/models/contentunderstanding/tagspace/train_data/small_train.csv
+++ b/models/contentunderstanding/tagspace/train_data/small_train.csv
--- a/models/contentunderstanding/tagspace/model.py
+++ b/models/contentunderstanding/tagspace/model.py
@@ -26,26 +26,30 @@ class Model(ModelBase):
        ModelBase.__init__(self, config)
        self.cost = None
        self.metrics = {}
-        self.vocab_text_size = envs.get_global_env("vocab_text_size", None,
+        self.vocab_text_size = envs.get_global_env(
-                                                   self._namespace)
+            "hyper_parameters.vocab_text_size")
-        self.vocab_tag_size = envs.get_global_env("vocab_tag_size", None,
+        self.vocab_tag_size = envs.get_global_env(
-                                                  self._namespace)
+            "hyper_parameters.vocab_tag_size")
-        self.emb_dim = envs.get_global_env("emb_dim", None, self._namespace)
+        self.emb_dim = envs.get_global_env("hyper_parameters.emb_dim")
-        self.hid_dim = envs.get_global_env("hid_dim", None, self._namespace)
+        self.hid_dim = envs.get_global_env("hyper_parameters.hid_dim")
-        self.win_size = envs.get_global_env("win_size", None, self._namespace)
+        self.win_size = envs.get_global_env("hyper_parameters.win_size")
-        self.margin = envs.get_global_env("margin", None, self._namespace)
+        self.margin = envs.get_global_env("hyper_parameters.margin")
-        self.neg_size = envs.get_global_env("neg_size", None, self._namespace)
+        self.neg_size = envs.get_global_env("hyper_parameters.neg_size")
-    def train_net(self):
+    def input_data(self, is_infer=False, **kwargs):
-        """ network"""
        text = fluid.data(
            name="text", shape=[None, 1], lod_level=1, dtype='int64')
        pos_tag = fluid.data(
            name="pos_tag", shape=[None, 1], lod_level=1, dtype='int64')
        neg_tag = fluid.data(
            name="neg_tag", shape=[None, 1], lod_level=1, dtype='int64')
+        return [text, pos_tag, neg_tag]
-        self._data_var = [text, pos_tag, neg_tag]
+    def net(self, input, is_infer=False):
+        """ network"""
+        text = input[0]
+        pos_tag = input[1]
+        neg_tag = input[2]
        text_emb = fluid.embedding(
            input=text,
@@ -97,22 +101,11 @@ class Model(ModelBase):
        avg_cost = nn.mean(loss_part3)
        less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32')
        correct = nn.reduce_sum(less)
-        self.cost = avg_cost
+        self._cost = avg_cost
-        self.metrics["correct"] = correct
-        self.metrics["cos_pos"] = cos_pos
-    def get_avg_cost(self):
-        return self.cost
-    def get_metrics(self):
-        return self.metrics
-    def optimizer(self):
-        learning_rate = envs.get_global_env("hyper_parameters.base_lr", None,
-                                            self._namespace)
-        sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate)
-        return sgd_optimizer
-    def infer_net(self, parameter_list):
+        if is_infer:
-        self.train_net()
+            self._infer_results["correct"] = correct
+            self._infer_results["cos_pos"] = cos_pos
+        else:
+            self._metrics["correct"] = correct
+            self._metrics["cos_pos"] = cos_pos
--- a/models/match/dssm/config.yaml
+++ b/models/match/dssm/config.yaml
@@ -11,44 +11,66 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-evaluate:
-  reader:
-    batch_size: 1
-    class: "{workspace}/synthetic_evaluate_reader.py"
-    test_data_path: "{workspace}/data/train"
-train:
-  trainer:
-    # for cluster training
-    strategy: "async"
-  epochs: 4
-  workspace: "paddlerec.models.match.dssm"
-  reader:
+workspace: "paddlerec.models.match.dssm"
-    batch_size: 4
-    class: "{workspace}/synthetic_reader.py"
+dataset:
-    train_data_path: "{workspace}/data/train"
+- name: dataset_train
+  batch_size: 4
+  type: QueueDataset
+  data_path: "{workspace}/data/train" 
+  data_converter: "{workspace}/synthetic_reader.py"
+- name: dataset_infer
+  batch_size: 1
+  type: QueueDataset
+  data_path: "{workspace}/data/train"
+  data_converter: "{workspace}/synthetic_evaluate_reader.py"
-  model:
+hyper_parameters:
-    models: "{workspace}/model.py"
+  optimizer:
-    hyper_parameters:
+    class: sgd
-      TRIGRAM_D: 1000
+    learning_rate: 0.01
-      NEG: 4
+    strategy: async
-      fc_sizes: [300, 300, 128]
+  trigram_d: 1000
-      fc_acts: ['tanh', 'tanh', 'tanh']
+  neg_num: 4
-      learning_rate: 0.01
+  fc_sizes: [300, 300, 128]
-      optimizer: sgd
+  fc_acts: ['tanh', 'tanh', 'tanh']
-  save:
+mode: train_runner
-    increment:
+# config of each runner.
-      dirname: "increment"
+# runner is a kind of paddle training class, which wraps the train/infer process.
-      epoch_interval: 2
+runner:
-      save_last: True
+- name: train_runner
+  class: single_train
+  # num of epochs
+  epochs: 4
+  # device to run training or infer
+  device: cpu
+  save_checkpoint_interval: 2 # save model interval of epochs
+  save_inference_interval: 4 # save inference
+  save_checkpoint_path: "increment" # save checkpoint path
+  save_inference_path: "inference" # save inference path
+  save_inference_feed_varnames: ["query", "doc_pos"] # feed vars of save inference
+  save_inference_fetch_varnames: ["cos_sim_0.tmp_0"] # fetch vars of save inference
+  init_model_path: "" # load model path
+  fetch_period: 2
+- name: infer_runner
+  class: single_infer
+  # num of epochs
+  epochs: 1
+  # device to run training or infer
+  device: cpu
+  fetch_period: 1
+  init_model_path: "increment/2" # load model path
-    inference:
+# runner will run all the phase in each epoch
-      dirname: "inference"
+phase:
-      epoch_interval: 4
+- name: phase1
-      feed_varnames: ["query", "doc_pos"]
+  model: "{workspace}/model.py" # user-defined model
-      fetch_varnames: ["cos_sim_0.tmp_0"]
+  dataset_name: dataset_train # select dataset by name
-      save_last: True
+  thread_num: 1
+#- name: phase2
+#  model: "{workspace}/model.py" # user-defined model
+#  dataset_name: dataset_infer # select dataset by name
+#  thread_num: 1
--- a/models/match/dssm/model.py
+++ b/models/match/dssm/model.py
@@ -22,45 +22,39 @@ class Model(ModelBase):
    def __init__(self, config):
        ModelBase.__init__(self, config)
-    def input(self):
+    def _init_hyper_parameters(self):
-        TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None,
+        self.trigram_d = envs.get_global_env("hyper_parameters.trigram_d")
-                                        self._namespace)
+        self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
+        self.hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes")
-        Neg = envs.get_global_env("hyper_parameters.NEG", None,
+        self.hidden_acts = envs.get_global_env("hyper_parameters.fc_acts")
-                                  self._namespace)
+        self.learning_rate = envs.get_global_env(
+            "hyper_parameters.learning_rate")
-        self.query = fluid.data(
-            name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
+    def input_data(self, is_infer=False, **kwargs):
-        self.doc_pos = fluid.data(
+        query = fluid.data(
+            name="query",
+            shape=[-1, self.trigram_d],
+            dtype='float32',
+            lod_level=0)
+        doc_pos = fluid.data(
            name="doc_pos",
-            shape=[-1, TRIGRAM_D],
+            shape=[-1, self.trigram_d],
            dtype='float32',
            lod_level=0)
-        self.doc_negs = [
+        if is_infer:
+            return [query, doc_pos]
+        doc_negs = [
            fluid.data(
                name="doc_neg_" + str(i),
-                shape=[-1, TRIGRAM_D],
+                shape=[-1, self.trigram_d],
                dtype="float32",
-                lod_level=0) for i in range(Neg)
+                lod_level=0) for i in range(self.neg_num)
        ]
-        self._data_var.append(self.query)
+        return [query, doc_pos] + doc_negs
-        self._data_var.append(self.doc_pos)
-        for input in self.doc_negs:
-            self._data_var.append(input)
-        if self._platform != "LINUX":
-            self._data_loader = fluid.io.DataLoader.from_generator(
-                feed_list=self._data_var,
-                capacity=64,
-                use_double_buffer=False,
-                iterable=False)
-    def net(self, is_infer=False):
-        hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes", None,
-                                            self._namespace)
-        hidden_acts = envs.get_global_env("hyper_parameters.fc_acts", None,
-                                          self._namespace)
+    def net(self, inputs, is_infer=False):
        def fc(data, hidden_layers, hidden_acts, names):
            fc_inputs = [data]
            for i in range(len(hidden_layers)):
@@ -77,71 +71,30 @@ class Model(ModelBase):
                fc_inputs.append(out)
            return fc_inputs[-1]
-        query_fc = fc(self.query, hidden_layers, hidden_acts,
+        query_fc = fc(inputs[0], self.hidden_layers, self.hidden_acts,
                      ['query_l1', 'query_l2', 'query_l3'])
-        doc_pos_fc = fc(self.doc_pos, hidden_layers, hidden_acts,
+        doc_pos_fc = fc(inputs[1], self.hidden_layers, self.hidden_acts,
                        ['doc_pos_l1', 'doc_pos_l2', 'doc_pos_l3'])
-        self.R_Q_D_p = fluid.layers.cos_sim(query_fc, doc_pos_fc)
+        R_Q_D_p = fluid.layers.cos_sim(query_fc, doc_pos_fc)
        if is_infer:
+            self._infer_results["query_doc_sim"] = R_Q_D_p
            return
        R_Q_D_ns = []
-        for i, doc_neg in enumerate(self.doc_negs):
+        for i in range(len(inputs) - 2):
-            doc_neg_fc_i = fc(doc_neg, hidden_layers, hidden_acts, [
+            doc_neg_fc_i = fc(
-                'doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i),
+                inputs[i + 2], self.hidden_layers, self.hidden_acts, [
-                'doc_neg_l3_' + str(i)
+                    'doc_neg_l1_' + str(i), 'doc_neg_l2_' + str(i),
-            ])
+                    'doc_neg_l3_' + str(i)
+                ])
            R_Q_D_ns.append(fluid.layers.cos_sim(query_fc, doc_neg_fc_i))
-        concat_Rs = fluid.layers.concat(
+        concat_Rs = fluid.layers.concat(input=[R_Q_D_p] + R_Q_D_ns, axis=-1)
-            input=[self.R_Q_D_p] + R_Q_D_ns, axis=-1)
        prob = fluid.layers.softmax(concat_Rs, axis=1)
        hit_prob = fluid.layers.slice(
            prob, axes=[0, 1], starts=[0, 0], ends=[4, 1])
        loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob))
-        self.avg_cost = fluid.layers.mean(x=loss)
+        avg_cost = fluid.layers.mean(x=loss)
+        self._cost = avg_cost
-    def infer_results(self):
+        self._metrics["LOSS"] = avg_cost
-        self._infer_results['query_doc_sim'] = self.R_Q_D_p
-    def avg_loss(self):
-        self._cost = self.avg_cost
-    def metrics(self):
-        self._metrics["LOSS"] = self.avg_cost
-    def train_net(self):
-        self.input()
-        self.net(is_infer=False)
-        self.avg_loss()
-        self.metrics()
-    def optimizer(self):
-        learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
-                                            None, self._namespace)
-        optimizer = fluid.optimizer.SGD(learning_rate)
-        return optimizer
-    def infer_input(self):
-        TRIGRAM_D = envs.get_global_env("hyper_parameters.TRIGRAM_D", None,
-                                        self._namespace)
-        self.query = fluid.data(
-            name="query", shape=[-1, TRIGRAM_D], dtype='float32', lod_level=0)
-        self.doc_pos = fluid.data(
-            name="doc_pos",
-            shape=[-1, TRIGRAM_D],
-            dtype='float32',
-            lod_level=0)
-        self._infer_data_var = [self.query, self.doc_pos]
-        self._infer_data_loader = fluid.io.DataLoader.from_generator(
-            feed_list=self._infer_data_var,
-            capacity=64,
-            use_double_buffer=False,
-            iterable=False)
-    def infer_net(self):
-        self.infer_input()
-        self.net(is_infer=True)
-        self.infer_results()
--- a/models/match/dssm/synthetic_evaluate_reader.py
+++ b/models/match/dssm/synthetic_evaluate_reader.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 from paddlerec.core.reader import Reader
-class EvaluateReader(Reader):
+class TrainReader(Reader):
    def init(self):
        pass

--- a/models/match/multiview-simnet/config.yaml
+++ b/models/match/multiview-simnet/config.yaml
@@ -11,49 +11,73 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-evaluate:
-  workspace: "paddlerec.models.match.multiview-simnet"
-  reader:
-    batch_size: 2
-    class: "{workspace}/evaluate_reader.py"
-    test_data_path: "{workspace}/data/test"
-train:
+# workspace
-  trainer:
+workspace: "paddlerec.models.match.multiview-simnet"
-    # for cluster training
-    strategy: "async"
-  epochs: 2
+# list of dataset
-  workspace: "paddlerec.models.match.multiview-simnet"
+dataset:
+- name: dataset_train # name of dataset to distinguish different datasets
+  batch_size: 2
+  type: DataLoader # or QueueDataset
+  data_path: "{workspace}/data/train"
+  sparse_slots: "1 2 3"
+- name: dataset_infer # name
+  batch_size: 2
+  type: DataLoader # or QueueDataset
+  data_path: "{workspace}/data/test"
+  sparse_slots: "1 2"
-  reader:
+# hyper parameters of user-defined network
-    batch_size: 2
+hyper_parameters:
-    class: "{workspace}/reader.py"
+  optimizer:
-    train_data_path: "{workspace}/data/train"
+    class: Adam
-    dataset_class: "DataLoader"
+    learning_rate: 0.0001
+    strategy: async
+  query_encoder: "bow"
+  title_encoder: "bow"
+  query_encode_dim: 128
+  title_encode_dim: 128
+  sparse_feature_dim: 1000001
+  embedding_dim: 128
+  hidden_size: 128
+  margin: 0.1
-  model:
+# select runner by name
-    models: "{workspace}/model.py"
+mode: train_runner
-    hyper_parameters:
+# config of each runner.
-      use_DataLoader: True
+# runner is a kind of paddle training class, which wraps the train/infer process.
-      query_encoder: "bow"
+runner:
-      title_encoder: "bow"
+- name: train_runner
-      query_encode_dim: 128
+  class: single_train
-      title_encode_dim: 128
+  # num of epochs
-      query_slots: 1
+  epochs: 2
-      title_slots: 1
+  # device to run training or infer
-      sparse_feature_dim: 1000001
+  device: cpu
-      embedding_dim: 128
+  save_checkpoint_interval: 1 # save model interval of epochs
-      hidden_size: 128
+  save_inference_interval: 1 # save inference
-      learning_rate: 0.0001
+  save_checkpoint_path: "increment" # save checkpoint path
-      optimizer: adam
+  save_inference_path: "inference" # save inference path
+  save_inference_feed_varnames: [] # feed vars of save inference
+  save_inference_fetch_varnames: [] # fetch vars of save inference
+  init_model_path: "" # load model path
+  fetch_period: 1
+- name: infer_runner
+  class: single_infer
+  # num of epochs
+  epochs: 1
+  # device to run training or infer
+  device: cpu
+  fetch_period: 1
+  init_model_path: "increment/0" # load model path
-  save:
+# runner will run all the phase in each epoch
-    increment:
+phase:
-      dirname: "increment"
+- name: phase1
-      epoch_interval: 1
+  model: "{workspace}/model.py" # user-defined model
-      save_last: True
+  dataset_name: dataset_train # select dataset by name
-    inference:
+  thread_num: 1
-      dirname: "inference"
+#- name: phase2
-      epoch_interval: 1
+#  model: "{workspace}/model.py" # user-defined model
-      save_last: True
+#  dataset_name: dataset_infer # select dataset by name
+#  thread_num: 1
--- a/models/match/multiview-simnet/model.py
+++ b/models/match/multiview-simnet/model.py
@@ -99,143 +99,89 @@ class SimpleEncoderFactory(object):
 class Model(ModelBase):
    def __init__(self, config):
        ModelBase.__init__(self, config)
-        self.init_config()
+    def _init_hyper_parameters(self):
-    def init_config(self):
+        self.query_encoder = envs.get_global_env(
-        self._fetch_interval = 1
+            "hyper_parameters.query_encoder")
-        query_encoder = envs.get_global_env("hyper_parameters.query_encoder",
+        self.title_encoder = envs.get_global_env(
-                                            None, self._namespace)
+            "hyper_parameters.title_encoder")
-        title_encoder = envs.get_global_env("hyper_parameters.title_encoder",
+        self.query_encode_dim = envs.get_global_env(
-                                            None, self._namespace)
+            "hyper_parameters.query_encode_dim")
-        query_encode_dim = envs.get_global_env(
+        self.title_encode_dim = envs.get_global_env(
-            "hyper_parameters.query_encode_dim", None, self._namespace)
+            "hyper_parameters.title_encode_dim")
-        title_encode_dim = envs.get_global_env(
-            "hyper_parameters.title_encode_dim", None, self._namespace)
-        query_slots = envs.get_global_env("hyper_parameters.query_slots", None,
-                                          self._namespace)
-        title_slots = envs.get_global_env("hyper_parameters.title_slots", None,
-                                          self._namespace)
-        factory = SimpleEncoderFactory()
-        self.query_encoders = [
-            factory.create(query_encoder, query_encode_dim)
-            for i in range(query_slots)
-        ]
-        self.title_encoders = [
-            factory.create(title_encoder, title_encode_dim)
-            for i in range(title_slots)
-        ]
        self.emb_size = envs.get_global_env(
-            "hyper_parameters.sparse_feature_dim", None, self._namespace)
+            "hyper_parameters.sparse_feature_dim")
-        self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim",
+        self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim")
-                                           None, self._namespace)
        self.emb_shape = [self.emb_size, self.emb_dim]
-        self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size",
-                                               None, self._namespace)
-        self.margin = 0.1
-    def input(self, is_train=True):
-        self.q_slots = [
-            fluid.data(
-                name="%d" % i, shape=[None, 1], lod_level=1, dtype='int64')
-            for i in range(len(self.query_encoders))
-        ]
-        self.pt_slots = [
-            fluid.data(
-                name="%d" % (i + len(self.query_encoders)),
-                shape=[None, 1],
-                lod_level=1,
-                dtype='int64') for i in range(len(self.title_encoders))
-        ]
-        if is_train == False:
+        self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size")
-            return self.q_slots + self.pt_slots
+        self.margin = envs.get_global_env("hyper_parameters.margin")
-        self.nt_slots = [
+    def net(self, input, is_infer=False):
-            fluid.data(
+        factory = SimpleEncoderFactory()
-                name="%d" %
+        self.q_slots = self._sparse_data_var[0:1]
-                (i + len(self.query_encoders) + len(self.title_encoders)),
+        self.query_encoders = [
-                shape=[None, 1],
+            factory.create(self.query_encoder, self.query_encode_dim)
-                lod_level=1,
+            for _ in self.q_slots
-                dtype='int64') for i in range(len(self.title_encoders))
        ]
-        return self.q_slots + self.pt_slots + self.nt_slots
-    def train_input(self):
-        res = self.input()
-        self._data_var = res
-        use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader",
-                                             False, self._namespace)
-        if self._platform != "LINUX" or use_dataloader:
-            self._data_loader = fluid.io.DataLoader.from_generator(
-                feed_list=self._data_var,
-                capacity=256,
-                use_double_buffer=False,
-                iterable=False)
-    def get_acc(self, x, y):
-        less = tensor.cast(cf.less_than(x, y), dtype='float32')
-        label_ones = fluid.layers.fill_constant_batch_size_like(
-            input=x, dtype='float32', shape=[-1, 1], value=1.0)
-        correct = fluid.layers.reduce_sum(less)
-        total = fluid.layers.reduce_sum(label_ones)
-        acc = fluid.layers.elementwise_div(correct, total)
-        return acc
-    def net(self):
        q_embs = [
            fluid.embedding(
                input=query, size=self.emb_shape, param_attr="emb")
            for query in self.q_slots
        ]
-        pt_embs = [
-            fluid.embedding(
-                input=title, size=self.emb_shape, param_attr="emb")
-            for title in self.pt_slots
-        ]
-        nt_embs = [
-            fluid.embedding(
-                input=title, size=self.emb_shape, param_attr="emb")
-            for title in self.nt_slots
-        ]
        # encode each embedding field with encoder
        q_encodes = [
            self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs)
        ]
-        pt_encodes = [
-            self.title_encoders[i].forward(emb)
-            for i, emb in enumerate(pt_embs)
-        ]
-        nt_encodes = [
-            self.title_encoders[i].forward(emb)
-            for i, emb in enumerate(nt_embs)
-        ]
        # concat multi view for query, pos_title, neg_title
        q_concat = fluid.layers.concat(q_encodes)
-        pt_concat = fluid.layers.concat(pt_encodes)
-        nt_concat = fluid.layers.concat(nt_encodes)
        # projection of hidden layer
        q_hid = fluid.layers.fc(q_concat,
                                size=self.hidden_size,
                                param_attr='q_fc.w',
                                bias_attr='q_fc.b')
+        self.pt_slots = self._sparse_data_var[1:2]
+        self.title_encoders = [
+            factory.create(self.title_encoder, self.title_encode_dim)
+        ]
+        pt_embs = [
+            fluid.embedding(
+                input=title, size=self.emb_shape, param_attr="emb")
+            for title in self.pt_slots
+        ]
+        pt_encodes = [
+            self.title_encoders[i].forward(emb)
+            for i, emb in enumerate(pt_embs)
+        ]
+        pt_concat = fluid.layers.concat(pt_encodes)
        pt_hid = fluid.layers.fc(pt_concat,
                                 size=self.hidden_size,
                                 param_attr='t_fc.w',
                                 bias_attr='t_fc.b')
+        # cosine of hidden layers
+        cos_pos = fluid.layers.cos_sim(q_hid, pt_hid)
+        if is_infer:
+            self._infer_results['query_pt_sim'] = cos_pos
+            return
+        self.nt_slots = self._sparse_data_var[2:3]
+        nt_embs = [
+            fluid.embedding(
+                input=title, size=self.emb_shape, param_attr="emb")
+            for title in self.nt_slots
+        ]
+        nt_encodes = [
+            self.title_encoders[i].forward(emb)
+            for i, emb in enumerate(nt_embs)
+        ]
+        nt_concat = fluid.layers.concat(nt_encodes)
        nt_hid = fluid.layers.fc(nt_concat,
                                 size=self.hidden_size,
                                 param_attr='t_fc.w',
                                 bias_attr='t_fc.b')
-        # cosine of hidden layers
-        cos_pos = fluid.layers.cos_sim(q_hid, pt_hid)
        cos_neg = fluid.layers.cos_sim(q_hid, nt_hid)
        # pairwise hinge_loss
@@ -254,72 +200,16 @@ class Model(ModelBase):
                input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'),
            loss_part2)
-        self.avg_cost = fluid.layers.mean(loss_part3)
+        self._cost = fluid.layers.mean(loss_part3)
        self.acc = self.get_acc(cos_neg, cos_pos)
+        self._metrics["loss"] = self._cost
-    def avg_loss(self):
-        self._cost = self.avg_cost
-    def metrics(self):
-        self._metrics["loss"] = self.avg_cost
        self._metrics["acc"] = self.acc
-    def train_net(self):
+    def get_acc(self, x, y):
-        self.train_input()
+        less = tensor.cast(cf.less_than(x, y), dtype='float32')
-        self.net()
+        label_ones = fluid.layers.fill_constant_batch_size_like(
-        self.avg_loss()
+            input=x, dtype='float32', shape=[-1, 1], value=1.0)
-        self.metrics()
+        correct = fluid.layers.reduce_sum(less)
+        total = fluid.layers.reduce_sum(label_ones)
-    def optimizer(self):
+        acc = fluid.layers.elementwise_div(correct, total)
-        learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
+        return acc
-                                            None, self._namespace)
-        optimizer = fluid.optimizer.Adam(learning_rate=learning_rate)
-        return optimizer
-    def infer_input(self):
-        res = self.input(is_train=False)
-        self._infer_data_var = res
-        self._infer_data_loader = fluid.io.DataLoader.from_generator(
-            feed_list=self._infer_data_var,
-            capacity=64,
-            use_double_buffer=False,
-            iterable=False)
-    def infer_net(self):
-        self.infer_input()
-        # lookup embedding for each slot
-        q_embs = [
-            fluid.embedding(
-                input=query, size=self.emb_shape, param_attr="emb")
-            for query in self.q_slots
-        ]
-        pt_embs = [
-            fluid.embedding(
-                input=title, size=self.emb_shape, param_attr="emb")
-            for title in self.pt_slots
-        ]
-        # encode each embedding field with encoder
-        q_encodes = [
-            self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs)
-        ]
-        pt_encodes = [
-            self.title_encoders[i].forward(emb)
-            for i, emb in enumerate(pt_embs)
-        ]
-        # concat multi view for query, pos_title, neg_title
-        q_concat = fluid.layers.concat(q_encodes)
-        pt_concat = fluid.layers.concat(pt_encodes)
-        # projection of hidden layer
-        q_hid = fluid.layers.fc(q_concat,
-                                size=self.hidden_size,
-                                param_attr='q_fc.w',
-                                bias_attr='q_fc.b')
-        pt_hid = fluid.layers.fc(pt_concat,
-                                 size=self.hidden_size,
-                                 param_attr='t_fc.w',
-                                 bias_attr='t_fc.b')
-        # cosine of hidden layers
-        cos = fluid.layers.cos_sim(q_hid, pt_hid)
-        self._infer_results['query_pt_sim'] = cos
--- a/models/match/readme.md
+++ b/models/match/readme.md
@@ -31,9 +31,21 @@
 <img align="center" src="../../doc/imgs/multiview-simnet.png">
 <p>
-## 使用教程
+## 使用教程(快速开始)
-### 训练&预测
+### 训练
 ```shell
 python -m paddlerec.run -m paddlerec.models.match.dssm # dssm
 python -m paddlerec.run -m paddlerec.models.match.multiview-simnet # multiview-simnet
 ```
+### 预测
+```shell
+# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
+# 修改对应模型的config.yaml，mode配置infer_runner
+# 示例: mode: train_runner -> mode: infer_runner
+# infer_runner中 class配置为 class: single_infer
+# 修改phase阶段为infer的配置，参照config注释
+# 修改完config.yaml后 执行:
+python -m paddlerec.run -m ./config.yaml # 以dssm为例
+```
--- a/models/rank/dcn/data/get_slot_data.py
+++ b/models/rank/dcn/data/get_slot_data.py
@@ -59,7 +59,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
        self.cat_feat_idx_dict_list = [{} for _ in range(26)]
        # TODO: set vocabulary dictionary
-        vocab_dir = "./vocab/"
+        vocab_dir = "./sample_data/vocab/"
        for i in range(26):
            lookup_idx = 1  # remain 0 for default value
            for line in open(

--- a/models/rank/deepfm/data/get_slot_data.py
+++ b/models/rank/deepfm/data/get_slot_data.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import yaml
+import yaml, os
 from paddlerec.core.reader import Reader
 from paddlerec.core.utils import envs
+import paddle.fluid.incubate.data_generator as dg
 try:
    import cPickle as pickle
 except ImportError:
@@ -44,7 +46,7 @@ class TrainReader(dg.MultiSlotDataGenerator):
        self.continuous_range_ = range(1, 14)
        self.categorical_range_ = range(14, 40)
        # load preprocessed feature dict 
-        self.feat_dict_name = "aid_data/feat_dict_10.pkl2"
+        self.feat_dict_name = "sample_data/feat_dict_10.pkl2"
        self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb'))
    def _process_line(self, line):

--- a/models/rank/logistic_regression/__init__.py
+++ b/models/rank/logistic_regression/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/models/rank/logistic_regression/config.yaml
+++ b/models/rank/logistic_regression/config.yaml
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# global settings 
+debug: false
+workspace: "paddlerec.models.rank.deepfm"
+dataset:
+  - name: train_sample
+    type: QueueDataset
+    batch_size: 5
+    data_path: "{workspace}/data/sample_data/train"
+    sparse_slots: "label feat_idx"
+    dense_slots: "feat_value:39"
+  - name: infer_sample
+    type: QueueDataset
+    batch_size: 5
+    data_path: "{workspace}/data/sample_data/train"
+    sparse_slots: "label feat_idx"
+    dense_slots: "feat_value:39"
+hyper_parameters:
+    optimizer:
+        class: SGD
+        learning_rate: 0.0001
+    sparse_feature_number: 1086460
+    sparse_feature_dim: 9
+    num_field: 39
+    reg: 0.001
+mode: train_runner
+# if infer, change mode to "infer_runner" and change phase to "infer_phase"
+runner:
+  - name: train_runner
+    trainer_class: single_train
+    epochs: 2
+    device: cpu
+    init_model_path: ""
+    save_checkpoint_interval: 1
+    save_inference_interval: 1
+    save_checkpoint_path: "increment"
+    save_inference_path: "inference"
+    print_interval: 1
+  - name: infer_runner
+    trainer_class: single_infer
+    epochs: 1
+    device: cpu
+    init_model_path: "increment/0"
+    print_interval: 1
+phase:
+- name: phase1
+  model: "{workspace}/model.py"
+  dataset_name: train_sample
+  thread_num: 1
+#- name: infer_phase
+#  model: "{workspace}/model.py"
+#  dataset_name: infer_sample
+#  thread_num: 1
--- a/models/rank/logistic_regression/data/download_preprocess.py
+++ b/models/rank/logistic_regression/data/download_preprocess.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import shutil
+import sys
+LOCAL_PATH = os.path.dirname(os.path.abspath(__file__))
+TOOLS_PATH = os.path.join(LOCAL_PATH, "..", "..", "tools")
+sys.path.append(TOOLS_PATH)
+from paddlerec.tools.tools import download_file_and_uncompress, download_file
+if __name__ == '__main__':
+    url = "https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz"
+    url2 = "https://paddlerec.bj.bcebos.com/deepfm%2Ffeat_dict_10.pkl2"
+    print("download and extract starting...")
+    download_file_and_uncompress(url)
+    download_file(url2, "./aid_data/feat_dict_10.pkl2", True)
+    print("download and extract finished")
+    print("preprocessing...")
+    os.system("python preprocess.py")
+    print("preprocess done")
+    shutil.rmtree("raw_data")
+    print("done")
--- a/models/rank/logistic_regression/data/get_slot_data.py
+++ b/models/rank/logistic_regression/data/get_slot_data.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import yaml
+import os
+from paddlerec.core.reader import Reader
+from paddlerec.core.utils import envs
+import paddle.fluid.incubate.data_generator as dg
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+class TrainReader(dg.MultiSlotDataGenerator):
+    def __init__(self, config):
+        dg.MultiSlotDataGenerator.__init__(self)
+        if os.path.isfile(config):
+            with open(config, 'r') as rb:
+                _config = yaml.load(rb.read(), Loader=yaml.FullLoader)
+        else:
+            raise ValueError("reader config only support yaml")
+    def init(self):
+        self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+        self.cont_max_ = [
+            5775, 257675, 65535, 969, 23159456, 431037, 56311, 6047, 29019, 46,
+            231, 4008, 7393
+        ]
+        self.cont_diff_ = [
+            self.cont_max_[i] - self.cont_min_[i]
+            for i in range(len(self.cont_min_))
+        ]
+        self.continuous_range_ = range(1, 14)
+        self.categorical_range_ = range(14, 40)
+        # load preprocessed feature dict 
+        self.feat_dict_name = "sample_data/feat_dict_10.pkl2"
+        self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb'))
+    def _process_line(self, line):
+        features = line.rstrip('\n').split('\t')
+        feat_idx = []
+        feat_value = []
+        for idx in self.continuous_range_:
+            if features[idx] == '':
+                feat_idx.append(0)
+                feat_value.append(0.0)
+            else:
+                feat_idx.append(self.feat_dict_[idx])
+                feat_value.append(
+                    (float(features[idx]) - self.cont_min_[idx - 1]) /
+                    self.cont_diff_[idx - 1])
+        for idx in self.categorical_range_:
+            if features[idx] == '' or features[idx] not in self.feat_dict_:
+                feat_idx.append(0)
+                feat_value.append(0.0)
+            else:
+                feat_idx.append(self.feat_dict_[features[idx]])
+                feat_value.append(1.0)
+        label = [int(features[0])]
+        return feat_idx, feat_value, label
+    def generate_sample(self, line):
+        """
+        Read the data line by line and process it as a dictionary
+        """
+        def data_iter():
+            feat_idx, feat_value, label = self._process_line(line)
+            s = ""
+            for i in [('feat_idx', feat_idx), ('feat_value', feat_value),
+                      ('label', label)]:
+                k = i[0]
+                v = i[1]
+                for j in v:
+                    s += " " + k + ":" + str(j)
+            print s.strip()
+            yield None
+        return data_iter
+reader = TrainReader(
+    "../config.yaml")  # run this file in original folder to find config.yaml
+reader.init()
+reader.run_from_stdin()
--- a/models/rank/logistic_regression/data/preprocess.py
+++ b/models/rank/logistic_regression/data/preprocess.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import numpy
+from collections import Counter
+import shutil
+import pickle
+def get_raw_data():
+    if not os.path.isdir('raw_data'):
+        os.mkdir('raw_data')
+    fin = open('train.txt', 'r')
+    fout = open('raw_data/part-0', 'w')
+    for line_idx, line in enumerate(fin):
+        if line_idx % 200000 == 0 and line_idx != 0:
+            fout.close()
+            cur_part_idx = int(line_idx / 200000)
+            fout = open('raw_data/part-' + str(cur_part_idx), 'w')
+        fout.write(line)
+    fout.close()
+    fin.close()
+def split_data():
+    split_rate_ = 0.9
+    dir_train_file_idx_ = 'aid_data/train_file_idx.txt'
+    filelist_ = [
+        'raw_data/part-%d' % x for x in range(len(os.listdir('raw_data')))
+    ]
+    if not os.path.exists(dir_train_file_idx_):
+        train_file_idx = list(
+            numpy.random.choice(
+                len(filelist_), int(len(filelist_) * split_rate_), False))
+        with open(dir_train_file_idx_, 'w') as fout:
+            fout.write(str(train_file_idx))
+    else:
+        with open(dir_train_file_idx_, 'r') as fin:
+            train_file_idx = eval(fin.read())
+    for idx in range(len(filelist_)):
+        if idx in train_file_idx:
+            shutil.move(filelist_[idx], 'train_data')
+        else:
+            shutil.move(filelist_[idx], 'test_data')
+def get_feat_dict():
+    freq_ = 10
+    dir_feat_dict_ = 'aid_data/feat_dict_' + str(freq_) + '.pkl2'
+    continuous_range_ = range(1, 14)
+    categorical_range_ = range(14, 40)
+    if not os.path.exists(dir_feat_dict_):
+        # Count the number of occurrences of discrete features
+        feat_cnt = Counter()
+        with open('train.txt', 'r') as fin:
+            for line_idx, line in enumerate(fin):
+                if line_idx % 100000 == 0:
+                    print('generating feature dict', line_idx / 45000000)
+                features = line.rstrip('\n').split('\t')
+                for idx in categorical_range_:
+                    if features[idx] == '': continue
+                    feat_cnt.update([features[idx]])
+        # Only retain discrete features with high frequency 
+        dis_feat_set = set()
+        for feat, ot in feat_cnt.items():
+            if ot >= freq_:
+                dis_feat_set.add(feat)
+        # Create a dictionary for continuous and discrete features
+        feat_dict = {}
+        tc = 1
+        # Continuous features
+        for idx in continuous_range_:
+            feat_dict[idx] = tc
+            tc += 1
+        for feat in dis_feat_set:
+            feat_dict[feat] = tc
+            tc += 1
+        # Save dictionary
+        with open(dir_feat_dict_, 'wb') as fout:
+            pickle.dump(feat_dict, fout, protocol=2)
+        print('args.num_feat ', len(feat_dict) + 1)
+if __name__ == '__main__':
+    if not os.path.isdir('train_data'):
+        os.mkdir('train_data')
+    if not os.path.isdir('test_data'):
+        os.mkdir('test_data')
+    if not os.path.isdir('aid_data'):
+        os.mkdir('aid_data')
+    get_raw_data()
+    split_data()
+    get_feat_dict()
+    print('Done!')
--- a/models/rank/logistic_regression/data/run.sh
+++ b/models/rank/logistic_regression/data/run.sh
+python download_preprocess.py 
+mkdir slot_train_data
+for i in `ls ./train_data`
+do
+    cat train_data/$i | python get_slot_data.py > slot_train_data/$i
+done
+mkdir slot_test_data
+for i in `ls ./test_data`
+do
+    cat test_data/$i | python get_slot_data.py > slot_test_data/$i
+done
--- a/models/rank/logistic_regression/data/sample_data/train/sample_train.txt
+++ b/models/rank/logistic_regression/data/sample_data/train/sample_train.txt
--- a/models/rank/logistic_regression/model.py
+++ b/models/rank/logistic_regression/model.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import paddle.fluid as fluid
+from paddlerec.core.utils import envs
+from paddlerec.core.model import Model as ModelBase
+class Model(ModelBase):
+    def __init__(self, config):
+        ModelBase.__init__(self, config)
+    def _init_hyper_parameters(self):
+        self.sparse_feature_number = envs.get_global_env(
+            "hyper_parameters.sparse_feature_number", None)
+        self.num_field = envs.get_global_env("hyper_parameters.num_field",
+                                             None)
+        self.reg = envs.get_global_env("hyper_parameters.reg", 1e-4)
+    def net(self, inputs, is_infer=False):
+        init_value_ = 0.1
+        is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
+        # ------------------------- network input --------------------------
+        raw_feat_idx = self._sparse_data_var[1]
+        raw_feat_value = self._dense_data_var[0]
+        self.label = self._sparse_data_var[0]
+        feat_idx = raw_feat_idx
+        feat_value = fluid.layers.reshape(
+            raw_feat_value, [-1, self.num_field])  # None * num_field * 1
+        first_weights_re = fluid.embedding(
+            input=feat_idx,
+            is_sparse=True,
+            is_distributed=is_distributed,
+            dtype='float32',
+            size=[self.sparse_feature_number + 1, 1],
+            padding_idx=0,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.TruncatedNormalInitializer(
+                    loc=0.0, scale=init_value_),
+                regularizer=fluid.regularizer.L1DecayRegularizer(self.reg)))
+        first_weights = fluid.layers.reshape(
+            first_weights_re,
+            shape=[-1, self.num_field])  # None * num_field * 1
+        y_first_order = fluid.layers.reduce_sum(
+            first_weights * feat_value, 1, keep_dim=True)
+        b_linear = fluid.layers.create_parameter(
+            shape=[1],
+            dtype='float32',
+            default_initializer=fluid.initializer.ConstantInitializer(value=0))
+        self.predict = fluid.layers.sigmoid(y_first_order + b_linear)
+        cost = fluid.layers.log_loss(
+            input=self.predict, label=fluid.layers.cast(self.label, "float32"))
+        avg_cost = fluid.layers.reduce_sum(cost)
+        self._cost = avg_cost
+        predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1)
+        label_int = fluid.layers.cast(self.label, 'int64')
+        auc_var, batch_auc_var, _ = fluid.layers.auc(input=predict_2d,
+                                                     label=label_int,
+                                                     slide_steps=0)
+        self._metrics["AUC"] = auc_var
+        self._metrics["BATCH_AUC"] = batch_auc_var
+        if is_infer:
+            self._infer_results["AUC"] = auc_var
--- a/models/rank/wide_deep/data/get_slot_data.py
+++ b/models/rank/wide_deep/data/get_slot_data.py
@@ -11,7 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import yaml
+import yaml, os
 from paddlerec.core.reader import Reader
 from paddlerec.core.utils import envs
 try:

--- a/models/rank/xdeepfm/data/get_slot_data.py
+++ b/models/rank/xdeepfm/data/get_slot_data.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import yaml
+import yaml, os
 from paddlerec.core.reader import Reader
 from paddlerec.core.utils import envs
 try:

--- a/models/recall/fasttext/__init__.py
+++ b/models/recall/fasttext/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/models/recall/fasttext/config.yaml
+++ b/models/recall/fasttext/config.yaml
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+workspace: "paddlerec.models.recall.fasttext" 
+# list of dataset
+dataset:
+- name: dataset_train # name of dataset to distinguish different datasets
+  batch_size: 10
+  type: DataLoader # or QueueDataset 
+  data_path: "{workspace}/data/train"
+  word_count_dict_path: "{workspace}/data/dict/word_count_dict.txt"
+  word_ngrams_path: "{workspace}/data/dict/word_ngrams_id.txt"
+  data_converter: "{workspace}/reader.py"
+- name: dataset_infer # name
+  batch_size: 10
+  type: DataLoader # or QueueDataset
+  data_path: "{workspace}/data/test"
+  word_id_dict_path: "{workspace}/data/dict/word_id_dict.txt"
+  data_converter: "{workspace}/evaluate_reader.py"
+hyper_parameters:
+  optimizer:
+    learning_rate: 1.0
+    decay_steps: 100000
+    decay_rate: 0.999
+    class: sgd
+    strategy: async
+  sparse_feature_number: 227915
+  sparse_feature_dim: 300
+  with_shuffle_batch: False
+  neg_num: 5
+  window_size: 5
+  min_n: 3
+  max_n: 5
+# select runner by name
+mode: train_runner
+# config of each runner.
+# runner is a kind of paddle training class, which wraps the train/infer process.
+runner:
+- name: train_runner
+  class: single_train
+  # num of epochs
+  epochs: 2
+  # device to run training or infer
+  device: cpu
+  save_checkpoint_interval: 1 # save model interval of epochs
+  save_inference_interval: 1 # save inference
+  save_checkpoint_path: "increment" # save checkpoint path
+  save_inference_path: "inference" # save inference path
+  save_inference_feed_varnames: [] # feed vars of save inference
+  save_inference_fetch_varnames: [] # fetch vars of save inference
+  init_model_path: "" # load model path
+  fetch_period: 10
+- name: infer_runner
+  class: single_infer
+  # num of epochs
+  epochs: 1
+  # device to run training or infer
+  device: cpu
+  init_model_path: "increment/0" # load model path
+# runner will run all the phase in each epoch
+phase:
+- name: phase1
+  model: "{workspace}/model.py" # user-defined model
+  dataset_name: dataset_train # select dataset by name
+  thread_num: 1
+  gradient_scale_strategy: 1
+#- name: phase2
+#  model: "{workspace}/model.py" # user-defined model
+#  dataset_name: dataset_infer # select dataset by name
+#  thread_num: 1
--- a/models/recall/fasttext/data/dict/word_count_dict.txt
+++ b/models/recall/fasttext/data/dict/word_count_dict.txt
--- a/models/recall/fasttext/data/dict/word_id_dict.txt
+++ b/models/recall/fasttext/data/dict/word_id_dict.txt
--- a/models/recall/fasttext/data/dict/word_ngrams_id.txt
+++ b/models/recall/fasttext/data/dict/word_ngrams_id.txt
--- a/models/recall/fasttext/data/test/sample.txt
+++ b/models/recall/fasttext/data/test/sample.txt
+Athens Greece Baghdad Iraq
+Athens Greece Bangkok Thailand
+Athens Greece Beijing China
+Athens Greece Berlin Germany
+Athens Greece Bern Switzerland
+Athens Greece Cairo Egypt
+Athens Greece Canberra Australia
+Athens Greece Hanoi Vietnam
+Athens Greece Havana Cuba
+Athens Greece Helsinki Finland
+Athens Greece Islamabad Pakistan
+Athens Greece Kabul Afghanistan
+Athens Greece London England
+Athens Greece Madrid Spain
+Athens Greece Moscow Russia
+Athens Greece Oslo Norway
+Athens Greece Ottawa Canada
+Athens Greece Paris France
+Athens Greece Rome Italy
+Athens Greece Stockholm Sweden
+Athens Greece Tehran Iran
+Athens Greece Tokyo Japan
+Baghdad Iraq Bangkok Thailand
+Baghdad Iraq Beijing China
+Baghdad Iraq Berlin Germany
+Baghdad Iraq Bern Switzerland
+Baghdad Iraq Cairo Egypt
+Baghdad Iraq Canberra Australia
+Baghdad Iraq Hanoi Vietnam
+Baghdad Iraq Havana Cuba
+Baghdad Iraq Helsinki Finland
+Baghdad Iraq Islamabad Pakistan
+Baghdad Iraq Kabul Afghanistan
+Baghdad Iraq London England
+Baghdad Iraq Madrid Spain
+Baghdad Iraq Moscow Russia
+Baghdad Iraq Oslo Norway
+Baghdad Iraq Ottawa Canada
+Baghdad Iraq Paris France
+Baghdad Iraq Rome Italy
+Baghdad Iraq Stockholm Sweden
+Baghdad Iraq Tehran Iran
+Baghdad Iraq Tokyo Japan
+Baghdad Iraq Athens Greece
+Bangkok Thailand Beijing China
+Bangkok Thailand Berlin Germany
+Bangkok Thailand Bern Switzerland
+Bangkok Thailand Cairo Egypt
+Bangkok Thailand Canberra Australia
+Bangkok Thailand Hanoi Vietnam
+Bangkok Thailand Havana Cuba
+Bangkok Thailand Helsinki Finland
+Bangkok Thailand Islamabad Pakistan
+Bangkok Thailand Kabul Afghanistan
+Bangkok Thailand London England
+Bangkok Thailand Madrid Spain
+Bangkok Thailand Moscow Russia
+Bangkok Thailand Oslo Norway
+Bangkok Thailand Ottawa Canada
+Bangkok Thailand Paris France
+Bangkok Thailand Rome Italy
+Bangkok Thailand Stockholm Sweden
+Bangkok Thailand Tehran Iran
+Bangkok Thailand Tokyo Japan
+Bangkok Thailand Athens Greece
+Bangkok Thailand Baghdad Iraq
+Beijing China Berlin Germany
+Beijing China Bern Switzerland
+Beijing China Cairo Egypt
+Beijing China Canberra Australia
+Beijing China Hanoi Vietnam
+Beijing China Havana Cuba
+Beijing China Helsinki Finland
+Beijing China Islamabad Pakistan
+Beijing China Kabul Afghanistan
+Beijing China London England
+Beijing China Madrid Spain
+Beijing China Moscow Russia
+Beijing China Oslo Norway
+Beijing China Ottawa Canada
+Beijing China Paris France
+Beijing China Rome Italy
+Beijing China Stockholm Sweden
+Beijing China Tehran Iran
+Beijing China Tokyo Japan
+Beijing China Athens Greece
+Beijing China Baghdad Iraq
+Beijing China Bangkok Thailand
+Berlin Germany Bern Switzerland
+Berlin Germany Cairo Egypt
+Berlin Germany Canberra Australia
+Berlin Germany Hanoi Vietnam
+Berlin Germany Havana Cuba
+Berlin Germany Helsinki Finland
+Berlin Germany Islamabad Pakistan
+Berlin Germany Kabul Afghanistan
+Berlin Germany London England
+Berlin Germany Madrid Spain
+Berlin Germany Moscow Russia
+Berlin Germany Oslo Norway
+Berlin Germany Ottawa Canada
+Berlin Germany Paris France
+Berlin Germany Rome Italy
+Berlin Germany Stockholm Sweden
+Berlin Germany Tehran Iran
+Berlin Germany Tokyo Japan
+Berlin Germany Athens Greece
+Berlin Germany Baghdad Iraq
+Berlin Germany Bangkok Thailand
+Berlin Germany Beijing China
+Bern Switzerland Cairo Egypt
+Bern Switzerland Canberra Australia
+Bern Switzerland Hanoi Vietnam
+Bern Switzerland Havana Cuba
+Bern Switzerland Helsinki Finland
+Bern Switzerland Islamabad Pakistan
+Bern Switzerland Kabul Afghanistan
+Bern Switzerland London England
+Bern Switzerland Madrid Spain
+Bern Switzerland Moscow Russia
+Bern Switzerland Oslo Norway
+Bern Switzerland Ottawa Canada
+Bern Switzerland Paris France
+Bern Switzerland Rome Italy
+Bern Switzerland Stockholm Sweden
+Bern Switzerland Tehran Iran
+Bern Switzerland Tokyo Japan
+Bern Switzerland Athens Greece
+Bern Switzerland Baghdad Iraq
+Bern Switzerland Bangkok Thailand
+Bern Switzerland Beijing China
+Bern Switzerland Berlin Germany
+Cairo Egypt Canberra Australia
+Cairo Egypt Hanoi Vietnam
+Cairo Egypt Havana Cuba
+Cairo Egypt Helsinki Finland
+Cairo Egypt Islamabad Pakistan
+Cairo Egypt Kabul Afghanistan
+Cairo Egypt London England
+Cairo Egypt Madrid Spain
+Cairo Egypt Moscow Russia
+Cairo Egypt Oslo Norway
+Cairo Egypt Ottawa Canada
+Cairo Egypt Paris France
+Cairo Egypt Rome Italy
+Cairo Egypt Stockholm Sweden
+Cairo Egypt Tehran Iran
+Cairo Egypt Tokyo Japan
+Cairo Egypt Athens Greece
+Cairo Egypt Baghdad Iraq
+Cairo Egypt Bangkok Thailand
+Cairo Egypt Beijing China
+Cairo Egypt Berlin Germany
+Cairo Egypt Bern Switzerland
+Canberra Australia Hanoi Vietnam
+Canberra Australia Havana Cuba
+Canberra Australia Helsinki Finland
+Canberra Australia Islamabad Pakistan
+Canberra Australia Kabul Afghanistan
+Canberra Australia London England
+Canberra Australia Madrid Spain
+Canberra Australia Moscow Russia
+Canberra Australia Oslo Norway
+Canberra Australia Ottawa Canada
+Canberra Australia Paris France
+Canberra Australia Rome Italy
+Canberra Australia Stockholm Sweden
+Canberra Australia Tehran Iran
+Canberra Australia Tokyo Japan
+Canberra Australia Athens Greece
+Canberra Australia Baghdad Iraq
+Canberra Australia Bangkok Thailand
+Canberra Australia Beijing China
+Canberra Australia Berlin Germany
+Canberra Australia Bern Switzerland
+Canberra Australia Cairo Egypt
+Hanoi Vietnam Havana Cuba
+Hanoi Vietnam Helsinki Finland
+Hanoi Vietnam Islamabad Pakistan
+Hanoi Vietnam Kabul Afghanistan
+Hanoi Vietnam London England
+Hanoi Vietnam Madrid Spain
+Hanoi Vietnam Moscow Russia
+Hanoi Vietnam Oslo Norway
+Hanoi Vietnam Ottawa Canada
+Hanoi Vietnam Paris France
+Hanoi Vietnam Rome Italy
+Hanoi Vietnam Stockholm Sweden
+Hanoi Vietnam Tehran Iran
+Hanoi Vietnam Tokyo Japan
+Hanoi Vietnam Athens Greece
+Hanoi Vietnam Baghdad Iraq
+Hanoi Vietnam Bangkok Thailand
+Hanoi Vietnam Beijing China
+Hanoi Vietnam Berlin Germany
+Hanoi Vietnam Bern Switzerland
+Hanoi Vietnam Cairo Egypt
+Hanoi Vietnam Canberra Australia
+Havana Cuba Helsinki Finland
+Havana Cuba Islamabad Pakistan
--- a/models/recall/fasttext/data/train/sample.txt
+++ b/models/recall/fasttext/data/train/sample.txt
+183648 183648 183648 183648 65918 183648 94834 93002 71149 183648 89518 183648 68731 183648 183648 63766 183648 183648 63766 183648 183648 63690 63766 83291 183648 183648 63766 183648 183648 183648 63766 65918 183648 73068 71149 183648 183648 183648 65370 183648 183648 67665 63945 93281 71149 183648 139630 183648 183648 63766 183648 183648 183648 183648 65062 110843 175871 94491 183648 183648 183648 183648 89277 183648 78014 183648 183648 63766 69529 183648 183648 183648 102013 63766 139449 183648 183648 113280 87363 64479 183648 183648 183648 183648 183648 183648 63766 183648 93281 183648 64068 183648 63690 183648 183648 183648 183648 71353 64068 183648 183648 183648 102334 97824 139630 183648 110843 183648 183648 183648 183648 183648 93281 183648 63766 183648 183648 183648 183648 183648 63766 67946 63766 69529 183648 183648 74700 183648 64292 183648 97584 64890 112995 125717 183648 183648 183648 65927 183648 183648 183648 64366 183648 183648 183648 68450 183648 102334 64068 183648 183648 183648 183648 183648 63690 183648 183648 183648 183648 183648 183648 183648 129534 92829 183648 183648 183648 183648 183648 183648 66719 63690 78014 183648 97719 183648 183648 74700 183648 183648 183648 102334 183648 183648 183648 183648 110843 80622 183648 183648 183648 102334 118096 183648 183648 183648 183648 208852 67073 183648 183648 125996 95715 66971 183648 183648 183648 97660 65663 98873 183648 183648 70581 183648 183648 183648 71353 183648 110843 183648 95715 66971 183648 67073 183648 97660 183648 183648 95715 183648 63690 183648 183648 63690 183648 99530 93281 183648 183648 183648 70654 183648 68437 183648 183648 183648 183648 74700 183648 183648 183648 63690 183648 183648 183648 93281 183648 183648 106973 75457 183648 63690 183648 183648 183648 183648 183648 183648 183648 183648 67438 63690 66087 76115 183648 183648 183648 183648 183648 183648 183648 63766 63766 183648 183648 63690 183648 80719 183648 183648 183648 183648 102013 183648 89431 183648 71932 183648 125996 183648 99901 213612 183648 183648 183648 183648 183648 213612 183648 63766 183648 183648 63766 67946 65113 183648 63690 183648 63690 183648 63766 183648 183648 63766 75094 67438 183648 63766 183648 63766 183648 183648 63856 79145 183648 183648 183648 64068 183648 183648 183648 183648 183648 183648 183648 183648 183648 183648 183648 183648 64649 183648 183648 183648 183648 183648 93281 63766 183648 183648 63689 67438 105276 118096 82908 93281 63766 72124 63702 68692 183648 183648 183648 88510
--- a/models/recall/fasttext/data_prepare.sh
+++ b/models/recall/fasttext/data_prepare.sh
+#! /bin/bash
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# download train_data
+mkdir raw_data
+wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/1-billion-word-language-modeling-benchmark-r13output.tar
+tar xvf 1-billion-word-language-modeling-benchmark-r13output.tar
+mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/ raw_data/
+# preprocess data
+python preprocess.py --build_dict --build_dict_corpus_dir raw_data/training-monolingual.tokenized.shuffled --dict_path raw_data/word_count_dict.txt --ngrams_path raw_data/word_ngrams.txt
+python preprocess.py --filter_corpus --dict_path raw_data/word_count_dict.txt --word_id_path raw_data/word_id_dict.txt --input_corpus_dir raw_data/training-monolingual.tokenized.shuffled --output_corpus_dir raw_data/convert_text8 --ngrams_id_path raw_data/word_ngrams_id.txt --ngrams_path raw_data/word_ngrams.txt --min_count 5 --downsample 0.001
+mv raw_data/word_count_dict.txt data/dict/
+mv raw_data/word_id_dict.txt data/dict/
+mv raw_data/word_ngrams.txt data/dict/
+mv raw_data/word_ngrams_id.txt data/dict/
+rm -rf data/train/*
+rm -rf data/test/*
+python preprocess.py --data_resplit --file_nums 24 --input_corpus_dir=raw_data/convert_text8 --output_corpus_dir=data/train
+# download test data
+wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar
+tar xzvf test_dir.tar -C raw_data
+mv raw_data/data/test_dir/* data/test/
+rm -rf raw_data
--- a/models/recall/fasttext/evaluate_reader.py
+++ b/models/recall/fasttext/evaluate_reader.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import six
+from paddlerec.core.reader import Reader
+from paddlerec.core.utils import envs
+class TrainReader(Reader):
+    def init(self):
+        dict_path = envs.get_global_env(
+            "dataset.dataset_infer.word_id_dict_path")
+        self.min_n = envs.get_global_env("hyper_parameters.min_n")
+        self.max_n = envs.get_global_env("hyper_parameters.max_n")
+        self.word_to_id = dict()
+        self.id_to_word = dict()
+        with io.open(dict_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                self.word_to_id[line.split(' ')[0]] = int(line.split(' ')[1])
+                self.id_to_word[int(line.split(' ')[1])] = line.split(' ')[0]
+        self.dict_size = len(self.word_to_id)
+    def computeSubwords(self, word):
+        ngrams = set()
+        for i in range(len(word) - self.min_n + 1):
+            for j in range(self.min_n, self.max_n + 1):
+                end = min(len(word), i + j)
+                ngrams.add("".join(word[i:end]))
+        return list(ngrams)
+    def native_to_unicode(self, s):
+        if self._is_unicode(s):
+            return s
+        try:
+            return self._to_unicode(s)
+        except UnicodeDecodeError:
+            res = self._to_unicode(s, ignore_errors=True)
+            return res
+    def _is_unicode(self, s):
+        if six.PY2:
+            if isinstance(s, unicode):
+                return True
+        else:
+            if isinstance(s, str):
+                return True
+        return False
+    def _to_unicode(self, s, ignore_errors=False):
+        if self._is_unicode(s):
+            return s
+        error_mode = "ignore" if ignore_errors else "strict"
+        return s.decode("utf-8", errors=error_mode)
+    def strip_lines(self, line, vocab):
+        return self._replace_oov(vocab, self.native_to_unicode(line))
+    def _replace_oov(self, original_vocab, line):
+        """Replace out-of-vocab words with "<UNK>".
+      This maintains compatibility with published results.
+      Args:
+        original_vocab: a set of strings (The standard vocabulary for the dataset)
+        line: a unicode string - a space-delimited sequence of words.
+      Returns:
+        a unicode string - a space-delimited sequence of words.
+      """
+        return u" ".join([
+            "<" + word + ">"
+            if "<" + word + ">" in original_vocab else u"<UNK>"
+            for word in line.split()
+        ])
+    def generate_sample(self, line):
+        def reader():
+            if ':' in line:
+                pass
+            features = self.strip_lines(line.lower(), self.word_to_id)
+            features = features.split()
+            inputs = []
+            for item in features:
+                if item == "<UNK>":
+                    inputs.append([self.word_to_id[item]])
+                else:
+                    ngrams = self.computeSubwords(item)
+                    res = []
+                    res.append(self.word_to_id[item])
+                    for _ in ngrams:
+                        res.append(self.word_to_id[_])
+                    inputs.append(res)
+            yield [('analogy_a', inputs[0]), ('analogy_b', inputs[1]),
+                   ('analogy_c', inputs[2]), ('analogy_d', inputs[3][0:1])]
+        return reader
--- a/models/recall/fasttext/model.py
+++ b/models/recall/fasttext/model.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle.fluid as fluid
+from paddlerec.core.utils import envs
+from paddlerec.core.model import Model as ModelBase
+class Model(ModelBase):
+    def __init__(self, config):
+        ModelBase.__init__(self, config)
+    def _init_hyper_parameters(self):
+        self.is_distributed = True if envs.get_trainer(
+        ) == "CtrTrainer" else False
+        self.sparse_feature_number = envs.get_global_env(
+            "hyper_parameters.sparse_feature_number")
+        self.sparse_feature_dim = envs.get_global_env(
+            "hyper_parameters.sparse_feature_dim")
+        self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
+        self.with_shuffle_batch = envs.get_global_env(
+            "hyper_parameters.with_shuffle_batch")
+        self.learning_rate = envs.get_global_env(
+            "hyper_parameters.optimizer.learning_rate")
+        self.decay_steps = envs.get_global_env(
+            "hyper_parameters.optimizer.decay_steps")
+        self.decay_rate = envs.get_global_env(
+            "hyper_parameters.optimizer.decay_rate")
+    def input_data(self, is_infer=False, **kwargs):
+        if is_infer:
+            analogy_a = fluid.data(
+                name="analogy_a", shape=[None, 1], lod_level=1, dtype='int64')
+            analogy_b = fluid.data(
+                name="analogy_b", shape=[None, 1], lod_level=1, dtype='int64')
+            analogy_c = fluid.data(
+                name="analogy_c", shape=[None, 1], lod_level=1, dtype='int64')
+            analogy_d = fluid.data(
+                name="analogy_d", shape=[None, 1], dtype='int64')
+            return [analogy_a, analogy_b, analogy_c, analogy_d]
+        input_word = fluid.data(
+            name="input_word", shape=[None, 1], lod_level=1, dtype='int64')
+        true_word = fluid.data(
+            name='true_label', shape=[None, 1], lod_level=1, dtype='int64')
+        if self.with_shuffle_batch:
+            return [input_word, true_word]
+        neg_word = fluid.data(
+            name="neg_label", shape=[None, self.neg_num], dtype='int64')
+        return [input_word, true_word, neg_word]
+    def net(self, inputs, is_infer=False):
+        if is_infer:
+            self.infer_net(inputs)
+            return
+        def embedding_layer(input,
+                            table_name,
+                            initializer_instance=None,
+                            sequence_pool=False):
+            emb = fluid.embedding(
+                input=input,
+                is_sparse=True,
+                is_distributed=self.is_distributed,
+                size=[self.sparse_feature_number, self.sparse_feature_dim],
+                param_attr=fluid.ParamAttr(
+                    name=table_name, initializer=initializer_instance), )
+            if sequence_pool:
+                emb = fluid.layers.sequence_pool(
+                    input=emb, pool_type='average')
+            return emb
+        init_width = 1.0 / self.sparse_feature_dim
+        emb_initializer = fluid.initializer.Uniform(-init_width, init_width)
+        emb_w_initializer = fluid.initializer.Constant(value=0.0)
+        input_emb = embedding_layer(inputs[0], "emb", emb_initializer, True)
+        input_emb = fluid.layers.squeeze(input=input_emb, axes=[1])
+        true_emb_w = embedding_layer(inputs[1], "emb_w", emb_w_initializer,
+                                     True)
+        true_emb_w = fluid.layers.squeeze(input=true_emb_w, axes=[1])
+        if self.with_shuffle_batch:
+            neg_emb_w_list = []
+            for i in range(self.neg_num):
+                neg_emb_w_list.append(
+                    fluid.contrib.layers.shuffle_batch(
+                        true_emb_w))  # shuffle true_word
+            neg_emb_w_concat = fluid.layers.concat(neg_emb_w_list, axis=0)
+            neg_emb_w = fluid.layers.reshape(
+                neg_emb_w_concat,
+                shape=[-1, self.neg_num, self.sparse_feature_dim])
+        else:
+            neg_emb_w = embedding_layer(inputs[2], "emb_w", emb_w_initializer)
+        true_logits = fluid.layers.reduce_sum(
+            fluid.layers.elementwise_mul(input_emb, true_emb_w),
+            dim=1,
+            keep_dim=True)
+        input_emb_re = fluid.layers.reshape(
+            input_emb, shape=[-1, 1, self.sparse_feature_dim])
+        neg_matmul = fluid.layers.matmul(
+            input_emb_re, neg_emb_w, transpose_y=True)
+        neg_logits = fluid.layers.reshape(neg_matmul, shape=[-1, 1])
+        logits = fluid.layers.concat([true_logits, neg_logits], axis=0)
+        label_ones = fluid.layers.fill_constant(
+            shape=[fluid.layers.shape(true_logits)[0], 1],
+            value=1.0,
+            dtype='float32')
+        label_zeros = fluid.layers.fill_constant(
+            shape=[fluid.layers.shape(neg_logits)[0], 1],
+            value=0.0,
+            dtype='float32')
+        label = fluid.layers.concat([label_ones, label_zeros], axis=0)
+        loss = fluid.layers.log_loss(fluid.layers.sigmoid(logits), label)
+        avg_cost = fluid.layers.reduce_sum(loss)
+        global_right_cnt = fluid.layers.create_global_var(
+            name="global_right_cnt",
+            persistable=True,
+            dtype='float32',
+            shape=[1],
+            value=0)
+        global_total_cnt = fluid.layers.create_global_var(
+            name="global_total_cnt",
+            persistable=True,
+            dtype='float32',
+            shape=[1],
+            value=0)
+        global_right_cnt.stop_gradient = True
+        global_total_cnt.stop_gradient = True
+        self._cost = avg_cost
+        self._metrics["LOSS"] = avg_cost
+    def optimizer(self):
+        optimizer = fluid.optimizer.SGD(
+            learning_rate=fluid.layers.exponential_decay(
+                learning_rate=self.learning_rate,
+                decay_steps=self.decay_steps,
+                decay_rate=self.decay_rate,
+                staircase=True))
+        return optimizer
+    def infer_net(self, inputs):
+        def embedding_layer(input,
+                            table_name,
+                            initializer_instance=None,
+                            sequence_pool=False):
+            emb = fluid.embedding(
+                input=input,
+                size=[self.sparse_feature_number, self.sparse_feature_dim],
+                param_attr=table_name)
+            if sequence_pool:
+                emb = fluid.layers.sequence_pool(
+                    input=emb, pool_type='average')
+            return emb
+        all_label = np.arange(self.sparse_feature_number).reshape(
+            self.sparse_feature_number).astype('int32')
+        self.all_label = fluid.layers.cast(
+            x=fluid.layers.assign(all_label), dtype='int64')
+        emb_all_label = embedding_layer(self.all_label, "emb")
+        emb_a = embedding_layer(inputs[0], "emb", sequence_pool=True)
+        emb_b = embedding_layer(inputs[1], "emb", sequence_pool=True)
+        emb_c = embedding_layer(inputs[2], "emb", sequence_pool=True)
+        target = fluid.layers.elementwise_add(
+            fluid.layers.elementwise_sub(emb_b, emb_a), emb_c)
+        emb_all_label_l2 = fluid.layers.l2_normalize(x=emb_all_label, axis=1)
+        dist = fluid.layers.matmul(
+            x=target, y=emb_all_label_l2, transpose_y=True)
+        values, pred_idx = fluid.layers.topk(input=dist, k=4)
+        label = fluid.layers.expand(inputs[3], expand_times=[1, 4])
+        label_ones = fluid.layers.fill_constant_batch_size_like(
+            label, shape=[-1, 1], value=1.0, dtype='float32')
+        right_cnt = fluid.layers.reduce_sum(input=fluid.layers.cast(
+            fluid.layers.equal(pred_idx, label), dtype='float32'))
+        total_cnt = fluid.layers.reduce_sum(label_ones)
+        global_right_cnt = fluid.layers.create_global_var(
+            name="global_right_cnt",
+            persistable=True,
+            dtype='float32',
+            shape=[1],
+            value=0)
+        global_total_cnt = fluid.layers.create_global_var(
+            name="global_total_cnt",
+            persistable=True,
+            dtype='float32',
+            shape=[1],
+            value=0)
+        global_right_cnt.stop_gradient = True
+        global_total_cnt.stop_gradient = True
+        tmp1 = fluid.layers.elementwise_add(right_cnt, global_right_cnt)
+        fluid.layers.assign(tmp1, global_right_cnt)
+        tmp2 = fluid.layers.elementwise_add(total_cnt, global_total_cnt)
+        fluid.layers.assign(tmp2, global_total_cnt)
+        acc = fluid.layers.elementwise_div(
+            global_right_cnt, global_total_cnt, name="total_acc")
+        self._infer_results['acc'] = acc
--- a/models/recall/fasttext/preprocess.py
+++ b/models/recall/fasttext/preprocess.py
+# -*- coding: utf-8 -*
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import math
+import os
+import random
+import re
+import six
+import argparse
+prog = re.compile("[^a-z ]", flags=0)
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Paddle Fluid word2 vector preprocess")
+    parser.add_argument(
+        '--build_dict_corpus_dir', type=str, help="The dir of corpus")
+    parser.add_argument(
+        '--input_corpus_dir', type=str, help="The dir of input corpus")
+    parser.add_argument(
+        '--output_corpus_dir', type=str, help="The dir of output corpus")
+    parser.add_argument(
+        '--dict_path',
+        type=str,
+        default='./dict',
+        help="The path of dictionary ")
+    parser.add_argument(
+        '--word_id_path',
+        type=str,
+        default='./word_id',
+        help="The path of word_id ")
+    parser.add_argument(
+        '--ngrams_path',
+        type=str,
+        default='./word_ngrams',
+        help="The path of word_ngrams ")
+    parser.add_argument(
+        '--ngrams_id_path',
+        type=str,
+        default='./word_ngrams_id',
+        help="The path of word_ngrams_id ")
+    parser.add_argument(
+        '--min_count',
+        type=int,
+        default=5,
+        help="If the word count is less then min_count, it will be removed from dict"
+    )
+    parser.add_argument('--min_n', type=int, default=3, help="min_n of ngrams")
+    parser.add_argument('--max_n', type=int, default=5, help="max_n of ngrams")
+    parser.add_argument(
+        '--file_nums',
+        type=int,
+        default=1024,
+        help="re-split input corpus file nums")
+    parser.add_argument(
+        '--downsample',
+        type=float,
+        default=0.001,
+        help="filter word by downsample")
+    parser.add_argument(
+        '--filter_corpus',
+        action='store_true',
+        default=False,
+        help='Filter corpus')
+    parser.add_argument(
+        '--build_dict',
+        action='store_true',
+        default=False,
+        help='Build dict from corpus')
+    parser.add_argument(
+        '--data_resplit',
+        action='store_true',
+        default=False,
+        help='re-split input corpus files')
+    return parser.parse_args()
+def text_strip(text):
+    # English Preprocess Rule
+    return prog.sub("", text.lower())
+# Shameless copy from Tensorflow https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/text_encoder.py
+# Unicode utility functions that work with Python 2 and 3
+def native_to_unicode(s):
+    if _is_unicode(s):
+        return s
+    try:
+        return _to_unicode(s)
+    except UnicodeDecodeError:
+        res = _to_unicode(s, ignore_errors=True)
+        return res
+def _is_unicode(s):
+    if six.PY2:
+        if isinstance(s, unicode):
+            return True
+    else:
+        if isinstance(s, str):
+            return True
+    return False
+def _to_unicode(s, ignore_errors=False):
+    if _is_unicode(s):
+        return s
+    error_mode = "ignore" if ignore_errors else "strict"
+    return s.decode("utf-8", errors=error_mode)
+def filter_corpus(args):
+    """
+    filter corpus and convert id.
+    """
+    word_count = dict()
+    word_to_id_ = dict()
+    word_all_count = 0
+    id_counts = []
+    word_id = 0
+    # read dict
+    with io.open(args.dict_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            word, count = line.split()[0], int(line.split()[1])
+            word_count[word] = count
+            word_to_id_[word] = word_id
+            word_id += 1
+            id_counts.append(count)
+            word_all_count += count
+    word_ngrams = dict()
+    with io.open(args.ngrams_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            word, ngrams = line.rstrip().split(':')
+            ngrams = ngrams.split()
+            ngrams = [str(word_to_id_[_]) for _ in ngrams]
+            word_ngrams[word_to_id_[word]] = ' '.join(ngrams)
+    with io.open(args.ngrams_id_path, 'w+', encoding='utf-8') as fid:
+        for k, v in word_ngrams.items():
+            fid.write(u'{} {}\n'.format(k, v))
+    # write word2id file
+    print("write word2id file to : " + args.dict_path + "_word_to_id_")
+    with io.open(args.word_id_path, 'w+', encoding='utf-8') as fid:
+        for k, v in word_to_id_.items():
+            fid.write(k + " " + str(v) + '\n')
+    # filter corpus and convert id
+    if not os.path.exists(args.output_corpus_dir):
+        os.makedirs(args.output_corpus_dir)
+    for file in os.listdir(args.input_corpus_dir):
+        with io.open(args.output_corpus_dir + '/convert_' + file + '.csv',
+                     "w") as wf:
+            with io.open(
+                    args.input_corpus_dir + '/' + file,
+                    encoding='utf-8') as rf:
+                print(args.input_corpus_dir + '/' + file)
+                for line in rf:
+                    signal = False
+                    line = text_strip(line)
+                    words = line.split()
+                    write_line = ""
+                    for item in words:
+                        if item in word_count:
+                            idx = word_to_id_[item]
+                        else:
+                            idx = word_to_id_[native_to_unicode('<UNK>')]
+                        count_w = id_counts[idx]
+                        corpus_size = word_all_count
+                        keep_prob = (
+                            math.sqrt(count_w /
+                                      (args.downsample * corpus_size)) + 1
+                        ) * (args.downsample * corpus_size) / count_w
+                        r_value = random.random()
+                        if r_value > keep_prob:
+                            continue
+                        write_line += str(idx)
+                        write_line += " "
+                        signal = True
+                    if signal:
+                        write_line = write_line[:-1] + "\n"
+                        wf.write(_to_unicode(write_line))
+def computeSubwords(word, min_n, max_n):
+    ngrams = set()
+    for i in range(len(word) - min_n + 1):
+        for j in range(min_n, max_n + 1):
+            end = min(len(word), i + j)
+            ngrams.add("".join(word[i:end]))
+    return list(ngrams)
+def build_dict(args):
+    """
+    proprocess the data, generate dictionary and save into dict_path.
+    :param corpus_dir: the input data dir.
+    :param dict_path: the generated dict path. the data in dict is "word count"
+    :param min_count:
+    :return:
+    """
+    # word to count
+    word_count = dict()
+    for file in os.listdir(args.build_dict_corpus_dir):
+        with io.open(
+                args.build_dict_corpus_dir + "/" + file,
+                encoding='utf-8') as f:
+            print("build dict : ", args.build_dict_corpus_dir + "/" + file)
+            for line in f:
+                line = text_strip(line)
+                words = line.split()
+                for item in words:
+                    item = '<' + item + '>'
+                    if item in word_count:
+                        word_count[item] = word_count[item] + 1
+                    else:
+                        word_count[item] = 1
+    item_to_remove = []
+    for item in word_count:
+        if word_count[item] <= args.min_count:
+            item_to_remove.append(item)
+    unk_sum = 0
+    for item in item_to_remove:
+        unk_sum += word_count[item]
+        del word_count[item]
+    # sort by count
+    word_count[native_to_unicode('<UNK>')] = unk_sum
+    word_ngrams = dict()
+    ngrams_count = dict()
+    for item in word_count:
+        ngrams = computeSubwords(item, args.min_n, args.max_n)
+        word_ngrams[item] = ngrams
+        for sub_word in ngrams:
+            if sub_word not in ngrams_count:
+                ngrams_count[sub_word] = 1
+            else:
+                ngrams_count[sub_word] = ngrams_count[sub_word] + 1
+    ngrams_count = sorted(
+        ngrams_count.items(), key=lambda ngrams_count: -ngrams_count[1])
+    word_count = sorted(
+        word_count.items(), key=lambda word_count: -word_count[1])
+    with io.open(args.dict_path, 'w+', encoding='utf-8') as f:
+        for k, v in word_count:
+            f.write(k + " " + str(v) + '\n')
+        for k, v in ngrams_count:
+            f.write(k + " " + str(v) + '\n')
+    with io.open(args.ngrams_path, 'w+', encoding='utf-8') as f:
+        for key in word_ngrams:
+            f.write(key + ":")
+            f.write(" ".join(word_ngrams[key]))
+            f.write(u'\n')
+def data_split(args):
+    raw_data_dir = args.input_corpus_dir
+    new_data_dir = args.output_corpus_dir
+    if not os.path.exists(new_data_dir):
+        os.mkdir(new_data_dir)
+    files = os.listdir(raw_data_dir)
+    print(files)
+    index = 0
+    contents = []
+    for file_ in files:
+        with open(os.path.join(raw_data_dir, file_), 'r') as f:
+            contents.extend(f.readlines())
+    num = int(args.file_nums)
+    lines_per_file = len(contents) / num
+    print("contents: ", str(len(contents)))
+    print("lines_per_file: ", str(lines_per_file))
+    for i in range(1, num + 1):
+        with open(os.path.join(new_data_dir, "part_" + str(i)), 'w') as fout:
+            data = contents[(i - 1) * lines_per_file:min(i * lines_per_file,
+                                                         len(contents))]
+            for line in data:
+                fout.write(line)
+if __name__ == "__main__":
+    args = parse_args()
+    if args.build_dict:
+        build_dict(args)
+    elif args.filter_corpus:
+        filter_corpus(args)
+    elif args.data_resplit:
+        data_split(args)
+    else:
+        print(
+            "error command line, please choose --build_dict or --filter_corpus")
--- a/models/recall/fasttext/reader.py
+++ b/models/recall/fasttext/reader.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import numpy as np
+from paddlerec.core.reader import Reader
+from paddlerec.core.utils import envs
+class NumpyRandomInt(object):
+    def __init__(self, a, b, buf_size=1000):
+        self.idx = 0
+        self.buffer = np.random.random_integers(a, b, buf_size)
+        self.a = a
+        self.b = b
+    def __call__(self):
+        if self.idx == len(self.buffer):
+            self.buffer = np.random.random_integers(self.a, self.b,
+                                                    len(self.buffer))
+            self.idx = 0
+        result = self.buffer[self.idx]
+        self.idx += 1
+        return result
+class TrainReader(Reader):
+    def init(self):
+        dict_path = envs.get_global_env(
+            "dataset.dataset_train.word_count_dict_path")
+        word_ngrams_path = envs.get_global_env(
+            "dataset.dataset_train.word_ngrams_path")
+        self.window_size = envs.get_global_env("hyper_parameters.window_size")
+        self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
+        self.with_shuffle_batch = envs.get_global_env(
+            "hyper_parameters.with_shuffle_batch")
+        self.random_generator = NumpyRandomInt(1, self.window_size + 1)
+        self.word_ngrams = dict()
+        with io.open(word_ngrams_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.rstrip().split()
+                self.word_ngrams[str(line[0])] = map(int, line[1:])
+        self.cs = None
+        if not self.with_shuffle_batch:
+            id_counts = []
+            word_all_count = 0
+            with io.open(dict_path, 'r', encoding='utf-8') as f:
+                for line in f:
+                    word, count = line.split()[0], int(line.split()[1])
+                    id_counts.append(count)
+                    word_all_count += count
+            id_frequencys = [
+                float(count) / word_all_count for count in id_counts
+            ]
+            np_power = np.power(np.array(id_frequencys), 0.75)
+            id_frequencys_pow = np_power / np_power.sum()
+            self.cs = np.array(id_frequencys_pow).cumsum()
+    def get_context_words(self, words, idx):
+        """
+        Get the context word list of target word.
+        words: the words of the current line
+        idx: input word index
+        window_size: window size
+        """
+        target_window = self.random_generator()
+        start_point = idx - target_window  # if (idx - target_window) > 0 else 0
+        if start_point < 0:
+            start_point = 0
+        end_point = idx + target_window
+        targets = words[start_point:idx] + words[idx + 1:end_point + 1]
+        return targets
+    def generate_sample(self, line):
+        def reader():
+            word_ids = [w for w in line.split()]
+            for idx, target_id in enumerate(word_ids):
+                input_word = [int(target_id)]
+                if target_id in self.word_ngrams:
+                    input_word += self.word_ngrams[target_id]
+                context_word_ids = self.get_context_words(word_ids, idx)
+                for context_id in context_word_ids:
+                    output = [('input_word', input_word),
+                              ('true_label', [int(context_id)])]
+                    if not self.with_shuffle_batch:
+                        neg_array = self.cs.searchsorted(
+                            np.random.sample(self.neg_num))
+                        output += [('neg_label',
+                                    [int(str(i)) for i in neg_array])]
+                    yield output
+        return reader
--- a/models/recall/gnn/config.yaml
+++ b/models/recall/gnn/config.yaml
@@ -11,46 +11,71 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-evaluate:
-  workspace: "paddlerec.models.recall.gnn"
-  reader:
-    batch_size: 50
-    class: "{workspace}/evaluate_reader.py"
-    test_data_path: "{workspace}/data/test"
-train:
+# workspace
-  trainer:
+workspace: "paddlerec.models.recall.gnn"
-    # for cluster training
-    strategy: "async"
-  epochs: 2
+# list of dataset
-  workspace: "paddlerec.models.recall.gnn"
+dataset:
+- name: dataset_train # name of dataset to distinguish different datasets
+  batch_size: 100
+  type: DataLoader # or QueueDataset
+  data_path: "{workspace}/data/train"
+  data_converter: "{workspace}/reader.py"
+- name: dataset_infer # name
+  batch_size: 50
+  type: DataLoader # or QueueDataset
+  data_path: "{workspace}/data/test"
+  data_converter: "{workspace}/evaluate_reader.py"
-  reader:
+# hyper parameters of user-defined network
-    batch_size: 100
+hyper_parameters:
-    class: "{workspace}/reader.py"
+  optimizer:
-    train_data_path: "{workspace}/data/train"
+    class: Adam
-    dataset_class: "DataLoader"
+    learning_rate: 0.001
+    decay_steps: 3
+    decay_rate: 0.1
+    l2: 0.00001
+  sparse_feature_number: 43098
+  sparse_feature_dim: 100
+  corpus_size: 719470
+  gnn_propogation_steps: 1
-  model:
+# select runner by name
-    models: "{workspace}/model.py"
+mode: train_runner
-    hyper_parameters:
+# config of each runner.
-      use_DataLoader: True
+# runner is a kind of paddle training class, which wraps the train/infer process.
-      config_path: "{workspace}/data/config.txt"
+runner:
-      sparse_feature_dim: 100
+- name: train_runner
-      gnn_propogation_steps: 1
+  class: single_train
-      learning_rate: 0.001
+  # num of epochs
-      l2: 0.00001
+  epochs: 2
-      decay_steps: 3
+  # device to run training or infer
-      decay_rate: 0.1
+  device: cpu
-      optimizer: adam
+  save_checkpoint_interval: 1 # save model interval of epochs
+  save_inference_interval: 1 # save inference
+  save_checkpoint_path: "increment" # save checkpoint path
+  save_inference_path: "inference" # save inference path
+  save_inference_feed_varnames: [] # feed vars of save inference
+  save_inference_fetch_varnames: [] # fetch vars of save inference
+  init_model_path: "" # load model path
+  fetch_period: 10
+- name: infer_runner
+  class: single_infer
+  # num of epochs
+  epochs: 1
+  # device to run training or infer
+  device: cpu
+  fetch_period: 1
+  init_model_path: "increment/0" # load model path
-  save:
+# runner will run all the phase in each epoch
-    increment:
+phase:
-      dirname: "increment"
+- name: phase1
-      epoch_interval: 1
+  model: "{workspace}/model.py" # user-defined model
-      save_last: True
+  dataset_name: dataset_train # select dataset by name
-    inference:
+  thread_num: 1
-      dirname: "inference"
+#- name: phase2
-      epoch_interval: 1
+#  model: "{workspace}/model.py" # user-defined model
-      save_last: True
+#  dataset_name: dataset_infer # select dataset by name
+#  thread_num: 1
--- a/models/recall/gnn/raw_data/convert_data.py
+++ b/models/recall/gnn/raw_data/convert_data.py
--- a/models/recall/gnn/raw_data/download.py
+++ b/models/recall/gnn/raw_data/download.py
--- a/models/recall/gnn/raw_data/preprocess.py
+++ b/models/recall/gnn/raw_data/preprocess.py
--- a/models/recall/gnn/data_process.sh
+++ b/models/recall/gnn/data_process.sh
@@ -17,7 +17,7 @@
 set -e
 echo "begin to download data"
-cd raw_data && python download.py
+cd data && python download.py
 mkdir diginetica
 python preprocess.py --dataset diginetica
@@ -26,8 +26,10 @@ python convert_data.py --data_dir diginetica
 cat diginetica/train.txt | wc -l >> diginetica/config.txt
-mkdir train_data
+rm -rf train && mkdir train
-mv diginetica/train.txt train_data
+mv diginetica/train.txt train
-mkdir test_data
+rm -rf test && mkdir test
-mv diginetica/test.txt test_data
+mv diginetica/test.txt test
+mv diginetica/config.txt ./config.txt
--- a/models/recall/gnn/evaluate_reader.py
+++ b/models/recall/gnn/evaluate_reader.py
@@ -21,10 +21,10 @@ from paddlerec.core.reader import Reader
 from paddlerec.core.utils import envs
-class EvaluateReader(Reader):
+class TrainReader(Reader):
    def init(self):
-        self.batch_size = envs.get_global_env("batch_size", None,
+        self.batch_size = envs.get_global_env(
-                                              "evaluate.reader")
+            "dataset.dataset_infer.batch_size")
        self.input = []
        self.length = None

--- a/models/recall/gnn/model.py
+++ b/models/recall/gnn/model.py
@@ -25,74 +25,65 @@ from paddlerec.core.model import Model as ModelBase
 class Model(ModelBase):
    def __init__(self, config):
        ModelBase.__init__(self, config)
-        self.init_config()
-    def init_config(self):
+    def _init_hyper_parameters(self):
-        self._fetch_interval = 1
+        self.learning_rate = envs.get_global_env(
-        self.items_num, self.ins_num = self.config_read(
+            "hyper_parameters.optimizer.learning_rate")
-            envs.get_global_env("hyper_parameters.config_path", None,
+        self.decay_steps = envs.get_global_env(
-                                self._namespace))
+            "hyper_parameters.optimizer.decay_steps")
-        self.train_batch_size = envs.get_global_env("batch_size", None,
+        self.decay_rate = envs.get_global_env(
-                                                    "train.reader")
+            "hyper_parameters.optimizer.decay_rate")
-        self.evaluate_batch_size = envs.get_global_env("batch_size", None,
+        self.l2 = envs.get_global_env("hyper_parameters.optimizer.l2")
-                                                       "evaluate.reader")
-        self.hidden_size = envs.get_global_env(
+        self.dict_size = envs.get_global_env(
-            "hyper_parameters.sparse_feature_dim", None, self._namespace)
+            "hyper_parameters.sparse_feature_number")
-        self.step = envs.get_global_env(
+        self.corpus_size = envs.get_global_env("hyper_parameters.corpus_size")
-            "hyper_parameters.gnn_propogation_steps", None, self._namespace)
-    def config_read(self, config_path=None):
+        self.train_batch_size = envs.get_global_env(
-        if config_path is None:
+            "dataset.dataset_train.batch_size")
-            raise ValueError(
+        self.evaluate_batch_size = envs.get_global_env(
-                "please set train.model.hyper_parameters.config_path at first")
+            "dataset.dataset_infer.batch_size")
-        with open(config_path, "r") as fin:
-            item_nums = int(fin.readline().strip())
-            ins_nums = int(fin.readline().strip())
-        return item_nums, ins_nums
-    def input(self, bs):
+        self.hidden_size = envs.get_global_env(
-        self.items = fluid.data(
+            "hyper_parameters.sparse_feature_dim")
+        self.step = envs.get_global_env(
+            "hyper_parameters.gnn_propogation_steps")
+    def input_data(self, is_infer=False, **kwargs):
+        if is_infer:
+            bs = self.evaluate_batch_size
+        else:
+            bs = self.train_batch_size
+        items = fluid.data(
            name="items", shape=[bs, -1],
            dtype="int64")  # [batch_size, uniq_max]
-        self.seq_index = fluid.data(
+        seq_index = fluid.data(
            name="seq_index", shape=[bs, -1, 2],
            dtype="int32")  # [batch_size, seq_max, 2]
-        self.last_index = fluid.data(
+        last_index = fluid.data(
            name="last_index", shape=[bs, 2], dtype="int32")  # [batch_size, 2]
-        self.adj_in = fluid.data(
+        adj_in = fluid.data(
            name="adj_in", shape=[bs, -1, -1],
            dtype="float32")  # [batch_size, seq_max, seq_max]
-        self.adj_out = fluid.data(
+        adj_out = fluid.data(
            name="adj_out", shape=[bs, -1, -1],
            dtype="float32")  # [batch_size, seq_max, seq_max]
-        self.mask = fluid.data(
+        mask = fluid.data(
            name="mask", shape=[bs, -1, 1],
            dtype="float32")  # [batch_size, seq_max, 1]
-        self.label = fluid.data(
+        label = fluid.data(
            name="label", shape=[bs, 1], dtype="int64")  # [batch_size, 1]
-        res = [
+        res = [items, seq_index, last_index, adj_in, adj_out, mask, label]
-            self.items, self.seq_index, self.last_index, self.adj_in,
-            self.adj_out, self.mask, self.label
-        ]
        return res
-    def train_input(self):
+    def net(self, inputs, is_infer=False):
-        res = self.input(self.train_batch_size)
+        if is_infer:
-        self._data_var = res
+            bs = self.evaluate_batch_size
+        else:
-        use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader",
+            bs = self.train_batch_size
-                                             False, self._namespace)
-        if self._platform != "LINUX" or use_dataloader:
+        stdv = 1.0 / math.sqrt(self.hidden_size)
-            self._data_loader = fluid.io.DataLoader.from_generator(
-                feed_list=self._data_var,
-                capacity=256,
-                use_double_buffer=False,
-                iterable=False)
-    def net(self, items_num, hidden_size, step, bs):
-        stdv = 1.0 / math.sqrt(hidden_size)
        def embedding_layer(input,
                            table_name,
@@ -100,22 +91,22 @@ class Model(ModelBase):
                            initializer_instance=None):
            emb = fluid.embedding(
                input=input,
-                size=[items_num, emb_dim],
+                size=[self.dict_size, emb_dim],
                param_attr=fluid.ParamAttr(
-                    name=table_name, initializer=initializer_instance), )
+                    name=table_name, initializer=initializer_instance))
            return emb
        sparse_initializer = fluid.initializer.Uniform(low=-stdv, high=stdv)
-        items_emb = embedding_layer(self.items, "emb", hidden_size,
+        items_emb = embedding_layer(inputs[0], "emb", self.hidden_size,
                                    sparse_initializer)
        pre_state = items_emb
-        for i in range(step):
+        for i in range(self.step):
            pre_state = layers.reshape(
-                x=pre_state, shape=[bs, -1, hidden_size])
+                x=pre_state, shape=[bs, -1, self.hidden_size])
            state_in = layers.fc(
                input=pre_state,
                name="state_in",
-                size=hidden_size,
+                size=self.hidden_size,
                act=None,
                num_flatten_dims=2,
                param_attr=fluid.ParamAttr(
@@ -127,7 +118,7 @@ class Model(ModelBase):
            state_out = layers.fc(
                input=pre_state,
                name="state_out",
-                size=hidden_size,
+                size=self.hidden_size,
                act=None,
                num_flatten_dims=2,
                param_attr=fluid.ParamAttr(
@@ -137,33 +128,34 @@ class Model(ModelBase):
                    initializer=fluid.initializer.Uniform(
                        low=-stdv, high=stdv)))  # [batch_size, uniq_max, h]
-            state_adj_in = layers.matmul(self.adj_in,
+            state_adj_in = layers.matmul(inputs[3],
                                         state_in)  # [batch_size, uniq_max, h]
            state_adj_out = layers.matmul(
-                self.adj_out, state_out)  # [batch_size, uniq_max, h]
+                inputs[4], state_out)  # [batch_size, uniq_max, h]
            gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)
            gru_input = layers.reshape(
-                x=gru_input, shape=[-1, hidden_size * 2])
+                x=gru_input, shape=[-1, self.hidden_size * 2])
            gru_fc = layers.fc(input=gru_input,
                               name="gru_fc",
-                               size=3 * hidden_size,
+                               size=3 * self.hidden_size,
                               bias_attr=False)
            pre_state, _, _ = fluid.layers.gru_unit(
                input=gru_fc,
                hidden=layers.reshape(
-                    x=pre_state, shape=[-1, hidden_size]),
+                    x=pre_state, shape=[-1, self.hidden_size]),
-                size=3 * hidden_size)
+                size=3 * self.hidden_size)
-        final_state = layers.reshape(pre_state, shape=[bs, -1, hidden_size])
+        final_state = layers.reshape(
-        seq = layers.gather_nd(final_state, self.seq_index)
+            pre_state, shape=[bs, -1, self.hidden_size])
-        last = layers.gather_nd(final_state, self.last_index)
+        seq = layers.gather_nd(final_state, inputs[1])
+        last = layers.gather_nd(final_state, inputs[2])
        seq_fc = layers.fc(
            input=seq,
            name="seq_fc",
-            size=hidden_size,
+            size=self.hidden_size,
            bias_attr=False,
            act=None,
            num_flatten_dims=2,
@@ -171,7 +163,7 @@ class Model(ModelBase):
                low=-stdv, high=stdv)))  # [batch_size, seq_max, h]
        last_fc = layers.fc(input=last,
                            name="last_fc",
-                            size=hidden_size,
+                            size=self.hidden_size,
                            bias_attr=False,
                            act=None,
                            num_flatten_dims=1,
@@ -184,7 +176,7 @@ class Model(ModelBase):
        add = layers.elementwise_add(seq_fc_t,
                                     last_fc)  # [seq_max, batch_size, h]
        b = layers.create_parameter(
-            shape=[hidden_size],
+            shape=[self.hidden_size],
            dtype='float32',
            default_initializer=fluid.initializer.Constant(value=0.0))  # [h]
        add = layers.elementwise_add(add, b)  # [seq_max, batch_size, h]
@@ -202,7 +194,7 @@ class Model(ModelBase):
            bias_attr=False,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  # [batch_size, seq_max, 1]
-        weight *= self.mask
+        weight *= inputs[5]
        weight_mask = layers.elementwise_mul(
            seq, weight, axis=0)  # [batch_size, seq_max, h]
        global_attention = layers.reduce_sum(
@@ -213,7 +205,7 @@ class Model(ModelBase):
        final_attention_fc = layers.fc(
            input=final_attention,
            name="final_attention_fc",
-            size=hidden_size,
+            size=self.hidden_size,
            bias_attr=False,
            act=None,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
@@ -225,7 +217,7 @@ class Model(ModelBase):
        #     dtype="int64",
        #     persistable=True,
        #     name="all_vocab")
-        all_vocab = np.arange(1, items_num).reshape((-1)).astype('int32')
+        all_vocab = np.arange(1, self.dict_size).reshape((-1)).astype('int32')
        all_vocab = fluid.layers.cast(
            x=fluid.layers.assign(all_vocab), dtype='int64')
@@ -235,63 +227,32 @@ class Model(ModelBase):
                name="emb",
                initializer=fluid.initializer.Uniform(
                    low=-stdv, high=stdv)),
-            size=[items_num, hidden_size])  # [all_vocab, h]
+            size=[self.dict_size, self.hidden_size])  # [all_vocab, h]
        logits = layers.matmul(
            x=final_attention_fc, y=all_emb,
            transpose_y=True)  # [batch_size, all_vocab]
        softmax = layers.softmax_with_cross_entropy(
-            logits=logits, label=self.label)  # [batch_size, 1]
+            logits=logits, label=inputs[6])  # [batch_size, 1]
        self.loss = layers.reduce_mean(softmax)  # [1]
-        self.acc = layers.accuracy(input=logits, label=self.label, k=20)
+        self.acc = layers.accuracy(input=logits, label=inputs[6], k=20)
-    def avg_loss(self):
        self._cost = self.loss
+        if is_infer:
+            self._infer_results['acc'] = self.acc
+            self._infer_results['loss'] = self.loss
+            return
-    def metrics(self):
        self._metrics["LOSS"] = self.loss
        self._metrics["train_acc"] = self.acc
-    def train_net(self):
-        self.train_input()
-        self.net(self.items_num, self.hidden_size, self.step,
-                 self.train_batch_size)
-        self.avg_loss()
-        self.metrics()
    def optimizer(self):
-        learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
+        step_per_epoch = self.corpus_size // self.train_batch_size
-                                            None, self._namespace)
-        step_per_epoch = self.ins_num // self.train_batch_size
-        decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None,
-                                          self._namespace)
-        decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None,
-                                         self._namespace)
-        l2 = envs.get_global_env("hyper_parameters.l2", None, self._namespace)
        optimizer = fluid.optimizer.Adam(
            learning_rate=fluid.layers.exponential_decay(
-                learning_rate=learning_rate,
+                learning_rate=self.learning_rate,
-                decay_steps=decay_steps * step_per_epoch,
+                decay_steps=self.decay_steps * step_per_epoch,
-                decay_rate=decay_rate),
+                decay_rate=self.decay_rate),
            regularization=fluid.regularizer.L2DecayRegularizer(
-                regularization_coeff=l2))
+                regularization_coeff=self.l2))
        return optimizer
-    def infer_input(self):
-        self._reader_namespace = "evaluate.reader"
-        res = self.input(self.evaluate_batch_size)
-        self._infer_data_var = res
-        self._infer_data_loader = fluid.io.DataLoader.from_generator(
-            feed_list=self._infer_data_var,
-            capacity=64,
-            use_double_buffer=False,
-            iterable=False)
-    def infer_net(self):
-        self.infer_input()
-        self.net(self.items_num, self.hidden_size, self.step,
-                 self.evaluate_batch_size)
-        self._infer_results['acc'] = self.acc
-        self._infer_results['loss'] = self.loss
--- a/models/recall/gnn/reader.py
+++ b/models/recall/gnn/reader.py
@@ -23,9 +23,8 @@ from paddlerec.core.utils import envs
 class TrainReader(Reader):
    def init(self):
-        self.batch_size = envs.get_global_env("batch_size", None,
+        self.batch_size = envs.get_global_env(
-                                              "train.reader")
+            "dataset.dataset_train.batch_size")
        self.input = []
        self.length = None

--- a/models/recall/readme.md
+++ b/models/recall/readme.md
@@ -57,8 +57,8 @@
 <img align="center" src="../../doc/imgs/gnn.png">
 <p>
-## 使用教程
+## 使用教程(快速开始)
-### 训练 预测
+### 
 ```shell
 python -m paddlerec.run -m paddlerec.models.recall.word2vec # word2vec
 python -m paddlerec.run -m paddlerec.models.recall.ssr # ssr
@@ -67,6 +67,40 @@ python -m paddlerec.run -m paddlerec.models.recall.gnn # gnn
 python -m paddlerec.run -m paddlerec.models.recall.ncf # ncf
 python -m paddlerec.run -m paddlerec.models.recall.youtube_dnn # youtube_dnn
 ```
+## 使用教程（复现论文）
+为了方便使用者能够快速的跑通每一个模型，我们在每个模型下都提供了样例数据，并且调整了batch_size等超参以便在样例数据上更加友好的显示训练&测试日志。如果需要复现readme中的效果请按照如下表格调整batch_size等超参，并使用提供的脚本下载对应数据集以及数据预处理。
+| 模型	| batch_size | thread_num | epoch_num |
+| :---: | :---: | :---: | :---: |
+| Word2Vec | 100 | 5 | 5 |
+| GNN | 100 | 1 | 30 |
+| GRU4REC | 500	| 1	| 10 |
+### 数据处理
+参考每个模型目录数据下载&预处理脚本。
+```bash
+sh data_prepare.sh
+```
+### 训练
+```bash
+cd modles/recall/gnn # 进入选定好的召回模型的目录 以gnn为例
+python -m paddlerec.run -m ./config.yaml # 自定义修改超参后，指定配置文件，使用自定义配置
+```
+### 预测
+```
+# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
+# 修改对应模型的config.yaml，mode配置infer_runner
+# 示例: mode: train_runner -> mode: infer_runner
+# infer_runner中 class配置为 class: single_infer
+# 修改phase阶段为infer的配置，参照config注释
+# 修改完config.yaml后 执行:
+python -m paddlerec.run -m ./config.yaml # 以gnn为例
+```
 ## 效果对比
 ### 模型效果列表

--- a/models/recall/word2vec/config.yaml
+++ b/models/recall/word2vec/config.yaml
@@ -11,51 +11,70 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-evaluate:
+workspace: "paddlerec.models.recall.word2vec" 
-  workspace: "paddlerec.models.recall.word2vec" 
-  evaluate_only: False
+# list of dataset
-  evaluate_model_path: ""
+dataset:
+- name: dataset_train # name of dataset to distinguish different datasets
-  reader:
+  batch_size: 100
-    batch_size: 50
+  type: DataLoader # or QueueDataset
-    class: "{workspace}/w2v_evaluate_reader.py"
+  data_path: "{workspace}/data/train"
-    test_data_path: "{workspace}/data/test"
+  word_count_dict_path: "{workspace}/data/dict/word_count_dict.txt"
-    word_id_dict_path: "{workspace}/data/dict/word_id_dict.txt"
+  data_converter: "{workspace}/w2v_reader.py"
+- name: dataset_infer # name
+  batch_size: 50
+  type: DataLoader # or QueueDataset
+  data_path: "{workspace}/data/test"
+  word_id_dict_path: "{workspace}/data/dict/word_id_dict.txt"
+  data_converter: "{workspace}/w2v_evaluate_reader.py"
-train:
+hyper_parameters:
-  trainer:
+  optimizer:
-    # for cluster training
+    learning_rate: 1.0
-    strategy: "async"
+    decay_steps: 100000
+    decay_rate: 0.999
+    class: sgd
+    strategy: async
+  sparse_feature_number: 354051
+  sparse_feature_dim: 300
+  with_shuffle_batch: False
+  neg_num: 5
+  window_size: 5
+# select runner by name
+mode: train_runner
+# config of each runner.
+# runner is a kind of paddle training class, which wraps the train/infer process.
+runner:
+- name: train_runner
+  class: single_train
+  # num of epochs
  epochs: 2
-  workspace: "paddlerec.models.recall.word2vec"
+  # device to run training or infer
+  device: cpu
+  save_checkpoint_interval: 1 # save model interval of epochs
+  save_inference_interval: 1 # save inference
+  save_checkpoint_path: "increment" # save checkpoint path
+  save_inference_path: "inference" # save inference path
+  save_inference_feed_varnames: [] # feed vars of save inference
+  save_inference_fetch_varnames: [] # fetch vars of save inference
+  init_model_path: "" # load model path
+  fetch_period: 10
+- name: infer_runner
+  class: single_infer
+  # num of epochs
+  epochs: 1
+  # device to run training or infer
+  device: cpu
+  init_model_path: "increment/0" # load model path
-  reader:
+# runner will run all the phase in each epoch
-    batch_size: 100
+phase:
-    class: "{workspace}/w2v_reader.py"
+- name: phase1
-    train_data_path: "{workspace}/data/train"
+  model: "{workspace}/model.py" # user-defined model
-    word_count_dict_path: "{workspace}/data/dict/word_count_dict.txt"
+  dataset_name: dataset_train # select dataset by name
+  thread_num: 1
-  model:
+#- name: phase2
-    models: "{workspace}/model.py"
+#  model: "{workspace}/model.py" # user-defined model
-    hyper_parameters:
+#  dataset_name: dataset_infer # select dataset by name
-      sparse_feature_number: 85
+#  thread_num: 1
-      sparse_feature_dim: 300
-      with_shuffle_batch: False
-      neg_num: 5
-      window_size: 5
-      learning_rate: 1.0
-      decay_steps: 100000
-      decay_rate: 0.999
-      optimizer: sgd
-  save:
-    increment:
-      dirname: "increment"
-      epoch_interval: 1
-      save_last: True
-    inference:
-      dirname: "inference"
-      epoch_interval: 1
-      save_last: True
--- a/models/recall/word2vec/prepare_data.sh
+++ b/models/recall/word2vec/prepare_data.sh
@@ -22,16 +22,17 @@ tar xvf 1-billion-word-language-modeling-benchmark-r13output.tar
 mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/ raw_data/
 # preprocess data
-python preprocess.py --build_dict --build_dict_corpus_dir raw_data/training-monolingual.tokenized.shuffled --dict_path raw_data/test_build_dict
+python preprocess.py --build_dict --build_dict_corpus_dir raw_data/training-monolingual.tokenized.shuffled --dict_path raw_data/word_count_dict.txt
-python preprocess.py --filter_corpus --dict_path raw_data/test_build_dict --input_corpus_dir raw_data/training-monolingual.tokenized.shuffled --output_corpus_dir raw_data/convert_text8 --min_count 5 --downsample 0.001
+python preprocess.py --filter_corpus --dict_path raw_data/word_count_dict.txt --input_corpus_dir raw_data/training-monolingual.tokenized.shuffled --output_corpus_dir raw_data/convert_text8 --min_count 5 --downsample 0.001
-mkdir thirdparty
+mv raw_data/word_count_dict.txt data/dict/
-mv raw_data/test_build_dict thirdparty/
+mv raw_data/word_id_dict.txt data/dict/
-mv raw_data/test_build_dict_word_to_id_ thirdparty/
-python preprocess.py --data_resplit --input_corpus_dir=raw_data/convert_text8 --output_corpus_dir=train_data
+rm -rf data/train/*
+rm -rf data/test/*
+python preprocess.py --data_resplit --input_corpus_dir=raw_data/convert_text8 --output_corpus_dir=data/train
 # download test data
 wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar
 tar xzvf test_dir.tar -C raw_data
-mv raw_data/data/test_dir test_data/
+mv raw_data/data/test_dir/* data/test/
 rm -rf raw_data
--- a/models/recall/word2vec/model.py
+++ b/models/recall/word2vec/model.py
@@ -23,45 +23,50 @@ class Model(ModelBase):
    def __init__(self, config):
        ModelBase.__init__(self, config)
-    def input(self):
+    def _init_hyper_parameters(self):
-        neg_num = int(
+        self.is_distributed = True if envs.get_trainer(
-            envs.get_global_env("hyper_parameters.neg_num", None,
+        ) == "CtrTrainer" else False
-                                self._namespace))
+        self.sparse_feature_number = envs.get_global_env(
-        self.input_word = fluid.data(
+            "hyper_parameters.sparse_feature_number")
+        self.sparse_feature_dim = envs.get_global_env(
+            "hyper_parameters.sparse_feature_dim")
+        self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
+        self.with_shuffle_batch = envs.get_global_env(
+            "hyper_parameters.with_shuffle_batch")
+        self.learning_rate = envs.get_global_env(
+            "hyper_parameters.optimizer.learning_rate")
+        self.decay_steps = envs.get_global_env(
+            "hyper_parameters.optimizer.decay_steps")
+        self.decay_rate = envs.get_global_env(
+            "hyper_parameters.optimizer.decay_rate")
+    def input_data(self, is_infer=False, **kwargs):
+        if is_infer:
+            analogy_a = fluid.data(
+                name="analogy_a", shape=[None], dtype='int64')
+            analogy_b = fluid.data(
+                name="analogy_b", shape=[None], dtype='int64')
+            analogy_c = fluid.data(
+                name="analogy_c", shape=[None], dtype='int64')
+            analogy_d = fluid.data(
+                name="analogy_d", shape=[None], dtype='int64')
+            return [analogy_a, analogy_b, analogy_c, analogy_d]
+        input_word = fluid.data(
            name="input_word", shape=[None, 1], dtype='int64')
-        self.true_word = fluid.data(
+        true_word = fluid.data(
            name='true_label', shape=[None, 1], dtype='int64')
-        self._data_var.append(self.input_word)
+        if self.with_shuffle_batch:
-        self._data_var.append(self.true_word)
+            return [input_word, true_word]
-        with_shuffle_batch = bool(
-            int(
-                envs.get_global_env("hyper_parameters.with_shuffle_batch",
-                                    None, self._namespace)))
-        if not with_shuffle_batch:
-            self.neg_word = fluid.data(
-                name="neg_label", shape=[None, neg_num], dtype='int64')
-            self._data_var.append(self.neg_word)
-        if self._platform != "LINUX":
+        neg_word = fluid.data(
-            self._data_loader = fluid.io.DataLoader.from_generator(
+            name="neg_label", shape=[None, self.neg_num], dtype='int64')
-                feed_list=self._data_var,
+        return [input_word, true_word, neg_word]
-                capacity=64,
-                use_double_buffer=False,
-                iterable=False)
-    def net(self):
+    def net(self, inputs, is_infer=False):
-        is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
+        if is_infer:
-        neg_num = int(
+            self.infer_net(inputs)
-            envs.get_global_env("hyper_parameters.neg_num", None,
+            return
-                                self._namespace))
-        sparse_feature_number = envs.get_global_env(
-            "hyper_parameters.sparse_feature_number", None, self._namespace)
-        sparse_feature_dim = envs.get_global_env(
-            "hyper_parameters.sparse_feature_dim", None, self._namespace)
-        with_shuffle_batch = bool(
-            int(
-                envs.get_global_env("hyper_parameters.with_shuffle_batch",
-                                    None, self._namespace)))
        def embedding_layer(input,
                            table_name,
@@ -71,8 +76,8 @@ class Model(ModelBase):
            emb = fluid.embedding(
                input=input,
                is_sparse=True,
-                is_distributed=is_distributed,
+                is_distributed=self.is_distributed,
-                size=[sparse_feature_number, emb_dim],
+                size=[self.sparse_feature_number, emb_dim],
                param_attr=fluid.ParamAttr(
                    name=table_name, initializer=initializer_instance), )
            if squeeze:
@@ -80,44 +85,44 @@ class Model(ModelBase):
            else:
                return emb
-        init_width = 0.5 / sparse_feature_dim
+        init_width = 0.5 / self.sparse_feature_dim
        emb_initializer = fluid.initializer.Uniform(-init_width, init_width)
        emb_w_initializer = fluid.initializer.Constant(value=0.0)
-        input_emb = embedding_layer(self.input_word, "emb", sparse_feature_dim,
+        input_emb = embedding_layer(inputs[0], "emb", self.sparse_feature_dim,
                                    emb_initializer, True)
-        true_emb_w = embedding_layer(self.true_word, "emb_w",
+        true_emb_w = embedding_layer(inputs[1], "emb_w",
-                                     sparse_feature_dim, emb_w_initializer,
+                                     self.sparse_feature_dim,
-                                     True)
-        true_emb_b = embedding_layer(self.true_word, "emb_b", 1,
                                     emb_w_initializer, True)
+        true_emb_b = embedding_layer(inputs[1], "emb_b", 1, emb_w_initializer,
+                                     True)
-        if with_shuffle_batch:
+        if self.with_shuffle_batch:
            neg_emb_w_list = []
-            for i in range(neg_num):
+            for i in range(self.neg_num):
                neg_emb_w_list.append(
                    fluid.contrib.layers.shuffle_batch(
                        true_emb_w))  # shuffle true_word
            neg_emb_w_concat = fluid.layers.concat(neg_emb_w_list, axis=0)
            neg_emb_w = fluid.layers.reshape(
-                neg_emb_w_concat, shape=[-1, neg_num, sparse_feature_dim])
+                neg_emb_w_concat,
+                shape=[-1, self.neg_num, self.sparse_feature_dim])
            neg_emb_b_list = []
-            for i in range(neg_num):
+            for i in range(self.neg_num):
                neg_emb_b_list.append(
                    fluid.contrib.layers.shuffle_batch(
                        true_emb_b))  # shuffle true_word
            neg_emb_b = fluid.layers.concat(neg_emb_b_list, axis=0)
            neg_emb_b_vec = fluid.layers.reshape(
-                neg_emb_b, shape=[-1, neg_num])
+                neg_emb_b, shape=[-1, self.neg_num])
        else:
-            neg_emb_w = embedding_layer(self.neg_word, "emb_w",
+            neg_emb_w = embedding_layer(
-                                        sparse_feature_dim, emb_w_initializer)
+                inputs[2], "emb_w", self.sparse_feature_dim, emb_w_initializer)
-            neg_emb_b = embedding_layer(self.neg_word, "emb_b", 1,
+            neg_emb_b = embedding_layer(inputs[2], "emb_b", 1,
                                        emb_w_initializer)
            neg_emb_b_vec = fluid.layers.reshape(
-                neg_emb_b, shape=[-1, neg_num])
+                neg_emb_b, shape=[-1, self.neg_num])
        true_logits = fluid.layers.elementwise_add(
            fluid.layers.reduce_sum(
@@ -127,18 +132,22 @@ class Model(ModelBase):
            true_emb_b)
        input_emb_re = fluid.layers.reshape(
-            input_emb, shape=[-1, 1, sparse_feature_dim])
+            input_emb, shape=[-1, 1, self.sparse_feature_dim])
        neg_matmul = fluid.layers.matmul(
            input_emb_re, neg_emb_w, transpose_y=True)
-        neg_logits = fluid.layers.elementwise_add(
+        neg_matmul_re = fluid.layers.reshape(
-            fluid.layers.reshape(
+            neg_matmul, shape=[-1, self.neg_num])
-                neg_matmul, shape=[-1, neg_num]),
+        neg_logits = fluid.layers.elementwise_add(neg_matmul_re, neg_emb_b_vec)
-            neg_emb_b_vec)
+        #nce loss
-        label_ones = fluid.layers.fill_constant_batch_size_like(
+        label_ones = fluid.layers.fill_constant(
-            true_logits, shape=[-1, 1], value=1.0, dtype='float32')
+            shape=[fluid.layers.shape(true_logits)[0], 1],
-        label_zeros = fluid.layers.fill_constant_batch_size_like(
+            value=1.0,
-            true_logits, shape=[-1, neg_num], value=0.0, dtype='float32')
+            dtype='float32')
+        label_zeros = fluid.layers.fill_constant(
+            shape=[fluid.layers.shape(true_logits)[0], self.neg_num],
+            value=0.0,
+            dtype='float32')
        true_xent = fluid.layers.sigmoid_cross_entropy_with_logits(true_logits,
                                                                   label_ones)
@@ -149,7 +158,9 @@ class Model(ModelBase):
                true_xent, dim=1),
            fluid.layers.reduce_sum(
                neg_xent, dim=1))
-        self.avg_cost = fluid.layers.reduce_mean(cost)
+        avg_cost = fluid.layers.reduce_mean(cost)
+        self._cost = avg_cost
        global_right_cnt = fluid.layers.create_global_var(
            name="global_right_cnt",
            persistable=True,
@@ -164,77 +175,33 @@ class Model(ModelBase):
            value=0)
        global_right_cnt.stop_gradient = True
        global_total_cnt.stop_gradient = True
+        self._metrics["LOSS"] = avg_cost
-    def avg_loss(self):
-        self._cost = self.avg_cost
-    def metrics(self):
-        self._metrics["LOSS"] = self.avg_cost
-    def train_net(self):
-        self.input()
-        self.net()
-        self.avg_loss()
-        self.metrics()
    def optimizer(self):
-        learning_rate = envs.get_global_env("hyper_parameters.learning_rate",
-                                            None, self._namespace)
-        decay_steps = envs.get_global_env("hyper_parameters.decay_steps", None,
-                                          self._namespace)
-        decay_rate = envs.get_global_env("hyper_parameters.decay_rate", None,
-                                         self._namespace)
        optimizer = fluid.optimizer.SGD(
            learning_rate=fluid.layers.exponential_decay(
-                learning_rate=learning_rate,
+                learning_rate=self.learning_rate,
-                decay_steps=decay_steps,
+                decay_steps=self.decay_steps,
-                decay_rate=decay_rate,
+                decay_rate=self.decay_rate,
                staircase=True))
        return optimizer
-    def analogy_input(self):
+    def infer_net(self, inputs):
-        sparse_feature_number = envs.get_global_env(
-            "hyper_parameters.sparse_feature_number", None, self._namespace)
-        self.analogy_a = fluid.data(
-            name="analogy_a", shape=[None], dtype='int64')
-        self.analogy_b = fluid.data(
-            name="analogy_b", shape=[None], dtype='int64')
-        self.analogy_c = fluid.data(
-            name="analogy_c", shape=[None], dtype='int64')
-        self.analogy_d = fluid.data(
-            name="analogy_d", shape=[None], dtype='int64')
-        self._infer_data_var = [
-            self.analogy_a, self.analogy_b, self.analogy_c, self.analogy_d
-        ]
-        self._infer_data_loader = fluid.io.DataLoader.from_generator(
-            feed_list=self._infer_data_var,
-            capacity=64,
-            use_double_buffer=False,
-            iterable=False)
-    def infer_net(self):
-        sparse_feature_dim = envs.get_global_env(
-            "hyper_parameters.sparse_feature_dim", None, self._namespace)
-        sparse_feature_number = envs.get_global_env(
-            "hyper_parameters.sparse_feature_number", None, self._namespace)
        def embedding_layer(input, table_name, initializer_instance=None):
            emb = fluid.embedding(
                input=input,
-                size=[sparse_feature_number, sparse_feature_dim],
+                size=[self.sparse_feature_number, self.sparse_feature_dim],
                param_attr=table_name)
            return emb
-        self.analogy_input()
+        all_label = np.arange(self.sparse_feature_number).reshape(
-        all_label = np.arange(sparse_feature_number).reshape(
+            self.sparse_feature_number).astype('int32')
-            sparse_feature_number).astype('int32')
        self.all_label = fluid.layers.cast(
            x=fluid.layers.assign(all_label), dtype='int64')
        emb_all_label = embedding_layer(self.all_label, "emb")
-        emb_a = embedding_layer(self.analogy_a, "emb")
+        emb_a = embedding_layer(inputs[0], "emb")
-        emb_b = embedding_layer(self.analogy_b, "emb")
+        emb_b = embedding_layer(inputs[1], "emb")
-        emb_c = embedding_layer(self.analogy_c, "emb")
+        emb_c = embedding_layer(inputs[2], "emb")
        target = fluid.layers.elementwise_add(
            fluid.layers.elementwise_sub(emb_b, emb_a), emb_c)
@@ -245,8 +212,7 @@ class Model(ModelBase):
        values, pred_idx = fluid.layers.topk(input=dist, k=4)
        label = fluid.layers.expand(
            fluid.layers.unsqueeze(
-                self.analogy_d, axes=[1]),
+                inputs[3], axes=[1]), expand_times=[1, 4])
-            expand_times=[1, 4])
        label_ones = fluid.layers.fill_constant_batch_size_like(
            label, shape=[-1, 1], value=1.0, dtype='float32')
        right_cnt = fluid.layers.reduce_sum(input=fluid.layers.cast(

--- a/models/recall/word2vec/preprocess.py
+++ b/models/recall/word2vec/preprocess.py
@@ -162,7 +162,7 @@ def filter_corpus(args):
                        if r_value > keep_prob:
                            continue
                        write_line += str(idx)
-                        write_line += ","
+                        write_line += " "
                        signal = True
                    if signal:
                        write_line = write_line[:-1] + "\n"

--- a/models/recall/word2vec/w2v_evaluate_reader.py
+++ b/models/recall/word2vec/w2v_evaluate_reader.py
@@ -20,10 +20,10 @@ from paddlerec.core.reader import Reader
 from paddlerec.core.utils import envs
-class EvaluateReader(Reader):
+class TrainReader(Reader):
    def init(self):
-        dict_path = envs.get_global_env("word_id_dict_path", None,
+        dict_path = envs.get_global_env(
-                                        "evaluate.reader")
+            "dataset.dataset_infer.word_id_dict_path")
        self.word_to_id = dict()
        self.id_to_word = dict()
        with io.open(dict_path, 'r', encoding='utf-8') as f:
@@ -75,6 +75,8 @@ class EvaluateReader(Reader):
    def generate_sample(self, line):
        def reader():
+            if ':' in line:
+                pass
            features = self.strip_lines(line.lower(), self.word_to_id)
            features = features.split()
            yield [('analogy_a', [self.word_to_id[features[0]]]),

--- a/models/recall/word2vec/w2v_reader.py
+++ b/models/recall/word2vec/w2v_reader.py
@@ -40,14 +40,12 @@ class NumpyRandomInt(object):
 class TrainReader(Reader):
    def init(self):
-        dict_path = envs.get_global_env("word_count_dict_path", None,
+        dict_path = envs.get_global_env(
-                                        "train.reader")
+            "dataset.dataset_train.word_count_dict_path")
-        self.window_size = envs.get_global_env("hyper_parameters.window_size",
+        self.window_size = envs.get_global_env("hyper_parameters.window_size")
-                                               None, "train.model")
+        self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
-        self.neg_num = envs.get_global_env("hyper_parameters.neg_num", None,
-                                           "train.model")
        self.with_shuffle_batch = envs.get_global_env(
-            "hyper_parameters.with_shuffle_batch", None, "train.model")
+            "hyper_parameters.with_shuffle_batch")
        self.random_generator = NumpyRandomInt(1, self.window_size + 1)
        self.cs = None

--- a/setup.py
+++ b/setup.py
@@ -63,7 +63,7 @@ def build(dirname):
    models_copy = [
        'data/*.txt', 'data/*/*.txt', '*.yaml', '*.sh', 'tree/*.npy',
        'tree/*.txt', 'data/sample_data/*', 'data/sample_data/train/*',
-        'data/sample_data/infer/*'
+        'data/sample_data/infer/*', 'data/*/*.csv'
    ]
    engine_copy = ['*/*.sh']