Merge pull request #28 from 123malin/fasttext

add Fasttext

Merge pull request #28 from 123malin/fasttext
add Fasttext
3d0cffc3 · wuzhihua · GitHub · 9b3afd7c · 3dae43c2 · 3d0cffc3
13 changed file
--- a/core/trainers/single_trainer.py
+++ b/core/trainers/single_trainer.py
@@ -228,10 +228,12 @@ class SingleTrainer(TranspileTrainer):
        scope = self._model[model_name][2]
        program = self._model[model_name][0]
        reader = self._dataset[reader_name]
+        threads = model_dict.get("thread_num", 1)
        with fluid.scope_guard(scope):
            self._exe.train_from_dataset(
                program=program,
                dataset=reader,
+                thread=threads,
                fetch_list=fetch_vars,
                fetch_info=fetch_alias,
                print_period=fetch_period)
@@ -241,8 +243,23 @@ class SingleTrainer(TranspileTrainer):
        model_name = model_dict["name"]
        model_class = self._model[model_name][3]
        program = self._model[model_name][0].clone()
+        _build_strategy = fluid.BuildStrategy()
+        _exe_strategy = fluid.ExecutionStrategy()
+        # 0: kCoeffNumDevice; 1: One; 2: Customized
+        _build_strategy.gradient_scale_strategy = model_dict.get(
+            "gradient_scale_strategy", 0)
+        if "thread_num" in model_dict and model_dict["thread_num"] > 1:
+            _build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+            _exe_strategy.num_threads = model_dict["thread_num"]
+            os.environ['CPU_NUM'] = str(_exe_strategy.num_threads)
        program = fluid.compiler.CompiledProgram(program).with_data_parallel(
-            loss_name=model_class.get_avg_cost().name)
+            loss_name=model_class.get_avg_cost().name,
+            build_strategy=_build_strategy,
+            exec_strategy=_exe_strategy)
        fetch_vars = []
        fetch_alias = []
        fetch_period = int(

--- a/models/recall/fasttext/__init__.py
+++ b/models/recall/fasttext/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/models/recall/fasttext/config.yaml
+++ b/models/recall/fasttext/config.yaml
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+workspace: "paddlerec.models.recall.fasttext" 
+# list of dataset
+dataset:
+- name: dataset_train # name of dataset to distinguish different datasets
+  batch_size: 10
+  type: DataLoader # or QueueDataset 
+  data_path: "{workspace}/data/train"
+  word_count_dict_path: "{workspace}/data/dict/word_count_dict.txt"
+  word_ngrams_path: "{workspace}/data/dict/word_ngrams_id.txt"
+  data_converter: "{workspace}/reader.py"
+- name: dataset_infer # name
+  batch_size: 10
+  type: DataLoader # or QueueDataset
+  data_path: "{workspace}/data/test"
+  word_id_dict_path: "{workspace}/data/dict/word_id_dict.txt"
+  data_converter: "{workspace}/evaluate_reader.py"
+hyper_parameters:
+  optimizer:
+    learning_rate: 1.0
+    decay_steps: 100000
+    decay_rate: 0.999
+    class: sgd
+    strategy: async
+  sparse_feature_number: 227915
+  sparse_feature_dim: 300
+  with_shuffle_batch: False
+  neg_num: 5
+  window_size: 5
+  min_n: 3
+  max_n: 5
+# select runner by name
+mode: train_runner
+# config of each runner.
+# runner is a kind of paddle training class, which wraps the train/infer process.
+runner:
+- name: train_runner
+  class: single_train
+  # num of epochs
+  epochs: 2
+  # device to run training or infer
+  device: cpu
+  save_checkpoint_interval: 1 # save model interval of epochs
+  save_inference_interval: 1 # save inference
+  save_checkpoint_path: "increment" # save checkpoint path
+  save_inference_path: "inference" # save inference path
+  save_inference_feed_varnames: [] # feed vars of save inference
+  save_inference_fetch_varnames: [] # fetch vars of save inference
+  init_model_path: "" # load model path
+  fetch_period: 10
+- name: infer_runner
+  class: single_infer
+  # num of epochs
+  epochs: 1
+  # device to run training or infer
+  device: cpu
+  init_model_path: "increment/0" # load model path
+# runner will run all the phase in each epoch
+phase:
+- name: phase1
+  model: "{workspace}/model.py" # user-defined model
+  dataset_name: dataset_train # select dataset by name
+  thread_num: 1
+  gradient_scale_strategy: 1
+#- name: phase2
+#  model: "{workspace}/model.py" # user-defined model
+#  dataset_name: dataset_infer # select dataset by name
+#  thread_num: 1
--- a/models/recall/fasttext/data/dict/word_count_dict.txt
+++ b/models/recall/fasttext/data/dict/word_count_dict.txt
--- a/models/recall/fasttext/data/dict/word_id_dict.txt
+++ b/models/recall/fasttext/data/dict/word_id_dict.txt
--- a/models/recall/fasttext/data/dict/word_ngrams_id.txt
+++ b/models/recall/fasttext/data/dict/word_ngrams_id.txt
--- a/models/recall/fasttext/data/test/sample.txt
+++ b/models/recall/fasttext/data/test/sample.txt
+Athens Greece Baghdad Iraq
+Athens Greece Bangkok Thailand
+Athens Greece Beijing China
+Athens Greece Berlin Germany
+Athens Greece Bern Switzerland
+Athens Greece Cairo Egypt
+Athens Greece Canberra Australia
+Athens Greece Hanoi Vietnam
+Athens Greece Havana Cuba
+Athens Greece Helsinki Finland
+Athens Greece Islamabad Pakistan
+Athens Greece Kabul Afghanistan
+Athens Greece London England
+Athens Greece Madrid Spain
+Athens Greece Moscow Russia
+Athens Greece Oslo Norway
+Athens Greece Ottawa Canada
+Athens Greece Paris France
+Athens Greece Rome Italy
+Athens Greece Stockholm Sweden
+Athens Greece Tehran Iran
+Athens Greece Tokyo Japan
+Baghdad Iraq Bangkok Thailand
+Baghdad Iraq Beijing China
+Baghdad Iraq Berlin Germany
+Baghdad Iraq Bern Switzerland
+Baghdad Iraq Cairo Egypt
+Baghdad Iraq Canberra Australia
+Baghdad Iraq Hanoi Vietnam
+Baghdad Iraq Havana Cuba
+Baghdad Iraq Helsinki Finland
+Baghdad Iraq Islamabad Pakistan
+Baghdad Iraq Kabul Afghanistan
+Baghdad Iraq London England
+Baghdad Iraq Madrid Spain
+Baghdad Iraq Moscow Russia
+Baghdad Iraq Oslo Norway
+Baghdad Iraq Ottawa Canada
+Baghdad Iraq Paris France
+Baghdad Iraq Rome Italy
+Baghdad Iraq Stockholm Sweden
+Baghdad Iraq Tehran Iran
+Baghdad Iraq Tokyo Japan
+Baghdad Iraq Athens Greece
+Bangkok Thailand Beijing China
+Bangkok Thailand Berlin Germany
+Bangkok Thailand Bern Switzerland
+Bangkok Thailand Cairo Egypt
+Bangkok Thailand Canberra Australia
+Bangkok Thailand Hanoi Vietnam
+Bangkok Thailand Havana Cuba
+Bangkok Thailand Helsinki Finland
+Bangkok Thailand Islamabad Pakistan
+Bangkok Thailand Kabul Afghanistan
+Bangkok Thailand London England
+Bangkok Thailand Madrid Spain
+Bangkok Thailand Moscow Russia
+Bangkok Thailand Oslo Norway
+Bangkok Thailand Ottawa Canada
+Bangkok Thailand Paris France
+Bangkok Thailand Rome Italy
+Bangkok Thailand Stockholm Sweden
+Bangkok Thailand Tehran Iran
+Bangkok Thailand Tokyo Japan
+Bangkok Thailand Athens Greece
+Bangkok Thailand Baghdad Iraq
+Beijing China Berlin Germany
+Beijing China Bern Switzerland
+Beijing China Cairo Egypt
+Beijing China Canberra Australia
+Beijing China Hanoi Vietnam
+Beijing China Havana Cuba
+Beijing China Helsinki Finland
+Beijing China Islamabad Pakistan
+Beijing China Kabul Afghanistan
+Beijing China London England
+Beijing China Madrid Spain
+Beijing China Moscow Russia
+Beijing China Oslo Norway
+Beijing China Ottawa Canada
+Beijing China Paris France
+Beijing China Rome Italy
+Beijing China Stockholm Sweden
+Beijing China Tehran Iran
+Beijing China Tokyo Japan
+Beijing China Athens Greece
+Beijing China Baghdad Iraq
+Beijing China Bangkok Thailand
+Berlin Germany Bern Switzerland
+Berlin Germany Cairo Egypt
+Berlin Germany Canberra Australia
+Berlin Germany Hanoi Vietnam
+Berlin Germany Havana Cuba
+Berlin Germany Helsinki Finland
+Berlin Germany Islamabad Pakistan
+Berlin Germany Kabul Afghanistan
+Berlin Germany London England
+Berlin Germany Madrid Spain
+Berlin Germany Moscow Russia
+Berlin Germany Oslo Norway
+Berlin Germany Ottawa Canada
+Berlin Germany Paris France
+Berlin Germany Rome Italy
+Berlin Germany Stockholm Sweden
+Berlin Germany Tehran Iran
+Berlin Germany Tokyo Japan
+Berlin Germany Athens Greece
+Berlin Germany Baghdad Iraq
+Berlin Germany Bangkok Thailand
+Berlin Germany Beijing China
+Bern Switzerland Cairo Egypt
+Bern Switzerland Canberra Australia
+Bern Switzerland Hanoi Vietnam
+Bern Switzerland Havana Cuba
+Bern Switzerland Helsinki Finland
+Bern Switzerland Islamabad Pakistan
+Bern Switzerland Kabul Afghanistan
+Bern Switzerland London England
+Bern Switzerland Madrid Spain
+Bern Switzerland Moscow Russia
+Bern Switzerland Oslo Norway
+Bern Switzerland Ottawa Canada
+Bern Switzerland Paris France
+Bern Switzerland Rome Italy
+Bern Switzerland Stockholm Sweden
+Bern Switzerland Tehran Iran
+Bern Switzerland Tokyo Japan
+Bern Switzerland Athens Greece
+Bern Switzerland Baghdad Iraq
+Bern Switzerland Bangkok Thailand
+Bern Switzerland Beijing China
+Bern Switzerland Berlin Germany
+Cairo Egypt Canberra Australia
+Cairo Egypt Hanoi Vietnam
+Cairo Egypt Havana Cuba
+Cairo Egypt Helsinki Finland
+Cairo Egypt Islamabad Pakistan
+Cairo Egypt Kabul Afghanistan
+Cairo Egypt London England
+Cairo Egypt Madrid Spain
+Cairo Egypt Moscow Russia
+Cairo Egypt Oslo Norway
+Cairo Egypt Ottawa Canada
+Cairo Egypt Paris France
+Cairo Egypt Rome Italy
+Cairo Egypt Stockholm Sweden
+Cairo Egypt Tehran Iran
+Cairo Egypt Tokyo Japan
+Cairo Egypt Athens Greece
+Cairo Egypt Baghdad Iraq
+Cairo Egypt Bangkok Thailand
+Cairo Egypt Beijing China
+Cairo Egypt Berlin Germany
+Cairo Egypt Bern Switzerland
+Canberra Australia Hanoi Vietnam
+Canberra Australia Havana Cuba
+Canberra Australia Helsinki Finland
+Canberra Australia Islamabad Pakistan
+Canberra Australia Kabul Afghanistan
+Canberra Australia London England
+Canberra Australia Madrid Spain
+Canberra Australia Moscow Russia
+Canberra Australia Oslo Norway
+Canberra Australia Ottawa Canada
+Canberra Australia Paris France
+Canberra Australia Rome Italy
+Canberra Australia Stockholm Sweden
+Canberra Australia Tehran Iran
+Canberra Australia Tokyo Japan
+Canberra Australia Athens Greece
+Canberra Australia Baghdad Iraq
+Canberra Australia Bangkok Thailand
+Canberra Australia Beijing China
+Canberra Australia Berlin Germany
+Canberra Australia Bern Switzerland
+Canberra Australia Cairo Egypt
+Hanoi Vietnam Havana Cuba
+Hanoi Vietnam Helsinki Finland
+Hanoi Vietnam Islamabad Pakistan
+Hanoi Vietnam Kabul Afghanistan
+Hanoi Vietnam London England
+Hanoi Vietnam Madrid Spain
+Hanoi Vietnam Moscow Russia
+Hanoi Vietnam Oslo Norway
+Hanoi Vietnam Ottawa Canada
+Hanoi Vietnam Paris France
+Hanoi Vietnam Rome Italy
+Hanoi Vietnam Stockholm Sweden
+Hanoi Vietnam Tehran Iran
+Hanoi Vietnam Tokyo Japan
+Hanoi Vietnam Athens Greece
+Hanoi Vietnam Baghdad Iraq
+Hanoi Vietnam Bangkok Thailand
+Hanoi Vietnam Beijing China
+Hanoi Vietnam Berlin Germany
+Hanoi Vietnam Bern Switzerland
+Hanoi Vietnam Cairo Egypt
+Hanoi Vietnam Canberra Australia
+Havana Cuba Helsinki Finland
+Havana Cuba Islamabad Pakistan
--- a/models/recall/fasttext/data/train/sample.txt
+++ b/models/recall/fasttext/data/train/sample.txt
+183648 183648 183648 183648 65918 183648 94834 93002 71149 183648 89518 183648 68731 183648 183648 63766 183648 183648 63766 183648 183648 63690 63766 83291 183648 183648 63766 183648 183648 183648 63766 65918 183648 73068 71149 183648 183648 183648 65370 183648 183648 67665 63945 93281 71149 183648 139630 183648 183648 63766 183648 183648 183648 183648 65062 110843 175871 94491 183648 183648 183648 183648 89277 183648 78014 183648 183648 63766 69529 183648 183648 183648 102013 63766 139449 183648 183648 113280 87363 64479 183648 183648 183648 183648 183648 183648 63766 183648 93281 183648 64068 183648 63690 183648 183648 183648 183648 71353 64068 183648 183648 183648 102334 97824 139630 183648 110843 183648 183648 183648 183648 183648 93281 183648 63766 183648 183648 183648 183648 183648 63766 67946 63766 69529 183648 183648 74700 183648 64292 183648 97584 64890 112995 125717 183648 183648 183648 65927 183648 183648 183648 64366 183648 183648 183648 68450 183648 102334 64068 183648 183648 183648 183648 183648 63690 183648 183648 183648 183648 183648 183648 183648 129534 92829 183648 183648 183648 183648 183648 183648 66719 63690 78014 183648 97719 183648 183648 74700 183648 183648 183648 102334 183648 183648 183648 183648 110843 80622 183648 183648 183648 102334 118096 183648 183648 183648 183648 208852 67073 183648 183648 125996 95715 66971 183648 183648 183648 97660 65663 98873 183648 183648 70581 183648 183648 183648 71353 183648 110843 183648 95715 66971 183648 67073 183648 97660 183648 183648 95715 183648 63690 183648 183648 63690 183648 99530 93281 183648 183648 183648 70654 183648 68437 183648 183648 183648 183648 74700 183648 183648 183648 63690 183648 183648 183648 93281 183648 183648 106973 75457 183648 63690 183648 183648 183648 183648 183648 183648 183648 183648 67438 63690 66087 76115 183648 183648 183648 183648 183648 183648 183648 63766 63766 183648 183648 63690 183648 80719 183648 183648 183648 183648 102013 183648 89431 183648 71932 183648 125996 183648 99901 213612 183648 183648 183648 183648 183648 213612 183648 63766 183648 183648 63766 67946 65113 183648 63690 183648 63690 183648 63766 183648 183648 63766 75094 67438 183648 63766 183648 63766 183648 183648 63856 79145 183648 183648 183648 64068 183648 183648 183648 183648 183648 183648 183648 183648 183648 183648 183648 183648 64649 183648 183648 183648 183648 183648 93281 63766 183648 183648 63689 67438 105276 118096 82908 93281 63766 72124 63702 68692 183648 183648 183648 88510
--- a/models/recall/fasttext/data_prepare.sh
+++ b/models/recall/fasttext/data_prepare.sh
+#! /bin/bash
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# download train_data
+mkdir raw_data
+wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/1-billion-word-language-modeling-benchmark-r13output.tar
+tar xvf 1-billion-word-language-modeling-benchmark-r13output.tar
+mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/ raw_data/
+# preprocess data
+python preprocess.py --build_dict --build_dict_corpus_dir raw_data/training-monolingual.tokenized.shuffled --dict_path raw_data/word_count_dict.txt --ngrams_path raw_data/word_ngrams.txt
+python preprocess.py --filter_corpus --dict_path raw_data/word_count_dict.txt --word_id_path raw_data/word_id_dict.txt --input_corpus_dir raw_data/training-monolingual.tokenized.shuffled --output_corpus_dir raw_data/convert_text8 --ngrams_id_path raw_data/word_ngrams_id.txt --ngrams_path raw_data/word_ngrams.txt --min_count 5 --downsample 0.001
+mv raw_data/word_count_dict.txt data/dict/
+mv raw_data/word_id_dict.txt data/dict/
+mv raw_data/word_ngrams.txt data/dict/
+mv raw_data/word_ngrams_id.txt data/dict/
+rm -rf data/train/*
+rm -rf data/test/*
+python preprocess.py --data_resplit --file_nums 24 --input_corpus_dir=raw_data/convert_text8 --output_corpus_dir=data/train
+# download test data
+wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar
+tar xzvf test_dir.tar -C raw_data
+mv raw_data/data/test_dir/* data/test/
+rm -rf raw_data
--- a/models/recall/fasttext/evaluate_reader.py
+++ b/models/recall/fasttext/evaluate_reader.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import six
+from paddlerec.core.reader import Reader
+from paddlerec.core.utils import envs
+class TrainReader(Reader):
+    def init(self):
+        dict_path = envs.get_global_env(
+            "dataset.dataset_infer.word_id_dict_path")
+        self.min_n = envs.get_global_env("hyper_parameters.min_n")
+        self.max_n = envs.get_global_env("hyper_parameters.max_n")
+        self.word_to_id = dict()
+        self.id_to_word = dict()
+        with io.open(dict_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                self.word_to_id[line.split(' ')[0]] = int(line.split(' ')[1])
+                self.id_to_word[int(line.split(' ')[1])] = line.split(' ')[0]
+        self.dict_size = len(self.word_to_id)
+    def computeSubwords(self, word):
+        ngrams = set()
+        for i in range(len(word) - self.min_n + 1):
+            for j in range(self.min_n, self.max_n + 1):
+                end = min(len(word), i + j)
+                ngrams.add("".join(word[i:end]))
+        return list(ngrams)
+    def native_to_unicode(self, s):
+        if self._is_unicode(s):
+            return s
+        try:
+            return self._to_unicode(s)
+        except UnicodeDecodeError:
+            res = self._to_unicode(s, ignore_errors=True)
+            return res
+    def _is_unicode(self, s):
+        if six.PY2:
+            if isinstance(s, unicode):
+                return True
+        else:
+            if isinstance(s, str):
+                return True
+        return False
+    def _to_unicode(self, s, ignore_errors=False):
+        if self._is_unicode(s):
+            return s
+        error_mode = "ignore" if ignore_errors else "strict"
+        return s.decode("utf-8", errors=error_mode)
+    def strip_lines(self, line, vocab):
+        return self._replace_oov(vocab, self.native_to_unicode(line))
+    def _replace_oov(self, original_vocab, line):
+        """Replace out-of-vocab words with "<UNK>".
+      This maintains compatibility with published results.
+      Args:
+        original_vocab: a set of strings (The standard vocabulary for the dataset)
+        line: a unicode string - a space-delimited sequence of words.
+      Returns:
+        a unicode string - a space-delimited sequence of words.
+      """
+        return u" ".join([
+            "<" + word + ">"
+            if "<" + word + ">" in original_vocab else u"<UNK>"
+            for word in line.split()
+        ])
+    def generate_sample(self, line):
+        def reader():
+            if ':' in line:
+                pass
+            features = self.strip_lines(line.lower(), self.word_to_id)
+            features = features.split()
+            inputs = []
+            for item in features:
+                if item == "<UNK>":
+                    inputs.append([self.word_to_id[item]])
+                else:
+                    ngrams = self.computeSubwords(item)
+                    res = []
+                    res.append(self.word_to_id[item])
+                    for _ in ngrams:
+                        res.append(self.word_to_id[_])
+                    inputs.append(res)
+            yield [('analogy_a', inputs[0]), ('analogy_b', inputs[1]),
+                   ('analogy_c', inputs[2]), ('analogy_d', inputs[3][0:1])]
+        return reader
--- a/models/recall/fasttext/model.py
+++ b/models/recall/fasttext/model.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle.fluid as fluid
+from paddlerec.core.utils import envs
+from paddlerec.core.model import Model as ModelBase
+class Model(ModelBase):
+    def __init__(self, config):
+        ModelBase.__init__(self, config)
+    def _init_hyper_parameters(self):
+        self.is_distributed = True if envs.get_trainer(
+        ) == "CtrTrainer" else False
+        self.sparse_feature_number = envs.get_global_env(
+            "hyper_parameters.sparse_feature_number")
+        self.sparse_feature_dim = envs.get_global_env(
+            "hyper_parameters.sparse_feature_dim")
+        self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
+        self.with_shuffle_batch = envs.get_global_env(
+            "hyper_parameters.with_shuffle_batch")
+        self.learning_rate = envs.get_global_env(
+            "hyper_parameters.optimizer.learning_rate")
+        self.decay_steps = envs.get_global_env(
+            "hyper_parameters.optimizer.decay_steps")
+        self.decay_rate = envs.get_global_env(
+            "hyper_parameters.optimizer.decay_rate")
+    def input_data(self, is_infer=False, **kwargs):
+        if is_infer:
+            analogy_a = fluid.data(
+                name="analogy_a", shape=[None, 1], lod_level=1, dtype='int64')
+            analogy_b = fluid.data(
+                name="analogy_b", shape=[None, 1], lod_level=1, dtype='int64')
+            analogy_c = fluid.data(
+                name="analogy_c", shape=[None, 1], lod_level=1, dtype='int64')
+            analogy_d = fluid.data(
+                name="analogy_d", shape=[None, 1], dtype='int64')
+            return [analogy_a, analogy_b, analogy_c, analogy_d]
+        input_word = fluid.data(
+            name="input_word", shape=[None, 1], lod_level=1, dtype='int64')
+        true_word = fluid.data(
+            name='true_label', shape=[None, 1], lod_level=1, dtype='int64')
+        if self.with_shuffle_batch:
+            return [input_word, true_word]
+        neg_word = fluid.data(
+            name="neg_label", shape=[None, self.neg_num], dtype='int64')
+        return [input_word, true_word, neg_word]
+    def net(self, inputs, is_infer=False):
+        if is_infer:
+            self.infer_net(inputs)
+            return
+        def embedding_layer(input,
+                            table_name,
+                            initializer_instance=None,
+                            sequence_pool=False):
+            emb = fluid.embedding(
+                input=input,
+                is_sparse=True,
+                is_distributed=self.is_distributed,
+                size=[self.sparse_feature_number, self.sparse_feature_dim],
+                param_attr=fluid.ParamAttr(
+                    name=table_name, initializer=initializer_instance), )
+            if sequence_pool:
+                emb = fluid.layers.sequence_pool(
+                    input=emb, pool_type='average')
+            return emb
+        init_width = 1.0 / self.sparse_feature_dim
+        emb_initializer = fluid.initializer.Uniform(-init_width, init_width)
+        emb_w_initializer = fluid.initializer.Constant(value=0.0)
+        input_emb = embedding_layer(inputs[0], "emb", emb_initializer, True)
+        input_emb = fluid.layers.squeeze(input=input_emb, axes=[1])
+        true_emb_w = embedding_layer(inputs[1], "emb_w", emb_w_initializer,
+                                     True)
+        true_emb_w = fluid.layers.squeeze(input=true_emb_w, axes=[1])
+        if self.with_shuffle_batch:
+            neg_emb_w_list = []
+            for i in range(self.neg_num):
+                neg_emb_w_list.append(
+                    fluid.contrib.layers.shuffle_batch(
+                        true_emb_w))  # shuffle true_word
+            neg_emb_w_concat = fluid.layers.concat(neg_emb_w_list, axis=0)
+            neg_emb_w = fluid.layers.reshape(
+                neg_emb_w_concat,
+                shape=[-1, self.neg_num, self.sparse_feature_dim])
+        else:
+            neg_emb_w = embedding_layer(inputs[2], "emb_w", emb_w_initializer)
+        true_logits = fluid.layers.reduce_sum(
+            fluid.layers.elementwise_mul(input_emb, true_emb_w),
+            dim=1,
+            keep_dim=True)
+        input_emb_re = fluid.layers.reshape(
+            input_emb, shape=[-1, 1, self.sparse_feature_dim])
+        neg_matmul = fluid.layers.matmul(
+            input_emb_re, neg_emb_w, transpose_y=True)
+        neg_logits = fluid.layers.reshape(neg_matmul, shape=[-1, 1])
+        logits = fluid.layers.concat([true_logits, neg_logits], axis=0)
+        label_ones = fluid.layers.fill_constant(
+            shape=[fluid.layers.shape(true_logits)[0], 1],
+            value=1.0,
+            dtype='float32')
+        label_zeros = fluid.layers.fill_constant(
+            shape=[fluid.layers.shape(neg_logits)[0], 1],
+            value=0.0,
+            dtype='float32')
+        label = fluid.layers.concat([label_ones, label_zeros], axis=0)
+        loss = fluid.layers.log_loss(fluid.layers.sigmoid(logits), label)
+        avg_cost = fluid.layers.reduce_sum(loss)
+        global_right_cnt = fluid.layers.create_global_var(
+            name="global_right_cnt",
+            persistable=True,
+            dtype='float32',
+            shape=[1],
+            value=0)
+        global_total_cnt = fluid.layers.create_global_var(
+            name="global_total_cnt",
+            persistable=True,
+            dtype='float32',
+            shape=[1],
+            value=0)
+        global_right_cnt.stop_gradient = True
+        global_total_cnt.stop_gradient = True
+        self._cost = avg_cost
+        self._metrics["LOSS"] = avg_cost
+    def optimizer(self):
+        optimizer = fluid.optimizer.SGD(
+            learning_rate=fluid.layers.exponential_decay(
+                learning_rate=self.learning_rate,
+                decay_steps=self.decay_steps,
+                decay_rate=self.decay_rate,
+                staircase=True))
+        return optimizer
+    def infer_net(self, inputs):
+        def embedding_layer(input,
+                            table_name,
+                            initializer_instance=None,
+                            sequence_pool=False):
+            emb = fluid.embedding(
+                input=input,
+                size=[self.sparse_feature_number, self.sparse_feature_dim],
+                param_attr=table_name)
+            if sequence_pool:
+                emb = fluid.layers.sequence_pool(
+                    input=emb, pool_type='average')
+            return emb
+        all_label = np.arange(self.sparse_feature_number).reshape(
+            self.sparse_feature_number).astype('int32')
+        self.all_label = fluid.layers.cast(
+            x=fluid.layers.assign(all_label), dtype='int64')
+        emb_all_label = embedding_layer(self.all_label, "emb")
+        emb_a = embedding_layer(inputs[0], "emb", sequence_pool=True)
+        emb_b = embedding_layer(inputs[1], "emb", sequence_pool=True)
+        emb_c = embedding_layer(inputs[2], "emb", sequence_pool=True)
+        target = fluid.layers.elementwise_add(
+            fluid.layers.elementwise_sub(emb_b, emb_a), emb_c)
+        emb_all_label_l2 = fluid.layers.l2_normalize(x=emb_all_label, axis=1)
+        dist = fluid.layers.matmul(
+            x=target, y=emb_all_label_l2, transpose_y=True)
+        values, pred_idx = fluid.layers.topk(input=dist, k=4)
+        label = fluid.layers.expand(inputs[3], expand_times=[1, 4])
+        label_ones = fluid.layers.fill_constant_batch_size_like(
+            label, shape=[-1, 1], value=1.0, dtype='float32')
+        right_cnt = fluid.layers.reduce_sum(input=fluid.layers.cast(
+            fluid.layers.equal(pred_idx, label), dtype='float32'))
+        total_cnt = fluid.layers.reduce_sum(label_ones)
+        global_right_cnt = fluid.layers.create_global_var(
+            name="global_right_cnt",
+            persistable=True,
+            dtype='float32',
+            shape=[1],
+            value=0)
+        global_total_cnt = fluid.layers.create_global_var(
+            name="global_total_cnt",
+            persistable=True,
+            dtype='float32',
+            shape=[1],
+            value=0)
+        global_right_cnt.stop_gradient = True
+        global_total_cnt.stop_gradient = True
+        tmp1 = fluid.layers.elementwise_add(right_cnt, global_right_cnt)
+        fluid.layers.assign(tmp1, global_right_cnt)
+        tmp2 = fluid.layers.elementwise_add(total_cnt, global_total_cnt)
+        fluid.layers.assign(tmp2, global_total_cnt)
+        acc = fluid.layers.elementwise_div(
+            global_right_cnt, global_total_cnt, name="total_acc")
+        self._infer_results['acc'] = acc
--- a/models/recall/fasttext/preprocess.py
+++ b/models/recall/fasttext/preprocess.py
+# -*- coding: utf-8 -*
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import math
+import os
+import random
+import re
+import six
+import argparse
+prog = re.compile("[^a-z ]", flags=0)
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Paddle Fluid word2 vector preprocess")
+    parser.add_argument(
+        '--build_dict_corpus_dir', type=str, help="The dir of corpus")
+    parser.add_argument(
+        '--input_corpus_dir', type=str, help="The dir of input corpus")
+    parser.add_argument(
+        '--output_corpus_dir', type=str, help="The dir of output corpus")
+    parser.add_argument(
+        '--dict_path',
+        type=str,
+        default='./dict',
+        help="The path of dictionary ")
+    parser.add_argument(
+        '--word_id_path',
+        type=str,
+        default='./word_id',
+        help="The path of word_id ")
+    parser.add_argument(
+        '--ngrams_path',
+        type=str,
+        default='./word_ngrams',
+        help="The path of word_ngrams ")
+    parser.add_argument(
+        '--ngrams_id_path',
+        type=str,
+        default='./word_ngrams_id',
+        help="The path of word_ngrams_id ")
+    parser.add_argument(
+        '--min_count',
+        type=int,
+        default=5,
+        help="If the word count is less then min_count, it will be removed from dict"
+    )
+    parser.add_argument('--min_n', type=int, default=3, help="min_n of ngrams")
+    parser.add_argument('--max_n', type=int, default=5, help="max_n of ngrams")
+    parser.add_argument(
+        '--file_nums',
+        type=int,
+        default=1024,
+        help="re-split input corpus file nums")
+    parser.add_argument(
+        '--downsample',
+        type=float,
+        default=0.001,
+        help="filter word by downsample")
+    parser.add_argument(
+        '--filter_corpus',
+        action='store_true',
+        default=False,
+        help='Filter corpus')
+    parser.add_argument(
+        '--build_dict',
+        action='store_true',
+        default=False,
+        help='Build dict from corpus')
+    parser.add_argument(
+        '--data_resplit',
+        action='store_true',
+        default=False,
+        help='re-split input corpus files')
+    return parser.parse_args()
+def text_strip(text):
+    # English Preprocess Rule
+    return prog.sub("", text.lower())
+# Shameless copy from Tensorflow https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/text_encoder.py
+# Unicode utility functions that work with Python 2 and 3
+def native_to_unicode(s):
+    if _is_unicode(s):
+        return s
+    try:
+        return _to_unicode(s)
+    except UnicodeDecodeError:
+        res = _to_unicode(s, ignore_errors=True)
+        return res
+def _is_unicode(s):
+    if six.PY2:
+        if isinstance(s, unicode):
+            return True
+    else:
+        if isinstance(s, str):
+            return True
+    return False
+def _to_unicode(s, ignore_errors=False):
+    if _is_unicode(s):
+        return s
+    error_mode = "ignore" if ignore_errors else "strict"
+    return s.decode("utf-8", errors=error_mode)
+def filter_corpus(args):
+    """
+    filter corpus and convert id.
+    """
+    word_count = dict()
+    word_to_id_ = dict()
+    word_all_count = 0
+    id_counts = []
+    word_id = 0
+    # read dict
+    with io.open(args.dict_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            word, count = line.split()[0], int(line.split()[1])
+            word_count[word] = count
+            word_to_id_[word] = word_id
+            word_id += 1
+            id_counts.append(count)
+            word_all_count += count
+    word_ngrams = dict()
+    with io.open(args.ngrams_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            word, ngrams = line.rstrip().split(':')
+            ngrams = ngrams.split()
+            ngrams = [str(word_to_id_[_]) for _ in ngrams]
+            word_ngrams[word_to_id_[word]] = ' '.join(ngrams)
+    with io.open(args.ngrams_id_path, 'w+', encoding='utf-8') as fid:
+        for k, v in word_ngrams.items():
+            fid.write(u'{} {}\n'.format(k, v))
+    # write word2id file
+    print("write word2id file to : " + args.dict_path + "_word_to_id_")
+    with io.open(args.word_id_path, 'w+', encoding='utf-8') as fid:
+        for k, v in word_to_id_.items():
+            fid.write(k + " " + str(v) + '\n')
+    # filter corpus and convert id
+    if not os.path.exists(args.output_corpus_dir):
+        os.makedirs(args.output_corpus_dir)
+    for file in os.listdir(args.input_corpus_dir):
+        with io.open(args.output_corpus_dir + '/convert_' + file + '.csv',
+                     "w") as wf:
+            with io.open(
+                    args.input_corpus_dir + '/' + file,
+                    encoding='utf-8') as rf:
+                print(args.input_corpus_dir + '/' + file)
+                for line in rf:
+                    signal = False
+                    line = text_strip(line)
+                    words = line.split()
+                    write_line = ""
+                    for item in words:
+                        if item in word_count:
+                            idx = word_to_id_[item]
+                        else:
+                            idx = word_to_id_[native_to_unicode('<UNK>')]
+                        count_w = id_counts[idx]
+                        corpus_size = word_all_count
+                        keep_prob = (
+                            math.sqrt(count_w /
+                                      (args.downsample * corpus_size)) + 1
+                        ) * (args.downsample * corpus_size) / count_w
+                        r_value = random.random()
+                        if r_value > keep_prob:
+                            continue
+                        write_line += str(idx)
+                        write_line += " "
+                        signal = True
+                    if signal:
+                        write_line = write_line[:-1] + "\n"
+                        wf.write(_to_unicode(write_line))
+def computeSubwords(word, min_n, max_n):
+    ngrams = set()
+    for i in range(len(word) - min_n + 1):
+        for j in range(min_n, max_n + 1):
+            end = min(len(word), i + j)
+            ngrams.add("".join(word[i:end]))
+    return list(ngrams)
+def build_dict(args):
+    """
+    proprocess the data, generate dictionary and save into dict_path.
+    :param corpus_dir: the input data dir.
+    :param dict_path: the generated dict path. the data in dict is "word count"
+    :param min_count:
+    :return:
+    """
+    # word to count
+    word_count = dict()
+    for file in os.listdir(args.build_dict_corpus_dir):
+        with io.open(
+                args.build_dict_corpus_dir + "/" + file,
+                encoding='utf-8') as f:
+            print("build dict : ", args.build_dict_corpus_dir + "/" + file)
+            for line in f:
+                line = text_strip(line)
+                words = line.split()
+                for item in words:
+                    item = '<' + item + '>'
+                    if item in word_count:
+                        word_count[item] = word_count[item] + 1
+                    else:
+                        word_count[item] = 1
+    item_to_remove = []
+    for item in word_count:
+        if word_count[item] <= args.min_count:
+            item_to_remove.append(item)
+    unk_sum = 0
+    for item in item_to_remove:
+        unk_sum += word_count[item]
+        del word_count[item]
+    # sort by count
+    word_count[native_to_unicode('<UNK>')] = unk_sum
+    word_ngrams = dict()
+    ngrams_count = dict()
+    for item in word_count:
+        ngrams = computeSubwords(item, args.min_n, args.max_n)
+        word_ngrams[item] = ngrams
+        for sub_word in ngrams:
+            if sub_word not in ngrams_count:
+                ngrams_count[sub_word] = 1
+            else:
+                ngrams_count[sub_word] = ngrams_count[sub_word] + 1
+    ngrams_count = sorted(
+        ngrams_count.items(), key=lambda ngrams_count: -ngrams_count[1])
+    word_count = sorted(
+        word_count.items(), key=lambda word_count: -word_count[1])
+    with io.open(args.dict_path, 'w+', encoding='utf-8') as f:
+        for k, v in word_count:
+            f.write(k + " " + str(v) + '\n')
+        for k, v in ngrams_count:
+            f.write(k + " " + str(v) + '\n')
+    with io.open(args.ngrams_path, 'w+', encoding='utf-8') as f:
+        for key in word_ngrams:
+            f.write(key + ":")
+            f.write(" ".join(word_ngrams[key]))
+            f.write(u'\n')
+def data_split(args):
+    raw_data_dir = args.input_corpus_dir
+    new_data_dir = args.output_corpus_dir
+    if not os.path.exists(new_data_dir):
+        os.mkdir(new_data_dir)
+    files = os.listdir(raw_data_dir)
+    print(files)
+    index = 0
+    contents = []
+    for file_ in files:
+        with open(os.path.join(raw_data_dir, file_), 'r') as f:
+            contents.extend(f.readlines())
+    num = int(args.file_nums)
+    lines_per_file = len(contents) / num
+    print("contents: ", str(len(contents)))
+    print("lines_per_file: ", str(lines_per_file))
+    for i in range(1, num + 1):
+        with open(os.path.join(new_data_dir, "part_" + str(i)), 'w') as fout:
+            data = contents[(i - 1) * lines_per_file:min(i * lines_per_file,
+                                                         len(contents))]
+            for line in data:
+                fout.write(line)
+if __name__ == "__main__":
+    args = parse_args()
+    if args.build_dict:
+        build_dict(args)
+    elif args.filter_corpus:
+        filter_corpus(args)
+    elif args.data_resplit:
+        data_split(args)
+    else:
+        print(
+            "error command line, please choose --build_dict or --filter_corpus")
--- a/models/recall/fasttext/reader.py
+++ b/models/recall/fasttext/reader.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import numpy as np
+from paddlerec.core.reader import Reader
+from paddlerec.core.utils import envs
+class NumpyRandomInt(object):
+    def __init__(self, a, b, buf_size=1000):
+        self.idx = 0
+        self.buffer = np.random.random_integers(a, b, buf_size)
+        self.a = a
+        self.b = b
+    def __call__(self):
+        if self.idx == len(self.buffer):
+            self.buffer = np.random.random_integers(self.a, self.b,
+                                                    len(self.buffer))
+            self.idx = 0
+        result = self.buffer[self.idx]
+        self.idx += 1
+        return result
+class TrainReader(Reader):
+    def init(self):
+        dict_path = envs.get_global_env(
+            "dataset.dataset_train.word_count_dict_path")
+        word_ngrams_path = envs.get_global_env(
+            "dataset.dataset_train.word_ngrams_path")
+        self.window_size = envs.get_global_env("hyper_parameters.window_size")
+        self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
+        self.with_shuffle_batch = envs.get_global_env(
+            "hyper_parameters.with_shuffle_batch")
+        self.random_generator = NumpyRandomInt(1, self.window_size + 1)
+        self.word_ngrams = dict()
+        with io.open(word_ngrams_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.rstrip().split()
+                self.word_ngrams[str(line[0])] = map(int, line[1:])
+        self.cs = None
+        if not self.with_shuffle_batch:
+            id_counts = []
+            word_all_count = 0
+            with io.open(dict_path, 'r', encoding='utf-8') as f:
+                for line in f:
+                    word, count = line.split()[0], int(line.split()[1])
+                    id_counts.append(count)
+                    word_all_count += count
+            id_frequencys = [
+                float(count) / word_all_count for count in id_counts
+            ]
+            np_power = np.power(np.array(id_frequencys), 0.75)
+            id_frequencys_pow = np_power / np_power.sum()
+            self.cs = np.array(id_frequencys_pow).cumsum()
+    def get_context_words(self, words, idx):
+        """
+        Get the context word list of target word.
+        words: the words of the current line
+        idx: input word index
+        window_size: window size
+        """
+        target_window = self.random_generator()
+        start_point = idx - target_window  # if (idx - target_window) > 0 else 0
+        if start_point < 0:
+            start_point = 0
+        end_point = idx + target_window
+        targets = words[start_point:idx] + words[idx + 1:end_point + 1]
+        return targets
+    def generate_sample(self, line):
+        def reader():
+            word_ids = [w for w in line.split()]
+            for idx, target_id in enumerate(word_ids):
+                input_word = [int(target_id)]
+                if target_id in self.word_ngrams:
+                    input_word += self.word_ngrams[target_id]
+                context_word_ids = self.get_context_words(word_ids, idx)
+                for context_id in context_word_ids:
+                    output = [('input_word', input_word),
+                              ('true_label', [int(context_id)])]
+                    if not self.with_shuffle_batch:
+                        neg_array = self.cs.searchsorted(
+                            np.random.sample(self.neg_num))
+                        output += [('neg_label',
+                                    [int(str(i)) for i in neg_array])]
+                    yield output
+        return reader