add fasttext

03353075 · malin10 · 1125b796 · 03353075 · 03353075 · 03353075
7 changed file
--- a/models/recall/fasttext/__init__.py
+++ b/models/recall/fasttext/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/models/recall/fasttext/config.yaml
+++ b/models/recall/fasttext/config.yaml
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#workspace: "paddlerec.models.recall.fasttext" 
+workspace: "/home/malin10/code/paddlerec/models/recall/fasttext"
+
+# list of dataset
+dataset:
+- name: dataset_train # name of dataset to distinguish different datasets
+  batch_size: 10
+  type: DataLoader # or QueueDataset 
+  data_path: "{workspace}/data/train"
+  word_count_dict_path: "{workspace}/data/dict/word_count_dict.txt"
+  word_ngrams_path: "{workspace}/data/dict/word_ngrams_id.txt"
+  data_converter: "{workspace}/reader.py"
+- name: dataset_infer # name
+  batch_size: 10
+  type: DataLoader # or QueueDataset
+  data_path: "{workspace}/data/test"
+  word_id_dict_path: "{workspace}/data/dict/word_id_dict.txt"
+  data_converter: "{workspace}/evaluate_reader.py"
+
+hyper_parameters:
+  optimizer:
+    learning_rate: 1.0
+    decay_steps: 100000
+    decay_rate: 0.999
+    class: sgd
+    strategy: async
+  sparse_feature_number: 227915
+  sparse_feature_dim: 300
+  with_shuffle_batch: False
+  neg_num: 5
+  window_size: 5
+  min_n: 3
+  max_n: 5
+
+# select runner by name
+mode: train_runner
+# config of each runner.
+# runner is a kind of paddle training class, which wraps the train/infer process.
+runner:
+- name: train_runner
+  class: single_train
+  # num of epochs
+  epochs: 2
+  # device to run training or infer
+  device: cpu
+  save_checkpoint_interval: 1 # save model interval of epochs
+  save_inference_interval: 1 # save inference
+  save_checkpoint_path: "increment" # save checkpoint path
+  save_inference_path: "inference" # save inference path
+  save_inference_feed_varnames: [] # feed vars of save inference
+  save_inference_fetch_varnames: [] # fetch vars of save inference
+  init_model_path: "" # load model path
+  fetch_period: 10
+- name: infer_runner
+  class: single_infer
+  # num of epochs
+  epochs: 1
+  # device to run training or infer
+  device: cpu
+  init_model_path: "increment/0" # load model path
+
+# runner will run all the phase in each epoch
+phase:
+- name: phase1
+  model: "{workspace}/model.py" # user-defined model
+  dataset_name: dataset_train # select dataset by name
+  thread_num: 1
+#- name: phase2
+#  model: "{workspace}/model.py" # user-defined model
+#  dataset_name: dataset_infer # select dataset by name
+#  thread_num: 1
--- a/models/recall/fasttext/data_prepare.sh
+++ b/models/recall/fasttext/data_prepare.sh
+#! /bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# download train_data
+mkdir raw_data
+wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/1-billion-word-language-modeling-benchmark-r13output.tar
+tar xvf 1-billion-word-language-modeling-benchmark-r13output.tar
+mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/ raw_data/
+
+# preprocess data
+python preprocess.py --build_dict --build_dict_corpus_dir raw_data/training-monolingual.tokenized.shuffled --dict_path raw_data/word_count_dict.txt --ngrams_path raw_data/word_ngrams.txt
+python preprocess.py --filter_corpus --dict_path raw_data/word_count_dict.txt --word_id_path raw_data/word_id_dict.txt --input_corpus_dir raw_data/training-monolingual.tokenized.shuffled --output_corpus_dir raw_data/convert_text8 --ngrams_id_path raw_data/word_ngrams_id.txt --ngrams_path raw_data/word_ngrams.txt --min_count 5 --downsample 0.001
+mv raw_data/word_count_dict.txt data/dict/
+mv raw_data/word_id_dict.txt data/dict/
+mv raw_data/word_ngrams.txt data/dict/
+mv raw_data/word_ngrams_id.txt data/dict/
+
+rm -rf data/train/*
+rm -rf data/test/*
+python preprocess.py --data_resplit --file_nums 24 --input_corpus_dir=raw_data/convert_text8 --output_corpus_dir=data/train
+
+# download test data
+wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar
+tar xzvf test_dir.tar -C raw_data
+mv raw_data/data/test_dir/* data/test/
+rm -rf raw_data
--- a/models/recall/fasttext/evaluate_reader.py
+++ b/models/recall/fasttext/evaluate_reader.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+
+import six
+
+from paddlerec.core.reader import Reader
+from paddlerec.core.utils import envs
+
+
+class TrainReader(Reader):
+    def init(self):
+        dict_path = envs.get_global_env(
+            "dataset.dataset_infer.word_id_dict_path")
+        self.min_n = envs.get_global_env("hyper_parameters.min_n")
+        self.max_n = envs.get_global_env("hyper_parameters.max_n")
+        self.word_to_id = dict()
+        self.id_to_word = dict()
+        with io.open(dict_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                self.word_to_id[line.split(' ')[0]] = int(line.split(' ')[1])
+                self.id_to_word[int(line.split(' ')[1])] = line.split(' ')[0]
+        self.dict_size = len(self.word_to_id)
+
+    def computeSubwords(self, word):
+        ngrams = set()
+        for i in range(len(word) - self.min_n + 1):
+            for j in range(self.min_n, self.max_n + 1):
+                end = min(len(word), i + j)
+                ngrams.add("".join(word[i:end]))
+        return list(ngrams)
+
+    def native_to_unicode(self, s):
+        if self._is_unicode(s):
+            return s
+        try:
+            return self._to_unicode(s)
+        except UnicodeDecodeError:
+            res = self._to_unicode(s, ignore_errors=True)
+            return res
+
+    def _is_unicode(self, s):
+        if six.PY2:
+            if isinstance(s, unicode):
+                return True
+        else:
+            if isinstance(s, str):
+                return True
+        return False
+
+    def _to_unicode(self, s, ignore_errors=False):
+        if self._is_unicode(s):
+            return s
+        error_mode = "ignore" if ignore_errors else "strict"
+        return s.decode("utf-8", errors=error_mode)
+
+    def strip_lines(self, line, vocab):
+        return self._replace_oov(vocab, self.native_to_unicode(line))
+
+    def _replace_oov(self, original_vocab, line):
+        """Replace out-of-vocab words with "<UNK>".
+      This maintains compatibility with published results.
+      Args:
+        original_vocab: a set of strings (The standard vocabulary for the dataset)
+        line: a unicode string - a space-delimited sequence of words.
+      Returns:
+        a unicode string - a space-delimited sequence of words.
+      """
+        return u" ".join([
+            "<" + word + ">"
+            if "<" + word + ">" in original_vocab else u"<UNK>"
+            for word in line.split()
+        ])
+
+    def generate_sample(self, line):
+        def reader():
+            if ':' in line:
+                pass
+            features = self.strip_lines(line.lower(), self.word_to_id)
+            features = features.split()
+            inputs = []
+            for item in features:
+                if item == "<UNK>":
+                    inputs.append([self.word_to_id[item]])
+                else:
+                    ngrams = self.computeSubwords(item)
+                    res = []
+                    res.append(self.word_to_id[item])
+                    for _ in ngrams:
+                        res.append(self.word_to_id[_])
+                    inputs.append(res)
+            yield [('analogy_a', inputs[0]), ('analogy_b', inputs[1]),
+                   ('analogy_c', inputs[2]), ('analogy_d', inputs[3][0:1])]
+
+        return reader
--- a/models/recall/fasttext/model.py
+++ b/models/recall/fasttext/model.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.fluid as fluid
+
+from paddlerec.core.utils import envs
+from paddlerec.core.model import Model as ModelBase
+
+
+class Model(ModelBase):
+    def __init__(self, config):
+        ModelBase.__init__(self, config)
+
+    def _init_hyper_parameters(self):
+        self.is_distributed = True if envs.get_trainer(
+        ) == "CtrTrainer" else False
+        self.sparse_feature_number = envs.get_global_env(
+            "hyper_parameters.sparse_feature_number")
+        self.sparse_feature_dim = envs.get_global_env(
+            "hyper_parameters.sparse_feature_dim")
+        self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
+        self.with_shuffle_batch = envs.get_global_env(
+            "hyper_parameters.with_shuffle_batch")
+        self.learning_rate = envs.get_global_env(
+            "hyper_parameters.optimizer.learning_rate")
+        self.decay_steps = envs.get_global_env(
+            "hyper_parameters.optimizer.decay_steps")
+        self.decay_rate = envs.get_global_env(
+            "hyper_parameters.optimizer.decay_rate")
+
+    def input_data(self, is_infer=False, **kwargs):
+        if is_infer:
+            analogy_a = fluid.data(
+                name="analogy_a", shape=[None, 1], lod_level=1, dtype='int64')
+            analogy_b = fluid.data(
+                name="analogy_b", shape=[None, 1], lod_level=1, dtype='int64')
+            analogy_c = fluid.data(
+                name="analogy_c", shape=[None, 1], lod_level=1, dtype='int64')
+            analogy_d = fluid.data(
+                name="analogy_d", shape=[None, 1], dtype='int64')
+            return [analogy_a, analogy_b, analogy_c, analogy_d]
+
+        input_word = fluid.data(
+            name="input_word", shape=[None, 1], lod_level=1, dtype='int64')
+        true_word = fluid.data(
+            name='true_label', shape=[None, 1], lod_level=1, dtype='int64')
+        if self.with_shuffle_batch:
+            return [input_word, true_word]
+
+        neg_word = fluid.data(
+            name="neg_label", shape=[None, self.neg_num], dtype='int64')
+        return [input_word, true_word, neg_word]
+
+    def net(self, inputs, is_infer=False):
+        if is_infer:
+            self.infer_net(inputs)
+            return
+
+        def embedding_layer(input,
+                            table_name,
+                            initializer_instance=None,
+                            sequence_pool=False):
+            emb = fluid.embedding(
+                input=input,
+                is_sparse=True,
+                is_distributed=self.is_distributed,
+                size=[self.sparse_feature_number, self.sparse_feature_dim],
+                param_attr=fluid.ParamAttr(
+                    name=table_name, initializer=initializer_instance), )
+            if sequence_pool:
+                emb = fluid.layers.sequence_pool(
+                    input=emb, pool_type='average')
+            return emb
+
+        init_width = 1.0 / self.sparse_feature_dim
+        emb_initializer = fluid.initializer.Uniform(-init_width, init_width)
+        emb_w_initializer = fluid.initializer.Constant(value=0.0)
+
+        input_emb = embedding_layer(inputs[0], "emb", emb_initializer, True)
+        input_emb = fluid.layers.squeeze(input=input_emb, axes=[1])
+        true_emb_w = embedding_layer(inputs[1], "emb_w", emb_w_initializer,
+                                     True)
+        true_emb_w = fluid.layers.squeeze(input=true_emb_w, axes=[1])
+
+        if self.with_shuffle_batch:
+            neg_emb_w_list = []
+            for i in range(self.neg_num):
+                neg_emb_w_list.append(
+                    fluid.contrib.layers.shuffle_batch(
+                        true_emb_w))  # shuffle true_word
+            neg_emb_w_concat = fluid.layers.concat(neg_emb_w_list, axis=0)
+            neg_emb_w = fluid.layers.reshape(
+                neg_emb_w_concat,
+                shape=[-1, self.neg_num, self.sparse_feature_dim])
+        else:
+            neg_emb_w = embedding_layer(inputs[2], "emb_w", emb_w_initializer)
+        true_logits = fluid.layers.reduce_sum(
+            fluid.layers.elementwise_mul(input_emb, true_emb_w),
+            dim=1,
+            keep_dim=True)
+
+        input_emb_re = fluid.layers.reshape(
+            input_emb, shape=[-1, 1, self.sparse_feature_dim])
+        neg_matmul = fluid.layers.matmul(
+            input_emb_re, neg_emb_w, transpose_y=True)
+        neg_logits = fluid.layers.reshape(neg_matmul, shape=[-1, 1])
+
+        logits = fluid.layers.concat([true_logits, neg_logits], axis=0)
+        label_ones = fluid.layers.fill_constant(
+            shape=[fluid.layers.shape(true_logits)[0], 1],
+            value=1.0,
+            dtype='float32')
+        label_zeros = fluid.layers.fill_constant(
+            shape=[fluid.layers.shape(neg_logits)[0], 1],
+            value=0.0,
+            dtype='float32')
+        label = fluid.layers.concat([label_ones, label_zeros], axis=0)
+
+        loss = fluid.layers.log_loss(fluid.layers.sigmoid(logits), label)
+        avg_cost = fluid.layers.reduce_sum(loss)
+        
+	global_right_cnt = fluid.layers.create_global_var(
+            name="global_right_cnt",
+            persistable=True,
+            dtype='float32',
+            shape=[1],
+            value=0)
+        global_total_cnt = fluid.layers.create_global_var(
+            name="global_total_cnt",
+            persistable=True,
+            dtype='float32',
+            shape=[1],
+            value=0)
+        global_right_cnt.stop_gradient = True
+        global_total_cnt.stop_gradient = True 
+        self._cost = avg_cost
+        self._metrics["LOSS"] = avg_cost
+
+    def optimizer(self):
+        optimizer = fluid.optimizer.SGD(
+            learning_rate=fluid.layers.exponential_decay(
+                learning_rate=self.learning_rate,
+                decay_steps=self.decay_steps,
+                decay_rate=self.decay_rate,
+                staircase=True))
+        return optimizer
+
+    def infer_net(self, inputs):
+        def embedding_layer(input,
+                            table_name,
+                            initializer_instance=None,
+                            sequence_pool=False):
+            emb = fluid.embedding(
+                input=input,
+                size=[self.sparse_feature_number, self.sparse_feature_dim],
+                param_attr=table_name)
+            if sequence_pool:
+                emb = fluid.layers.sequence_pool(
+                    input=emb, pool_type='average')
+            return emb
+
+        all_label = np.arange(self.sparse_feature_number).reshape(
+            self.sparse_feature_number).astype('int32')
+        self.all_label = fluid.layers.cast(
+            x=fluid.layers.assign(all_label), dtype='int64')
+        emb_all_label = embedding_layer(self.all_label, "emb")
+        emb_a = embedding_layer(inputs[0], "emb", sequence_pool=True)
+        emb_b = embedding_layer(inputs[1], "emb", sequence_pool=True)
+        emb_c = embedding_layer(inputs[2], "emb", sequence_pool=True)
+
+        target = fluid.layers.elementwise_add(
+            fluid.layers.elementwise_sub(emb_b, emb_a), emb_c)
+
+        emb_all_label_l2 = fluid.layers.l2_normalize(x=emb_all_label, axis=1)
+        dist = fluid.layers.matmul(
+            x=target, y=emb_all_label_l2, transpose_y=True)
+        values, pred_idx = fluid.layers.topk(input=dist, k=4)
+        label = fluid.layers.expand(inputs[3], expand_times=[1, 4])
+        label_ones = fluid.layers.fill_constant_batch_size_like(
+            label, shape=[-1, 1], value=1.0, dtype='float32')
+        right_cnt = fluid.layers.reduce_sum(input=fluid.layers.cast(
+            fluid.layers.equal(pred_idx, label), dtype='float32'))
+        total_cnt = fluid.layers.reduce_sum(label_ones)
+
+        global_right_cnt = fluid.layers.create_global_var(
+            name="global_right_cnt",
+            persistable=True,
+            dtype='float32',
+            shape=[1],
+            value=0)
+        global_total_cnt = fluid.layers.create_global_var(
+            name="global_total_cnt",
+            persistable=True,
+            dtype='float32',
+            shape=[1],
+            value=0)
+        global_right_cnt.stop_gradient = True
+        global_total_cnt.stop_gradient = True
+
+        global_total_cnt = fluid.layers.Print(global_total_cnt)
+
+        tmp1 = fluid.layers.elementwise_add(right_cnt, global_right_cnt)
+        fluid.layers.assign(tmp1, global_right_cnt)
+        
+        total_cnt = fluid.layers.Print(total_cnt)
+        tmp2 = fluid.layers.elementwise_add(total_cnt, global_total_cnt)
+        fluid.layers.assign(tmp2, global_total_cnt)
+
+        global_right_cnt = fluid.layers.Print(global_right_cnt)
+        global_total_cnt = fluid.layers.Print(global_total_cnt)
+
+        acc = fluid.layers.elementwise_div(
+            global_right_cnt, global_total_cnt, name="total_acc")
+        self._infer_results['acc'] = acc
--- a/models/recall/fasttext/preprocess.py
+++ b/models/recall/fasttext/preprocess.py
+# -*- coding: utf-8 -*
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+import math
+import os
+import random
+import re
+import six
+
+import argparse
+
+prog = re.compile("[^a-z ]", flags=0)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Paddle Fluid word2 vector preprocess")
+    parser.add_argument(
+        '--build_dict_corpus_dir', type=str, help="The dir of corpus")
+    parser.add_argument(
+        '--input_corpus_dir', type=str, help="The dir of input corpus")
+    parser.add_argument(
+        '--output_corpus_dir', type=str, help="The dir of output corpus")
+    parser.add_argument(
+        '--dict_path',
+        type=str,
+        default='./dict',
+        help="The path of dictionary ")
+    parser.add_argument(
+        '--word_id_path',
+        type=str,
+        default='./word_id',
+        help="The path of word_id ")
+    parser.add_argument(
+        '--ngrams_path',
+        type=str,
+        default='./word_ngrams',
+        help="The path of word_ngrams ")
+    parser.add_argument(
+	'--ngrams_id_path',
+	type=str,
+        default='./word_ngrams_id',
+        help="The path of word_ngrams_id ")
+    parser.add_argument(
+        '--min_count',
+        type=int,
+        default=5,
+        help="If the word count is less then min_count, it will be removed from dict"
+    )
+    parser.add_argument('--min_n', type=int, default=3, help="min_n of ngrams")
+    parser.add_argument('--max_n', type=int, default=5, help="max_n of ngrams")
+    parser.add_argument(
+        '--file_nums',
+        type=int,
+        default=1024,
+        help="re-split input corpus file nums")
+    parser.add_argument(
+        '--downsample',
+        type=float,
+        default=0.001,
+        help="filter word by downsample")
+    parser.add_argument(
+        '--filter_corpus',
+        action='store_true',
+        default=False,
+        help='Filter corpus')
+    parser.add_argument(
+        '--build_dict',
+        action='store_true',
+        default=False,
+        help='Build dict from corpus')
+    parser.add_argument(
+        '--data_resplit',
+        action='store_true',
+        default=False,
+        help='re-split input corpus files')
+    return parser.parse_args()
+
+
+def text_strip(text):
+    # English Preprocess Rule
+    return prog.sub("", text.lower())
+
+
+# Shameless copy from Tensorflow https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/text_encoder.py
+# Unicode utility functions that work with Python 2 and 3
+def native_to_unicode(s):
+    if _is_unicode(s):
+        return s
+    try:
+        return _to_unicode(s)
+    except UnicodeDecodeError:
+        res = _to_unicode(s, ignore_errors=True)
+        return res
+
+
+def _is_unicode(s):
+    if six.PY2:
+        if isinstance(s, unicode):
+            return True
+    else:
+        if isinstance(s, str):
+            return True
+    return False
+
+
+def _to_unicode(s, ignore_errors=False):
+    if _is_unicode(s):
+        return s
+    error_mode = "ignore" if ignore_errors else "strict"
+    return s.decode("utf-8", errors=error_mode)
+
+
+def filter_corpus(args):
+    """
+    filter corpus and convert id.
+    """
+    word_count = dict()
+    word_to_id_ = dict()
+    word_all_count = 0
+    id_counts = []
+    word_id = 0
+    # read dict
+    with io.open(args.dict_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            word, count = line.split()[0], int(line.split()[1])
+            word_count[word] = count
+            word_to_id_[word] = word_id
+            word_id += 1
+            id_counts.append(count)
+            word_all_count += count
+
+    word_ngrams = dict()
+    with io.open(args.ngrams_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            word, ngrams = line.rstrip().split(':')
+            ngrams = ngrams.split()
+            ngrams = [str(word_to_id_[_]) for _ in ngrams]
+            word_ngrams[word_to_id_[word]] = ' '.join(ngrams)
+
+    with io.open(args.ngrams_id_path, 'w+', encoding='utf-8') as fid:
+        for k, v in word_ngrams.items():
+            fid.write(u'{} {}\n'.format(k, v))
+
+    # write word2id file
+    print("write word2id file to : " + args.dict_path + "_word_to_id_")
+    with io.open(
+            args.word_id_path, 'w+', encoding='utf-8') as fid:
+        for k, v in word_to_id_.items():
+            fid.write(k + " " + str(v) + '\n')
+    # filter corpus and convert id
+    if not os.path.exists(args.output_corpus_dir):
+        os.makedirs(args.output_corpus_dir)
+    for file in os.listdir(args.input_corpus_dir):
+        with io.open(args.output_corpus_dir + '/convert_' + file + '.csv',
+                     "w") as wf:
+            with io.open(
+                    args.input_corpus_dir + '/' + file,
+                    encoding='utf-8') as rf:
+                print(args.input_corpus_dir + '/' + file)
+                for line in rf:
+                    signal = False
+                    line = text_strip(line)
+                    words = line.split()
+                    write_line = ""
+                    for item in words:
+                        if item in word_count:
+                            idx = word_to_id_[item]
+                        else:
+                            idx = word_to_id_[native_to_unicode('<UNK>')]
+                        count_w = id_counts[idx]
+                        corpus_size = word_all_count
+                        keep_prob = (
+                            math.sqrt(count_w /
+                                      (args.downsample * corpus_size)) + 1
+                        ) * (args.downsample * corpus_size) / count_w
+                        r_value = random.random()
+                        if r_value > keep_prob:
+                            continue
+                        write_line += str(idx)
+                        write_line += ","
+                        signal = True
+                    if signal:
+                        write_line = write_line[:-1] + "\n"
+                        wf.write(_to_unicode(write_line))
+
+
+def computeSubwords(word, min_n, max_n):
+    ngrams = set()
+    for i in range(len(word) - min_n + 1):
+        for j in range(min_n, max_n + 1):
+            end = min(len(word), i + j)
+            ngrams.add("".join(word[i:end]))
+    return list(ngrams)
+
+
+def build_dict(args):
+    """
+    proprocess the data, generate dictionary and save into dict_path.
+    :param corpus_dir: the input data dir.
+    :param dict_path: the generated dict path. the data in dict is "word count"
+    :param min_count:
+    :return:
+    """
+    # word to count
+
+    word_count = dict()
+
+    for file in os.listdir(args.build_dict_corpus_dir):
+        with io.open(
+                args.build_dict_corpus_dir + "/" + file,
+                encoding='utf-8') as f:
+            print("build dict : ", args.build_dict_corpus_dir + "/" + file)
+            for line in f:
+                line = text_strip(line)
+                words = line.split()
+                for item in words:
+                    item = '<' + item + '>'
+                    if item in word_count:
+                        word_count[item] = word_count[item] + 1
+                    else:
+                        word_count[item] = 1
+
+    item_to_remove = []
+    for item in word_count:
+        if word_count[item] <= args.min_count:
+            item_to_remove.append(item)
+
+    unk_sum = 0
+    for item in item_to_remove:
+        unk_sum += word_count[item]
+        del word_count[item]
+    # sort by count
+    word_count[native_to_unicode('<UNK>')] = unk_sum
+
+    word_ngrams = dict()
+    ngrams_count = dict()
+    for item in word_count:
+        ngrams = computeSubwords(item, args.min_n, args.max_n)
+        word_ngrams[item] = ngrams
+        for sub_word in ngrams:
+            if sub_word not in ngrams_count:
+                ngrams_count[sub_word] = 1
+            else:
+                ngrams_count[sub_word] = ngrams_count[sub_word] + 1
+    ngrams_count = sorted(
+        ngrams_count.items(), key=lambda ngrams_count: -ngrams_count[1])
+
+    word_count = sorted(
+        word_count.items(), key=lambda word_count: -word_count[1])
+    with io.open(args.dict_path, 'w+', encoding='utf-8') as f:
+        for k, v in word_count:
+            f.write(k + " " + str(v) + '\n')
+        for k, v in ngrams_count:
+            f.write(k + " " + str(v) + '\n')
+
+    with io.open(args.ngrams_path, 'w+', encoding='utf-8') as f:
+        for key in word_ngrams:
+            f.write(key + ":")
+            f.write(" ".join(word_ngrams[key]))
+            f.write(u'\n')
+
+
+def data_split(args):
+    raw_data_dir = args.input_corpus_dir
+    new_data_dir = args.output_corpus_dir
+    if not os.path.exists(new_data_dir):
+        os.mkdir(new_data_dir)
+    files = os.listdir(raw_data_dir)
+    print(files)
+    index = 0
+    contents = []
+    for file_ in files:
+        with open(os.path.join(raw_data_dir, file_), 'r') as f:
+            contents.extend(f.readlines())
+
+    num = int(args.file_nums)
+    lines_per_file = len(contents) / num
+    print("contents: ", str(len(contents)))
+    print("lines_per_file: ", str(lines_per_file))
+
+    for i in range(1, num + 1):
+        with open(os.path.join(new_data_dir, "part_" + str(i)), 'w') as fout:
+            data = contents[(i - 1) * lines_per_file:min(i * lines_per_file,
+                                                         len(contents))]
+            for line in data:
+                fout.write(line)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    if args.build_dict:
+        build_dict(args)
+    elif args.filter_corpus:
+        filter_corpus(args)
+    elif args.data_resplit:
+        data_split(args)
+    else:
+        print(
+            "error command line, please choose --build_dict or --filter_corpus")
--- a/models/recall/fasttext/reader.py
+++ b/models/recall/fasttext/reader.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+
+import numpy as np
+
+from paddlerec.core.reader import Reader
+from paddlerec.core.utils import envs
+
+
+class NumpyRandomInt(object):
+    def __init__(self, a, b, buf_size=1000):
+        self.idx = 0
+        self.buffer = np.random.random_integers(a, b, buf_size)
+        self.a = a
+        self.b = b
+
+    def __call__(self):
+        if self.idx == len(self.buffer):
+            self.buffer = np.random.random_integers(self.a, self.b,
+                                                    len(self.buffer))
+            self.idx = 0
+
+        result = self.buffer[self.idx]
+        self.idx += 1
+        return result
+
+
+class TrainReader(Reader):
+    def init(self):
+        dict_path = envs.get_global_env(
+            "dataset.dataset_train.word_count_dict_path")
+        word_ngrams_path = envs.get_global_env(
+            "dataset.dataset_train.word_ngrams_path")
+        self.window_size = envs.get_global_env("hyper_parameters.window_size")
+        self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
+        self.with_shuffle_batch = envs.get_global_env(
+            "hyper_parameters.with_shuffle_batch")
+        self.random_generator = NumpyRandomInt(1, self.window_size + 1)
+
+        self.word_ngrams = dict()
+        with io.open(word_ngrams_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.rstrip().split()
+                self.word_ngrams[str(line[0])] = map(int, line[1:])
+
+        self.cs = None
+        if not self.with_shuffle_batch:
+            id_counts = []
+            word_all_count = 0
+            with io.open(dict_path, 'r', encoding='utf-8') as f:
+                for line in f:
+                    word, count = line.split()[0], int(line.split()[1])
+                    id_counts.append(count)
+                    word_all_count += count
+            id_frequencys = [
+                float(count) / word_all_count for count in id_counts
+            ]
+            np_power = np.power(np.array(id_frequencys), 0.75)
+            id_frequencys_pow = np_power / np_power.sum()
+            self.cs = np.array(id_frequencys_pow).cumsum()
+
+    def get_context_words(self, words, idx):
+        """
+        Get the context word list of target word.
+        words: the words of the current line
+        idx: input word index
+        window_size: window size
+        """
+        target_window = self.random_generator()
+        start_point = idx - target_window  # if (idx - target_window) > 0 else 0
+        if start_point < 0:
+            start_point = 0
+        end_point = idx + target_window
+        targets = words[start_point:idx] + words[idx + 1:end_point + 1]
+        return targets
+
+    def generate_sample(self, line):
+        def reader():
+            word_ids = [w for w in line.split()]
+            for idx, target_id in enumerate(word_ids):
+                input_word = [int(target_id)]
+                if target_id in self.word_ngrams:
+                    input_word += self.word_ngrams[target_id]
+                context_word_ids = self.get_context_words(word_ids, idx)
+                for context_id in context_word_ids:
+                    output = [('input_word', input_word),
+                              ('true_label', [int(context_id)])]
+                    if not self.with_shuffle_batch:
+                        neg_array = self.cs.searchsorted(
+                            np.random.sample(self.neg_num))
+                        output += [('neg_label',
+                                    [int(str(i)) for i in neg_array])]
+                    yield output
+
+        return reader