fix preprocess

b8555a93 · JiabinYang · 28b8943b · b8555a93 · b8555a93 · b8555a93
5 changed file
--- a/fluid/PaddleRec/word2vec/README.cn.md
+++ b/fluid/PaddleRec/word2vec/README.cn.md
@@ -45,12 +45,15 @@ python train.py \
        --with_nce --is_local \
        2>&1 | tee train.log
 ```
 使用async executor
 ```bash
 python async_train.py --train_data_path ./async_data/ \
        --dict_path data/1-billion_dict --with_nce --with_hs \
        --epochs 1 --thread_num 1 --is_sparse --batch_size 100 --is_local 2>&1 | tee async_trainer1.log
 ```
+如果您想使用我们支持的第三方词汇表，请将--other_dict_path设置为您存放将使用的词汇表的目录，并设置--with_other_dict使用它
 ### 分布式训练
 本地启动一个2 trainer 2 pserver的分布式训练任务，分布式场景下训练数据会按照trainer的id进行切分，保证trainer之间的训练数据不会重叠，提高训练效率

--- a/fluid/PaddleRec/word2vec/README.md
+++ b/fluid/PaddleRec/word2vec/README.md
@@ -53,6 +53,7 @@ python train.py \
        --with_nce --is_local \
        2>&1 | tee train.log
 ```
 with async executor
 ```bash
 python async_train.py --train_data_path ./async_data/ \
@@ -60,6 +61,10 @@ python async_train.py --train_data_path ./async_data/ \
        --epochs 1 --thread_num 1 --is_sparse --batch_size 100 --is_local 2>&1 | tee async_trainer1.log
 ```
+if you would like to use our supported third party vocab, please set --other_dict_path as the directory of where you
+save the vocab you will use and set --with_other_dict flag on to using it.
 ### Distributed Train
 Run a 2 pserver 2 trainer distribute training on a single machine.
 In distributed training setting, training data is splited by trainer_id, so that training data

--- a/fluid/PaddleRec/word2vec/async_train.py
+++ b/fluid/PaddleRec/word2vec/async_train.py
@@ -174,7 +174,8 @@ def async_train_loop(args, train_program, dataset, loss, thread_num):
 def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
    train_reader = paddle.batch(
        paddle.reader.shuffle(
-            reader.train((args.with_hs or (not args.with_nce))),
+            reader.train((args.with_hs or (not args.with_nce)),
+                         args.with_other_dict),
            buf_size=args.batch_size * 100),
        batch_size=args.batch_size)

--- a/fluid/PaddleRec/word2vec/reader.py
+++ b/fluid/PaddleRec/word2vec/reader.py
@@ -105,7 +105,7 @@ class Word2VecReader(object):
        return set(targets)
-    def train(self, with_hs):
+    def train(self, with_hs, with_other_dict):
        def _reader():
            for file in self.filelist:
                with io.open(
@@ -116,7 +116,11 @@ class Word2VecReader(object):
                    count = 1
                    for line in f:
                        if self.trainer_id == count % self.trainer_num:
-                            line = preprocess.strip_lines(line, self.word_count)
+                            if with_other_dict:
+                                line = preprocess.strip_lines(line,
+                                                              self.word_count)
+                            else:
+                                line = preprocess.text_strip(line)
                            word_ids = [
                                self.word_to_id_[word] for word in line.split()
                                if word in self.word_to_id_
@@ -140,7 +144,11 @@ class Word2VecReader(object):
                    count = 1
                    for line in f:
                        if self.trainer_id == count % self.trainer_num:
-                            line = preprocess.strip_lines(line, self.word_count)
+                            if with_other_dict:
+                                line = preprocess.strip_lines(line,
+                                                              self.word_count)
+                            else:
+                                line = preprocess.text_strip(line)
                            word_ids = [
                                self.word_to_id_[word] for word in line.split()
                                if word in self.word_to_id_

--- a/fluid/PaddleRec/word2vec/train.py
+++ b/fluid/PaddleRec/word2vec/train.py
@@ -116,6 +116,13 @@ def parse_args():
        default=False,
        help='Do inference every 100 batches , (default: False)')
+    parser.add_argument(
+        '--with_other_dict',
+        action='store_true',
+        required=False,
+        default=False,
+        help='if use other dict , (default: False)')
    parser.add_argument(
        '--rank_num',
        type=int,
@@ -161,8 +168,8 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
    py_reader.decorate_tensor_provider(
        convert_python_to_tensor(args.batch_size,
                                 reader.train((args.with_hs or (
-                                     not args.with_nce))), (args.with_hs or (
+                                     not args.with_nce)), args.with_other_dict),
-                                         not args.with_nce))))
+                                 (args.with_hs or (not args.with_nce))))
    place = fluid.CPUPlace()