From b8555a93b4d5aca679891e842c58978d617d37b6 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 9 Jan 2019 07:28:13 +0000
Subject: [PATCH] fix preprocess

---
 fluid/PaddleRec/word2vec/README.cn.md   |  3 +++
 fluid/PaddleRec/word2vec/README.md      |  5 +++++
 fluid/PaddleRec/word2vec/async_train.py |  3 ++-
 fluid/PaddleRec/word2vec/reader.py      | 14 +++++++++++---
 fluid/PaddleRec/word2vec/train.py       | 11 +++++++++--
 5 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/fluid/PaddleRec/word2vec/README.cn.md b/fluid/PaddleRec/word2vec/README.cn.md
index 37d3fd6d..f83444c1 100644
--- a/fluid/PaddleRec/word2vec/README.cn.md
+++ b/fluid/PaddleRec/word2vec/README.cn.md
@@ -45,12 +45,15 @@ python train.py \
         --with_nce --is_local \
         2>&1 | tee train.log
 ```
+
 使用async executor
 ```bash
 python async_train.py --train_data_path ./async_data/ \
         --dict_path data/1-billion_dict --with_nce --with_hs \
         --epochs 1 --thread_num 1 --is_sparse --batch_size 100 --is_local 2>&1 | tee async_trainer1.log
 ```
+
+如果您想使用我们支持的第三方词汇表，请将--other_dict_path设置为您存放将使用的词汇表的目录，并设置--with_other_dict使用它
 ### 分布式训练
 
 本地启动一个2 trainer 2 pserver的分布式训练任务，分布式场景下训练数据会按照trainer的id进行切分，保证trainer之间的训练数据不会重叠，提高训练效率
diff --git a/fluid/PaddleRec/word2vec/README.md b/fluid/PaddleRec/word2vec/README.md
index a823d822..aca7002a 100644
--- a/fluid/PaddleRec/word2vec/README.md
+++ b/fluid/PaddleRec/word2vec/README.md
@@ -53,6 +53,7 @@ python train.py \
         --with_nce --is_local \
         2>&1 | tee train.log
 ```
+
 with async executor
 ```bash
 python async_train.py --train_data_path ./async_data/ \
@@ -60,6 +61,10 @@ python async_train.py --train_data_path ./async_data/ \
         --epochs 1 --thread_num 1 --is_sparse --batch_size 100 --is_local 2>&1 | tee async_trainer1.log
 ```
 
+if you would like to use our supported third party vocab, please set --other_dict_path as the directory of where you
+save the vocab you will use and set --with_other_dict flag on to using it.
+
+
 ### Distributed Train
 Run a 2 pserver 2 trainer distribute training on a single machine.
 In distributed training setting, training data is splited by trainer_id, so that training data
diff --git a/fluid/PaddleRec/word2vec/async_train.py b/fluid/PaddleRec/word2vec/async_train.py
index 29094d9a..91301088 100644
--- a/fluid/PaddleRec/word2vec/async_train.py
+++ b/fluid/PaddleRec/word2vec/async_train.py
@@ -174,7 +174,8 @@ def async_train_loop(args, train_program, dataset, loss, thread_num):
 def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
     train_reader = paddle.batch(
         paddle.reader.shuffle(
-            reader.train((args.with_hs or (not args.with_nce))),
+            reader.train((args.with_hs or (not args.with_nce)),
+                         args.with_other_dict),
             buf_size=args.batch_size * 100),
         batch_size=args.batch_size)
 
diff --git a/fluid/PaddleRec/word2vec/reader.py b/fluid/PaddleRec/word2vec/reader.py
index 1b1b6a9d..69ae84c8 100644
--- a/fluid/PaddleRec/word2vec/reader.py
+++ b/fluid/PaddleRec/word2vec/reader.py
@@ -105,7 +105,7 @@ class Word2VecReader(object):
 
         return set(targets)
 
-    def train(self, with_hs):
+    def train(self, with_hs, with_other_dict):
         def _reader():
             for file in self.filelist:
                 with io.open(
@@ -116,7 +116,11 @@ class Word2VecReader(object):
                     count = 1
                     for line in f:
                         if self.trainer_id == count % self.trainer_num:
-                            line = preprocess.strip_lines(line, self.word_count)
+                            if with_other_dict:
+                                line = preprocess.strip_lines(line,
+                                                              self.word_count)
+                            else:
+                                line = preprocess.text_strip(line)
                             word_ids = [
                                 self.word_to_id_[word] for word in line.split()
                                 if word in self.word_to_id_
@@ -140,7 +144,11 @@ class Word2VecReader(object):
                     count = 1
                     for line in f:
                         if self.trainer_id == count % self.trainer_num:
-                            line = preprocess.strip_lines(line, self.word_count)
+                            if with_other_dict:
+                                line = preprocess.strip_lines(line,
+                                                              self.word_count)
+                            else:
+                                line = preprocess.text_strip(line)
                             word_ids = [
                                 self.word_to_id_[word] for word in line.split()
                                 if word in self.word_to_id_
diff --git a/fluid/PaddleRec/word2vec/train.py b/fluid/PaddleRec/word2vec/train.py
index df32e0b4..6fbe7a79 100644
--- a/fluid/PaddleRec/word2vec/train.py
+++ b/fluid/PaddleRec/word2vec/train.py
@@ -116,6 +116,13 @@ def parse_args():
         default=False,
         help='Do inference every 100 batches , (default: False)')
 
+    parser.add_argument(
+        '--with_other_dict',
+        action='store_true',
+        required=False,
+        default=False,
+        help='if use other dict , (default: False)')
+
     parser.add_argument(
         '--rank_num',
         type=int,
@@ -161,8 +168,8 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
     py_reader.decorate_tensor_provider(
         convert_python_to_tensor(args.batch_size,
                                  reader.train((args.with_hs or (
-                                     not args.with_nce))), (args.with_hs or (
-                                         not args.with_nce))))
+                                     not args.with_nce)), args.with_other_dict),
+                                 (args.with_hs or (not args.with_nce))))
 
     place = fluid.CPUPlace()
 
-- 
GitLab