Merge pull request #1624 from JiabinYang/fix_preprocess_without_3rd_part_dict

fix_preprocess_without_3rd_part_dict

Merge pull request #1624 from JiabinYang/fix_preprocess_without_3rd_part_dict
fix_preprocess_without_3rd_part_dict
aea40b82 · Jiabin Yang · GitHub · 111bbdb6 · 18a0bbe8 · aea40b82
5 changed file
--- a/fluid/PaddleRec/word2vec/README.cn.md
+++ b/fluid/PaddleRec/word2vec/README.cn.md
@@ -25,7 +25,14 @@ cd data && ./download.sh && cd ..
 ```bash
 python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict
 ```
-如果您想使用我们支持的第三方词汇表，请将--other_dict_path设置为您存放将使用的词汇表的目录，并设置--with_other_dict使用它
+如果您想使用自定义的词典形如：
+```bash
+<UNK>
+a
+b
+c
+```
+请将--other_dict_path设置为您存放将使用的词典的目录，并设置--with_other_dict使用它
 ## 训练
 训练的命令行选项可以通过`python train.py -h`列出。
@@ -40,6 +47,14 @@ python train.py \
        --with_hs --with_nce --is_local \
        2>&1 | tee train.log
 ```
+如果您想使用自定义的词典形如：
+```bash
+<UNK>
+a
+b
+c
+```
+请将--other_dict_path设置为您存放将使用的词典的目录，并设置--with_other_dict使用它
 ### 分布式训练

--- a/fluid/PaddleRec/word2vec/README.md
+++ b/fluid/PaddleRec/word2vec/README.md
@@ -29,9 +29,16 @@ This model implement a skip-gram model of word2vector.
 Preprocess the training data to generate a word dict.
 ```bash
-python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --is_local --dict_path data/1-billion_dict
+python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict
 ```
-if you would like to use our supported third party vocab, please set --other_dict_path as the directory of where you
+if you would like to use your own vocab follow the format below:
+```bash
+<UNK>
+a
+b
+c
+```
+Then, please set --other_dict_path as the directory of where you
 save the vocab you will use and set --with_other_dict flag on to using it.
 ## Train
@@ -47,7 +54,8 @@ python train.py \
        --with_hs --with_nce --is_local \
        2>&1 | tee train.log
 ```
+if you would like to use our supported third party vocab, please set --other_dict_path as the directory of where you
+save the vocab you will use and set --with_other_dict flag on to using it.
 ### Distributed Train
 Run a 2 pserver 2 trainer distribute training on a single machine.

--- a/fluid/PaddleRec/word2vec/preprocess.py
+++ b/fluid/PaddleRec/word2vec/preprocess.py
@@ -27,12 +27,6 @@ def parse_args():
        type=int,
        default=5,
        help="If the word count is less then freq, it will be removed from dict")
-    parser.add_argument(
-        '--is_local',
-        action='store_true',
-        required=False,
-        default=False,
-        help='Local train or not, (default: False)')
    parser.add_argument(
        '--with_other_dict',
@@ -203,26 +197,27 @@ def preprocess(args):
            for line in f:
                word_count[native_to_unicode(line.strip())] = 1
-    if args.is_local:
+    for i in range(1, 100):
-        for i in range(1, 100):
+        with io.open(
-            with io.open(
+                args.data_path + "/news.en-000{:0>2d}-of-00100".format(i),
-                    args.data_path + "/news.en-000{:0>2d}-of-00100".format(i),
+                encoding='utf-8') as f:
-                    encoding='utf-8') as f:
+            for line in f:
-                for line in f:
+                if args.with_other_dict:
                    line = strip_lines(line)
                    words = line.split()
-                    if args.with_other_dict:
+                    for item in words:
-                        for item in words:
+                        if item in word_count:
-                            if item in word_count:
+                            word_count[item] = word_count[item] + 1
-                                word_count[item] = word_count[item] + 1
+                        else:
-                            else:
+                            word_count[native_to_unicode('<UNK>')] += 1
-                                word_count[native_to_unicode('<UNK>')] += 1
+                else:
-                    else:
+                    line = text_strip(line)
-                        for item in words:
+                    words = line.split()
-                            if item in word_count:
+                    for item in words:
-                                word_count[item] = word_count[item] + 1
+                        if item in word_count:
-                            else:
+                            word_count[item] = word_count[item] + 1
-                                word_count[item] = 1
+                        else:
+                            word_count[item] = 1
    item_to_remove = []
    for item in word_count:
        if word_count[item] <= args.freq:

--- a/fluid/PaddleRec/word2vec/reader.py
+++ b/fluid/PaddleRec/word2vec/reader.py
@@ -105,7 +105,7 @@ class Word2VecReader(object):
        return set(targets)
-    def train(self, with_hs):
+    def train(self, with_hs, with_other_dict):
        def _reader():
            for file in self.filelist:
                with io.open(
@@ -116,7 +116,11 @@ class Word2VecReader(object):
                    count = 1
                    for line in f:
                        if self.trainer_id == count % self.trainer_num:
-                            line = preprocess.strip_lines(line, self.word_count)
+                            if with_other_dict:
+                                line = preprocess.strip_lines(line,
+                                                              self.word_count)
+                            else:
+                                line = preprocess.text_strip(line)
                            word_ids = [
                                self.word_to_id_[word] for word in line.split()
                                if word in self.word_to_id_
@@ -140,7 +144,11 @@ class Word2VecReader(object):
                    count = 1
                    for line in f:
                        if self.trainer_id == count % self.trainer_num:
-                            line = preprocess.strip_lines(line, self.word_count)
+                            if with_other_dict:
+                                line = preprocess.strip_lines(line,
+                                                              self.word_count)
+                            else:
+                                line = preprocess.text_strip(line)
                            word_ids = [
                                self.word_to_id_[word] for word in line.split()
                                if word in self.word_to_id_

--- a/fluid/PaddleRec/word2vec/train.py
+++ b/fluid/PaddleRec/word2vec/train.py
@@ -116,6 +116,13 @@ def parse_args():
        default=False,
        help='Do inference every 100 batches , (default: False)')
+    parser.add_argument(
+        '--with_other_dict',
+        action='store_true',
+        required=False,
+        default=False,
+        help='if use other dict , (default: False)')
    parser.add_argument(
        '--rank_num',
        type=int,
@@ -161,8 +168,8 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
    py_reader.decorate_tensor_provider(
        convert_python_to_tensor(args.batch_size,
                                 reader.train((args.with_hs or (
-                                     not args.with_nce))), (args.with_hs or (
+                                     not args.with_nce)), args.with_other_dict),
-                                         not args.with_nce))))
+                                 (args.with_hs or (not args.with_nce))))
    place = fluid.CPUPlace()
@@ -261,7 +268,7 @@ def train(args):
            args.dict_path, args.train_data_path, filelist, 0, 1)
    else:
        trainer_id = int(os.environ["PADDLE_TRAINER_ID"])
-        trainers = int(os.environ["PADDLE_TRAINERS"])
+        trainer_num = int(os.environ["PADDLE_TRAINERS"])
        word2vec_reader = reader.Word2VecReader(args.dict_path,
                                                args.train_data_path, filelist,
                                                trainer_id, trainer_num)