fix_preprocess_without_3rd_part_dict

075d74cf · JiabinYang · 62148b78 · 075d74cf · 075d74cf · 075d74cf
5 changed file
--- a/fluid/PaddleRec/word2vec/README.cn.md
+++ b/fluid/PaddleRec/word2vec/README.cn.md
@@ -23,7 +23,7 @@ cd data && ./download.sh && cd ..
 对数据进行预处理以生成一个词典。

 ```bash
-python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict
+python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict --is_local
 ```
 如果您想使用我们支持的第三方词汇表，请将--other_dict_path设置为您存放将使用的词汇表的目录，并设置--with_other_dict使用它

@@ -40,6 +40,7 @@ python train.py \
        --with_hs --with_nce --is_local \
        2>&1 | tee train.log
 ```
+如果您想使用我们支持的第三方词汇表，请将--other_dict_path设置为您存放将使用的词汇表的目录，并设置--with_other_dict使用它

 ### 分布式训练


--- a/fluid/PaddleRec/word2vec/README.md
+++ b/fluid/PaddleRec/word2vec/README.md
@@ -47,7 +47,8 @@ python train.py \
        --with_hs --with_nce --is_local \
        2>&1 | tee train.log
 ```
-
+if you would like to use our supported third party vocab, please set --other_dict_path as the directory of where you
+save the vocab you will use and set --with_other_dict flag on to using it.

 ### Distributed Train
 Run a 2 pserver 2 trainer distribute training on a single machine.

--- a/fluid/PaddleRec/word2vec/preprocess.py
+++ b/fluid/PaddleRec/word2vec/preprocess.py
@@ -209,15 +209,17 @@ def preprocess(args):
                    args.data_path + "/news.en-000{:0>2d}-of-00100".format(i),
                    encoding='utf-8') as f:
                for line in f:
-                    line = strip_lines(line)
-                    words = line.split()
                    if args.with_other_dict:
+                        line = strip_lines(line)
+                        words = line.split()
                        for item in words:
                            if item in word_count:
                                word_count[item] = word_count[item] + 1
                            else:
                                word_count[native_to_unicode('<UNK>')] += 1
                    else:
+                        line = text_strip(line)
+                        words = line.split()
                        for item in words:
                            if item in word_count:
                                word_count[item] = word_count[item] + 1

--- a/fluid/PaddleRec/word2vec/reader.py
+++ b/fluid/PaddleRec/word2vec/reader.py
@@ -105,7 +105,7 @@ class Word2VecReader(object):

        return set(targets)

-    def train(self, with_hs):
+    def train(self, with_hs, with_other_dict):
        def _reader():
            for file in self.filelist:
                with io.open(
@@ -116,7 +116,11 @@ class Word2VecReader(object):
                    count = 1
                    for line in f:
                        if self.trainer_id == count % self.trainer_num:
-                            line = preprocess.strip_lines(line, self.word_count)
+                            if with_other_dict:
+                                line = preprocess.strip_lines(line,
+                                                              self.word_count)
+                            else:
+                                line = preprocess.text_strip(line)
                            word_ids = [
                                self.word_to_id_[word] for word in line.split()
                                if word in self.word_to_id_
@@ -140,7 +144,11 @@ class Word2VecReader(object):
                    count = 1
                    for line in f:
                        if self.trainer_id == count % self.trainer_num:
-                            line = preprocess.strip_lines(line, self.word_count)
+                            if with_other_dict:
+                                line = preprocess.strip_lines(line,
+                                                              self.word_count)
+                            else:
+                                line = preprocess.text_strip(line)
                            word_ids = [
                                self.word_to_id_[word] for word in line.split()
                                if word in self.word_to_id_

--- a/fluid/PaddleRec/word2vec/train.py
+++ b/fluid/PaddleRec/word2vec/train.py
@@ -116,6 +116,13 @@ def parse_args():
        default=False,
        help='Do inference every 100 batches , (default: False)')

+    parser.add_argument(
+        '--with_other_dict',
+        action='store_true',
+        required=False,
+        default=False,
+        help='if use other dict , (default: False)')
+
    parser.add_argument(
        '--rank_num',
        type=int,
@@ -161,8 +168,8 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
    py_reader.decorate_tensor_provider(
        convert_python_to_tensor(args.batch_size,
                                 reader.train((args.with_hs or (
-                                     not args.with_nce))), (args.with_hs or (
-                                         not args.with_nce))))
+                                     not args.with_nce)), args.with_other_dict),
+                                 (args.with_hs or (not args.with_nce))))

    place = fluid.CPUPlace()

@@ -261,7 +268,7 @@ def train(args):
            args.dict_path, args.train_data_path, filelist, 0, 1)
    else:
        trainer_id = int(os.environ["PADDLE_TRAINER_ID"])
-        trainers = int(os.environ["PADDLE_TRAINERS"])
+        trainer_num = int(os.environ["PADDLE_TRAINERS"])
        word2vec_reader = reader.Word2VecReader(args.dict_path,
                                                args.train_data_path, filelist,
                                                trainer_id, trainer_num)