remove is_local for preprocess

18a0bbe8 · JiabinYang · fc4fe627 · 18a0bbe8 · 18a0bbe8 · 18a0bbe8
3 changed file
--- a/fluid/PaddleRec/word2vec/README.cn.md
+++ b/fluid/PaddleRec/word2vec/README.cn.md
@@ -23,7 +23,7 @@ cd data && ./download.sh && cd ..
 对数据进行预处理以生成一个词典。
 ```bash
-python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict --is_local
+python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict
 ```
 如果您想使用自定义的词典形如：
 ```bash

--- a/fluid/PaddleRec/word2vec/README.md
+++ b/fluid/PaddleRec/word2vec/README.md
@@ -29,9 +29,16 @@ This model implement a skip-gram model of word2vector.
 Preprocess the training data to generate a word dict.
 ```bash
-python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --is_local --dict_path data/1-billion_dict
+python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict
 ```
-if you would like to use our supported third party vocab, please set --other_dict_path as the directory of where you
+if you would like to use your own vocab follow the format below:
+```bash
+<UNK>
+a
+b
+c
+```
+Then, please set --other_dict_path as the directory of where you
 save the vocab you will use and set --with_other_dict flag on to using it.
 ## Train

--- a/fluid/PaddleRec/word2vec/preprocess.py
+++ b/fluid/PaddleRec/word2vec/preprocess.py
@@ -27,12 +27,6 @@ def parse_args():
        type=int,
        default=5,
        help="If the word count is less then freq, it will be removed from dict")
-    parser.add_argument(
-        '--is_local',
-        action='store_true',
-        required=False,
-        default=False,
-        help='Local train or not, (default: False)')
    parser.add_argument(
        '--with_other_dict',
@@ -203,28 +197,27 @@ def preprocess(args):
            for line in f:
                word_count[native_to_unicode(line.strip())] = 1
-    if args.is_local:
+    for i in range(1, 100):
-        for i in range(1, 100):
+        with io.open(
-            with io.open(
+                args.data_path + "/news.en-000{:0>2d}-of-00100".format(i),
-                    args.data_path + "/news.en-000{:0>2d}-of-00100".format(i),
+                encoding='utf-8') as f:
-                    encoding='utf-8') as f:
+            for line in f:
-                for line in f:
+                if args.with_other_dict:
-                    if args.with_other_dict:
+                    line = strip_lines(line)
-                        line = strip_lines(line)
+                    words = line.split()
-                        words = line.split()
+                    for item in words:
-                        for item in words:
+                        if item in word_count:
-                            if item in word_count:
+                            word_count[item] = word_count[item] + 1
-                                word_count[item] = word_count[item] + 1
+                        else:
-                            else:
+                            word_count[native_to_unicode('<UNK>')] += 1
-                                word_count[native_to_unicode('<UNK>')] += 1
+                else:
-                    else:
+                    line = text_strip(line)
-                        line = text_strip(line)
+                    words = line.split()
-                        words = line.split()
+                    for item in words:
-                        for item in words:
+                        if item in word_count:
-                            if item in word_count:
+                            word_count[item] = word_count[item] + 1
-                                word_count[item] = word_count[item] + 1
+                        else:
-                            else:
+                            word_count[item] = 1
-                                word_count[item] = 1
    item_to_remove = []
    for item in word_count:
        if word_count[item] <= args.freq: