Simplify codes and comments.

c2e6378a · yangyaming · 1325cd9b · c2e6378a · c2e6378a
隐藏空白更改
内联并排

Showing with 17 addition and 18 deletion

tools/_init_paths.py tools/_init_paths.py +3 -0

tools/build_vocab.py tools/build_vocab.py +14 -18

未找到文件。
--- a/tools/_init_paths.py
+++ b/tools/_init_paths.py
 """Set up paths for DS2"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function

 import os.path
 import sys

--- a/tools/build_vocab.py
+++ b/tools/build_vocab.py
-"""Build vocabulary dictionary from manifest files.
+"""Build vocabulary from manifest files.

 Each item in vocabulary file is a character.
 """
@@ -11,13 +11,14 @@ import codecs
 import json
 from collections import Counter
 import os.path
+import _init_paths
+from data_utils import utils

-parser = argparse.ArgumentParser(
-    description='Build vocabulary dictionary from transcription texts.')
+parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--manifest_paths",
    type=str,
-    help="Manifest paths for building vocabulary dictionary."
+    help="Manifest paths for building vocabulary."
    "You can provide multiple manifest files.",
    nargs='+',
    required=True)
@@ -25,25 +26,20 @@ parser.add_argument(
    "--count_threshold",
    default=0,
    type=int,
-    help="Characters whose count below the threshold will be truncated. "
-    "(default: %(default)s)")
+    help="Characters whose counts are below the threshold will be truncated. "
+    "(default: %(default)i)")
 parser.add_argument(
    "--vocab_path",
    default='datasets/vocab/zh_vocab.txt',
    type=str,
-    help="Filepath to write vocabularies. (default: %(default)s)")
+    help="File path to write the vocabulary. (default: %(default)s)")
 args = parser.parse_args()


 def count_manifest(counter, manifest_path):
-    for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
-        try:
-            json_data = json.loads(json_line)
-        except Exception as e:
-            raise Exception('Error parsing manifest: %s, %s' % \
-                    (manifest_path, e))
-        text = json_data['text']
-        for char in text:
+    manifest_jsons = utils.read_manifest(manifest_path)
+    for line_json in manifest_jsons:
+        for char in line_json['text']:
            counter.update(char)


@@ -54,9 +50,9 @@ def main():

    count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    with codecs.open(args.vocab_path, 'w', 'utf-8') as fout:
-        for item_pair in count_sorted:
-            if item_pair[1] < args.count_threshold: break
-            fout.write(item_pair[0] + '\n')
+        for char, count in count_sorted:
+            if count < args.count_threshold: break
+            fout.write(char + '\n')


 if __name__ == '__main__':