Simplify codes and comments.

9aaab22a · yangyaming · 11ca0ec6 · 9aaab22a · 9aaab22a
隐藏空白更改
内联并排

Showing with 17 addition and 18 deletion

deep_speech_2/tools/_init_paths.py deep_speech_2/tools/_init_paths.py +3 -0

deep_speech_2/tools/build_vocab.py deep_speech_2/tools/build_vocab.py +14 -18

未找到文件。
--- a/deep_speech_2/tools/_init_paths.py
+++ b/deep_speech_2/tools/_init_paths.py
 """Set up paths for DS2"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 import os.path
 import sys

--- a/deep_speech_2/tools/build_vocab.py
+++ b/deep_speech_2/tools/build_vocab.py
-"""Build vocabulary dictionary from manifest files.
+"""Build vocabulary from manifest files.
 Each item in vocabulary file is a character.
 """
@@ -11,13 +11,14 @@ import codecs
 import json
 from collections import Counter
 import os.path
+import _init_paths
+from data_utils import utils
-parser = argparse.ArgumentParser(
+parser = argparse.ArgumentParser(description=__doc__)
-    description='Build vocabulary dictionary from transcription texts.')
 parser.add_argument(
    "--manifest_paths",
    type=str,
-    help="Manifest paths for building vocabulary dictionary."
+    help="Manifest paths for building vocabulary."
    "You can provide multiple manifest files.",
    nargs='+',
    required=True)
@@ -25,25 +26,20 @@ parser.add_argument(
    "--count_threshold",
    default=0,
    type=int,
-    help="Characters whose count below the threshold will be truncated. "
+    help="Characters whose counts are below the threshold will be truncated. "
-    "(default: %(default)s)")
+    "(default: %(default)i)")
 parser.add_argument(
    "--vocab_path",
    default='datasets/vocab/zh_vocab.txt',
    type=str,
-    help="Filepath to write vocabularies. (default: %(default)s)")
+    help="File path to write the vocabulary. (default: %(default)s)")
 args = parser.parse_args()
 def count_manifest(counter, manifest_path):
-    for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
+    manifest_jsons = utils.read_manifest(manifest_path)
-        try:
+    for line_json in manifest_jsons:
-            json_data = json.loads(json_line)
+        for char in line_json['text']:
-        except Exception as e:
-            raise Exception('Error parsing manifest: %s, %s' % \
-                    (manifest_path, e))
-        text = json_data['text']
-        for char in text:
            counter.update(char)
@@ -54,9 +50,9 @@ def main():
    count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    with codecs.open(args.vocab_path, 'w', 'utf-8') as fout:
-        for item_pair in count_sorted:
+        for char, count in count_sorted:
-            if item_pair[1] < args.count_threshold: break
+            if count < args.count_threshold: break
-            fout.write(item_pair[0] + '\n')
+            fout.write(char + '\n')
 if __name__ == '__main__':