提交 11ca0ec6 编写于 作者: Y yangyaming

Create 'tools' to hold tool scripts and add vocabulary dictionary building script.

上级 c7f57e50
......@@ -40,13 +40,13 @@ python datasets/librispeech/librispeech.py --help
### Preparing for Training
```
python compute_mean_std.py
python tools/compute_mean_std.py
```
It will compute mean and stdandard deviation for audio features, and save them to a file with a default name `./mean_std.npz`. This file will be used in both training and inferencing. The default feature of audio data is power spectrum, and the mfcc feature is also supported. To train and infer based on mfcc feature, please generate this file by
```
python compute_mean_std.py --specgram_type mfcc
python tools/compute_mean_std.py --specgram_type mfcc
```
and specify ```--specgram_type mfcc``` when running train.py, infer.py, evaluator.py or tune.py.
......@@ -54,7 +54,7 @@ and specify ```--specgram_type mfcc``` when running train.py, infer.py, evaluato
More help for arguments:
```
python compute_mean_std.py --help
python tools/compute_mean_std.py --help
```
### Training
......
"""Set up paths for DS2"""
import os.path
import sys
def add_path(path):
if path not in sys.path:
sys.path.insert(0, path)
this_dir = os.path.dirname(__file__)
# Add project path to PYTHONPATH
proj_path = os.path.join(this_dir, '..')
add_path(proj_path)
"""Build vocabulary dictionary from manifest files.
Each item in vocabulary file is a character.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import codecs
import json
from collections import Counter
import os.path
parser = argparse.ArgumentParser(
description='Build vocabulary dictionary from transcription texts.')
parser.add_argument(
"--manifest_paths",
type=str,
help="Manifest paths for building vocabulary dictionary."
"You can provide multiple manifest files.",
nargs='+',
required=True)
parser.add_argument(
"--count_threshold",
default=0,
type=int,
help="Characters whose count below the threshold will be truncated. "
"(default: %(default)s)")
parser.add_argument(
"--vocab_path",
default='datasets/vocab/zh_vocab.txt',
type=str,
help="Filepath to write vocabularies. (default: %(default)s)")
args = parser.parse_args()
def count_manifest(counter, manifest_path):
for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
try:
json_data = json.loads(json_line)
except Exception as e:
raise Exception('Error parsing manifest: %s, %s' % \
(manifest_path, e))
text = json_data['text']
for char in text:
counter.update(char)
def main():
counter = Counter()
for manifest_path in args.manifest_paths:
count_manifest(counter, manifest_path)
count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
with codecs.open(args.vocab_path, 'w', 'utf-8') as fout:
for item_pair in count_sorted:
if item_pair[1] < args.count_threshold: break
fout.write(item_pair[0] + '\n')
if __name__ == '__main__':
main()
......@@ -4,6 +4,7 @@ from __future__ import division
from __future__ import print_function
import argparse
import _init_paths
from data_utils.normalizer import FeatureNormalizer
from data_utils.augmentor.augmentation import AugmentationPipeline
from data_utils.featurizer.audio_featurizer import AudioFeaturizer
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册