From 75c8018eabe1509bf7ca20030eb90534949fc7a1 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 4 Feb 2021 03:08:14 +0000 Subject: [PATCH] refactor tiny egs --- examples/librispeech/local/librispeech.py | 146 ++++++++++++++++++ examples/tiny/README.md | 4 + examples/tiny/{ => local}/run_data.sh | 26 ++-- examples/tiny/{ => local}/run_infer.sh | 16 +- examples/tiny/{ => local}/run_infer_golden.sh | 18 +-- examples/tiny/{ => local}/run_test.sh | 16 +- examples/tiny/{ => local}/run_test_golden.sh | 18 +-- examples/tiny/{ => local}/run_train.sh | 16 +- examples/tiny/{ => local}/run_tune.sh | 14 +- examples/tiny/path.sh | 8 + examples/tiny/run.sh | 24 +++ requirements.txt | 1 + 12 files changed, 238 insertions(+), 69 deletions(-) create mode 100644 examples/librispeech/local/librispeech.py create mode 100644 examples/tiny/README.md rename examples/tiny/{ => local}/run_data.sh (51%) rename examples/tiny/{ => local}/run_infer.sh (64%) rename examples/tiny/{ => local}/run_infer_golden.sh (62%) rename examples/tiny/{ => local}/run_test.sh (66%) rename examples/tiny/{ => local}/run_test_golden.sh (63%) rename examples/tiny/{ => local}/run_train.sh (69%) rename examples/tiny/{ => local}/run_tune.sh (67%) create mode 100644 examples/tiny/path.sh create mode 100644 examples/tiny/run.sh diff --git a/examples/librispeech/local/librispeech.py b/examples/librispeech/local/librispeech.py new file mode 100644 index 00000000..8a136453 --- /dev/null +++ b/examples/librispeech/local/librispeech.py @@ -0,0 +1,146 @@ +"""Prepare Librispeech ASR datasets. + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" + +import distutils.util +import os +import sys +import argparse +import soundfile +import json +import codecs +import io +from data_utils.utility import download, unpack + +URL_ROOT = "http://www.openslr.org/resources/12" +URL_ROOT = "https://openslr.magicdatatech.com/resources/12" +URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz" +URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz" +URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz" +URL_DEV_OTHER = URL_ROOT + "/dev-other.tar.gz" +URL_TRAIN_CLEAN_100 = URL_ROOT + "/train-clean-100.tar.gz" +URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz" +URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz" + +MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9" +MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135" +MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1" +MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931" +MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522" +MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa" +MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708" + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default='~/.cache/paddle/dataset/speech/libri', + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +parser.add_argument( + "--full_download", + default="True", + type=distutils.util.strtobool, + help="Download all datasets for Librispeech." + " If False, only download a minimal requirement (test-clean, dev-clean" + " train-clean-100). (default: %(default)s)") +args = parser.parse_args() + + +def create_manifest(data_dir, manifest_path): + """Create a manifest json file summarizing the data set, with each line + containing the meta data (i.e. audio filepath, transcription text, audio + duration) of each audio file within the data set. + """ + print("Creating manifest %s ..." % manifest_path) + json_lines = [] + for subfolder, _, filelist in sorted(os.walk(data_dir)): + text_filelist = [ + filename for filename in filelist if filename.endswith('trans.txt') + ] + if len(text_filelist) > 0: + text_filepath = os.path.join(subfolder, text_filelist[0]) + for line in io.open(text_filepath, encoding="utf8"): + segments = line.strip().split() + text = ' '.join(segments[1:]).lower() + audio_filepath = os.path.join(subfolder, segments[0] + '.flac') + audio_data, samplerate = soundfile.read(audio_filepath) + duration = float(len(audio_data)) / samplerate + json_lines.append( + json.dumps({ + 'audio_filepath': audio_filepath, + 'duration': duration, + 'text': text + })) + with codecs.open(manifest_path, 'w', 'utf-8') as out_file: + for line in json_lines: + out_file.write(line + '\n') + + +def prepare_dataset(url, md5sum, target_dir, manifest_path): + """Download, unpack and create summmary manifest file. + """ + if not os.path.exists(os.path.join(target_dir, "LibriSpeech")): + # download + filepath = download(url, md5sum, target_dir) + # unpack + unpack(filepath, target_dir) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + # create manifest json file + create_manifest(target_dir, manifest_path) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset( + url=URL_TEST_CLEAN, + md5sum=MD5_TEST_CLEAN, + target_dir=os.path.join(args.target_dir, "test-clean"), + manifest_path=args.manifest_prefix + ".test-clean") + prepare_dataset( + url=URL_DEV_CLEAN, + md5sum=MD5_DEV_CLEAN, + target_dir=os.path.join(args.target_dir, "dev-clean"), + manifest_path=args.manifest_prefix + ".dev-clean") + if args.full_download: + prepare_dataset( + url=URL_TRAIN_CLEAN_100, + md5sum=MD5_TRAIN_CLEAN_100, + target_dir=os.path.join(args.target_dir, "train-clean-100"), + manifest_path=args.manifest_prefix + ".train-clean-100") + prepare_dataset( + url=URL_TEST_OTHER, + md5sum=MD5_TEST_OTHER, + target_dir=os.path.join(args.target_dir, "test-other"), + manifest_path=args.manifest_prefix + ".test-other") + prepare_dataset( + url=URL_DEV_OTHER, + md5sum=MD5_DEV_OTHER, + target_dir=os.path.join(args.target_dir, "dev-other"), + manifest_path=args.manifest_prefix + ".dev-other") + prepare_dataset( + url=URL_TRAIN_CLEAN_360, + md5sum=MD5_TRAIN_CLEAN_360, + target_dir=os.path.join(args.target_dir, "train-clean-360"), + manifest_path=args.manifest_prefix + ".train-clean-360") + prepare_dataset( + url=URL_TRAIN_OTHER_500, + md5sum=MD5_TRAIN_OTHER_500, + target_dir=os.path.join(args.target_dir, "train-other-500"), + manifest_path=args.manifest_prefix + ".train-other-500") + + +if __name__ == '__main__': + main() diff --git a/examples/tiny/README.md b/examples/tiny/README.md new file mode 100644 index 00000000..ffa6621f --- /dev/null +++ b/examples/tiny/README.md @@ -0,0 +1,4 @@ +# Tiny Example + +1. `source path.sh` +2. `bash run.sh` diff --git a/examples/tiny/run_data.sh b/examples/tiny/local/run_data.sh similarity index 51% rename from examples/tiny/run_data.sh rename to examples/tiny/local/run_data.sh index dd75ddad..2acbe97b 100644 --- a/examples/tiny/run_data.sh +++ b/examples/tiny/local/run_data.sh @@ -1,16 +1,14 @@ #! /usr/bin/env bash -cd ../.. > /dev/null - # prepare folder -if [ ! -e data/tiny ]; then - mkdir data/tiny +if [ ! -e data ]; then + mkdir data fi # download data, generate manifests -PYTHONPATH=.:$PYTHONPATH python3 data/librispeech/librispeech.py \ ---manifest_prefix='data/tiny/manifest' \ ---target_dir='./dataset/librispeech' \ +PYTHONPATH=.:$PYTHONPATH python3 ../librispeech/local/librispeech.py \ +--manifest_prefix='data/manifest' \ +--target_dir="${MAIN_ROOT}/dataset/librispeech" \ --full_download='False' if [ $? -ne 0 ]; then @@ -18,13 +16,13 @@ if [ $? -ne 0 ]; then exit 1 fi -head -n 64 data/tiny/manifest.dev-clean > data/tiny/manifest.tiny +head -n 64 data/manifest.dev-clean > data/manifest.tiny # build vocabulary -python3 tools/build_vocab.py \ +python3 ${MAIN_ROOT}/tools/build_vocab.py \ --count_threshold=0 \ ---vocab_path='data/tiny/vocab.txt' \ ---manifest_paths='data/tiny/manifest.tiny' +--vocab_path='data/vocab.txt' \ +--manifest_paths='data/manifest.tiny' if [ $? -ne 0 ]; then echo "Build vocabulary failed. Terminated." @@ -33,11 +31,11 @@ fi # compute mean and stddev for normalizer -python3 tools/compute_mean_std.py \ ---manifest_path='data/tiny/manifest.tiny' \ +python3 ${MAIN_ROOT}/tools/compute_mean_std.py \ +--manifest_path='data/manifest.tiny' \ --num_samples=64 \ --specgram_type='linear' \ ---output_path='data/tiny/mean_std.npz' +--output_path='data/mean_std.npz' if [ $? -ne 0 ]; then echo "Compute mean and stddev failed. Terminated." diff --git a/examples/tiny/run_infer.sh b/examples/tiny/local/run_infer.sh similarity index 64% rename from examples/tiny/run_infer.sh rename to examples/tiny/local/run_infer.sh index d88f4526..09e86079 100644 --- a/examples/tiny/run_infer.sh +++ b/examples/tiny/local/run_infer.sh @@ -1,9 +1,7 @@ #! /usr/bin/env bash -cd ../.. > /dev/null - # download language model -cd models/lm > /dev/null +cd $MAIN_ROOT/models/lm > /dev/null bash download_lm_en.sh if [ $? -ne 0 ]; then exit 1 @@ -13,7 +11,7 @@ cd - > /dev/null # infer CUDA_VISIBLE_DEVICES=0 \ -python3 -u infer.py \ +python3 -u $MAIN_ROOT/infer.py \ --num_samples=10 \ --beam_size=500 \ --num_proc_bsearch=8 \ @@ -27,11 +25,11 @@ python3 -u infer.py \ --use_gru=False \ --use_gpu=True \ --share_rnn_weights=True \ ---infer_manifest='data/tiny/manifest.test-clean' \ ---mean_std_path='data/tiny/mean_std.npz' \ ---vocab_path='data/tiny/vocab.txt' \ ---model_path='./checkpoints/tiny/step_final' \ ---lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \ +--infer_manifest='data/manifest.test-clean' \ +--mean_std_path='data/mean_std.npz' \ +--vocab_path='data/vocab.txt' \ +--model_path='checkpoints/step_final' \ +--lang_model_path="$MAIN_ROOT/models/lm/common_crawl_00.prune01111.trie.klm" \ --decoding_method='ctc_beam_search' \ --error_rate_type='wer' \ --specgram_type='linear' diff --git a/examples/tiny/run_infer_golden.sh b/examples/tiny/local/run_infer_golden.sh similarity index 62% rename from examples/tiny/run_infer_golden.sh rename to examples/tiny/local/run_infer_golden.sh index d18c21f5..4d5bbecc 100644 --- a/examples/tiny/run_infer_golden.sh +++ b/examples/tiny/local/run_infer_golden.sh @@ -1,9 +1,7 @@ #! /usr/bin/env bash -cd ../.. > /dev/null - # download language model -cd models/lm > /dev/null +cd ${MAIN_ROOT}/models/lm > /dev/null bash download_lm_en.sh if [ $? -ne 0 ]; then exit 1 @@ -12,7 +10,7 @@ cd - > /dev/null # download well-trained model -cd models/librispeech > /dev/null +cd ${MAIN_ROOT}/models/librispeech > /dev/null bash download_model.sh if [ $? -ne 0 ]; then exit 1 @@ -22,7 +20,7 @@ cd - > /dev/null # infer CUDA_VISIBLE_DEVICES=0 \ -python3 -u infer.py \ +python3 -u ${MAIN_ROOT}/infer.py \ --num_samples=10 \ --beam_size=500 \ --num_proc_bsearch=8 \ @@ -36,11 +34,11 @@ python3 -u infer.py \ --use_gru=False \ --use_gpu=True \ --share_rnn_weights=True \ ---infer_manifest='data/tiny/manifest.test-clean' \ ---mean_std_path='models/librispeech/mean_std.npz' \ ---vocab_path='models/librispeech/vocab.txt' \ ---model_path='models/librispeech' \ ---lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \ +--infer_manifest='data/manifest.test-clean' \ +--mean_std_path="${MAIN_ROOT}/models/librispeech/mean_std.npz" \ +--vocab_path="${MAIN_ROOT}/models/librispeech/vocab.txt" \ +--model_path="${MAIN_ROOT}/models/librispeech" \ +--lang_model_path="${MAIN_ROOT}/models/lm/common_crawl_00.prune01111.trie.klm" \ --decoding_method='ctc_beam_search' \ --error_rate_type='wer' \ --specgram_type='linear' diff --git a/examples/tiny/run_test.sh b/examples/tiny/local/run_test.sh similarity index 66% rename from examples/tiny/run_test.sh rename to examples/tiny/local/run_test.sh index 81eafe23..bdc95512 100644 --- a/examples/tiny/run_test.sh +++ b/examples/tiny/local/run_test.sh @@ -1,9 +1,7 @@ #! /usr/bin/env bash -cd ../.. > /dev/null - # download language model -cd models/lm > /dev/null +cd $MAIN_ROOT/models/lm > /dev/null bash download_lm_en.sh if [ $? -ne 0 ]; then exit 1 @@ -13,7 +11,7 @@ cd - > /dev/null # evaluate model CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ -python3 -u test.py \ +python3 -u $MAIN_ROOT/test.py \ --batch_size=128 \ --beam_size=500 \ --num_proc_bsearch=8 \ @@ -27,11 +25,11 @@ python3 -u test.py \ --use_gru=False \ --use_gpu=True \ --share_rnn_weights=True \ ---test_manifest='data/tiny/manifest.test-clean' \ ---mean_std_path='data/tiny/mean_std.npz' \ ---vocab_path='data/tiny/vocab.txt' \ ---model_path='checkpoints/tiny/step_final' \ ---lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \ +--test_manifest='data/manifest.test-clean' \ +--mean_std_path='data/mean_std.npz' \ +--vocab_path='data/vocab.txt' \ +--model_path='checkpoints/step_final' \ +--lang_model_path="$MAIN_ROOT/models/lm/common_crawl_00.prune01111.trie.klm" \ --decoding_method='ctc_beam_search' \ --error_rate_type='wer' \ --specgram_type='linear' diff --git a/examples/tiny/run_test_golden.sh b/examples/tiny/local/run_test_golden.sh similarity index 63% rename from examples/tiny/run_test_golden.sh rename to examples/tiny/local/run_test_golden.sh index d82865f4..e818a932 100644 --- a/examples/tiny/run_test_golden.sh +++ b/examples/tiny/local/run_test_golden.sh @@ -1,9 +1,7 @@ #! /usr/bin/env bash -cd ../.. > /dev/null - # download language model -cd models/lm > /dev/null +cd $MAIN_ROOT/models/lm > /dev/null bash download_lm_en.sh if [ $? -ne 0 ]; then exit 1 @@ -12,7 +10,7 @@ cd - > /dev/null # download well-trained model -cd models/librispeech > /dev/null +cd $MAIN_ROOT/models/librispeech > /dev/null bash download_model.sh if [ $? -ne 0 ]; then exit 1 @@ -22,7 +20,7 @@ cd - > /dev/null # evaluate model CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ -python3 -u test.py \ +python3 -u $MAIN_ROOT/test.py \ --batch_size=128 \ --beam_size=500 \ --num_proc_bsearch=8 \ @@ -36,11 +34,11 @@ python3 -u test.py \ --use_gru=False \ --use_gpu=True \ --share_rnn_weights=True \ ---test_manifest='data/tiny/manifest.test-clean' \ ---mean_std_path='models/librispeech/mean_std.npz' \ ---vocab_path='models/librispeech/vocab.txt' \ ---model_path='models/librispeech' \ ---lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \ +--test_manifest='data/manifest.test-clean' \ +--mean_std_path="$MAIN_ROOT/models/librispeech/mean_std.npz" \ +--vocab_path="$MAIN_ROOT/models/librispeech/vocab.txt" \ +--model_path="$MAIN_ROOT/models/librispeech" \ +--lang_model_path="$MAIN_ROOT/models/lm/common_crawl_00.prune01111.trie.klm" \ --decoding_method='ctc_beam_search' \ --error_rate_type='wer' \ --specgram_type='linear' diff --git a/examples/tiny/run_train.sh b/examples/tiny/local/run_train.sh similarity index 69% rename from examples/tiny/run_train.sh rename to examples/tiny/local/run_train.sh index fe5b6203..88c6fec3 100644 --- a/examples/tiny/run_train.sh +++ b/examples/tiny/local/run_train.sh @@ -1,12 +1,10 @@ #! /usr/bin/env bash -cd ../.. > /dev/null - # train model # if you wish to resume from an exists model, uncomment --init_from_pretrained_model export FLAGS_sync_nccl_allreduce=0 CUDA_VISIBLE_DEVICES=0,1,2,3 \ -python3 -u train.py \ +python3 -u ${MAIN_ROOT}/train.py \ --batch_size=4 \ --num_epoch=20 \ --num_conv_layers=2 \ @@ -24,12 +22,12 @@ python3 -u train.py \ --use_gpu=True \ --is_local=True \ --share_rnn_weights=True \ ---train_manifest='data/tiny/manifest.tiny' \ ---dev_manifest='data/tiny/manifest.tiny' \ ---mean_std_path='data/tiny/mean_std.npz' \ ---vocab_path='data/tiny/vocab.txt' \ ---output_model_dir='./checkpoints/tiny' \ ---augment_conf_path='conf/augmentation.config' \ +--train_manifest='data/manifest.tiny' \ +--dev_manifest='data/manifest.tiny' \ +--mean_std_path='data/mean_std.npz' \ +--vocab_path='data/vocab.txt' \ +--output_model_dir='./checkpoints/' \ +--augment_conf_path="${MAIN_ROOT}/conf/augmentation.config" \ --specgram_type='linear' \ --shuffle_method='batch_shuffle_clipped' \ diff --git a/examples/tiny/run_tune.sh b/examples/tiny/local/run_tune.sh similarity index 67% rename from examples/tiny/run_tune.sh rename to examples/tiny/local/run_tune.sh index bec71111..67501acf 100644 --- a/examples/tiny/run_tune.sh +++ b/examples/tiny/local/run_tune.sh @@ -1,10 +1,8 @@ #! /usr/bin/env bash -cd ../.. > /dev/null - # grid-search for hyper-parameters in language model CUDA_VISIBLE_DEVICES=0,1,2,3 \ -python3 -u tools/tune.py \ +python3 -u $MAIN_ROOT/tools/tune.py \ --num_batches=-1 \ --batch_size=128 \ --beam_size=500 \ @@ -23,11 +21,11 @@ python3 -u tools/tune.py \ --use_gru=False \ --use_gpu=True \ --share_rnn_weights=True \ ---tune_manifest='data/tiny/manifest.dev-clean' \ ---mean_std_path='data/tiny/mean_std.npz' \ ---vocab_path='data/tiny/vocab.txt' \ ---model_path='models/librispeech' \ ---lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \ +--tune_manifest='data/manifest.dev-clean' \ +--mean_std_path='data/mean_std.npz' \ +--vocab_path='data/vocab.txt' \ +--model_path="$MAIN_ROOT/models/librispeech" \ +--lang_model_path="$MAIN_ROOT/models/lm/common_crawl_00.prune01111.trie.klm" \ --error_rate_type='wer' \ --specgram_type='linear' diff --git a/examples/tiny/path.sh b/examples/tiny/path.sh new file mode 100644 index 00000000..fd1cebba --- /dev/null +++ b/examples/tiny/path.sh @@ -0,0 +1,8 @@ +export MAIN_ROOT=${PWD}/../../ + +export PATH=${MAIN_ROOT}:${PWD}/tools:${PATH} +export LC_ALL=C + +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} diff --git a/examples/tiny/run.sh b/examples/tiny/run.sh new file mode 100644 index 00000000..c8e58913 --- /dev/null +++ b/examples/tiny/run.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +source path.sh + +# prepare data +bash ./local/run_data.sh + +# test pretrain model +bash ./local/run_test_golden.sh + +# test pretain model +bash ./local/run_infer_golden.sh + +# train model +bash ./local/run_train.sh + +# test model +bash ./local/run_test.sh + +# infer model +bash ./local/run_infer.sh + +# tune model +bash ./local/run_tune.sh diff --git a/requirements.txt b/requirements.txt index 8c57208a..6976c8f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ scipy==1.2.1 resampy==0.1.5 SoundFile==0.9.0.post1 python_speech_features +paddlepaddle-gpu==1.8.0.post107 -- GitLab