From 44743622d4b894ba5fe7440d35007c87b3258db1 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 5 Nov 2021 09:50:19 +0000 Subject: [PATCH] filter example; cmvn stride and window int; libri/s1 conf --- examples/aishell/s0/local/data.sh | 4 +- examples/aishell/s1/local/data.sh | 4 +- examples/callcenter/s1/local/data.sh | 4 +- examples/dataset/librispeech/librispeech.py | 10 +- examples/librispeech/s0/local/data.sh | 4 +- .../librispeech/s1/conf/chunk_conformer.yaml | 4 +- .../s1/conf/chunk_transformer.yaml | 4 +- examples/librispeech/s1/conf/conformer.yaml | 4 +- examples/librispeech/s1/conf/preprocess.yaml | 29 +++++ examples/librispeech/s1/conf/transformer.yaml | 4 +- examples/librispeech/s1/local/data.sh | 45 +++++--- examples/ted_en_zh/t0/local/data.sh | 4 +- examples/timit/s1/local/data.sh | 4 +- examples/tiny/s0/local/data.sh | 4 +- examples/tiny/s1/local/data.sh | 4 +- utils/compute_mean_std.py | 8 +- utils/format_data.py | 2 +- utils/remove_longshortdata.py | 102 ++++++++++++++++++ 18 files changed, 195 insertions(+), 49 deletions(-) create mode 100644 examples/librispeech/s1/conf/preprocess.yaml create mode 100755 utils/remove_longshortdata.py diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/s0/local/data.sh index d0a63dca..23f04f2a 100755 --- a/examples/aishell/s0/local/data.sh +++ b/examples/aishell/s0/local/data.sh @@ -32,8 +32,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --manifest_path="data/manifest.train.raw" \ --spectrum_type="linear" \ --delta_delta=false \ - --stride_ms=10.0 \ - --window_ms=20.0 \ + --stride_ms=10 \ + --window_ms=20 \ --sample_rate=16000 \ --use_dB_normalization=True \ --num_samples=2000 \ diff --git a/examples/aishell/s1/local/data.sh b/examples/aishell/s1/local/data.sh index 8124d1bb..76e28075 100755 --- a/examples/aishell/s1/local/data.sh +++ b/examples/aishell/s1/local/data.sh @@ -33,8 +33,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --stride_ms=10 \ + --window_ms=25 \ --sample_rate=16000 \ --use_dB_normalization=False \ --num_samples=-1 \ diff --git a/examples/callcenter/s1/local/data.sh b/examples/callcenter/s1/local/data.sh index 65e6e5fc..c40c752a 100755 --- a/examples/callcenter/s1/local/data.sh +++ b/examples/callcenter/s1/local/data.sh @@ -21,8 +21,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --stride_ms=10 \ + --window_ms=25 \ --sample_rate=8000 \ --use_dB_normalization=False \ --num_samples=-1 \ diff --git a/examples/dataset/librispeech/librispeech.py b/examples/dataset/librispeech/librispeech.py index 0d535e13..69f0db59 100644 --- a/examples/dataset/librispeech/librispeech.py +++ b/examples/dataset/librispeech/librispeech.py @@ -78,7 +78,7 @@ def create_manifest(data_dir, manifest_path): print("Creating manifest %s ..." % manifest_path) json_lines = [] total_sec = 0.0 - total_text = 0.0 + total_char = 0.0 total_num = 0 for subfolder, _, filelist in sorted(os.walk(data_dir)): @@ -89,7 +89,7 @@ def create_manifest(data_dir, manifest_path): text_filepath = os.path.join(subfolder, text_filelist[0]) for line in io.open(text_filepath, encoding="utf8"): segments = line.strip().split() - n_token = len(segments[1:]) + nchars = len(segments[1:]) text = ' '.join(segments[1:]).lower() audio_filepath = os.path.abspath( @@ -110,7 +110,7 @@ def create_manifest(data_dir, manifest_path): })) total_sec += duration - total_text += n_token + total_char += nchars total_num += 1 with codecs.open(manifest_path, 'w', 'utf-8') as out_file: @@ -125,8 +125,8 @@ def create_manifest(data_dir, manifest_path): print(f"{subset}:", file=f) print(f"{total_num} utts", file=f) print(f"{total_sec / (60*60)} h", file=f) - print(f"{total_text} text", file=f) - print(f"{total_text / total_sec} text/sec", file=f) + print(f"{total_char} char", file=f) + print(f"{total_char / total_sec} char/sec", file=f) print(f"{total_sec / total_num} sec/utt", file=f) diff --git a/examples/librispeech/s0/local/data.sh b/examples/librispeech/s0/local/data.sh index 78a4ffc4..0f276cec 100755 --- a/examples/librispeech/s0/local/data.sh +++ b/examples/librispeech/s0/local/data.sh @@ -50,8 +50,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --spectrum_type="linear" \ --delta_delta=false \ --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=20.0 \ + --stride_ms=10 \ + --window_ms=20 \ --use_dB_normalization=True \ --num_workers=${num_workers} \ --output_path="data/mean_std.json" diff --git a/examples/librispeech/s1/conf/chunk_conformer.yaml b/examples/librispeech/s1/conf/chunk_conformer.yaml index 4d0e6ceb..2bfb0fb6 100644 --- a/examples/librispeech/s1/conf/chunk_conformer.yaml +++ b/examples/librispeech/s1/conf/chunk_conformer.yaml @@ -15,7 +15,7 @@ collator: unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 16 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: conformer diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/s1/conf/chunk_transformer.yaml index c7b53f95..fe533777 100644 --- a/examples/librispeech/s1/conf/chunk_transformer.yaml +++ b/examples/librispeech/s1/conf/chunk_transformer.yaml @@ -15,7 +15,7 @@ collator: unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 64 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: transformer diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/s1/conf/conformer.yaml index 3bc942dc..c844baaa 100644 --- a/examples/librispeech/s1/conf/conformer.yaml +++ b/examples/librispeech/s1/conf/conformer.yaml @@ -15,7 +15,7 @@ collator: unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 16 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: conformer diff --git a/examples/librispeech/s1/conf/preprocess.yaml b/examples/librispeech/s1/conf/preprocess.yaml new file mode 100644 index 00000000..dd4cfd27 --- /dev/null +++ b/examples/librispeech/s1/conf/preprocess.yaml @@ -0,0 +1,29 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: true + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false + + + + diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml index 3cc17004..5a158f3e 100644 --- a/examples/librispeech/s1/conf/transformer.yaml +++ b/examples/librispeech/s1/conf/transformer.yaml @@ -15,7 +15,7 @@ collator: unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 32 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: transformer diff --git a/examples/librispeech/s1/local/data.sh b/examples/librispeech/s1/local/data.sh index b15ddce5..35f4e635 100755 --- a/examples/librispeech/s1/local/data.sh +++ b/examples/librispeech/s1/local/data.sh @@ -8,6 +8,11 @@ nbpe=5000 bpemode=unigram bpeprefix="data/bpe_${bpemode}_${nbpe}" +stride_ms=10 +window_ms=25 +sample_rate=16000 +feat_dim=80 + source ${MAIN_ROOT}/utils/parse_options.sh @@ -27,21 +32,21 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then exit 1 fi - for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do - mv data/manifest.${set} data/manifest.${set}.raw + for sub in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do + mv data/manifest.${sub} data/manifest.${sub}.raw done rm -rf data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw - for set in train-clean-100 train-clean-360 train-other-500; do - cat data/manifest.${set}.raw >> data/manifest.train.raw + for sub in train-clean-100 train-clean-360 train-other-500; do + cat data/manifest.${sub}.raw >> data/manifest.train.raw done - for set in dev-clean dev-other; do - cat data/manifest.${set}.raw >> data/manifest.dev.raw + for sub in dev-clean dev-other; do + cat data/manifest.${sub}.raw >> data/manifest.dev.raw done - for set in test-clean test-other; do - cat data/manifest.${set}.raw >> data/manifest.test.raw + for sub in test-clean test-other; do + cat data/manifest.${sub}.raw >> data/manifest.test.raw done fi @@ -52,11 +57,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --manifest_path="data/manifest.train.raw" \ --num_samples=-1 \ --spectrum_type="fbank" \ - --feat_dim=80 \ + --feat_dim=${feat_dim} \ --delta_delta=false \ - --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --sample_rate=${sample_rate} \ + --stride_ms=${stride_ms} \ + --window_ms=${window_ms} \ --use_dB_normalization=False \ --num_workers=${num_workers} \ --output_path="data/mean_std.json" @@ -85,15 +90,15 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size - for set in train dev test dev-clean dev-other test-clean test-other; do + for sub in train dev test dev-clean dev-other test-clean test-other; do { python3 ${MAIN_ROOT}/utils/format_data.py \ --cmvn_path "data/mean_std.json" \ --unit_type "spm" \ --spm_model_prefix ${bpeprefix} \ --vocab_path="data/vocab.txt" \ - --manifest_path="data/manifest.${set}.raw" \ - --output_path="data/manifest.${set}" + --manifest_path="data/manifest.${sub}.raw" \ + --output_path="data/manifest.${sub}" if [ $? -ne 0 ]; then echo "Formt mnaifest failed. Terminated." @@ -102,6 +107,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then }& done wait + + for sub in train dev; do + mv data/manifest.${sub} data/manifest.${sub}.fmt + done +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + for sub in train dev; do + remove_longshortdata.py --maxframes 3000 --maxchars 400 --stride_ms ${stride_ms} data/manifest.${sub}.fmt data/manifest.${sub} + done fi echo "LibriSpeech Data preparation done." diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/t0/local/data.sh index 23e5a9c7..ce58f539 100755 --- a/examples/ted_en_zh/t0/local/data.sh +++ b/examples/ted_en_zh/t0/local/data.sh @@ -54,8 +54,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --stride_ms=10 \ + --window_ms=25 \ --use_dB_normalization=False \ --num_workers=${num_workers} \ --output_path="data/mean_std.json" diff --git a/examples/timit/s1/local/data.sh b/examples/timit/s1/local/data.sh index 66be39e2..e588e48d 100755 --- a/examples/timit/s1/local/data.sh +++ b/examples/timit/s1/local/data.sh @@ -35,8 +35,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --stride_ms=10 \ + --window_ms=25 \ --use_dB_normalization=False \ --num_workers=${num_workers} \ --output_path="data/mean_std.json" diff --git a/examples/tiny/s0/local/data.sh b/examples/tiny/s0/local/data.sh index bcf9e6d1..f1fb8cb1 100755 --- a/examples/tiny/s0/local/data.sh +++ b/examples/tiny/s0/local/data.sh @@ -34,8 +34,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --spectrum_type="linear" \ --delta_delta=false \ --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=20.0 \ + --stride_ms=10 \ + --window_ms=20 \ --use_dB_normalization=False \ --num_workers=2 \ --output_path="data/mean_std.json" diff --git a/examples/tiny/s1/local/data.sh b/examples/tiny/s1/local/data.sh index 3d7f19ab..87539d5e 100755 --- a/examples/tiny/s1/local/data.sh +++ b/examples/tiny/s1/local/data.sh @@ -38,8 +38,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --stride_ms=10 \ + --window_ms=25 \ --use_dB_normalization=False \ --num_workers=2 \ --output_path="data/mean_std.json" diff --git a/utils/compute_mean_std.py b/utils/compute_mean_std.py index 296d272a..e47554dc 100755 --- a/utils/compute_mean_std.py +++ b/utils/compute_mean_std.py @@ -33,8 +33,8 @@ add_arg('spectrum_type', str, choices=['linear', 'mfcc', 'fbank']) add_arg('feat_dim', int, 13, "Audio feature dim.") add_arg('delta_delta', bool, False, "Audio feature with delta delta.") -add_arg('stride_ms', float, 10.0, "stride length in ms.") -add_arg('window_ms', float, 20.0, "stride length in ms.") +add_arg('stride_ms', int, 10, "stride length in ms.") +add_arg('window_ms', int, 20, "stride length in ms.") add_arg('sample_rate', int, 16000, "target sample rate.") add_arg('use_dB_normalization', bool, True, "do dB normalization.") add_arg('target_dB', int, -20, "target dB.") @@ -61,8 +61,8 @@ def main(): spectrum_type=args.spectrum_type, feat_dim=args.feat_dim, delta_delta=args.delta_delta, - stride_ms=args.stride_ms, - window_ms=args.window_ms, + stride_ms=float(args.stride_ms), + window_ms=float(args.window_ms), n_fft=None, max_freq=None, target_sample_rate=args.sample_rate, diff --git a/utils/format_data.py b/utils/format_data.py index 49dcbee8..f9b5e6aa 100755 --- a/utils/format_data.py +++ b/utils/format_data.py @@ -122,7 +122,7 @@ def main(): fout.write(json.dumps(output_json) + '\n') count += 1 - print(f"Examples number: {count}") + print(f"{args.manifest_paths} Examples number: {count}") fout.close() diff --git a/utils/remove_longshortdata.py b/utils/remove_longshortdata.py new file mode 100755 index 00000000..dcc05b23 --- /dev/null +++ b/utils/remove_longshortdata.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +"""remove longshort data from manifest""" +import logging +import argparse +import jsonlines + +from paddlespeech.s2t.utils.cli_utils import get_commandline_args + +# manifest after format +# josnline like this +# { +# "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}], +# "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}], +# "utt2spk": "111-2222", +# "utt": "111-2222-333" +# } + + +def get_parser(): + parser = argparse.ArgumentParser( + description="remove longshort data from format manifest", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) + parser.add_argument( + "--verbose", "-V", default=0, type=int, help="Verbose option") + parser.add_argument( + "--iaxis", default=0, type=int, help="multi inputs index, 0 is the first") + parser.add_argument( + "--oaxis", default=0, type=int, help="multi outputs index, 0 is the first") + parser.add_argument( + "--maxframes", default=2000, type=int, help="maxframes") + parser.add_argument( + "--minframes", default=10, type=int, help="minframes") + parser.add_argument( + "--maxchars", default=200, type=int, help="max tokens") + parser.add_argument( + "--minchars", default=0, type=int, help="min tokens") + parser.add_argument( + "--stride_ms", default=10, type=int, help="stride in ms unit.") + parser.add_argument( + "rspecifier", + type=str, + help="jsonl format manifest. e.g. manifest.jsonl") + parser.add_argument( + "wspecifier_or_wxfilename", + type=str, + help="Write specifier. e.g. manifest.jsonl") + return parser + + +def filter_input(args, line): + tmp = line['input'][args.iaxis] + if args.sound: + # second to frame + nframe = tmp['shape'][0] * 1000 / args.stride_ms + else: + nframe = tmp['shape'][0] + + if nframe < args.minframes or nframe > args.maxframes: + return True + else: + return False + + +def filter_output(args, line): + nchars = len(line['output'][args.iaxis]['text']) + if nchars < args.minchars or nchars > args.maxchars: + return True + else: + return False + + +def main(): + args = get_parser().parse_args() + + logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" + if args.verbose > 0: + logging.basicConfig(level=logging.INFO, format=logfmt) + else: + logging.basicConfig(level=logging.WARN, format=logfmt) + logging.info(get_commandline_args()) + + with jsonlines.open(args.rspecifier, 'r') as reader: + lines = list(reader) + logging.info(f"Example: {len(lines)}") + feat = lines[0]['input'][args.iaxis]['feat'] + args.soud = False + if feat.split('.')[-1] not in 'ark, scp': + args.sound = True + + count = 0 + filter = 0 + with jsonlines.open(args.wspecifier_or_wxfilename, 'w') as writer: + for line in lines: + if filter_input(args, line) or filter_output(args, line): + filter += 1 + continue + writer.write(line) + count += 1 + logging.info(f"Example after filter: {count}\{filter}") + +if __name__ == '__main__': + main() \ No newline at end of file -- GitLab