diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 0976ec1ac6a27f5eaafced10e2c3e02dce62afec..7806aaa491bcde1c26969ead6e4c8032e6aec665 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -444,7 +444,7 @@ class U2Tester(U2Trainer): start_time = time.time() text_feature = self.test_loader.collate_fn.text_feature target_transcripts = self.ordid2token(texts, texts_len) - result_transcripts = self.model.decode( + result_transcripts, result_tokenids = self.model.decode( audio, audio_len, text_feature=text_feature, @@ -462,14 +462,19 @@ class U2Tester(U2Trainer): simulate_streaming=cfg.simulate_streaming) decode_time = time.time() - start_time - for utt, target, result in zip(utts, target_transcripts, - result_transcripts): + for utt, target, result, rec_tids in zip( + utts, target_transcripts, result_transcripts, result_tokenids): errors, len_ref = errors_func(target, result) errors_sum += errors len_refs += len_ref num_ins += 1 if fout: - fout.write({"utt": utt, "ref": target, "hyp": result}) + fout.write({ + "utt": utt, + "refs": [target], + "hyps": [result], + "hyps_tokenid": [rec_tids], + }) logger.info(f"Utt: {utt}") logger.info(f"Ref: {target}") logger.info(f"Hyp: {result}") diff --git a/deepspeech/exps/u2_kaldi/model.py b/deepspeech/exps/u2_kaldi/model.py index c182c59865c5e1a01ebf4b261e9c468af1924234..18e29b28f95d228f2c7f856741c857a8fbe51f1b 100644 --- a/deepspeech/exps/u2_kaldi/model.py +++ b/deepspeech/exps/u2_kaldi/model.py @@ -390,6 +390,10 @@ class U2Tester(U2Trainer): def __init__(self, config, args): super().__init__(config, args) + self.text_feature = TextFeaturizer( + unit_type=self.config.collator.unit_type, + vocab_filepath=self.config.collator.vocab_filepath, + spm_model_prefix=self.config.collator.spm_model_prefix) def id2token(self, texts, texts_len, text_feature): """ ord() id to chr() chr """ @@ -413,15 +417,11 @@ class U2Tester(U2Trainer): error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer start_time = time.time() - text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab_filepath=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) - target_transcripts = self.id2token(texts, texts_len, text_feature) - result_transcripts = self.model.decode( + target_transcripts = self.id2token(texts, texts_len, self.text_feature) + result_transcripts, result_tokenids = self.model.decode( audio, audio_len, - text_feature=text_feature, + text_feature=self.text_feature, decoding_method=cfg.decoding_method, lang_model_path=cfg.lang_model_path, beam_alpha=cfg.alpha, @@ -436,14 +436,19 @@ class U2Tester(U2Trainer): simulate_streaming=cfg.simulate_streaming) decode_time = time.time() - start_time - for utt, target, result in zip(utts, target_transcripts, - result_transcripts): + for i, (utt, target, result, rec_tids) in enumerate(zip( + utts, target_transcripts, result_transcripts, result_tokenids)): errors, len_ref = errors_func(target, result) errors_sum += errors len_refs += len_ref num_ins += 1 if fout: - fout.write({"utt": utt, "ref": target, "hyp": result}) + fout.write({ + "utt": utt, + "refs": [target], + "hyps": [result], + "hyps_tokenid": [rec_tids], + }) logger.info(f"Utt: {utt}") logger.info(f"Ref: {target}") logger.info(f"Hyp: {result}") diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 553ffcb5334ab146b2e3c4d7681c09095095faae..ae1feb78a617cbe7818f9be92b39114a8c804224 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -32,7 +32,7 @@ __all__ = ["SpeechCollator", "TripletSpeechCollator"] logger = Log(__name__).getlog() -def tokenids(text, keep_transcription_text): +def _tokenids(text, keep_transcription_text): # for training text is token ids tokens = text # token ids @@ -93,6 +93,8 @@ class SpeechCollatorBase(): a user-defined shape) within one batch. """ self.keep_transcription_text = keep_transcription_text + self.train_mode = not keep_transcription_text + self.stride_ms = stride_ms self.window_ms = window_ms self.feat_dim = feat_dim @@ -192,6 +194,7 @@ class SpeechCollatorBase(): texts = [] text_lens = [] utts = [] + tids = [] # tokenids for idx, item in enumerate(batch): utts.append(item['utt']) @@ -203,7 +206,7 @@ class SpeechCollatorBase(): audios.append(audio) # [T, D] audio_lens.append(audio.shape[0]) - tokens = tokenids(text, self.keep_transcription_text) + tokens = _tokenids(text, self.keep_transcription_text) texts.append(tokens) text_lens.append(tokens.shape[0]) diff --git a/deepspeech/io/dataloader.py b/deepspeech/io/dataloader.py index 310f5f581826460c947580a440e1aec5cc2a146a..d8ef9ba6be7e57d9f7b3185cffcb7a2634ce955d 100644 --- a/deepspeech/io/dataloader.py +++ b/deepspeech/io/dataloader.py @@ -142,6 +142,15 @@ class BatchDataLoader(): collate_fn=batch_collate, num_workers=self.n_iter_processes, ) + def __len__(self): + return len(self.dataloader) + + def __iter__(self): + return self.dataloader.__iter__() + + def __call__(self): + return self.__iter__() + def __repr__(self): echo = f"<{self.__class__.__module__}.{self.__class__.__name__} object at {hex(id(self))}> " echo += f"train_mode: {self.train_mode}, " @@ -159,12 +168,3 @@ class BatchDataLoader(): echo += f"num_workers: {self.n_iter_processes}, " echo += f"file: {self.json_file}" return echo - - def __len__(self): - return len(self.dataloader) - - def __iter__(self): - return self.dataloader.__iter__() - - def __call__(self): - return self.__iter__() diff --git a/deepspeech/models/u2/u2.py b/deepspeech/models/u2/u2.py index e6cd7b5c8fce9ac75330e8ab0e5ad8cb64c69fb9..fd63fa9c5300fe14ea92996dc11ca8f247e2dfd2 100644 --- a/deepspeech/models/u2/u2.py +++ b/deepspeech/models/u2/u2.py @@ -809,7 +809,8 @@ class U2BaseModel(nn.Layer): raise ValueError(f"Not support decoding method: {decoding_method}") res = [text_feature.defeaturize(hyp) for hyp in hyps] - return res + res_tokenids = [hyp for hyp in hyps] + return res, res_tokenids class U2Model(U2BaseModel): diff --git a/examples/librispeech/s1/conf/augmentation.json b/examples/librispeech/s1/conf/augmentation.json index 40a5b7900b74a307ffc525b65ca1809d28493d91..31c481c8d103b9c18be6cdde0a3e228b07761a6c 100644 --- a/examples/librispeech/s1/conf/augmentation.json +++ b/examples/librispeech/s1/conf/augmentation.json @@ -1,12 +1,4 @@ [ - { - "type": "shift", - "params": { - "min_shift_ms": -5, - "max_shift_ms": 5 - }, - "prob": 1.0 - }, { "type": "speed", "params": { @@ -16,6 +8,14 @@ }, "prob": 0.0 }, + { + "type": "shift", + "params": { + "min_shift_ms": -5, + "max_shift_ms": 5 + }, + "prob": 1.0 + }, { "type": "specaug", "params": { diff --git a/examples/librispeech/s2/README.md b/examples/librispeech/s2/README.md index e4022f014a40888afdbe4d2f84c88cc4b9c198dc..34c65c115adb9055cfbabcdf31b2668271e2ee1d 100644 --- a/examples/librispeech/s2/README.md +++ b/examples/librispeech/s2/README.md @@ -1,41 +1,9 @@ # LibriSpeech -## Data -| Data Subset | Duration in Seconds | -| data/manifest.train | 0.83s ~ 29.735s | -| data/manifest.dev | 1.065 ~ 35.155s | -| data/manifest.test-clean | 1.285s ~ 34.955s | - -## Conformer -| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention | - | - | -| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | | | -| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | | | -| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | | | - -### Test w/o length filter -| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean-all | attention | | | - - -## Chunk Conformer - -| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention | 16, -1 | | | -| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 16, -1 | | | -| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 16, -1 | | - | -| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 16, -1 | | - | - - ## Transformer -| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | --- | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | | | - -### Test w/o length filter -| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| Model | Params | Config | Augmentation| Test Set | Decode Method | Loss | WER % | | --- | --- | --- | --- | --- | --- | --- | --- | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | | | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention | 6.395054340362549 | 4.2 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_greedy_search | 6.395054340362549 | 5.0 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 6.395054340362549 | | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescore | 6.395054340362549 | | diff --git a/examples/librispeech/s2/conf/transformer.yaml b/examples/librispeech/s2/conf/transformer.yaml index b86224ff46d554f34a7b8c4d5c1b9792e3212cb2..c9eed4f956a91b33c30f7c3b415dc508e1cbaf57 100644 --- a/examples/librispeech/s2/conf/transformer.yaml +++ b/examples/librispeech/s2/conf/transformer.yaml @@ -5,9 +5,9 @@ data: test_manifest: data/manifest.test-clean collator: - vocab_filepath: data/train_960_unigram5000_units.txt - unit_type: 'spm' - spm_model_prefix: 'data/train_960_unigram5000' + vocab_filepath: data/bpe_unigram_5000_units.txt + unit_type: spm + spm_model_prefix: data/bpe_unigram_5000 feat_dim: 83 stride_ms: 10.0 window_ms: 25.0 diff --git a/examples/librispeech/s2/local/test.sh b/examples/librispeech/s2/local/test.sh index 5eeb2d6126e801801b4b8be13fa66d0781859d9a..6717415267f0831d0cd7257c3ed7df30817d0a9f 100755 --- a/examples/librispeech/s2/local/test.sh +++ b/examples/librispeech/s2/local/test.sh @@ -46,15 +46,17 @@ pids=() # initialize pids for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_rescoring; do ( + echo "${dmethd} decoding" for rtask in ${recog_set}; do ( - decode_dir=decode_${rtask}_${dmethd}_$(basename ${config_path%.*})_${lmtag} + echo "${rtask} dataset" + decode_dir=decode_${rtask/-/_}_${dmethd}_$(basename ${config_path%.*})_${lmtag} feat_recog_dir=${datadir} mkdir -p ${expdir}/${decode_dir} mkdir -p ${feat_recog_dir} # split data - split_json.sh ${feat_recog_dir}/manifest.${rtask} ${nj} + split_json.sh manifest.${rtask} ${nj} #### use CPU for decoding ngpu=0 @@ -74,17 +76,16 @@ for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_resco --opts decoding.batch_size ${batch_size} \ --opts data.test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask} - score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict} + score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel} --wer false ${expdir}/${decode_dir} ${dict} ) & pids+=($!) # store background pids + i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done + [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false done -) & -pids+=($!) # store background pids +) done -i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done -[ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false echo "Finished" exit 0 diff --git a/examples/librispeech/s2/run.sh b/examples/librispeech/s2/run.sh index 46b6ac1b406bdb344ae266365524d4ffaa95af41..8a21938165a7833e87eb00e7764a9efe508cd6dd 100755 --- a/examples/librispeech/s2/run.sh +++ b/examples/librispeech/s2/run.sh @@ -32,7 +32,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + ./local/test.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then diff --git a/tools/Makefile b/tools/Makefile index b8b002930f9bbdedef0256078563facbb454ead9..5690ea91e74f02caa25e41b04b619fd7bf564786 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -6,7 +6,7 @@ CC ?= gcc # used for sph2pipe # CXX = clang++ # Uncomment these lines... # CC = clang # ...to build with Clang. -WGET ?= wget +WGET ?= wget --no-check-certificate .PHONY: all clean diff --git a/utils/json2trn.py b/utils/json2trn.py new file mode 100755 index 0000000000000000000000000000000000000000..873fde4f7c1fed6e100aaaf2d79030c4e0a441ad --- /dev/null +++ b/utils/json2trn.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2017 Johns Hopkins University (Shinji Watanabe) +# 2018 Xuankai Chang (Shanghai Jiao Tong University) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +import argparse +import json +import logging +import sys + +import jsonlines +from utility import get_commandline_args + + +def get_parser(): + parser = argparse.ArgumentParser( + description="convert a json to a transcription file with a token dictionary", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) + parser.add_argument("json", type=str, help="jsonlines files") + parser.add_argument("dict", type=str, help="dict, not used.") + parser.add_argument( + "--num-spkrs", type=int, default=1, help="number of speakers") + parser.add_argument( + "--refs", type=str, nargs="+", help="ref for all speakers") + parser.add_argument( + "--hyps", type=str, nargs="+", help="hyp for all outputs") + return parser + + +def main(args): + args = get_parser().parse_args(args) + convert(args.json, args.dict, args.refs, args.hyps, args.num_spkrs) + + +def convert(jsonf, dic, refs, hyps, num_spkrs=1): + n_ref = len(refs) + n_hyp = len(hyps) + assert n_ref == n_hyp + assert n_ref == num_spkrs + + # logging info + logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" + logging.basicConfig(level=logging.INFO, format=logfmt) + logging.info(get_commandline_args()) + + logging.info("reading %s", jsonf) + with jsonlines.open(jsonf, "r") as f: + j = [item for item in f] + + logging.info("reading %s", dic) + with open(dic, "r") as f: + dictionary = f.readlines() + char_list = [entry.split(" ")[0] for entry in dictionary] + char_list.insert(0, "") + char_list.append("") + + for ns in range(num_spkrs): + hyp_file = open(hyps[ns], "w") + ref_file = open(refs[ns], "w") + + for x in j: + # recognition hypothesis + if num_spkrs == 1: + #seq = [char_list[int(i)] for i in x['hyps_tokenid'][0]] + seq = x['hyps'][0] + else: + seq = [char_list[int(i)] for i in x['hyps_tokenid'][ns]] + # In the recognition hypothesis, + # the symbol is usually attached in the last part of the sentence + # and it is removed below. + #hyp_file.write(" ".join(seq).replace("", "")) + hyp_file.write(seq.replace("", "")) + # spk-uttid + hyp_file.write(" (" + x["utt"] + ")\n") + + # reference + if num_spkrs == 1: + seq = x["refs"][0] + else: + seq = x['refs'][ns] + # Unlike the recognition hypothesis, + # the reference is directly generated from a token without dictionary + # to avoid to include symbols in the reference to make scoring normal. + # The detailed discussion can be found at + # https://github.com/espnet/espnet/issues/993 + # ref_file.write( + # seq + " (" + j["utts"][x]["utt2spk"].replace("-", "_") + "-" + x + ")\n" + # ) + ref_file.write(seq + " (" + x['utt'] + ")\n") + + hyp_file.close() + ref_file.close() + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/utils/score_sclite.sh b/utils/score_sclite.sh index 7ded76eba53089f171270471554e8ccd51ee0a83..99214b7d3565ff462b03e366e7406b150036117e 100755 --- a/utils/score_sclite.sh +++ b/utils/score_sclite.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +set -e + # Copyright 2017 Johns Hopkins University (Shinji Watanabe) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) diff --git a/utils/utility.py b/utils/utility.py index a6b81d735d62c359233cea123b72ea9cfed7e9cc..b4db518a414de78fcc1c95dc0b9ab63d0a3c1733 100755 --- a/utils/utility.py +++ b/utils/utility.py @@ -14,6 +14,7 @@ import hashlib import json import os +import sys import tarfile import zipfile from typing import Text @@ -21,7 +22,7 @@ from typing import Text __all__ = [ "check_md5sum", "getfile_insensitive", "download_multi", "download", "unpack", "unzip", "md5file", "print_arguments", "add_arguments", - "read_manifest" + "read_manifest", "get_commandline_args" ] @@ -46,6 +47,40 @@ def read_manifest(manifest_path): return manifest +def get_commandline_args(): + extra_chars = [ + " ", + ";", + "&", + "(", + ")", + "|", + "^", + "<", + ">", + "?", + "*", + "[", + "]", + "$", + "`", + '"', + "\\", + "!", + "{", + "}", + ] + + # Escape the extra characters for shell + argv = [ + arg.replace("'", "'\\''") if all(char not in arg + for char in extra_chars) else + "'" + arg.replace("'", "'\\''") + "'" for arg in sys.argv + ] + + return sys.executable + " " + " ".join(argv) + + def print_arguments(args, info=None): """Print argparse's arguments.