diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/s0/local/data.sh index f4fccbe6e034064935f1d41abd9aebdc0cc7d7ac..d0a63dca6b05b1dda4ea7ccae457f13453efd2dc 100755 --- a/examples/aishell/s0/local/data.sh +++ b/examples/aishell/s0/local/data.sh @@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for dataset in train dev test; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "char" \ --vocab_path="data/vocab.txt" \ diff --git a/examples/aishell/s1/local/data.sh b/examples/aishell/s1/local/data.sh index 2b9f69ae46c35fbaba51ba7e9629147e053ffdc5..8124d1bb49804a4b3f5d6cdc3b1f370540e650f8 100755 --- a/examples/aishell/s1/local/data.sh +++ b/examples/aishell/s1/local/data.sh @@ -67,7 +67,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for dataset in train dev test; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "char" \ --vocab_path="data/vocab.txt" \ diff --git a/examples/callcenter/s1/local/data.sh b/examples/callcenter/s1/local/data.sh index 634bb8d0eb24c660fb17ba80fa78082192b33f03..65e6e5fcef72e46df5ab542dcd8599ddc8f4b53b 100755 --- a/examples/callcenter/s1/local/data.sh +++ b/examples/callcenter/s1/local/data.sh @@ -55,7 +55,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for dataset in train dev test; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "char" \ --vocab_path="data/vocab.txt" \ diff --git a/examples/dataset/librispeech/librispeech.py b/examples/dataset/librispeech/librispeech.py index e85bbb3aa44d0bd775698ac0c79ce5b690f83a00..0d535e131649dcbb7e62d143586824c7a4f2d29c 100644 --- a/examples/dataset/librispeech/librispeech.py +++ b/examples/dataset/librispeech/librispeech.py @@ -89,25 +89,28 @@ def create_manifest(data_dir, manifest_path): text_filepath = os.path.join(subfolder, text_filelist[0]) for line in io.open(text_filepath, encoding="utf8"): segments = line.strip().split() + n_token = len(segments[1:]) text = ' '.join(segments[1:]).lower() audio_filepath = os.path.abspath( os.path.join(subfolder, segments[0] + '.flac')) audio_data, samplerate = soundfile.read(audio_filepath) duration = float(len(audio_data)) / samplerate + + utt = os.path.splitext(os.path.basename(audio_filepath))[0] + utt2spk = '-'.join(utt.split('-')[:2]) + json_lines.append( json.dumps({ - 'utt': - os.path.splitext(os.path.basename(audio_filepath))[0], - 'feat': - audio_filepath, - 'feat_shape': (duration, ), #second - 'text': - text + 'utt': utt, + 'utt2spk': utt2spk, + 'feat': audio_filepath, + 'feat_shape': (duration, ), # second + 'text': text, })) total_sec += duration - total_text += len(text) + total_text += n_token total_num += 1 with codecs.open(manifest_path, 'w', 'utf-8') as out_file: diff --git a/examples/librispeech/s0/local/data.sh b/examples/librispeech/s0/local/data.sh index fd2b0c0138dceb8352344b3b7beb9b19a2f94c40..78a4ffc4c6e8bc365f8a82035f70113bc8d616ee 100755 --- a/examples/librispeech/s0/local/data.sh +++ b/examples/librispeech/s0/local/data.sh @@ -81,7 +81,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for set in train dev test dev-clean dev-other test-clean test-other; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type ${unit_type} \ --vocab_path="data/vocab.txt" \ diff --git a/examples/librispeech/s1/local/data.sh b/examples/librispeech/s1/local/data.sh index 56fec8463c9c27eda2853ac83d5b8d030942d8cf..b15ddce58be8fdac42fd8cb1752a38dcbab12240 100755 --- a/examples/librispeech/s1/local/data.sh +++ b/examples/librispeech/s1/local/data.sh @@ -88,7 +88,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for set in train dev test dev-clean dev-other test-clean test-other; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "spm" \ --spm_model_prefix ${bpeprefix} \ diff --git a/examples/other/1xt2x/aishell/local/data.sh b/examples/other/1xt2x/aishell/local/data.sh index 0bf35e1f582c424c4634d395d42867b7775673ad..85574260b023f8559c16869666a34b7f42599679 100755 --- a/examples/other/1xt2x/aishell/local/data.sh +++ b/examples/other/1xt2x/aishell/local/data.sh @@ -50,7 +50,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for dataset in train dev test; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.npz" \ --unit_type "char" \ --vocab_path="data/vocab.txt" \ diff --git a/examples/other/1xt2x/baidu_en8k/local/data.sh b/examples/other/1xt2x/baidu_en8k/local/data.sh index f0bde77fe4ec4d0abb25634a28bb4bdfae91766f..8e378ff053ba78c970a50ca5d938975f3464f50f 100755 --- a/examples/other/1xt2x/baidu_en8k/local/data.sh +++ b/examples/other/1xt2x/baidu_en8k/local/data.sh @@ -65,7 +65,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for set in train dev test dev-clean dev-other test-clean test-other; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.npz" \ --unit_type ${unit_type} \ --vocab_path="data/vocab.txt" \ diff --git a/examples/other/1xt2x/librispeech/local/data.sh b/examples/other/1xt2x/librispeech/local/data.sh index 6f9bc5566cbdb1d482fc00988e87f1a15c2b3647..7387472d53cd0cb6ed6a73eb9eb8e5d3ba6685aa 100755 --- a/examples/other/1xt2x/librispeech/local/data.sh +++ b/examples/other/1xt2x/librispeech/local/data.sh @@ -63,7 +63,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for set in train dev test dev-clean dev-other test-clean test-other; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.npz" \ --unit_type ${unit_type} \ --vocab_path="data/vocab.txt" \ diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/t0/local/data.sh index b080a5b497e703c6b2f1c2d385315f771b33a1a0..23e5a9c70b83d783f693341f5ac18ee4bce58e0e 100755 --- a/examples/ted_en_zh/t0/local/data.sh +++ b/examples/ted_en_zh/t0/local/data.sh @@ -89,7 +89,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for set in train dev test; do { python3 ${MAIN_ROOT}/utils/format_triplet_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "spm" \ --spm_model_prefix ${bpeprefix} \ diff --git a/examples/timit/s1/local/data.sh b/examples/timit/s1/local/data.sh index ad4ddde3fc9fa5696f0818497126f01da9e36431..66be39e23f50d7ee37aedea27f308946fcc0f75f 100755 --- a/examples/timit/s1/local/data.sh +++ b/examples/timit/s1/local/data.sh @@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for set in train dev test; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type ${unit_type} \ --vocab_path="data/vocab.txt" \ diff --git a/examples/tiny/s0/local/data.sh b/examples/tiny/s0/local/data.sh index 711ebee406b0b101b07bd38453257128995f25bf..bcf9e6d10ae0310b51e3ad82569795e2657e435b 100755 --- a/examples/tiny/s0/local/data.sh +++ b/examples/tiny/s0/local/data.sh @@ -63,7 +63,6 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type ${unit_type} \ --vocab_path="data/vocab.txt" \ diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/s1/conf/chunk_confermer.yaml index c518666977faef8c0862be3e7c7f4d5b5244a5fc..6bed27f5c9caba478f127064d2fcce102eccf1f7 100644 --- a/examples/tiny/s1/conf/chunk_confermer.yaml +++ b/examples/tiny/s1/conf/chunk_confermer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 4 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/s1/conf/chunk_transformer.yaml index 29c30b262048b46bf08d132aebbb24bd7186bf71..7aed1b1933ca1edcf34e6c45a49dbc68eed91527 100644 --- a/examples/tiny/s1/conf/chunk_transformer.yaml +++ b/examples/tiny/s1/conf/chunk_transformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 4 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/s1/conf/conformer.yaml index 8487da771930e6f615ac9fe0e718bab310f66970..2c09b3ae6954cff537a7f1c934b4193e56f3243f 100644 --- a/examples/tiny/s1/conf/conformer.yaml +++ b/examples/tiny/s1/conf/conformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 4 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank diff --git a/examples/tiny/s1/conf/preprocess.yaml b/examples/tiny/s1/conf/preprocess.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9de0d8c71a7699e555eec6d94608324b9abf99a1 --- /dev/null +++ b/examples/tiny/s1/conf/preprocess.yaml @@ -0,0 +1,27 @@ +process: + # extract kaldi fbank from PCM + - type: "fbank_kaldi" + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: true + # these three processes are a.k.a. SpecAugument + - type: "time_warp" + max_time_warp: 5 + inplace: true + mode: "PIL" + - type: "freq_mask" + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: "time_mask" + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false + + + + diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml index cc9b5c5158adf2ca74ccf715e6edaf61cb320953..87f9c24360899c90a3ba504fbce7181459b03c6f 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/s1/conf/transformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 4 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank diff --git a/examples/tiny/s1/local/data.sh b/examples/tiny/s1/local/data.sh index b25f993f6107d222ec7feea63967e5bdea1b291a..3d7f19abaaec81f76cd74e030e5180108816723f 100755 --- a/examples/tiny/s1/local/data.sh +++ b/examples/tiny/s1/local/data.sh @@ -69,7 +69,6 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "spm" \ --spm_model_prefix ${bpeprefix} \ diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 67441fae55068fd2a2d5288de82d9e2aad40b47c..6b23a98526da5fc7f7874ad277d167777c1d6377 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -27,7 +27,9 @@ from paddle import distributed as dist from paddle.io import DataLoader from yacs.config import CfgNode +from paddlespeech.s2t.frontend.featurizer import TextFeaturizer from paddlespeech.s2t.io.collator import SpeechCollator +from paddlespeech.s2t.io.dataloader import BatchDataLoader from paddlespeech.s2t.io.dataset import ManifestDataset from paddlespeech.s2t.io.sampler import SortagradBatchSampler from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler @@ -247,92 +249,103 @@ class U2Trainer(Trainer): def setup_dataloader(self): config = self.config.clone() - config.defrost() - config.collator.keep_transcription_text = False - # train/valid dataset, return token ids - config.data.manifest = config.data.train_manifest - train_dataset = ManifestDataset.from_config(config) - - config.data.manifest = config.data.dev_manifest - dev_dataset = ManifestDataset.from_config(config) - - collate_fn_train = SpeechCollator.from_config(config) - - config.collator.augmentation_config = "" - collate_fn_dev = SpeechCollator.from_config(config) - - if self.parallel: - batch_sampler = SortagradDistributedBatchSampler( - train_dataset, + if self.train: + # train/valid dataset, return token ids + self.train_loader = BatchDataLoader( + json_file=config.data.train_manifest, + train_mode=True, + sortagrad=False, batch_size=config.collator.batch_size, - num_replicas=None, - rank=None, - shuffle=True, - drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) - else: - batch_sampler = SortagradBatchSampler( - train_dataset, - shuffle=True, + maxlen_in=float('inf'), + maxlen_out=float('inf'), + minibatches=0, + mini_batch_size=self.args.nprocs, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config.collator. + augmentation_config, # aug will be off when train_mode=False + n_iter_processes=config.collator.num_workers, + subsampling_factor=1, + num_encs=1) + + self.valid_loader = BatchDataLoader( + json_file=config.data.dev_manifest, + train_mode=False, + sortagrad=False, batch_size=config.collator.batch_size, - drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) - self.train_loader = DataLoader( - train_dataset, - batch_sampler=batch_sampler, - collate_fn=collate_fn_train, - num_workers=config.collator.num_workers, ) - self.valid_loader = DataLoader( - dev_dataset, - batch_size=config.collator.batch_size, - shuffle=False, - drop_last=False, - collate_fn=collate_fn_dev, - num_workers=config.collator.num_workers, ) - - # test dataset, return raw text - config.data.manifest = config.data.test_manifest - # filter test examples, will cause less examples, but no mismatch with training - # and can use large batch size , save training time, so filter test egs now. - config.data.min_input_len = 0.0 # second - config.data.max_input_len = float('inf') # second - config.data.min_output_len = 0.0 # tokens - config.data.max_output_len = float('inf') # tokens - config.data.min_output_input_ratio = 0.00 - config.data.max_output_input_ratio = float('inf') - - test_dataset = ManifestDataset.from_config(config) - # return text ord id - config.collator.keep_transcription_text = True - config.collator.augmentation_config = "" - self.test_loader = DataLoader( - test_dataset, - batch_size=config.decoding.batch_size, - shuffle=False, - drop_last=False, - collate_fn=SpeechCollator.from_config(config), - num_workers=config.collator.num_workers, ) - # return text token id - config.collator.keep_transcription_text = False - self.align_loader = DataLoader( - test_dataset, - batch_size=config.decoding.batch_size, - shuffle=False, - drop_last=False, - collate_fn=SpeechCollator.from_config(config), - num_workers=config.collator.num_workers, ) - logger.info("Setup train/valid/test/align Dataloader!") + maxlen_in=float('inf'), + maxlen_out=float('inf'), + minibatches=0, + mini_batch_size=self.args.nprocs, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config.collator. + augmentation_config, # aug will be off when train_mode=False + n_iter_processes=config.collator.num_workers, + subsampling_factor=1, + num_encs=1) + logger.info("Setup train/valid Dataloader!") + else: + # test dataset, return raw text + self.test_loader = BatchDataLoader( + json_file=config.data.test_manifest, + train_mode=False, + sortagrad=False, + batch_size=config.decoding.batch_size, + maxlen_in=float('inf'), + maxlen_out=float('inf'), + minibatches=0, + mini_batch_size=1, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config.collator. + augmentation_config, # aug will be off when train_mode=False + n_iter_processes=1, + subsampling_factor=1, + num_encs=1) + + self.align_loader = BatchDataLoader( + json_file=config.data.test_manifest, + train_mode=False, + sortagrad=False, + batch_size=config.decoding.batch_size, + maxlen_in=float('inf'), + maxlen_out=float('inf'), + minibatches=0, + mini_batch_size=1, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config.collator. + augmentation_config, # aug will be off when train_mode=False + n_iter_processes=1, + subsampling_factor=1, + num_encs=1) + logger.info("Setup test/align Dataloader!") def setup_model(self): config = self.config model_conf = config.model with UpdateConfig(model_conf): - model_conf.input_dim = self.train_loader.collate_fn.feature_size - model_conf.output_dim = self.train_loader.collate_fn.vocab_size + if self.train: + model_conf.input_dim = self.train_loader.feat_dim + model_conf.output_dim = self.train_loader.vocab_size + else: + model_conf.input_dim = self.test_loader.feat_dim + model_conf.output_dim = self.test_loader.vocab_size model = U2Model.from_config(model_conf) @@ -341,6 +354,11 @@ class U2Trainer(Trainer): logger.info(f"{model}") layer_tools.print_params(model, logger.info) + self.model = model + logger.info("Setup model!") + + if not self.train: + return train_config = config.training optim_type = train_config.optim @@ -381,10 +399,9 @@ class U2Trainer(Trainer): optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler) optimizer = OptimizerFactory.from_args(optim_type, optimzer_args) - self.model = model self.optimizer = optimizer self.lr_scheduler = lr_scheduler - logger.info("Setup model/optimizer/lr_scheduler!") + logger.info("Setup optimizer/lr_scheduler!") class U2Tester(U2Trainer): @@ -419,14 +436,19 @@ class U2Tester(U2Trainer): def __init__(self, config, args): super().__init__(config, args) + self.text_feature = TextFeaturizer( + unit_type=self.config.collator.unit_type, + vocab_filepath=self.config.collator.vocab_filepath, + spm_model_prefix=self.config.collator.spm_model_prefix) + self.vocab_list = self.text_feature.vocab_list - def ordid2token(self, texts, texts_len): + def id2token(self, texts, texts_len, text_feature): """ ord() id to chr() chr """ trans = [] for text, n in zip(texts, texts_len): n = n.numpy().item() ids = text[:n] - trans.append(''.join([chr(i) for i in ids])) + trans.append(text_feature.defeaturize(ids.numpy().tolist())) return trans def compute_metrics(self, @@ -442,12 +464,11 @@ class U2Tester(U2Trainer): error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer start_time = time.time() - text_feature = self.test_loader.collate_fn.text_feature - target_transcripts = self.ordid2token(texts, texts_len) + target_transcripts = self.id2token(texts, texts_len, self.text_feature) result_transcripts, result_tokenids = self.model.decode( audio, audio_len, - text_feature=text_feature, + text_feature=self.text_feature, decoding_method=cfg.decoding_method, lang_model_path=cfg.lang_model_path, beam_alpha=cfg.alpha, @@ -497,7 +518,7 @@ class U2Tester(U2Trainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - stride_ms = self.test_loader.collate_fn.stride_ms + stride_ms = self.config.collator.stride_ms error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 num_frames = 0.0 @@ -556,8 +577,8 @@ class U2Tester(U2Trainer): def align(self): ctc_utils.ctc_align( self.model, self.align_loader, self.config.decoding.batch_size, - self.align_loader.collate_fn.stride_ms, - self.align_loader.collate_fn.vocab_list, self.args.result_file) + self.config.collator.stride_ms, + self.vocab_list, self.args.result_file) def load_inferspec(self): """infer model and input spec. diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py index e37784aa1b1b8fb797cf1a50d1fa2464d065284b..357a39b91d4808b184055f045e5ad856d90061bc 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/model.py +++ b/paddlespeech/s2t/exps/u2_kaldi/model.py @@ -392,6 +392,7 @@ class U2Tester(U2Trainer): unit_type=self.config.collator.unit_type, vocab_filepath=self.config.collator.vocab_filepath, spm_model_prefix=self.config.collator.spm_model_prefix) + self.vocab_list = self.text_feature.vocab_list def id2token(self, texts, texts_len, text_feature): """ ord() id to chr() chr """ @@ -529,8 +530,8 @@ class U2Tester(U2Trainer): def align(self): ctc_utils.ctc_align( self.model, self.align_loader, self.config.decoding.batch_size, - self.align_loader.collate_fn.stride_ms, - self.align_loader.collate_fn.vocab_list, self.args.result_file) + self.config.collator.stride_ms, + self.vocab_list, self.args.result_file) def load_inferspec(self): """infer model and input spec. diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py index 7c96cb432890ad483adf0911409e7cb6b00d881d..7007518da8a5cb2750744edb971cb121a17d54c4 100644 --- a/paddlespeech/s2t/io/dataset.py +++ b/paddlespeech/s2t/io/dataset.py @@ -207,34 +207,16 @@ class AudioDataset(Dataset): if sort: data = sorted(data, key=lambda x: x["feat_shape"][0]) if raw_wav: - assert data[0]['feat'].split(':')[0].splitext()[-1] not in ('.ark', - '.scp') - data = map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms)) + path_suffix = data[0]['feat'].split(':')[0].splitext()[-1] + assert path_suffix not in ('.ark', '.scp') + # m second to n frame + data = list( + map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms), + data)) self.input_dim = data[0]['feat_shape'][1] self.output_dim = data[0]['token_shape'][1] - # with open(data_file, 'r') as f: - # for line in f: - # arr = line.strip().split('\t') - # if len(arr) != 7: - # continue - # key = arr[0].split(':')[1] - # tokenid = arr[5].split(':')[1] - # output_dim = int(arr[6].split(':')[1].split(',')[1]) - # if raw_wav: - # wav_path = ':'.join(arr[1].split(':')[1:]) - # duration = int(float(arr[2].split(':')[1]) * 1000 / 10) - # data.append((key, wav_path, duration, tokenid)) - # else: - # feat_ark = ':'.join(arr[1].split(':')[1:]) - # feat_info = arr[2].split(':')[1].split(',') - # feat_dim = int(feat_info[1].strip()) - # num_frames = int(feat_info[0].strip()) - # data.append((key, feat_ark, num_frames, tokenid)) - # self.input_dim = feat_dim - # self.output_dim = output_dim - valid_data = [] for i in range(len(data)): length = data[i]['feat_shape'][0] @@ -242,17 +224,17 @@ class AudioDataset(Dataset): # remove too lang or too short utt for both input and output # to prevent from out of memory if length > max_length or length < min_length: - # logging.warn('ignore utterance {} feature {}'.format( - # data[i][0], length)) pass elif token_length > token_max_length or token_length < token_min_length: pass else: valid_data.append(data[i]) + logger.info(f"raw dataset len: {len(data)}") data = valid_data + num_data = len(data) + logger.info(f"dataset len after filter: {num_data}") self.minibatch = [] - num_data = len(data) # Dynamic batch size if batch_type == 'dynamic': assert (max_frames_in_batch > 0) @@ -277,7 +259,9 @@ class AudioDataset(Dataset): cur = end def __len__(self): + """number of example(batch)""" return len(self.minibatch) def __getitem__(self, idx): + """batch example of idx""" return self.minibatch[idx] diff --git a/paddlespeech/s2t/io/reader.py b/paddlespeech/s2t/io/reader.py index e810662df377af2db05e128c1cbda9d2042afa89..38ff1396389f55c5dfcd8f42656483e5981a3e54 100644 --- a/paddlespeech/s2t/io/reader.py +++ b/paddlespeech/s2t/io/reader.py @@ -18,8 +18,10 @@ import kaldiio import numpy as np import soundfile -from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline as Transformation +from .utility import feat_type +from paddlespeech.s2t.transform.transformation import Transformation from paddlespeech.s2t.utils.log import Log +# from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline as Transformation __all__ = ["LoadInputsAndTargets"] @@ -322,20 +324,7 @@ class LoadInputsAndTargets(): "Not supported: loader_type={}".format(filetype)) def file_type(self, filepath): - suffix = filepath.split(":")[0].split('.')[-1].lower() - if suffix == 'ark': - return 'mat' - elif suffix == 'scp': - return 'scp' - elif suffix == 'npy': - return 'npy' - elif suffix == 'npz': - return 'npz' - elif suffix in ['wav', 'flac']: - # PCM16 - return 'sound' - else: - raise ValueError(f"Not support filetype: {suffix}") + return feat_type(filepath) class SoundHDF5File(): diff --git a/paddlespeech/s2t/io/utility.py b/paddlespeech/s2t/io/utility.py index 392031ba81c5b6e641538891b81b36767805d2ff..1a90e3d0461ba9bb278b1c4be358449c2b2c6191 100644 --- a/paddlespeech/s2t/io/utility.py +++ b/paddlespeech/s2t/io/utility.py @@ -17,7 +17,7 @@ import numpy as np from paddlespeech.s2t.utils.log import Log -__all__ = ["pad_list", "pad_sequence"] +__all__ = ["pad_list", "pad_sequence", "feat_type"] logger = Log(__name__).getlog() @@ -85,3 +85,20 @@ def pad_sequence(sequences: List[np.ndarray], out_tensor[:length, i, ...] = tensor return out_tensor + + +def feat_type(filepath): + suffix = filepath.split(":")[0].split('.')[-1].lower() + if suffix == 'ark': + return 'mat' + elif suffix == 'scp': + return 'scp' + elif suffix == 'npy': + return 'npy' + elif suffix == 'npz': + return 'npz' + elif suffix in ['wav', 'flac']: + # PCM16 + return 'sound' + else: + raise ValueError(f"Not support filetype: {suffix}") diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index df3130dadcd68e2416cc860cd41bb0e6c3d2c651..6956b908ab859f5638a8d355bc305f0db029cca1 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -14,6 +14,7 @@ # Modified from espnet(https://github.com/espnet/espnet) import librosa import numpy as np +from python_speech_features import logfbank def stft(x, @@ -304,3 +305,85 @@ class IStft(): win_length=self.win_length, window=self.window, center=self.center, ) + + +class LogMelSpectrogramKaldi(): + def __init__( + self, + fs=16000, + n_mels=80, + n_fft=512, # fft point + n_shift=160, # unit:sample, 10ms + win_length=400, # unit:sample, 25ms + window="povey", + fmin=20, + fmax=None, + eps=1e-10, + dither=False): + self.fs = fs + self.n_mels = n_mels + self.n_fft = n_fft + if n_shift > win_length: + raise ValueError("Stride size must not be greater than " + "window size.") + self.n_shift = n_shift / fs # unit: ms + self.win_length = win_length / fs # unit: ms + + self.window = window + self.fmin = fmin + if fmax is None: + fmax_ = fmax if fmax else self.fs / 2 + elif fmax > int(self.fs / 2): + raise ValueError("fmax must not be greater than half of " + "sample rate.") + self.fmax = fmax_ + + self.eps = eps + self.remove_dc_offset = True + self.preemph = 0.97 + self.dither = dither + + def __repr__(self): + return ("{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, " + "n_shift={n_shift}, win_length={win_length}, window={window}, " + "fmin={fmin}, fmax={fmax}, eps={eps}))".format( + name=self.__class__.__name__, + fs=self.fs, + n_mels=self.n_mels, + n_fft=self.n_fft, + n_shift=self.n_shift, + win_length=self.win_length, + window=self.window, + fmin=self.fmin, + fmax=self.fmax, + eps=self.eps, )) + + def __call__(self, x): + """ + + Args: + x (np.ndarray): shape (Ti,) + + Raises: + ValueError: not support (Ti, C) + + Returns: + np.ndarray: (T, D) + """ + if x.ndim != 1: + raise ValueError("Not support x: [Time, Channel]") + if x.dtype == np.int16: + x = x / 2**(16 - 1) + return logfbank( + signal=x, + samplerate=self.fs, + winlen=self.win_length, # unit ms + winstep=self.n_shift, # unit ms + nfilt=self.n_mels, + nfft=self.n_fft, + lowfreq=self.fmin, + highfreq=self.fmax, + dither=self.dither, + remove_dc_offset=self.remove_dc_offset, + preemph=self.preemph, + wintype=self.window) diff --git a/paddlespeech/s2t/transform/transformation.py b/paddlespeech/s2t/transform/transformation.py index 1aee4b36f79479d5aa60d4d2148bb3431697bcde..492d35df49b37469431018f2384f646bcfb388f0 100644 --- a/paddlespeech/s2t/transform/transformation.py +++ b/paddlespeech/s2t/transform/transformation.py @@ -45,6 +45,7 @@ import_alias = dict( stft2fbank="paddlespeech.s2t.transform.spectrogram:Stft2LogMelSpectrogram", wpe="paddlespeech.s2t.transform.wpe:WPE", channel_selector="paddlespeech.s2t.transform.channel_selector:ChannelSelector", + fbank_kaldi="paddlespeech.s2t.transform.spectrogram:LogMelSpectrogramKaldi", ) diff --git a/utils/format_data.py b/utils/format_data.py index 6fe36997a6513121e2878a00306e9d09018af47c..49dcbee8c72107491269ad3093f5bed8a7d74cf9 100755 --- a/utils/format_data.py +++ b/utils/format_data.py @@ -20,13 +20,13 @@ import json from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.frontend.utility import read_manifest +from paddlespeech.s2t.io.utility import feat_type from paddlespeech.s2t.utils.utility import add_arguments from paddlespeech.s2t.utils.utility import print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable -add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), mat(ark), scp") add_arg('cmvn_path', str, 'examples/librispeech/data/mean_std.json', "Filepath of cmvn.") @@ -62,24 +62,64 @@ def main(): vocab_size = text_feature.vocab_size print(f"Vocab size: {vocab_size}") + # josnline like this + # { + # "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}], + # "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}], + # "utt2spk": "111-2222", + # "utt": "111-2222-333" + # } count = 0 for manifest_path in args.manifest_paths: manifest_jsons = read_manifest(manifest_path) for line_json in manifest_jsons: + output_json = { + "input": [], + "output": [], + 'utt': line_json['utt'], + 'utt2spk': line_json.get('utt2spk', 'global'), + } + + # output line = line_json['text'] - tokens = text_feature.tokenize(line) - tokenids = text_feature.featurize(line) - line_json['token'] = tokens - line_json['token_id'] = tokenids - line_json['token_shape'] = (len(tokenids), vocab_size) - feat_shape = line_json['feat_shape'] - assert isinstance(feat_shape, (list, tuple)), type(feat_shape) - if args.feat_type == 'raw': - feat_shape.append(feat_dim) - line_json['filetype'] = 'sound' - else: # kaldi - raise NotImplementedError('no support kaldi feat now!') - fout.write(json.dumps(line_json) + '\n') + if isinstance(line, str): + # only one target + tokens = text_feature.tokenize(line) + tokenids = text_feature.featurize(line) + output_json['output'].append({ + 'name': 'traget1', + 'shape': (len(tokenids), vocab_size), + 'text': line, + 'token': ' '.join(tokens), + 'tokenid': ' '.join(map(str, tokenids)), + }) + else: + # isinstance(line, list), multi target + raise NotImplementedError("not support multi output now!") + + # input + line = line_json['feat'] + if isinstance(line, str): + # only one input + feat_shape = line_json['feat_shape'] + assert isinstance(feat_shape, (list, tuple)), type(feat_shape) + filetype = feat_type(line) + if filetype == 'sound': + feat_shape.append(feat_dim) + else: # kaldi + raise NotImplementedError('no support kaldi feat now!') + + output_json['input'].append({ + "name": "input1", + "shape": feat_shape, + "feat": line, + "filetype": filetype, + }) + else: + # isinstance(line, list), multi input + raise NotImplementedError("not support multi input now!") + + fout.write(json.dumps(output_json) + '\n') count += 1 print(f"Examples number: {count}") diff --git a/utils/format_triplet_data.py b/utils/format_triplet_data.py index 79b3d2cb2dd8b4cb2edea6f055b109d35dc0cae7..e0b5ece37353dc9cc592d440ce11ba486569bfaf 100755 --- a/utils/format_triplet_data.py +++ b/utils/format_triplet_data.py @@ -20,13 +20,13 @@ import json from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.frontend.utility import read_manifest +from paddlespeech.s2t.io.utility import feat_type from paddlespeech.s2t.utils.utility import add_arguments from paddlespeech.s2t.utils.utility import print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable -add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), kaldi") add_arg('cmvn_path', str, 'examples/librispeech/data/mean_std.json', "Filepath of cmvn.") @@ -79,9 +79,11 @@ def main(): line_json['token1'] = tokens line_json['token_id1'] = tokenids line_json['token_shape1'] = (len(tokenids), vocab_size) + feat_shape = line_json['feat_shape'] assert isinstance(feat_shape, (list, tuple)), type(feat_shape) - if args.feat_type == 'raw': + filetype = feat_type(line_json['feat']) + if filetype == 'sound': feat_shape.append(feat_dim) else: # kaldi raise NotImplementedError('no support kaldi feat now!')