diff --git a/examples/dataset/aishell/aishell.py b/examples/dataset/aishell/aishell.py index 95ed040860a452a0fcdbcf4321e6c4c94c110a4c..7431fc08369546f372c93dc923f50300f1da10a3 100644 --- a/examples/dataset/aishell/aishell.py +++ b/examples/dataset/aishell/aishell.py @@ -82,7 +82,7 @@ def create_manifest(data_dir, manifest_path_prefix): # if no transcription for audio then skipped if audio_id not in transcript_dict: continue - + utt2spk = Path(audio_path).parent.name audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) diff --git a/examples/dataset/ted_en_zh/ted_en_zh.py b/examples/dataset/ted_en_zh/ted_en_zh.py index a8cbb83793710d9971ff320d6968b743a13d5df1..9a3ba3b31c2f7a9b9e050ceebaa8da9ace0ccb89 100644 --- a/examples/dataset/ted_en_zh/ted_en_zh.py +++ b/examples/dataset/ted_en_zh/ted_en_zh.py @@ -73,7 +73,6 @@ def create_manifest(data_dir, manifest_path_prefix): audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) - translation_str = " ".join(translation.split()) trancription_str = " ".join(trancription.split()) json_lines.append( @@ -82,7 +81,7 @@ def create_manifest(data_dir, manifest_path_prefix): 'utt': utt, 'feat': audio_path, 'feat_shape': (duration, ), # second - 'text': [translation_str, trancription_str], + 'text': [translation_str, trancription_str], }, ensure_ascii=False)) diff --git a/examples/dataset/thchs30/thchs30.py b/examples/dataset/thchs30/thchs30.py index 2ec4ddab29b1cca3a586269eabee4d78d4d9220e..cdfc0a75c0aacfdf89492d2f83642cb7f5decea8 100644 --- a/examples/dataset/thchs30/thchs30.py +++ b/examples/dataset/thchs30/thchs30.py @@ -124,7 +124,7 @@ def create_manifest(data_dir, manifest_path_prefix): json.dumps( { 'utt': audio_id, - 'utt2spk', spk, + 'utt2spk': spk, 'feat': audio_path, 'feat_shape': (duration, ), # second 'text': word_text, # charactor diff --git a/examples/dataset/timit/timit_kaldi_standard_split.py b/examples/dataset/timit/timit_kaldi_standard_split.py index 26aa76c72d65d29e11501ef6ca1003191e3a41d0..473fc856f4f78e6ed1f2d145a599d41226e212f9 100644 --- a/examples/dataset/timit/timit_kaldi_standard_split.py +++ b/examples/dataset/timit/timit_kaldi_standard_split.py @@ -22,9 +22,9 @@ import argparse import codecs import json import os +from pathlib import Path import soundfile -from pathlib import Path parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( diff --git a/examples/librispeech/asr1/README.md b/examples/librispeech/asr1/README.md index 20255db8e9be1fff1361eda8670947d853c2382b..73f0863ed38746d11eb50a8d86ae46c651926a4b 100644 --- a/examples/librispeech/asr1/README.md +++ b/examples/librispeech/asr1/README.md @@ -24,4 +24,4 @@ | transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention | 6.805267604192098, | 0.049795 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_greedy_search | 6.805267604192098, | 0.054892 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 6.805267604192098, | 0.054531 | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescoring | 6.805267604192098, | 0.042244 | \ No newline at end of file +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescoring | 6.805267604192098, | 0.042244 | diff --git a/examples/timit/README.md b/examples/timit/README.md index 778398748d6acb5dfbcb9a63ebd32c99a50a8b02..51fcfd57c930850c5fbb239436ac0eab5afa47eb 100644 --- a/examples/timit/README.md +++ b/examples/timit/README.md @@ -4,4 +4,4 @@ asr model with phone unit * asr0 - deepspeech2 Streaming/Non-Streaming * asr1 - transformer/conformer Streaming/Non-Streaming -* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature \ No newline at end of file +* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature diff --git a/examples/wenetspeech/README.md b/examples/wenetspeech/README.md index 0cb0f354c742912d317db40bfbe47c65f27a4fed..cbd01eb8c443494e0fcb07cc7b5834cd8d1648b2 100644 --- a/examples/wenetspeech/README.md +++ b/examples/wenetspeech/README.md @@ -55,4 +55,4 @@ As shown in the following table, we provide 3 training subsets, namely `S`, `M` |-----------------|-------|--------------|-----------------------------------------------------------------------------------------| | DEV | 20 | Internet | Specially designed for some speech tools which require cross-validation set in training | | TEST\_NET | 23 | Internet | Match test | -| TEST\_MEETING | 15 | Real meeting | Mismatch test which is a far-field, conversational, spontaneous, and meeting dataset | \ No newline at end of file +| TEST\_MEETING | 15 | Real meeting | Mismatch test which is a far-field, conversational, spontaneous, and meeting dataset | diff --git a/examples/wenetspeech/asr1/RESULTS.md b/examples/wenetspeech/asr1/RESULTS.md index 5aff041f80931ae13e0f275147f55d91f1759d6e..5c2b8143ca31b2c6f12a8db53db4abc38a0af748 100644 --- a/examples/wenetspeech/asr1/RESULTS.md +++ b/examples/wenetspeech/asr1/RESULTS.md @@ -21,4 +21,4 @@ Pretrain model from http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/wen | conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention | - | 0.048456 | | conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | - | 0.052534 | | conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | - | 0.052915 | -| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention_rescoring | - | 0.047904 | \ No newline at end of file +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention_rescoring | - | 0.047904 | diff --git a/examples/wenetspeech/asr1/local/extract_meta.py b/examples/wenetspeech/asr1/local/extract_meta.py index 4de0b7d45631af71ea428f4817b0db0488722daf..0e1b2727838052740e5e89593dcdab04ffe387c9 100644 --- a/examples/wenetspeech/asr1/local/extract_meta.py +++ b/examples/wenetspeech/asr1/local/extract_meta.py @@ -1,6 +1,18 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang) # Mobvoi Inc(Author: Di Wu, Binbin Zhang) - # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -12,11 +24,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import sys -import os import argparse import json +import os +import sys def get_args(): @@ -85,13 +96,13 @@ def meta_analysis(input_json, output_dir): else: utt2text.write(f'{sid}\t{text}\n') segments.write( - f'{sid}\t{aid}\t{start_time}\t{end_time}\n' - ) + f'{sid}\t{aid}\t{start_time}\t{end_time}\n') utt2dur.write(f'{sid}\t{dur}\n') segment_sub_names = " ".join(segment_subsets) utt2subsets.write( f'{sid}\t{segment_sub_names}\n') + def main(): args = get_args() @@ -99,4 +110,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/examples/wenetspeech/asr1/local/process_opus.py b/examples/wenetspeech/asr1/local/process_opus.py index 603e0082cc80dbfd4f56cb3ad3cbeb24012a03fa..f1b9287edbdcda1270fcd5192a3fc1328d492bbe 100644 --- a/examples/wenetspeech/asr1/local/process_opus.py +++ b/examples/wenetspeech/asr1/local/process_opus.py @@ -1,5 +1,17 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # Copyright 2021 NPU, ASLP Group (Author: Qijie Shao) - # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -11,14 +23,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - # process_opus.py: segmentation and downsampling of opus audio - # usage: python3 process_opus.py wav.scp segments output_wav.scp +import os +import sys from pydub import AudioSegment -import sys -import os def read_file(wav_scp, segments): @@ -86,4 +96,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py index 177d710b066d4b05bb5e60ee0b040aa51f823613..e827414d3c67a5790a381f4877bf6a7618ff7d46 100644 --- a/paddlespeech/s2t/exps/deepspeech2/model.py +++ b/paddlespeech/s2t/exps/deepspeech2/model.py @@ -409,7 +409,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): @paddle.no_grad() def test(self): logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - if self.args.enable_auto_log == True: + if self.args.enable_auto_log is True: from paddlespeech.s2t.utils.log import Autolog self.autolog = Autolog( batch_size=self.config.decoding.batch_size, @@ -438,7 +438,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): msg += "Final error rate [%s] (%d/%d) = %f" % ( error_rate_type, num_ins, num_ins, errors_sum / len_refs) logger.info(msg) - if self.args.enable_auto_log == True: + if self.args.enable_auto_log is True: self.autolog.report() def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): @@ -512,7 +512,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): x_len_list = np.split(x_len_batch, batch_size, axis=0) for x, x_len in zip(x_list, x_len_list): - if self.args.enable_auto_log == True: + if self.args.enable_auto_log is True: self.autolog.times.start() x_len = x_len[0] assert (chunk_size <= x_len) @@ -547,7 +547,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): probs_chunk_list = [] probs_chunk_lens_list = [] - if self.args.enable_auto_log == True: + if self.args.enable_auto_log is True: # record the model preprocessing time self.autolog.times.stamp() @@ -606,7 +606,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): [output_probs, output_probs_padding], axis=1) output_probs_list.append(output_probs) output_lens_list.append(output_lens) - if self.args.enable_auto_log == True: + if self.args.enable_auto_log is True: # record the model inference time self.autolog.times.stamp() # record the post processing time @@ -641,12 +641,12 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): audio_len_handle.reshape(x_len.shape) audio_len_handle.copy_from_cpu(x_len) - if self.args.enable_auto_log == True: + if self.args.enable_auto_log is True: self.autolog.times.start() # record the prefix processing time self.autolog.times.stamp() self.predictor.run() - if self.args.enable_auto_log == True: + if self.args.enable_auto_log is True: # record the model inference time self.autolog.times.stamp() # record the post processing time diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 9f5448ccf702f5cbf10db6db9b45a9a19ccdf3a9..27bc47d2baa537496bef0e3f1d1b18a23cf1d1f2 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -24,15 +24,10 @@ import jsonlines import numpy as np import paddle from paddle import distributed as dist -from paddle.io import DataLoader from yacs.config import CfgNode from paddlespeech.s2t.frontend.featurizer import TextFeaturizer -from paddlespeech.s2t.io.collator import SpeechCollator from paddlespeech.s2t.io.dataloader import BatchDataLoader -from paddlespeech.s2t.io.dataset import ManifestDataset -from paddlespeech.s2t.io.sampler import SortagradBatchSampler -from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler from paddlespeech.s2t.models.u2 import U2Model from paddlespeech.s2t.training.optimizer import OptimizerFactory from paddlespeech.s2t.training.reporter import ObsScope @@ -215,7 +210,7 @@ class U2Trainer(Trainer): msg += f"{v:>.8f}" if isinstance(v, float) else f"{v}" msg += f" {k.split(',')[1]}" if len( - k.split(',')) == 2 else f"" + k.split(',')) == 2 else "" msg += "," msg = msg[:-1] # remove the last "," if (batch_index + 1