diff --git a/docs/source/tts/models_introduction.md b/docs/source/tts/models_introduction.md index edf6f312c260a946cae09f754ccef3541d618266..202fe491ad500c9804faf22ada1d109e16bab2cd 100644 --- a/docs/source/tts/models_introduction.md +++ b/docs/source/tts/models_introduction.md @@ -251,8 +251,10 @@ Vocoders based on neural networks usually is speech synthesis, which learns the - GAN - WaveGAN - **Parallel WaveGAN** - - MelGAN - - HiFi-GAN + - **MelGAN** + - **Style MelGAN** + - **Multi Band MelGAN** + - **HiFi GAN** - VAE - Wave-VAE - Diffusion diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md index cb7d49f900050f0cde1bf0ecea1bac29d52722cf..488ede2ec132f43afbaf4d2ccb2c01b4c11db76c 100644 --- a/examples/csmsc/tts3/README.md +++ b/examples/csmsc/tts3/README.md @@ -203,7 +203,9 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} ``` ## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip). +Pretrained FastSpeech2 model with no silence in the edge of audios: +- [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip) +- [fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip) Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip). diff --git a/paddlespeech/cli/README.md b/paddlespeech/cli/README.md index 56afb939c2a36f998a70993f44f4d402f6538a3d..264d66f72b44f091737a003278518826e6987159 100644 --- a/paddlespeech/cli/README.md +++ b/paddlespeech/cli/README.md @@ -7,3 +7,6 @@ ## ASR `paddlespeech asr --input ./test_audio.wav` + + ## Multi-label Classification + `paddlespeech cls --input ./test_audio.wav` diff --git a/paddlespeech/cli/__init__.py b/paddlespeech/cli/__init__.py index b4b2e22d4bba7943cf2a7c8196546173130f0667..246d0f381279dd7263415c9e1a6c7ad76b8dbb53 100644 --- a/paddlespeech/cli/__init__.py +++ b/paddlespeech/cli/__init__.py @@ -14,4 +14,5 @@ from .asr import ASRExecutor from .base_commands import BaseCommand from .base_commands import HelpCommand +from .cls import CLSExecutor from .st import STExecutor diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index b40516e955a3a5f1b1ab36ca0be605bd6df8c586..1d235201d06080c2268033d78792ef4ccebd5152 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -39,7 +39,11 @@ from paddlespeech.s2t.utils.utility import UpdateConfig __all__ = ['ASRExecutor'] pretrained_models = { - "wenetspeech_zh_16k": { + # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]". + # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k". + # Command line and python api use "{model_name}[_{dataset}]" as --model, usage: + # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav" + "conformer_wenetspeech-zh-16k": { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/conformer.model.tar.gz', 'md5': @@ -49,7 +53,7 @@ pretrained_models = { 'ckpt_path': 'exp/conformer/checkpoints/wenetspeech', }, - "transformer_zh_16k": { + "transformer_aishell-zh-16k": { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz', 'md5': @@ -83,7 +87,7 @@ class ASRExecutor(BaseExecutor): self.parser.add_argument( '--model', type=str, - default='wenetspeech', + default='conformer_wenetspeech', help='Choose model type of asr task.') self.parser.add_argument( '--lang', @@ -137,9 +141,13 @@ class ASRExecutor(BaseExecutor): """ Init model and other resources from a specific path. """ + if hasattr(self, 'model'): + logger.info('Model had been initialized.') + return + if cfg_path is None or ckpt_path is None: sample_rate_str = '16k' if sample_rate == 16000 else '8k' - tag = model_type + '_' + lang + '_' + sample_rate_str + tag = model_type + '-' + lang + '-' + sample_rate_str res_path = self._get_pretrained_path(tag) # wenetspeech_zh self.res_path = res_path self.cfg_path = os.path.join(res_path, @@ -161,7 +169,7 @@ class ASRExecutor(BaseExecutor): self.config.decoding.decoding_method = "attention_rescoring" with UpdateConfig(self.config): - if model_type == "ds2_online" or model_type == "ds2_offline": + if "ds2_online" in model_type or "ds2_offline" in model_type: from paddlespeech.s2t.io.collator import SpeechCollator self.config.collator.vocab_filepath = os.path.join( res_path, self.config.collator.vocab_filepath) @@ -174,7 +182,7 @@ class ASRExecutor(BaseExecutor): spm_model_prefix=self.config.collator.spm_model_prefix) self.config.model.input_dim = self.collate_fn_test.feature_size self.config.model.output_dim = text_feature.vocab_size - elif model_type == "conformer" or model_type == "transformer" or model_type == "wenetspeech": + elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type: self.config.collator.vocab_filepath = os.path.join( res_path, self.config.collator.vocab_filepath) self.config.collator.augmentation_config = os.path.join( @@ -192,7 +200,9 @@ class ASRExecutor(BaseExecutor): raise Exception("wrong type") # Enter the path of model root - model_class = dynamic_import(model_type, model_alias) + model_name = ''.join( + model_type.split('_')[:-1]) # model_type: {model_name}_{dataset} + model_class = dynamic_import(model_name, model_alias) model_conf = self.config.model logger.info(model_conf) model = model_class.from_config(model_conf) @@ -213,7 +223,7 @@ class ASRExecutor(BaseExecutor): logger.info("Preprocess audio_file:" + audio_file) # Get the object for feature extraction - if model_type == "ds2_online" or model_type == "ds2_offline": + if "ds2_online" in model_type or "ds2_offline" in model_type: audio, _ = self.collate_fn_test.process_utterance( audio_file=audio_file, transcript=" ") audio_len = audio.shape[0] @@ -225,7 +235,7 @@ class ASRExecutor(BaseExecutor): self._inputs["audio_len"] = audio_len logger.info(f"audio feat shape: {audio.shape}") - elif model_type == "conformer" or model_type == "transformer" or model_type == "wenetspeech": + elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type: logger.info("get the preprocess conf") preprocess_conf_file = self.config.collator.augmentation_config # redirect the cmvn path @@ -289,7 +299,7 @@ class ASRExecutor(BaseExecutor): cfg = self.config.decoding audio = self._inputs["audio"] audio_len = self._inputs["audio_len"] - if model_type == "ds2_online" or model_type == "ds2_offline": + if "ds2_online" in model_type or "ds2_offline" in model_type: result_transcripts = self.model.decode( audio, audio_len, @@ -304,7 +314,7 @@ class ASRExecutor(BaseExecutor): num_processes=cfg.num_proc_bsearch) self._outputs["result"] = result_transcripts[0] - elif model_type == "conformer" or model_type == "transformer" or model_type == "wenetspeech": + elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type: result_transcripts = self.model.decode( audio, audio_len, @@ -361,7 +371,7 @@ class ASRExecutor(BaseExecutor): audio, audio_sample_rate = soundfile.read( audio_file, dtype="int16", always_2d=True) except Exception as e: - logger.error(str(e)) + logger.exception(e) logger.error( "can not open the audio file, please check the audio file format is 'wav'. \n \ you can try to use sox to change the file format.\n \ @@ -421,7 +431,7 @@ class ASRExecutor(BaseExecutor): logger.info('ASR Result: {}'.format(res)) return True except Exception as e: - print(e) + logger.exception(e) return False def __call__(self, model, lang, sample_rate, config, ckpt_path, audio_file, diff --git a/paddlespeech/cli/cls/__init.__py b/paddlespeech/cli/cls/__init.__py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/paddlespeech/cli/cls/__init__.py b/paddlespeech/cli/cls/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..13e316f8f687b34743a92bd0723b944741b74516 --- /dev/null +++ b/paddlespeech/cli/cls/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .infer import CLSExecutor diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..0b4982d157b8768160d47d4c4eb0e39533a9884d --- /dev/null +++ b/paddlespeech/cli/cls/infer.py @@ -0,0 +1,260 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +from typing import List +from typing import Optional +from typing import Union + +import numpy as np +import paddle +import yaml + +from ..executor import BaseExecutor +from ..utils import cli_register +from ..utils import download_and_decompress +from ..utils import logger +from ..utils import MODEL_HOME +from paddleaudio import load +from paddleaudio.features import LogMelSpectrogram +from paddlespeech.s2t.utils.dynamic_import import dynamic_import + +__all__ = ['CLSExecutor'] + +pretrained_models = { + # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]". + # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k". + # Command line and python api use "{model_name}[_{dataset}]" as --model, usage: + # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav" + "panns_cnn6-32k": { + 'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz', + 'md5': '4cf09194a95df024fd12f84712cf0f9c', + 'cfg_path': 'panns.yaml', + 'ckpt_path': 'cnn6.pdparams', + 'label_file': 'audioset_labels.txt', + }, + "panns_cnn10-32k": { + 'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz', + 'md5': 'cb8427b22176cc2116367d14847f5413', + 'cfg_path': 'panns.yaml', + 'ckpt_path': 'cnn10.pdparams', + 'label_file': 'audioset_labels.txt', + }, + "panns_cnn14-32k": { + 'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz', + 'md5': 'e3b9b5614a1595001161d0ab95edee97', + 'cfg_path': 'panns.yaml', + 'ckpt_path': 'cnn14.pdparams', + 'label_file': 'audioset_labels.txt', + }, +} + +model_alias = { + "panns_cnn6": "paddlespeech.cls.models.panns:CNN6", + "panns_cnn10": "paddlespeech.cls.models.panns:CNN10", + "panns_cnn14": "paddlespeech.cls.models.panns:CNN14", +} + + +@cli_register( + name='paddlespeech.cls', description='Audio classification infer command.') +class CLSExecutor(BaseExecutor): + def __init__(self): + super(CLSExecutor, self).__init__() + + self.parser = argparse.ArgumentParser( + prog='paddlespeech.cls', add_help=True) + self.parser.add_argument( + '--input', type=str, required=True, help='Audio file to classify.') + self.parser.add_argument( + '--model', + type=str, + default='panns_cnn14', + help='Choose model type of cls task.') + self.parser.add_argument( + '--config', + type=str, + default=None, + help='Config of cls task. Use deault config when it is None.') + self.parser.add_argument( + '--ckpt_path', + type=str, + default=None, + help='Checkpoint file of model.') + self.parser.add_argument( + '--label_file', + type=str, + default=None, + help='Label file of cls task.') + self.parser.add_argument( + '--topk', + type=int, + default=1, + help='Return topk scores of classification result.') + self.parser.add_argument( + '--device', + type=str, + default=paddle.get_device(), + help='Choose device to execute model inference.') + + def _get_pretrained_path(self, tag: str) -> os.PathLike: + """ + Download and returns pretrained resources path of current task. + """ + assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format( + tag) + + res_path = os.path.join(MODEL_HOME, tag) + decompressed_path = download_and_decompress(pretrained_models[tag], + res_path) + decompressed_path = os.path.abspath(decompressed_path) + logger.info( + 'Use pretrained model stored in: {}'.format(decompressed_path)) + + return decompressed_path + + def _init_from_path(self, + model_type: str='panns_cnn14', + cfg_path: Optional[os.PathLike]=None, + ckpt_path: Optional[os.PathLike]=None, + label_file: Optional[os.PathLike]=None): + """ + Init model and other resources from a specific path. + """ + if hasattr(self, 'model'): + logger.info('Model had been initialized.') + return + + if label_file is None or ckpt_path is None: + tag = model_type + '-' + '32k' # panns_cnn14-32k + self.res_path = self._get_pretrained_path(tag) + self.cfg_path = os.path.join(self.res_path, + pretrained_models[tag]['cfg_path']) + self.label_file = os.path.join(self.res_path, + pretrained_models[tag]['label_file']) + self.ckpt_path = os.path.join(self.res_path, + pretrained_models[tag]['ckpt_path']) + else: + self.cfg_path = os.path.abspath(cfg_path) + self.label_file = os.path.abspath(label_file) + self.ckpt_path = os.path.abspath(ckpt_path) + + # config + with open(self.cfg_path, 'r') as f: + self._conf = yaml.safe_load(f) + + # labels + self._label_list = [] + with open(self.label_file, 'r') as f: + for line in f: + self._label_list.append(line.strip()) + + # model + model_class = dynamic_import(model_type, model_alias) + model_dict = paddle.load(self.ckpt_path) + self.model = model_class(extract_embedding=False) + self.model.set_state_dict(model_dict) + self.model.eval() + + def preprocess(self, audio_file: Union[str, os.PathLike]): + """ + Input preprocess and return paddle.Tensor stored in self.input. + Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet). + """ + feat_conf = self._conf['feature'] + logger.info(feat_conf) + waveform, _ = load( + file=audio_file, + sr=feat_conf['sample_rate'], + mono=True, + dtype='float32') + logger.info("Preprocessing audio_file:" + audio_file) + + # Feature extraction + feature_extractor = LogMelSpectrogram( + sr=feat_conf['sample_rate'], + n_fft=feat_conf['n_fft'], + hop_length=feat_conf['hop_length'], + window=feat_conf['window'], + win_length=feat_conf['window_length'], + f_min=feat_conf['f_min'], + f_max=feat_conf['f_max'], + n_mels=feat_conf['n_mels'], ) + feats = feature_extractor( + paddle.to_tensor(paddle.to_tensor(waveform).unsqueeze(0))) + self._inputs['feats'] = paddle.transpose(feats, [0, 2, 1]).unsqueeze( + 1) # [B, N, T] -> [B, 1, T, N] + + @paddle.no_grad() + def infer(self): + """ + Model inference and result stored in self.output. + """ + self._outputs['logits'] = self.model(self._inputs['feats']) + + def _generate_topk_label(self, result: np.ndarray, topk: int) -> str: + assert topk <= len( + self._label_list), 'Value of topk is larger than number of labels.' + + topk_idx = (-result).argsort()[:topk] + ret = '' + for idx in topk_idx: + label, score = self._label_list[idx], result[idx] + ret += f'{label}: {score}\n' + return ret + + def postprocess(self, topk: int) -> Union[str, os.PathLike]: + """ + Output postprocess and return human-readable results such as texts and audio files. + """ + return self._generate_topk_label( + result=self._outputs['logits'].squeeze(0).numpy(), topk=topk) + + def execute(self, argv: List[str]) -> bool: + """ + Command line entry. + """ + parser_args = self.parser.parse_args(argv) + + model_type = parser_args.model + label_file = parser_args.label_file + cfg_path = parser_args.config + ckpt_path = parser_args.ckpt_path + audio_file = parser_args.input + topk = parser_args.topk + device = parser_args.device + + try: + res = self(model_type, cfg_path, label_file, ckpt_path, audio_file, + topk, device) + logger.info('CLS Result:\n{}'.format(res)) + return True + except Exception as e: + logger.exception(e) + return False + + def __call__(self, model, config, ckpt_path, label_file, audio_file, topk, + device): + """ + Python API to call an executor. + """ + audio_file = os.path.abspath(audio_file) + # self._check(audio_file, sample_rate) + paddle.set_device(device) + self._init_from_path(model, config, ckpt_path, label_file) + self.preprocess(audio_file) + self.infer() + res = self.postprocess(topk) # Retrieve result of cls. + + return res diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py index 9f97d8731e913e0513a24a0de5c10838d151a521..534b9e3b9458f75fd89710ee20dd4b06feb38c4d 100644 --- a/paddlespeech/cli/st/infer.py +++ b/paddlespeech/cli/st/infer.py @@ -21,6 +21,7 @@ from typing import Union import kaldi_io import numpy as np import paddle +import soundfile from kaldiio import WriteHelper from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.utils.dynamic_import import dynamic_import @@ -36,19 +37,19 @@ from ..utils import MODEL_HOME __all__ = ["STExecutor"] pretrained_models = { - "fat_st_ted_en_zh": { + "fat_st_ted_en-zh": { "url": - "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/fat_st_mtl.model.tar.gz", + "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/fat_st_ted-en-zh.tar.gz", "md5": - "210b8eacc390d9965334fa8e96c49a13", + "fa0a7425b91b4f8d259c70b2aca5ae67", "cfg_path": "conf/transformer_mtl_noam.yaml", "ckpt_path": - "exp/transformer_mtl_noam/checkpoints/fat_st_ted_en_zh", + "exp/transformer_mtl_noam/checkpoints/fat_st_ted-en-zh.pdparams", } } -model_alias = {"fat_st": "paddlespeech.s2t.models.u2_st:U2STModel"} +model_alias = {"fat_st_ted": "paddlespeech.s2t.models.u2_st:U2STModel"} kaldi_bins = { "url": @@ -69,17 +70,28 @@ class STExecutor(BaseExecutor): self.parser.add_argument( "--input", type=str, required=True, help="Audio file to translate.") self.parser.add_argument( - "--model", + "--model_type", type=str, - default="fat_st", + default="fat_st_ted", help="Choose model type of st task.") self.parser.add_argument( - "--lang", + "--src_lang", type=str, - default="ted_en_zh", - help="Choose model language.") + default="en", + help="Choose model source language.") self.parser.add_argument( - "--config", + "--tgt_lang", + type=str, + default="zh", + help="Choose model target language.") + self.parser.add_argument( + "--sample_rate", + type=int, + default=16000, + choices=[16000], + help='Choose the audio sample rate of the model. 8000 or 16000') + self.parser.add_argument( + "--cfg_path", type=str, default=None, help="Config of st task. Use deault config when it is None.") @@ -117,20 +129,28 @@ class STExecutor(BaseExecutor): decompressed_path = download_and_decompress(kaldi_bins, MODEL_HOME) decompressed_path = os.path.abspath(decompressed_path) logger.info("Kaldi_bins stored in: {}".format(decompressed_path)) - os.environ['LD_LIBRARY_PATH'] += f':{decompressed_path}' + if "LD_LIBRARY_PATH" in os.environ: + os.environ["LD_LIBRARY_PATH"] += f":{decompressed_path}" + else: + os.environ["LD_LIBRARY_PATH"] = f"{decompressed_path}" os.environ["PATH"] += f":{decompressed_path}" return decompressed_path def _init_from_path(self, - model_type: str="fat_st", - lang: str="zh", + model_type: str="fat_st_ted", + src_lang: str="en", + tgt_lang: str="zh", cfg_path: Optional[os.PathLike]=None, ckpt_path: Optional[os.PathLike]=None): """ Init model and other resources from a specific path. """ + if hasattr(self, 'model'): + logger.info('Model had been initialized.') + return + if cfg_path is None or ckpt_path is None: - tag = model_type + "_" + lang + tag = model_type + "_" + src_lang + "-" + tgt_lang res_path = self._get_pretrained_path(tag) self.cfg_path = os.path.join(res_path, pretrained_models[tag]["cfg_path"]) @@ -171,13 +191,20 @@ class STExecutor(BaseExecutor): self.model.eval() # load model - params_path = self.ckpt_path + ".pdparams" + params_path = self.ckpt_path model_dict = paddle.load(params_path) self.model.set_state_dict(model_dict) # set kaldi bins self._set_kaldi_bins() + def _check(self, audio_file: str, sample_rate: int): + _, audio_sample_rate = soundfile.read( + audio_file, dtype="int16", always_2d=True) + if audio_sample_rate != sample_rate: + raise Exception("invalid sample rate") + sys.exit(-1) + def preprocess(self, wav_file: Union[str, os.PathLike], model_type: str): """ Input preprocess and return paddle.Tensor stored in self.input. @@ -186,7 +213,7 @@ class STExecutor(BaseExecutor): audio_file = os.path.abspath(wav_file) logger.info("Preprocess audio_file:" + audio_file) - if model_type == "fat_st": + if model_type == "fat_st_ted": cmvn = self.config.collator.cmvn_path utt_name = "_tmp" @@ -198,7 +225,8 @@ class STExecutor(BaseExecutor): fbank_extract_process = subprocess.Popen( fbank_extract_command, stdin=subprocess.PIPE, - stdout=subprocess.PIPE) + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) fbank_extract_process.stdin.write( f"{utt_name} {wav_file}".encode("utf8")) fbank_extract_process.stdin.close() @@ -207,14 +235,18 @@ class STExecutor(BaseExecutor): extract_command = ["compute-kaldi-pitch-feats", "scp:-", "ark:-"] pitch_extract_process = subprocess.Popen( - extract_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) + extract_command, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) pitch_extract_process.stdin.write( f"{utt_name} {wav_file}".encode("utf8")) process_command = ["process-kaldi-pitch-feats", "ark:", "ark:-"] pitch_process = subprocess.Popen( process_command, stdin=pitch_extract_process.stdout, - stdout=subprocess.PIPE) + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) pitch_extract_process.stdin.close() pitch_feat = dict( kaldi_io.read_mat_ark(pitch_process.stdout))[utt_name] @@ -228,19 +260,19 @@ class STExecutor(BaseExecutor): "ark:-" ] cmvn_process = subprocess.Popen( - cmvn_command, stdout=subprocess.PIPE) + cmvn_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) process_command = [ "copy-feats", "--compress=true", "ark:-", "ark:-" ] process = subprocess.Popen( process_command, stdin=cmvn_process.stdout, - stdout=subprocess.PIPE) + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) norm_feat = dict(kaldi_io.read_mat_ark(process.stdout))[utt_name] - self.audio = paddle.to_tensor(norm_feat).unsqueeze(0) - self.audio_len = paddle.to_tensor( - self.audio.shape[1], dtype="int64") - logger.info(f"audio feat shape: {self.audio.shape}") + self._inputs["audio"] = paddle.to_tensor(norm_feat).unsqueeze(0) + self._inputs["audio_len"] = paddle.to_tensor( + self._inputs["audio"].shape[1], dtype="int64") else: raise ValueError("Wrong model type.") @@ -250,9 +282,9 @@ class STExecutor(BaseExecutor): Model inference and result stored in self.output. """ cfg = self.config.decoding - audio = self.audio - audio_len = self.audio_len - if model_type == "fat_st": + audio = self._inputs["audio"] + audio_len = self._inputs["audio_len"] + if model_type == "fat_st_ted": hyps = self.model.decode( audio, audio_len, @@ -270,7 +302,7 @@ class STExecutor(BaseExecutor): decoding_chunk_size=cfg.decoding_chunk_size, num_decoding_left_chunks=cfg.num_decoding_left_chunks, simulate_streaming=cfg.simulate_streaming) - self.result_transcripts = hyps + self._outputs["result"] = hyps else: raise ValueError("Wrong model type.") @@ -278,8 +310,8 @@ class STExecutor(BaseExecutor): """ Output postprocess and return human-readable results such as texts and audio files. """ - if model_type == "fat_st": - return self.result_transcripts + if model_type == "fat_st_ted": + return self._outputs["result"] else: raise ValueError("Wrong model type.") @@ -289,30 +321,36 @@ class STExecutor(BaseExecutor): """ parser_args = self.parser.parse_args(argv) - model = parser_args.model - lang = parser_args.lang - config = parser_args.config + model_type = parser_args.model_type + src_lang = parser_args.src_lang + tgt_lang = parser_args.tgt_lang + sample_rate = parser_args.sample_rate + cfg_path = parser_args.cfg_path ckpt_path = parser_args.ckpt_path audio_file = parser_args.input device = parser_args.device try: - res = self(model, lang, config, ckpt_path, audio_file, device) - logger.info('ST Result: {}'.format(res)) + res = self(model_type, src_lang, tgt_lang, sample_rate, cfg_path, + ckpt_path, audio_file, device) + logger.info("ST Result: {}".format(res)) return True except Exception as e: print(e) return False - def __call__(self, model, lang, config, ckpt_path, audio_file, device): + def __call__(self, model_type, src_lang, tgt_lang, sample_rate, cfg_path, + ckpt_path, audio_file, device): """ Python API to call an executor. """ audio_file = os.path.abspath(audio_file) + self._check(audio_file, sample_rate) paddle.set_device(device) - self._init_from_path(model, lang, config, ckpt_path) - self.preprocess(audio_file, model) - self.infer(model) - res = self.postprocess(model) + self._init_from_path(model_type, src_lang, tgt_lang, cfg_path, + ckpt_path) + self.preprocess(audio_file, model_type) + self.infer(model_type) + res = self.postprocess(model_type) return res diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py index edf579f71634b07281eded2479ecbd7d20b7c308..eb023c11ba88b001fc0ce0171508ea16bed8ffea 100644 --- a/paddlespeech/cli/utils.py +++ b/paddlespeech/cli/utils.py @@ -12,10 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. import functools +import hashlib import logging import os +import tarfile +import zipfile from typing import Any from typing import Dict +from typing import List from paddle.framework import load from paddle.utils import download @@ -55,12 +59,69 @@ def get_command(name: str) -> Any: return com['_entry'] -def decompress(file: str) -> os.PathLike: - """ - Extracts all files from a compressed file. - """ - assert os.path.isfile(file), "File: {} not exists.".format(file) - return download._decompress(file) +def _md5check(filepath: os.PathLike, md5sum: str) -> bool: + logger.info("File {} md5 checking...".format(filepath)) + md5 = hashlib.md5() + with open(filepath, 'rb') as f: + for chunk in iter(lambda: f.read(4096), b""): + md5.update(chunk) + calc_md5sum = md5.hexdigest() + + if calc_md5sum != md5sum: + logger.info("File {} md5 check failed, {}(calc) != " + "{}(base)".format(filepath, calc_md5sum, md5sum)) + return False + else: + logger.info("File {} md5 check passed.".format(filepath)) + return True + + +def _get_uncompress_path(filepath: os.PathLike) -> os.PathLike: + file_dir = os.path.dirname(filepath) + + if tarfile.is_tarfile(filepath): + files = tarfile.open(filepath, "r:*") + file_list = files.getnames() + elif zipfile.is_zipfile(filepath): + files = zipfile.ZipFile(filepath, 'r') + file_list = files.namelist() + else: + return file_dir + + if _is_a_single_file(file_list): + rootpath = file_list[0] + uncompressed_path = os.path.join(file_dir, rootpath) + elif _is_a_single_dir(file_list): + rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1] + uncompressed_path = os.path.join(file_dir, rootpath) + else: + rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1] + uncompressed_path = os.path.join(file_dir, rootpath) + + files.close() + return uncompressed_path + + +def _is_a_single_file(file_list: List[os.PathLike]) -> bool: + if len(file_list) == 1 and file_list[0].find(os.sep) < -1: + return True + return False + + +def _is_a_single_dir(file_list: List[os.PathLike]) -> bool: + new_file_list = [] + for file_path in file_list: + if '/' in file_path: + file_path = file_path.replace('/', os.sep) + elif '\\' in file_path: + file_path = file_path.replace('\\', os.sep) + new_file_list.append(file_path) + + file_name = new_file_list[0].split(os.sep)[0] + for i in range(1, len(new_file_list)): + if file_name != new_file_list[i].split(os.sep)[0]: + return False + return True def download_and_decompress(archive: Dict[str, str], path: str) -> os.PathLike: @@ -72,7 +133,17 @@ def download_and_decompress(archive: Dict[str, str], path: str) -> os.PathLike: assert 'url' in archive and 'md5' in archive, \ 'Dictionary keys of "url" and "md5" are required in the archive, but got: {}'.format(list(archive.keys())) - return download.get_path_from_url(archive['url'], path, archive['md5']) + + filepath = os.path.join(path, os.path.basename(archive['url'])) + if os.path.isfile(filepath) and _md5check(filepath, archive['md5']): + uncompress_path = _get_uncompress_path(filepath) + if not os.path.isdir(uncompress_path): + download._decompress(filepath) + else: + uncompress_path = download.get_path_from_url(archive['url'], path, + archive['md5']) + + return uncompress_path def load_state_dict_from_url(url: str, path: str, md5: str=None) -> os.PathLike: @@ -128,11 +199,16 @@ class Logger(object): 'EVAL': 22, 'WARNING': 30, 'ERROR': 40, - 'CRITICAL': 50 + 'CRITICAL': 50, + 'EXCEPTION': 100, } for key, level in log_config.items(): logging.addLevelName(level, key) - self.__dict__[key.lower()] = functools.partial(self.__call__, level) + if key == 'EXCEPTION': + self.__dict__[key.lower()] = self.logger.exception + else: + self.__dict__[key.lower()] = functools.partial(self.__call__, + level) self.format = logging.Formatter( fmt='[%(asctime)-15s] [%(levelname)8s] [%(filename)s] [L%(lineno)d] - %(message)s' diff --git a/requirements.txt b/requirements.txt index 658e64c05d654780de8347e3862f92fd385255db..3708bb0620b39f0282438a98ecc8a5e8066a027f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,6 +14,7 @@ loguru matplotlib nara_wpe nltk +paddleaudio paddlespeech_ctcdecoders paddlespeech_feat pandas diff --git a/setup.py b/setup.py index fbb3a24a455acef6c04cdee060b0d77e230abe30..7720ba3fd9d4e64ef4f85f1b37919f896bb546d2 100644 --- a/setup.py +++ b/setup.py @@ -43,6 +43,7 @@ requirements = { "nara_wpe", "nltk", "pandas", + "paddleaudio", "paddlespeech_ctcdecoders", "paddlespeech_feat", "praatio~=4.1", @@ -197,7 +198,7 @@ setup_info = dict( "pwgan", "gan", ], - python_requires='>=3.6', + python_requires='>=3.7', install_requires=requirements["install"], extras_require={ 'develop': diff --git a/tests/benchmark/conformer/run.sh b/tests/benchmark/conformer/run.sh index 9fd13fbcb1691f2a6a6ab9ac42fce97912efd062..fcd0c2359690373e73acf077e3b3edeeeb6e470f 100644 --- a/tests/benchmark/conformer/run.sh +++ b/tests/benchmark/conformer/run.sh @@ -20,7 +20,7 @@ mkdir -p conf/benchmark cp conf/conformer.yaml conf/benchmark/conformer.yaml sed -i "s/ accum_grad: 2/ accum_grad: 1/g" conf/benchmark/conformer.yaml fp_item_list=(fp32) -bs_item=(16 30) +bs_item=(16) config_path=conf/benchmark/conformer.yaml seed=0 output=exp/conformer diff --git a/tests/benchmark/pwgan/run_all.sh b/tests/benchmark/pwgan/run_all.sh index 51deaf9ff149f8bee28608bf9fae43ece3172054..874e9aa252e74a2fec1daec7645af29acdd998e7 100755 --- a/tests/benchmark/pwgan/run_all.sh +++ b/tests/benchmark/pwgan/run_all.sh @@ -38,7 +38,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then model_mode_list=(pwgan) fp_item_list=(fp32) # 满 bs 是 26 - bs_item_list=(6 26) + bs_item_list=(6) for model_mode in ${model_mode_list[@]}; do for fp_item in ${fp_item_list[@]}; do for bs_item in ${bs_item_list[@]}; do @@ -55,4 +55,4 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then done done done -fi \ No newline at end of file +fi