diff --git a/examples/esc50/README.md b/examples/esc50/README.md index e148efd00fe98f5585873ba8470fbcd26db51455..6ac10b3acd2bc2b3037861180476587ec32538ee 100644 --- a/examples/esc50/README.md +++ b/examples/esc50/README.md @@ -31,7 +31,7 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 `local/train.py` 脚本中可支持配置的参数: - `device`: 选用什么设备进行训练,可选cpu或gpu,默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。 -- `gpu_feat`: 选择是否用gpu加速提取音频特征,默认为False。 +- `feat_backend`: 选择提取特征的后端,可选`'numpy'`或`'paddle'`,默认为`'numpy'`。 - `epochs`: 训练轮次,默认为50。 - `learning_rate`: Fine-tune的学习率;默认为5e-5。 - `batch_size`: 批处理大小,请结合显存情况进行调整,若出现显存不足,请适当调低这一参数;默认为16。 @@ -69,7 +69,7 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 2 - `device`: 选用什么设备进行训练,可选cpu或gpu,默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。 - `wav`: 指定预测的音频文件。 -- `gpu_feat`: 选择是否用gpu加速提取音频特征,默认为False。 +- `feat_backend`: 选择提取特征的后端,可选`'numpy'`或`'paddle'`,默认为`'numpy'`。 - `top_k`: 预测显示的top k标签的得分,默认为1。 - `checkpoint`: 模型参数checkpoint文件。 diff --git a/examples/esc50/cls0/local/predict.py b/examples/esc50/cls0/local/predict.py index 58187677df6ff472894d3a4e9c9ab9f57ebeca14..a6e38a35f0548ad8e0e27758c703b0de23a13118 100644 --- a/examples/esc50/cls0/local/predict.py +++ b/examples/esc50/cls0/local/predict.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import argparse -import ast import numpy as np import paddle @@ -29,24 +28,25 @@ from paddlespeech.cls.models.panns import cnn14 parser = argparse.ArgumentParser(__doc__) parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to predict, defaults to gpu.") parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.") -parser.add_argument("--gpu_feat", type=ast.literal_eval, default=False, help="Use gpu to extract feature.") +parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.") parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results") parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.") args = parser.parse_args() # yapf: enable -def extract_features(file: str, gpu_feat: bool=False, +def extract_features(file: str, feat_backend: str='numpy', **kwargs) -> paddle.Tensor: waveform, sr = load_audio(file, sr=None) - if gpu_feat: - feature_extractor = LogMelSpectrogram(sr=sr, hop_length=320, **kwargs) - feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0)) - feat = paddle.transpose(feat, [0, 2, 1]) - else: + + if args.feat_backend == 'numpy': feat = melspectrogram(waveform, sr, **kwargs).transpose() feat = np.expand_dims(feat, 0) feat = paddle.to_tensor(feat) + else: + feature_extractor = LogMelSpectrogram(sr=sr, **kwargs) + feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0)) + feat = paddle.transpose(feat, [0, 2, 1]) return feat @@ -59,7 +59,7 @@ if __name__ == '__main__': model.set_state_dict(paddle.load(args.checkpoint)) model.eval() - feat = extract_features(args.wav, args.gpu_feat) + feat = extract_features(args.wav, args.feat_backend) logits = model(feat) probs = F.softmax(logits, axis=1).numpy() diff --git a/examples/esc50/cls0/local/train.py b/examples/esc50/cls0/local/train.py index 67215535c98decb754bc45d52981efddeac7ac81..7a0301878ada4ba9ecd376c19d673ce5d544daad 100644 --- a/examples/esc50/cls0/local/train.py +++ b/examples/esc50/cls0/local/train.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import argparse -import ast import os import paddle @@ -28,7 +27,7 @@ from paddlespeech.cls.utils import Timer parser = argparse.ArgumentParser(__doc__) parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.") parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.") -parser.add_argument("--gpu_feat", type=ast.literal_eval, default=False, help="Use gpu to extract feature.") +parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.") parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.") parser.add_argument("--num_workers", type=int, default=0, help="Number of workers in dataloader.") @@ -52,13 +51,13 @@ if __name__ == "__main__": learning_rate=args.learning_rate, parameters=model.parameters()) criterion = paddle.nn.loss.CrossEntropyLoss() - if args.gpu_feat: - train_ds = ESC50(mode='train') - dev_ds = ESC50(mode='dev') - feature_extractor = LogMelSpectrogram(sr=16000, hop_length=320) - else: + if args.feat_backend == 'numpy': train_ds = ESC50(mode='train', feat_type='melspectrogram') dev_ds = ESC50(mode='dev', feat_type='melspectrogram') + else: + train_ds = ESC50(mode='train') + dev_ds = ESC50(mode='dev') + feature_extractor = LogMelSpectrogram(sr=16000) train_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False) @@ -80,15 +79,15 @@ if __name__ == "__main__": num_corrects = 0 num_samples = 0 for batch_idx, batch in enumerate(train_loader): - if args.gpu_feat: + if args.feat_backend == 'numpy': + feats, labels = batch + else: waveforms, labels = batch feats = feature_extractor( waveforms ) # Need a padding when lengths of waveforms differ in a batch. feats = paddle.transpose(feats, [0, 2, 1]) # To [N, length, n_mels] - else: - feats, labels = batch logits = model(feats) @@ -144,12 +143,12 @@ if __name__ == "__main__": num_samples = 0 with logger.processing('Evaluation on validation dataset'): for batch_idx, batch in enumerate(dev_loader): - if args.gpu_feat: + if args.feat_backend == 'numpy': + feats, labels = batch + else: waveforms, labels = batch feats = feature_extractor(waveforms) feats = paddle.transpose(feats, [0, 2, 1]) - else: - feats, labels = batch logits = model(feats) diff --git a/examples/esc50/cls0/run.sh b/examples/esc50/cls0/run.sh index 17f2fd99543d35a0ab50105447b7e05f9a3e4bbc..6d3a09c6d78c07b65c296f76db3d58f2d6c85c99 100755 --- a/examples/esc50/cls0/run.sh +++ b/examples/esc50/cls0/run.sh @@ -16,13 +16,13 @@ num_epochs=50 batch_size=16 ckpt_dir=./checkpoint save_freq=10 -gpu_feat=True +feat_backend=numpy if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${ngpu} -gt 1 ]; then python -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES local/train.py \ --epochs ${num_epochs} \ - --gpu_feat ${gpu_feat} \ + --feat_backend ${feat_backend} \ --batch_size ${batch_size} \ --checkpoint_dir ${ckpt_dir} \ --save_freq ${save_freq} @@ -30,7 +30,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python local/train.py \ --device ${device} \ --epochs ${num_epochs} \ - --gpu_feat ${gpu_feat} \ + --feat_backend ${feat_backend} \ --batch_size ${batch_size} \ --checkpoint_dir ${ckpt_dir} \ --save_freq ${save_freq} @@ -43,7 +43,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then python local/predict.py \ --device ${device} \ --wav ${audio_file} \ - --gpu_feat ${gpu_feat} \ + --feat_backend ${feat_backend} \ --top_k 10 \ --checkpoint ${ckpt} fi diff --git a/paddlespeech/cls/features/spectrum.py b/paddlespeech/cls/features/spectrum.py index d70e60fb0550b7b98373d70536134744bd7ba8b0..154b6484ccc7157b8296e0567a4230fa8ef19335 100644 --- a/paddlespeech/cls/features/spectrum.py +++ b/paddlespeech/cls/features/spectrum.py @@ -201,7 +201,7 @@ def compute_fbank_matrix(sr: int, def power_to_db(magnitude: paddle.Tensor, ref_value: float=1.0, amin: float=1e-10, - top_db: Optional[float]=80.0) -> paddle.Tensor: + top_db: Optional[float]=None) -> paddle.Tensor: """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling ``10 * log10(x / ref)`` in a numerically stable way. @@ -304,7 +304,7 @@ class MelSpectrogram(nn.Layer): center: bool=True, pad_mode: str='reflect', n_mels: int=64, - f_min: float=0.0, + f_min: float=50.0, f_max: Optional[float]=None, htk: bool=False, norm: Union[str, float]='slaney', @@ -384,13 +384,13 @@ class LogMelSpectrogram(nn.Layer): center: bool=True, pad_mode: str='reflect', n_mels: int=64, - f_min: float=0.0, + f_min: float=50.0, f_max: Optional[float]=None, htk: bool=False, norm: Union[str, float]='slaney', ref_value: float=1.0, amin: float=1e-10, - top_db: Optional[float]=80.0, + top_db: Optional[float]=None, dtype: str=paddle.float32): """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal, typically an audio waveform. diff --git a/paddlespeech/cls/utils/env.py b/paddlespeech/cls/utils/env.py index 340c1e4bf752bd63f4afc2cbe67df4442ea3f3dc..c455af0008b90a8ed95ce72ac9b6cbc90cf37900 100644 --- a/paddlespeech/cls/utils/env.py +++ b/paddlespeech/cls/utils/env.py @@ -13,8 +13,8 @@ # limitations under the License. ''' This module is used to store environmental variables in PaddleSpeech. -PACKAGE_HOME --> the root directory for storing PaddleSpeech related data. Default to ~/.paddlespeech. Users can change the -├ default value through the PACKAGE_HOME environment variable. +PPSPEECH_HOME --> the root directory for storing PaddleSpeech related data. Default to ~/.paddlespeech. Users can change the +├ default value through the PPSPEECH_HOME environment variable. ├─ MODEL_HOME --> Store model files. └─ DATA_HOME --> Store automatically downloaded datasets. ''' @@ -26,14 +26,14 @@ def _get_user_home(): def _get_package_home(): - if 'PACKAGE_HOME' in os.environ: - home_path = os.environ['PACKAGE_HOME'] + if 'PPSPEECH_HOME' in os.environ: + home_path = os.environ['PPSPEECH_HOME'] if os.path.exists(home_path): if os.path.isdir(home_path): return home_path else: raise RuntimeError( - 'The environment variable PACKAGE_HOME {} is not a directory.'. + 'The environment variable PPSPEECH_HOME {} is not a directory.'. format(home_path)) else: return home_path @@ -48,6 +48,6 @@ def _get_sub_home(directory): USER_HOME = _get_user_home() -PACKAGE_HOME = _get_package_home() +PPSPEECH_HOME = _get_package_home() MODEL_HOME = _get_sub_home('pretrained_models') DATA_HOME = _get_sub_home('datasets') diff --git a/paddlespeech/cls/utils/log.py b/paddlespeech/cls/utils/log.py index 89d1e5b1874719da06d80e7085488c174f9e7c58..f4146c4f594ad6981b5913cdcb508189ad7684f5 100644 --- a/paddlespeech/cls/utils/log.py +++ b/paddlespeech/cls/utils/log.py @@ -55,13 +55,13 @@ log_config = { class Logger(object): ''' - Deafult logger in PaddleSpeechCls + Deafult logger in PaddleSpeech Args: - name(str) : Logger name, default is 'PaddleSpeechCls' + name(str) : Logger name, default is 'PaddleSpeech' ''' def __init__(self, name: str=None): - name = 'PaddleSpeechCls' if not name else name + name = 'PaddleSpeech' if not name else name self.logger = logging.getLogger(name) for key, conf in log_config.items():