提交 bdb3ce23 编写于 作者: K KP

Add paddlespeech.cls and esc50 example.

上级 eb68b3d8
......@@ -31,7 +31,7 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 1
`local/train.py` 脚本中可支持配置的参数:
- `device`: 选用什么设备进行训练,可选cpu或gpu,默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
- `gpu_feat`: 选择是否用gpu加速提取音频特征,默认为False
- `feat_backend`: 选择提取特征的后端,可选`'numpy'``'paddle'`,默认为`'numpy'`
- `epochs`: 训练轮次,默认为50。
- `learning_rate`: Fine-tune的学习率;默认为5e-5。
- `batch_size`: 批处理大小,请结合显存情况进行调整,若出现显存不足,请适当调低这一参数;默认为16。
......@@ -69,7 +69,7 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 2
- `device`: 选用什么设备进行训练,可选cpu或gpu,默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
- `wav`: 指定预测的音频文件。
- `gpu_feat`: 选择是否用gpu加速提取音频特征,默认为False
- `feat_backend`: 选择提取特征的后端,可选`'numpy'``'paddle'`,默认为`'numpy'`
- `top_k`: 预测显示的top k标签的得分,默认为1。
- `checkpoint`: 模型参数checkpoint文件。
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import ast
import numpy as np
import paddle
......@@ -29,24 +28,25 @@ from paddlespeech.cls.models.panns import cnn14
parser = argparse.ArgumentParser(__doc__)
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to predict, defaults to gpu.")
parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
parser.add_argument("--gpu_feat", type=ast.literal_eval, default=False, help="Use gpu to extract feature.")
parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.")
parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results")
parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.")
args = parser.parse_args()
# yapf: enable
def extract_features(file: str, gpu_feat: bool=False,
def extract_features(file: str, feat_backend: str='numpy',
**kwargs) -> paddle.Tensor:
waveform, sr = load_audio(file, sr=None)
if gpu_feat:
feature_extractor = LogMelSpectrogram(sr=sr, hop_length=320, **kwargs)
feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0))
feat = paddle.transpose(feat, [0, 2, 1])
else:
if args.feat_backend == 'numpy':
feat = melspectrogram(waveform, sr, **kwargs).transpose()
feat = np.expand_dims(feat, 0)
feat = paddle.to_tensor(feat)
else:
feature_extractor = LogMelSpectrogram(sr=sr, **kwargs)
feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0))
feat = paddle.transpose(feat, [0, 2, 1])
return feat
......@@ -59,7 +59,7 @@ if __name__ == '__main__':
model.set_state_dict(paddle.load(args.checkpoint))
model.eval()
feat = extract_features(args.wav, args.gpu_feat)
feat = extract_features(args.wav, args.feat_backend)
logits = model(feat)
probs = F.softmax(logits, axis=1).numpy()
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import ast
import os
import paddle
......@@ -28,7 +27,7 @@ from paddlespeech.cls.utils import Timer
parser = argparse.ArgumentParser(__doc__)
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.")
parser.add_argument("--gpu_feat", type=ast.literal_eval, default=False, help="Use gpu to extract feature.")
parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.")
parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.")
parser.add_argument("--num_workers", type=int, default=0, help="Number of workers in dataloader.")
......@@ -52,13 +51,13 @@ if __name__ == "__main__":
learning_rate=args.learning_rate, parameters=model.parameters())
criterion = paddle.nn.loss.CrossEntropyLoss()
if args.gpu_feat:
train_ds = ESC50(mode='train')
dev_ds = ESC50(mode='dev')
feature_extractor = LogMelSpectrogram(sr=16000, hop_length=320)
else:
if args.feat_backend == 'numpy':
train_ds = ESC50(mode='train', feat_type='melspectrogram')
dev_ds = ESC50(mode='dev', feat_type='melspectrogram')
else:
train_ds = ESC50(mode='train')
dev_ds = ESC50(mode='dev')
feature_extractor = LogMelSpectrogram(sr=16000)
train_sampler = paddle.io.DistributedBatchSampler(
train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False)
......@@ -80,15 +79,15 @@ if __name__ == "__main__":
num_corrects = 0
num_samples = 0
for batch_idx, batch in enumerate(train_loader):
if args.gpu_feat:
if args.feat_backend == 'numpy':
feats, labels = batch
else:
waveforms, labels = batch
feats = feature_extractor(
waveforms
) # Need a padding when lengths of waveforms differ in a batch.
feats = paddle.transpose(feats,
[0, 2, 1]) # To [N, length, n_mels]
else:
feats, labels = batch
logits = model(feats)
......@@ -144,12 +143,12 @@ if __name__ == "__main__":
num_samples = 0
with logger.processing('Evaluation on validation dataset'):
for batch_idx, batch in enumerate(dev_loader):
if args.gpu_feat:
if args.feat_backend == 'numpy':
feats, labels = batch
else:
waveforms, labels = batch
feats = feature_extractor(waveforms)
feats = paddle.transpose(feats, [0, 2, 1])
else:
feats, labels = batch
logits = model(feats)
......
......@@ -16,13 +16,13 @@ num_epochs=50
batch_size=16
ckpt_dir=./checkpoint
save_freq=10
gpu_feat=True
feat_backend=numpy
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
if [ ${ngpu} -gt 1 ]; then
python -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES local/train.py \
--epochs ${num_epochs} \
--gpu_feat ${gpu_feat} \
--feat_backend ${feat_backend} \
--batch_size ${batch_size} \
--checkpoint_dir ${ckpt_dir} \
--save_freq ${save_freq}
......@@ -30,7 +30,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python local/train.py \
--device ${device} \
--epochs ${num_epochs} \
--gpu_feat ${gpu_feat} \
--feat_backend ${feat_backend} \
--batch_size ${batch_size} \
--checkpoint_dir ${ckpt_dir} \
--save_freq ${save_freq}
......@@ -43,7 +43,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python local/predict.py \
--device ${device} \
--wav ${audio_file} \
--gpu_feat ${gpu_feat} \
--feat_backend ${feat_backend} \
--top_k 10 \
--checkpoint ${ckpt}
fi
......
......@@ -201,7 +201,7 @@ def compute_fbank_matrix(sr: int,
def power_to_db(magnitude: paddle.Tensor,
ref_value: float=1.0,
amin: float=1e-10,
top_db: Optional[float]=80.0) -> paddle.Tensor:
top_db: Optional[float]=None) -> paddle.Tensor:
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units.
The function computes the scaling ``10 * log10(x / ref)`` in a numerically
stable way.
......@@ -304,7 +304,7 @@ class MelSpectrogram(nn.Layer):
center: bool=True,
pad_mode: str='reflect',
n_mels: int=64,
f_min: float=0.0,
f_min: float=50.0,
f_max: Optional[float]=None,
htk: bool=False,
norm: Union[str, float]='slaney',
......@@ -384,13 +384,13 @@ class LogMelSpectrogram(nn.Layer):
center: bool=True,
pad_mode: str='reflect',
n_mels: int=64,
f_min: float=0.0,
f_min: float=50.0,
f_max: Optional[float]=None,
htk: bool=False,
norm: Union[str, float]='slaney',
ref_value: float=1.0,
amin: float=1e-10,
top_db: Optional[float]=80.0,
top_db: Optional[float]=None,
dtype: str=paddle.float32):
"""Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal,
typically an audio waveform.
......
......@@ -13,8 +13,8 @@
# limitations under the License.
'''
This module is used to store environmental variables in PaddleSpeech.
PACKAGE_HOME --> the root directory for storing PaddleSpeech related data. Default to ~/.paddlespeech. Users can change the
├ default value through the PACKAGE_HOME environment variable.
PPSPEECH_HOME --> the root directory for storing PaddleSpeech related data. Default to ~/.paddlespeech. Users can change the
├ default value through the PPSPEECH_HOME environment variable.
├─ MODEL_HOME --> Store model files.
└─ DATA_HOME --> Store automatically downloaded datasets.
'''
......@@ -26,14 +26,14 @@ def _get_user_home():
def _get_package_home():
if 'PACKAGE_HOME' in os.environ:
home_path = os.environ['PACKAGE_HOME']
if 'PPSPEECH_HOME' in os.environ:
home_path = os.environ['PPSPEECH_HOME']
if os.path.exists(home_path):
if os.path.isdir(home_path):
return home_path
else:
raise RuntimeError(
'The environment variable PACKAGE_HOME {} is not a directory.'.
'The environment variable PPSPEECH_HOME {} is not a directory.'.
format(home_path))
else:
return home_path
......@@ -48,6 +48,6 @@ def _get_sub_home(directory):
USER_HOME = _get_user_home()
PACKAGE_HOME = _get_package_home()
PPSPEECH_HOME = _get_package_home()
MODEL_HOME = _get_sub_home('pretrained_models')
DATA_HOME = _get_sub_home('datasets')
......@@ -55,13 +55,13 @@ log_config = {
class Logger(object):
'''
Deafult logger in PaddleSpeechCls
Deafult logger in PaddleSpeech
Args:
name(str) : Logger name, default is 'PaddleSpeechCls'
name(str) : Logger name, default is 'PaddleSpeech'
'''
def __init__(self, name: str=None):
name = 'PaddleSpeechCls' if not name else name
name = 'PaddleSpeech' if not name else name
self.logger = logging.getLogger(name)
for key, conf in log_config.items():
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册