From bdb3ce23ee2a0a80418d51c072c80afc6ca85992 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Thu, 25 Nov 2021 13:32:36 +0800
Subject: [PATCH] Add paddlespeech.cls and esc50 example.

---
 examples/esc50/README.md              |  4 ++--
 examples/esc50/cls0/local/predict.py  | 18 +++++++++---------
 examples/esc50/cls0/local/train.py    | 25 ++++++++++++-------------
 examples/esc50/cls0/run.sh            |  8 ++++----
 paddlespeech/cls/features/spectrum.py |  8 ++++----
 paddlespeech/cls/utils/env.py         | 12 ++++++------
 paddlespeech/cls/utils/log.py         |  6 +++---
 7 files changed, 40 insertions(+), 41 deletions(-)

diff --git a/examples/esc50/README.md b/examples/esc50/README.md
index e148efd0..6ac10b3a 100644
--- a/examples/esc50/README.md
+++ b/examples/esc50/README.md
@@ -31,7 +31,7 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 1
 `local/train.py` 脚本中可支持配置的参数：
 
 - `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
-- `gpu_feat`: 选择是否用gpu加速提取音频特征，默认为False。
+- `feat_backend`: 选择提取特征的后端，可选`'numpy'`或`'paddle'`，默认为`'numpy'`。
 - `epochs`: 训练轮次，默认为50。
 - `learning_rate`: Fine-tune的学习率；默认为5e-5。
 - `batch_size`: 批处理大小，请结合显存情况进行调整，若出现显存不足，请适当调低这一参数；默认为16。
@@ -69,7 +69,7 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 2
 
 - `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
 - `wav`: 指定预测的音频文件。
-- `gpu_feat`: 选择是否用gpu加速提取音频特征，默认为False。
+- `feat_backend`: 选择提取特征的后端，可选`'numpy'`或`'paddle'`，默认为`'numpy'`。
 - `top_k`: 预测显示的top k标签的得分，默认为1。
 - `checkpoint`: 模型参数checkpoint文件。
 
diff --git a/examples/esc50/cls0/local/predict.py b/examples/esc50/cls0/local/predict.py
index 58187677..a6e38a35 100644
--- a/examples/esc50/cls0/local/predict.py
+++ b/examples/esc50/cls0/local/predict.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
-import ast
 
 import numpy as np
 import paddle
@@ -29,24 +28,25 @@ from paddlespeech.cls.models.panns import cnn14
 parser = argparse.ArgumentParser(__doc__)
 parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to predict, defaults to gpu.")
 parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
-parser.add_argument("--gpu_feat", type=ast.literal_eval, default=False, help="Use gpu to extract feature.")
+parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.")
 parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results")
 parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.")
 args = parser.parse_args()
 # yapf: enable
 
 
-def extract_features(file: str, gpu_feat: bool=False,
+def extract_features(file: str, feat_backend: str='numpy',
                      **kwargs) -> paddle.Tensor:
     waveform, sr = load_audio(file, sr=None)
-    if gpu_feat:
-        feature_extractor = LogMelSpectrogram(sr=sr, hop_length=320, **kwargs)
-        feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0))
-        feat = paddle.transpose(feat, [0, 2, 1])
-    else:
+
+    if args.feat_backend == 'numpy':
         feat = melspectrogram(waveform, sr, **kwargs).transpose()
         feat = np.expand_dims(feat, 0)
         feat = paddle.to_tensor(feat)
+    else:
+        feature_extractor = LogMelSpectrogram(sr=sr, **kwargs)
+        feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0))
+        feat = paddle.transpose(feat, [0, 2, 1])
     return feat
 
 
@@ -59,7 +59,7 @@ if __name__ == '__main__':
     model.set_state_dict(paddle.load(args.checkpoint))
     model.eval()
 
-    feat = extract_features(args.wav, args.gpu_feat)
+    feat = extract_features(args.wav, args.feat_backend)
     logits = model(feat)
     probs = F.softmax(logits, axis=1).numpy()
 
diff --git a/examples/esc50/cls0/local/train.py b/examples/esc50/cls0/local/train.py
index 67215535..7a030187 100644
--- a/examples/esc50/cls0/local/train.py
+++ b/examples/esc50/cls0/local/train.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
-import ast
 import os
 
 import paddle
@@ -28,7 +27,7 @@ from paddlespeech.cls.utils import Timer
 parser = argparse.ArgumentParser(__doc__)
 parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
 parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.")
-parser.add_argument("--gpu_feat", type=ast.literal_eval, default=False, help="Use gpu to extract feature.")
+parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.")
 parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
 parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.")
 parser.add_argument("--num_workers", type=int, default=0, help="Number of workers in dataloader.")
@@ -52,13 +51,13 @@ if __name__ == "__main__":
         learning_rate=args.learning_rate, parameters=model.parameters())
     criterion = paddle.nn.loss.CrossEntropyLoss()
 
-    if args.gpu_feat:
-        train_ds = ESC50(mode='train')
-        dev_ds = ESC50(mode='dev')
-        feature_extractor = LogMelSpectrogram(sr=16000, hop_length=320)
-    else:
+    if args.feat_backend == 'numpy':
         train_ds = ESC50(mode='train', feat_type='melspectrogram')
         dev_ds = ESC50(mode='dev', feat_type='melspectrogram')
+    else:
+        train_ds = ESC50(mode='train')
+        dev_ds = ESC50(mode='dev')
+        feature_extractor = LogMelSpectrogram(sr=16000)
 
     train_sampler = paddle.io.DistributedBatchSampler(
         train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False)
@@ -80,15 +79,15 @@ if __name__ == "__main__":
         num_corrects = 0
         num_samples = 0
         for batch_idx, batch in enumerate(train_loader):
-            if args.gpu_feat:
+            if args.feat_backend == 'numpy':
+                feats, labels = batch
+            else:
                 waveforms, labels = batch
                 feats = feature_extractor(
                     waveforms
                 )  # Need a padding when lengths of waveforms differ in a batch.
                 feats = paddle.transpose(feats,
                                          [0, 2, 1])  # To [N, length, n_mels]
-            else:
-                feats, labels = batch
 
             logits = model(feats)
 
@@ -144,12 +143,12 @@ if __name__ == "__main__":
             num_samples = 0
             with logger.processing('Evaluation on validation dataset'):
                 for batch_idx, batch in enumerate(dev_loader):
-                    if args.gpu_feat:
+                    if args.feat_backend == 'numpy':
+                        feats, labels = batch
+                    else:
                         waveforms, labels = batch
                         feats = feature_extractor(waveforms)
                         feats = paddle.transpose(feats, [0, 2, 1])
-                    else:
-                        feats, labels = batch
 
                     logits = model(feats)
 
diff --git a/examples/esc50/cls0/run.sh b/examples/esc50/cls0/run.sh
index 17f2fd99..6d3a09c6 100755
--- a/examples/esc50/cls0/run.sh
+++ b/examples/esc50/cls0/run.sh
@@ -16,13 +16,13 @@ num_epochs=50
 batch_size=16
 ckpt_dir=./checkpoint
 save_freq=10
-gpu_feat=True
+feat_backend=numpy
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     if [ ${ngpu} -gt 1 ]; then
         python -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES local/train.py \
         --epochs ${num_epochs} \
-        --gpu_feat ${gpu_feat} \
+        --feat_backend ${feat_backend} \
         --batch_size ${batch_size} \
         --checkpoint_dir ${ckpt_dir} \
         --save_freq ${save_freq}
@@ -30,7 +30,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         python local/train.py \
         --device ${device} \
         --epochs ${num_epochs} \
-        --gpu_feat ${gpu_feat} \
+        --feat_backend ${feat_backend} \
         --batch_size ${batch_size} \
         --checkpoint_dir ${ckpt_dir} \
         --save_freq ${save_freq}
@@ -43,7 +43,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     python local/predict.py \
     --device ${device} \
     --wav ${audio_file} \
-    --gpu_feat ${gpu_feat} \
+    --feat_backend ${feat_backend} \
     --top_k 10 \
     --checkpoint ${ckpt}
 fi
diff --git a/paddlespeech/cls/features/spectrum.py b/paddlespeech/cls/features/spectrum.py
index d70e60fb..154b6484 100644
--- a/paddlespeech/cls/features/spectrum.py
+++ b/paddlespeech/cls/features/spectrum.py
@@ -201,7 +201,7 @@ def compute_fbank_matrix(sr: int,
 def power_to_db(magnitude: paddle.Tensor,
                 ref_value: float=1.0,
                 amin: float=1e-10,
-                top_db: Optional[float]=80.0) -> paddle.Tensor:
+                top_db: Optional[float]=None) -> paddle.Tensor:
     """Convert a power spectrogram (amplitude squared) to decibel (dB) units.
     The function computes the scaling ``10 * log10(x / ref)`` in a numerically
     stable way.
@@ -304,7 +304,7 @@ class MelSpectrogram(nn.Layer):
                  center: bool=True,
                  pad_mode: str='reflect',
                  n_mels: int=64,
-                 f_min: float=0.0,
+                 f_min: float=50.0,
                  f_max: Optional[float]=None,
                  htk: bool=False,
                  norm: Union[str, float]='slaney',
@@ -384,13 +384,13 @@ class LogMelSpectrogram(nn.Layer):
                  center: bool=True,
                  pad_mode: str='reflect',
                  n_mels: int=64,
-                 f_min: float=0.0,
+                 f_min: float=50.0,
                  f_max: Optional[float]=None,
                  htk: bool=False,
                  norm: Union[str, float]='slaney',
                  ref_value: float=1.0,
                  amin: float=1e-10,
-                 top_db: Optional[float]=80.0,
+                 top_db: Optional[float]=None,
                  dtype: str=paddle.float32):
         """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal,
         typically an audio waveform.
diff --git a/paddlespeech/cls/utils/env.py b/paddlespeech/cls/utils/env.py
index 340c1e4b..c455af00 100644
--- a/paddlespeech/cls/utils/env.py
+++ b/paddlespeech/cls/utils/env.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 '''
 This module is used to store environmental variables in PaddleSpeech.
-PACKAGE_HOME     -->  the root directory for storing PaddleSpeech related data. Default to ~/.paddlespeech. Users can change the
-├                            default value through the PACKAGE_HOME environment variable.
+PPSPEECH_HOME     -->  the root directory for storing PaddleSpeech related data. Default to ~/.paddlespeech. Users can change the
+├                            default value through the PPSPEECH_HOME environment variable.
 ├─ MODEL_HOME    -->  Store model files.
 └─ DATA_HOME     -->  Store automatically downloaded datasets.
 '''
@@ -26,14 +26,14 @@ def _get_user_home():
 
 
 def _get_package_home():
-    if 'PACKAGE_HOME' in os.environ:
-        home_path = os.environ['PACKAGE_HOME']
+    if 'PPSPEECH_HOME' in os.environ:
+        home_path = os.environ['PPSPEECH_HOME']
         if os.path.exists(home_path):
             if os.path.isdir(home_path):
                 return home_path
             else:
                 raise RuntimeError(
-                    'The environment variable PACKAGE_HOME {} is not a directory.'.
+                    'The environment variable PPSPEECH_HOME {} is not a directory.'.
                     format(home_path))
         else:
             return home_path
@@ -48,6 +48,6 @@ def _get_sub_home(directory):
 
 
 USER_HOME = _get_user_home()
-PACKAGE_HOME = _get_package_home()
+PPSPEECH_HOME = _get_package_home()
 MODEL_HOME = _get_sub_home('pretrained_models')
 DATA_HOME = _get_sub_home('datasets')
diff --git a/paddlespeech/cls/utils/log.py b/paddlespeech/cls/utils/log.py
index 89d1e5b1..f4146c4f 100644
--- a/paddlespeech/cls/utils/log.py
+++ b/paddlespeech/cls/utils/log.py
@@ -55,13 +55,13 @@ log_config = {
 
 class Logger(object):
     '''
-    Deafult logger in PaddleSpeechCls
+    Deafult logger in PaddleSpeech
     Args:
-        name(str) : Logger name, default is 'PaddleSpeechCls'
+        name(str) : Logger name, default is 'PaddleSpeech'
     '''
 
     def __init__(self, name: str=None):
-        name = 'PaddleSpeechCls' if not name else name
+        name = 'PaddleSpeech' if not name else name
         self.logger = logging.getLogger(name)
 
         for key, conf in log_config.items():
-- 
GitLab