diff --git a/README.md b/README.md
index 66feb0982025fce8caf819fddca27b0d81598d7a..2f9d992895309f28ccfabc5d0bf83dfa94aaa443 100644
--- a/README.md
+++ b/README.md
@@ -154,7 +154,7 @@ If you want to try more functions like training and tuning, please see [Speech-t
## Model List
-PaddleSpeech supports a series of most popular models, summarized in [released models](./docs/source/released_models.md) with available pretrained models.
+PaddleSpeech supports a series of most popular models, summarized in [released models](./docs/source/released_model.md) with available pretrained models.
Speech-to-Text module contains *Acoustic Model* and *Language Model*, with the following details:
@@ -344,4 +344,4 @@ year={2021}
PaddleSpeech is provided under the [Apache-2.0 License](./LICENSE).
-PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information.
\ No newline at end of file
+PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information.
diff --git a/demos/style_fs2/style_syn.py b/demos/style_fs2/style_syn.py
index db15b7ef3a6c9baa8bdbf92ba3803c170a134932..5b8ce35139aea0edb084cd3b1d33b702b27d2628 100644
--- a/demos/style_fs2/style_syn.py
+++ b/demos/style_fs2/style_syn.py
@@ -13,7 +13,6 @@
# limitations under the License.
import argparse
from pathlib import Path
-from typing import Union
import numpy as np
import paddle
@@ -23,129 +22,12 @@ from yacs.config import CfgNode
from paddlespeech.t2s.frontend.zh_frontend import Frontend
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
-from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference
+from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference
from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
from paddlespeech.t2s.models.parallel_wavegan import PWGInference
from paddlespeech.t2s.modules.normalizer import ZScore
-class StyleFastSpeech2Inference(FastSpeech2Inference):
- def __init__(self, normalizer, model, pitch_stats_path, energy_stats_path):
- super().__init__(normalizer, model)
- pitch_mean, pitch_std = np.load(pitch_stats_path)
- self.pitch_mean = paddle.to_tensor(pitch_mean)
- self.pitch_std = paddle.to_tensor(pitch_std)
- energy_mean, energy_std = np.load(energy_stats_path)
- self.energy_mean = paddle.to_tensor(energy_mean)
- self.energy_std = paddle.to_tensor(energy_std)
-
- def denorm(self, data, mean, std):
- return data * std + mean
-
- def norm(self, data, mean, std):
- return (data - mean) / std
-
- def forward(self,
- text: paddle.Tensor,
- durations: Union[paddle.Tensor, np.ndarray]=None,
- durations_scale: Union[int, float]=None,
- durations_bias: Union[int, float]=None,
- pitch: Union[paddle.Tensor, np.ndarray]=None,
- pitch_scale: Union[int, float]=None,
- pitch_bias: Union[int, float]=None,
- energy: Union[paddle.Tensor, np.ndarray]=None,
- energy_scale: Union[int, float]=None,
- energy_bias: Union[int, float]=None,
- robot: bool=False):
- """
- Parameters
- ----------
- text : Tensor(int64)
- Input sequence of characters (T,).
- speech : Tensor, optional
- Feature sequence to extract style (N, idim).
- durations : paddle.Tensor/np.ndarray, optional (int64)
- Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
- durations_scale: int/float, optional
- durations_bias: int/float, optional
- pitch : paddle.Tensor/np.ndarray, optional
- Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
- pitch_scale: int/float, optional
- In denormed HZ domain.
- pitch_bias: int/float, optional
- In denormed HZ domain.
- energy : paddle.Tensor/np.ndarray, optional
- Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
- energy_scale: int/float, optional
- In denormed domain.
- energy_bias: int/float, optional
- In denormed domain.
- robot : bool, optional
- Weather output robot style
- Returns
- ----------
- Tensor
- Output sequence of features (L, odim).
- """
- normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
- text, durations=None, pitch=None, energy=None)
- # priority: groundtruth > scale/bias > previous output
- # set durations
- if isinstance(durations, np.ndarray):
- durations = paddle.to_tensor(durations)
- elif isinstance(durations, paddle.Tensor):
- durations = durations
- elif durations_scale or durations_bias:
- durations_scale = durations_scale if durations_scale is not None else 1
- durations_bias = durations_bias if durations_bias is not None else 0
- durations = durations_scale * d_outs + durations_bias
- else:
- durations = d_outs
-
- if robot:
- # set normed pitch to zeros have the same effect with set denormd ones to mean
- pitch = paddle.zeros(p_outs.shape)
-
- # set pitch, can overwrite robot set
- if isinstance(pitch, np.ndarray):
- pitch = paddle.to_tensor(pitch)
- elif isinstance(pitch, paddle.Tensor):
- pitch = pitch
- elif pitch_scale or pitch_bias:
- pitch_scale = pitch_scale if pitch_scale is not None else 1
- pitch_bias = pitch_bias if pitch_bias is not None else 0
- p_Hz = paddle.exp(
- self.denorm(p_outs, self.pitch_mean, self.pitch_std))
- p_HZ = pitch_scale * p_Hz + pitch_bias
- pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std)
- else:
- pitch = p_outs
-
- # set energy
- if isinstance(energy, np.ndarray):
- energy = paddle.to_tensor(energy)
- elif isinstance(energy, paddle.Tensor):
- energy = energy
- elif energy_scale or energy_bias:
- energy_scale = energy_scale if energy_scale is not None else 1
- energy_bias = energy_bias if energy_bias is not None else 0
- e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std)
- e_dnorm = energy_scale * e_dnorm + energy_bias
- energy = self.norm(e_dnorm, self.energy_mean, self.energy_std)
- else:
- energy = e_outs
-
- normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
- text,
- durations=durations,
- pitch=pitch,
- energy=energy,
- use_teacher_forcing=True)
-
- logmel = self.normalizer.inverse(normalized_mel)
- return logmel
-
-
def evaluate(args, fastspeech2_config, pwg_config):
# construct dataset for evaluation
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 53e5d15df5baaf307a3d0c24fce608af0d34a5e2..ea2599abe49e06c2b652488d073bda45a3a3b80e 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -23,7 +23,7 @@ Contents
.. toctree::
:maxdepth: 1
- :caption: Speech-To-Text
+ :caption: Speech-to-Text
asr/models_introduction
asr/data_preparation
@@ -33,7 +33,7 @@ Contents
.. toctree::
:maxdepth: 1
- :caption: Text-To-Speech
+ :caption: Text-to-Speech
tts/basic_usage
tts/advanced_usage
diff --git a/docs/source/install.md b/docs/source/install.md
index 0700a1667831bee9b303d5590388a5c9a49c0446..d68b990d2a6e3c4b9096808d7712c7792144960e 100644
--- a/docs/source/install.md
+++ b/docs/source/install.md
@@ -16,6 +16,22 @@ cd DeepSpeech
pip install -e .
```
+For user who only needs the basic function of paddlespeech, using conda to do installing is recommended.
+You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) to select a version and install it by yourself, or you can use the scripts below to install the last miniconda version.
+
+```python
+pushd tools
+bash extras/install_miniconda.sh
+popd
+bash
+```
+
+After installing the conda, run the setup.sh to complete the installing process.
+```python
+bash setup.sh
+```
+
+
## Setup (Other Platform)
- Make sure these libraries or tools in [dependencies](./dependencies.md) installed. More information please see: `setup.py `and ` tools/Makefile`.
diff --git a/docs/source/introduction.md b/docs/source/introduction.md
index e7dd2892afe37c1391c04ca2bc9a410ea7754756..e3fc8b9ea9e1c2d9b6d80e8ea6edb1c6dbbf1385 100644
--- a/docs/source/introduction.md
+++ b/docs/source/introduction.md
@@ -1,11 +1,11 @@
# PaddleSpeech
## What is PaddleSpeech?
-PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech - Speech-To-Text (Automatic Speech Recognition, ASR) and Text-To-Speech Synthesis (TTS), with modules involving state-of-art and influential models.
+PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech - Speech-to-Text (Automatic Speech Recognition, ASR) and Text-to-Speech Synthesis (TTS), with modules involving state-of-art and influential models.
## What can PaddleSpeech do?
-### Speech-To-Text
+### Speech-to-Text
PaddleSpeech ASR mainly consists of components below:
- Implementation of models and commonly used neural network layers.
- Dataset abstraction and common data preprocessing pipelines.
@@ -29,9 +29,9 @@ PaddleSpeech ASR provides you with a complete ASR pipeline, including:
- attention decoding (used in Transformer and Conformer)
- attention rescoring (used in Transformer and Conformer)
-Speech-To-Text helps you training the ASR model very simply.
+Speech-to-Text helps you training the ASR model very simply.
-### Text-To-Speech
+### Text-to-Speech
TTS mainly consists of components below:
- Implementation of models and commonly used neural network layers.
- Dataset abstraction and common data preprocessing pipelines.
@@ -53,4 +53,4 @@ PaddleSpeech TTS provides you with a complete TTS pipeline, including:
- Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis
- GE2E
-Text-To-Speech helps you to train TTS models with simple commands.
+Text-to-Speech helps you to train TTS models with simple commands.
diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index bb03689c7e3f1712af5f4d0d47c328206765d770..a7c6a036b455410cc7d88947ef6c99d7b867924c 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -1,7 +1,7 @@
# Released Models
-## Speech-To-Text Models
+## Speech-to-Text Models
### Acoustic Model Released in paddle 2.X
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :---------
@@ -27,7 +27,7 @@ Language Model | Training Data | Token-based | Size | Descriptions
[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4;
About 0.13 billion n-grams;
'probing' binary with default settings
[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning;
About 3.7 billion n-grams;
'probing' binary with default settings
-## Text-To-Speech Models
+## Text-to-Speech Models
### Acoustic Models
Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static)
:-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
diff --git a/examples/csmsc/voc3/conf/finetune.yaml b/examples/csmsc/voc3/conf/finetune.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e02f3e220147e4ca78fffc1e564efa4c968c9089
--- /dev/null
+++ b/examples/csmsc/voc3/conf/finetune.yaml
@@ -0,0 +1,139 @@
+# This is the hyperparameter configuration file for MelGAN.
+# Please make sure this is adjusted for the CSMSC dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires ~ 8GB memory and will finish within 7 days on Titan V.
+
+# This configuration is based on full-band MelGAN but the hop size and sampling
+# rate is different from the paper (16kHz vs 24kHz). The number of iteraions
+# is not shown in the paper so currently we train 1M iterations (not sure enough
+# to converge). The optimizer setting is based on @dathudeptrai advice.
+# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
+
+###########################################################
+# FEATURE EXTRACTION SETTING #
+###########################################################
+fs: 24000 # Sampling rate.
+n_fft: 2048 # FFT size. (in samples)
+n_shift: 300 # Hop size. (in samples)
+win_length: 1200 # Window length. (in samples)
+ # If set to null, it will be the same as fft_size.
+window: "hann" # Window function.
+n_mels: 80 # Number of mel basis.
+fmin: 80 # Minimum freq in mel basis calculation. (Hz)
+fmax: 7600 # Maximum frequency in mel basis calculation. (Hz)
+
+###########################################################
+# GENERATOR NETWORK ARCHITECTURE SETTING #
+###########################################################
+generator_params:
+ in_channels: 80 # Number of input channels.
+ out_channels: 4 # Number of output channels.
+ kernel_size: 7 # Kernel size of initial and final conv layers.
+ channels: 384 # Initial number of channels for conv layers.
+ upsample_scales: [5, 5, 3] # List of Upsampling scales.
+ stack_kernel_size: 3 # Kernel size of dilated conv layers in residual stack.
+ stacks: 4 # Number of stacks in a single residual stack module.
+ use_weight_norm: True # Whether to use weight normalization.
+ use_causal_conv: False # Whether to use causal convolution.
+ use_final_nonlinear_activation: True
+
+
+###########################################################
+# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
+###########################################################
+discriminator_params:
+ in_channels: 1 # Number of input channels.
+ out_channels: 1 # Number of output channels.
+ scales: 3 # Number of multi-scales.
+ downsample_pooling: "AvgPool1D" # Pooling type for the input downsampling.
+ downsample_pooling_params: # Parameters of the above pooling function.
+ kernel_size: 4
+ stride: 2
+ padding: 1
+ exclusive: True
+ kernel_sizes: [5, 3] # List of kernel size.
+ channels: 16 # Number of channels of the initial conv layer.
+ max_downsample_channels: 512 # Maximum number of channels of downsampling layers.
+ downsample_scales: [4, 4, 4] # List of downsampling scales.
+ nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
+ nonlinear_activation_params: # Parameters of nonlinear activation function.
+ negative_slope: 0.2
+ use_weight_norm: True # Whether to use weight norm.
+
+
+###########################################################
+# STFT LOSS SETTING #
+###########################################################
+use_stft_loss: true
+stft_loss_params:
+ fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
+ hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
+ win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+ window: "hann" # Window function for STFT-based loss
+use_subband_stft_loss: true
+subband_stft_loss_params:
+ fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
+ hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss
+ win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
+ window: "hann" # Window function for STFT-based loss
+
+###########################################################
+# ADVERSARIAL LOSS SETTING #
+###########################################################
+use_feat_match_loss: false # Whether to use feature matching loss.
+lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss.
+
+###########################################################
+# DATA LOADER SETTING #
+###########################################################
+batch_size: 64 # Batch size.
+batch_max_steps: 16200 # Length of each audio in batch. Make sure dividable by hop_size.
+num_workers: 2 # Number of workers in DataLoader.
+
+###########################################################
+# OPTIMIZER & SCHEDULER SETTING #
+###########################################################
+generator_optimizer_params:
+ epsilon: 1.0e-7 # Generator's epsilon.
+ weight_decay: 0.0 # Generator's weight decay coefficient.
+
+generator_grad_norm: -1 # Generator's gradient norm.
+generator_scheduler_params:
+ learning_rate: 1.0e-3 # Generator's learning rate.
+ gamma: 0.5 # Generator's scheduler gamma.
+ milestones: # At each milestone, lr will be multiplied by gamma.
+ - 100000
+ - 200000
+ - 300000
+ - 400000
+ - 500000
+ - 600000
+discriminator_optimizer_params:
+ epsilon: 1.0e-7 # Discriminator's epsilon.
+ weight_decay: 0.0 # Discriminator's weight decay coefficient.
+
+discriminator_grad_norm: -1 # Discriminator's gradient norm.
+discriminator_scheduler_params:
+ learning_rate: 1.0e-3 # Discriminator's learning rate.
+ gamma: 0.5 # Discriminator's scheduler gamma.
+ milestones: # At each milestone, lr will be multiplied by gamma.
+ - 100000
+ - 200000
+ - 300000
+ - 400000
+ - 500000
+ - 600000
+
+###########################################################
+# INTERVAL SETTING #
+###########################################################
+discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
+train_max_steps: 1200000 # Number of training steps.
+save_interval_steps: 1000 # Interval steps to save checkpoint.
+eval_interval_steps: 1000 # Interval steps to evaluate the network.
+
+###########################################################
+# OTHER SETTING #
+###########################################################
+num_snapshots: 10 # max number of snapshots to keep while training
+seed: 42 # random seed for paddle, random, and np.random
\ No newline at end of file
diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh
new file mode 100755
index 0000000000000000000000000000000000000000..42e5a39796acdb46a8104876d8c4086b61866fdb
--- /dev/null
+++ b/examples/csmsc/voc3/finetune.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \
+ --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
+ --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
+ --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
+ --dur-file=durations.txt \
+ --output-dir=dump_finetune \
+ --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ python3 local/link_wav.py \
+ --old-dump-dir=dump \
+ --dump-dir=dump_finetune
+
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ # get features' stats(mean and std)
+ echo "Get features' stats ..."
+ cp dump/train/feats_stats.npy dump_finetune/train/
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ # normalize, dev and test should use train's stats
+ echo "Normalize ..."
+
+ python3 ${BIN_DIR}/../normalize.py \
+ --metadata=dump_finetune/train/raw/metadata.jsonl \
+ --dumpdir=dump_finetune/train/norm \
+ --stats=dump_finetune/train/feats_stats.npy
+ python3 ${BIN_DIR}/../normalize.py \
+ --metadata=dump_finetune/dev/raw/metadata.jsonl \
+ --dumpdir=dump_finetune/dev/norm \
+ --stats=dump_finetune/train/feats_stats.npy
+
+ python3 ${BIN_DIR}/../normalize.py \
+ --metadata=dump_finetune/test/raw/metadata.jsonl \
+ --dumpdir=dump_finetune/test/norm \
+ --stats=dump_finetune/train/feats_stats.npy
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ CUDA_VISIBLE_DEVICES=${gpus} \
+ FLAGS_cudnn_exhaustive_search=true \
+ FLAGS_conv_workspace_size_limit=4000 \
+ python ${BIN_DIR}/train.py \
+ --train-metadata=dump_finetune/train/norm/metadata.jsonl \
+ --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \
+ --config=conf/finetune.yaml \
+ --output-dir=exp/finetune \
+ --ngpu=1
+fi
\ No newline at end of file
diff --git a/examples/csmsc/voc3/local/link_wav.py b/examples/csmsc/voc3/local/link_wav.py
new file mode 100644
index 0000000000000000000000000000000000000000..c81e0d4b83320665b98720d09a940e9de6dc63cd
--- /dev/null
+++ b/examples/csmsc/voc3/local/link_wav.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from operator import itemgetter
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+
+
+def main():
+ # parse config and args
+ parser = argparse.ArgumentParser(
+ description="Preprocess audio and then extract features .")
+
+ parser.add_argument(
+ "--old-dump-dir",
+ default=None,
+ type=str,
+ help="directory to dump feature files.")
+ parser.add_argument(
+ "--dump-dir",
+ type=str,
+ required=True,
+ help="directory to finetune dump feature files.")
+ args = parser.parse_args()
+
+ old_dump_dir = Path(args.old_dump_dir).expanduser()
+ old_dump_dir = old_dump_dir.resolve()
+ dump_dir = Path(args.dump_dir).expanduser()
+ # use absolute path
+ dump_dir = dump_dir.resolve()
+ dump_dir.mkdir(parents=True, exist_ok=True)
+
+ assert old_dump_dir.is_dir()
+ assert dump_dir.is_dir()
+
+ for sub in ["train", "dev", "test"]:
+ # 把 old_dump_dir 里面的 *-wave.npy 软连接到 dump_dir 的对应位置
+ output_dir = dump_dir / sub
+ output_dir.mkdir(parents=True, exist_ok=True)
+ results = []
+ for name in os.listdir(output_dir / "raw"):
+ # 003918_feats.npy
+ utt_id = name.split("_")[0]
+ mel_path = output_dir / ("raw/" + name)
+ gen_mel = np.load(mel_path)
+ wave_name = utt_id + "_wave.npy"
+ wav = np.load(old_dump_dir / sub / ("raw/" + wave_name))
+ os.symlink(old_dump_dir / sub / ("raw/" + wave_name),
+ output_dir / ("raw/" + wave_name))
+ num_sample = wav.shape[0]
+ num_frames = gen_mel.shape[0]
+ wav_path = output_dir / ("raw/" + wave_name)
+
+ record = {
+ "utt_id": utt_id,
+ "num_samples": num_sample,
+ "num_frames": num_frames,
+ "feats": str(mel_path),
+ "wave": str(wav_path),
+ }
+ results.append(record)
+
+ results.sort(key=itemgetter("utt_id"))
+
+ with jsonlines.open(output_dir / "raw/metadata.jsonl", 'w') as writer:
+ for item in results:
+ writer.write(item)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/librispeech/s2/conf/transformer.yaml b/examples/librispeech/s2/conf/transformer.yaml
index b2babca7bcde8b6e68480a5dddeb4950d49159ec..d77329f50843e270b52750ef5dcc2e9429bd8617 100644
--- a/examples/librispeech/s2/conf/transformer.yaml
+++ b/examples/librispeech/s2/conf/transformer.yaml
@@ -1,36 +1,6 @@
# https://yaml.org/type/float.html
-data:
- train_manifest: data/manifest.train
- dev_manifest: data/manifest.dev
- test_manifest: data/manifest.test-clean
-
-collator:
- vocab_filepath: data/lang_char/train_960_unigram5000_units.txt
- unit_type: spm
- spm_model_prefix: data/lang_char/train_960_unigram5000
- feat_dim: 83
- stride_ms: 10.0
- window_ms: 25.0
- sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
- batch_size: 30
- maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
- maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
- minibatches: 0 # for debug
- batch_count: auto
- batch_bins: 0
- batch_frames_in: 0
- batch_frames_out: 0
- batch_frames_inout: 0
- augmentation_config: conf/augmentation.json
- num_workers: 0
- subsampling_factor: 1
- num_encs: 1
-
-
# network architecture
model:
- cmvn_file:
- cmvn_file_type: "json"
# encoder related
encoder: transformer
encoder_conf:
@@ -63,6 +33,33 @@ model:
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
+data:
+ train_manifest: data/manifest.train
+ dev_manifest: data/manifest.dev
+ test_manifest: data/manifest.test-clean
+
+collator:
+ vocab_filepath: data/lang_char/train_960_unigram5000_units.txt
+ unit_type: spm
+ spm_model_prefix: data/lang_char/train_960_unigram5000
+ feat_dim: 83
+ stride_ms: 10.0
+ window_ms: 25.0
+ sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+ batch_size: 30
+ maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
+ maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+ minibatches: 0 # for debug
+ batch_count: auto
+ batch_bins: 0
+ batch_frames_in: 0
+ batch_frames_out: 0
+ batch_frames_inout: 0
+ augmentation_config: conf/augmentation.json
+ num_workers: 0
+ subsampling_factor: 1
+ num_encs: 1
+
training:
n_epoch: 120
diff --git a/paddlespeech/t2s/datasets/vocoder_batch_fn.py b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
index 2de4fb124e1e50a7c5481366c8cec675922d8a98..2e4f740fb6f048dd91a1e799d598261a88a6419c 100644
--- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py
+++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
@@ -110,10 +110,10 @@ class Clip(object):
if len(x) < c.shape[0] * self.hop_size:
x = np.pad(x, (0, c.shape[0] * self.hop_size - len(x)), mode="edge")
elif len(x) > c.shape[0] * self.hop_size:
- print(
- f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
- )
- x = x[:c.shape[1] * self.hop_size]
+ # print(
+ # f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
+ # )
+ x = x[:c.shape[0] * self.hop_size]
# check the legnth is valid
assert len(x) == c.shape[
diff --git a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a9ef370c0f2916149b62c50d2425e969b49a5cb
--- /dev/null
+++ b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# generate mels using durations.txt
+# for mb melgan finetune
+# 长度和原本的 mel 不一致怎么办?
+import argparse
+from pathlib import Path
+
+import numpy as np
+import paddle
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
+from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
+from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference
+from paddlespeech.t2s.modules.normalizer import ZScore
+
+
+def evaluate(args, fastspeech2_config):
+
+ # construct dataset for evaluation
+ with open(args.phones_dict, "r") as f:
+ phn_id = [line.strip().split() for line in f.readlines()]
+ vocab_size = len(phn_id)
+ print("vocab_size:", vocab_size)
+
+ phone_dict = {}
+ for phn, id in phn_id:
+ phone_dict[phn] = int(id)
+
+ odim = fastspeech2_config.n_mels
+ model = FastSpeech2(
+ idim=vocab_size, odim=odim, **fastspeech2_config["model"])
+
+ model.set_state_dict(
+ paddle.load(args.fastspeech2_checkpoint)["main_params"])
+ model.eval()
+
+ stat = np.load(args.fastspeech2_stat)
+ mu, std = stat
+ mu = paddle.to_tensor(mu)
+ std = paddle.to_tensor(std)
+ fastspeech2_normalizer = ZScore(mu, std)
+
+ fastspeech2_inference = StyleFastSpeech2Inference(fastspeech2_normalizer,
+ model)
+ fastspeech2_inference.eval()
+
+ output_dir = Path(args.output_dir)
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ sentences, speaker_set = get_phn_dur(args.dur_file)
+ merge_silence(sentences)
+
+ for i, utt_id in enumerate(sentences):
+ phones = sentences[utt_id][0]
+ durations = sentences[utt_id][1]
+ speaker = sentences[utt_id][2]
+ # 裁剪掉开头和结尾的 sil
+ if args.cut_sil:
+ if phones[0] == "sil" and len(durations) > 1:
+ durations = durations[1:]
+ phones = phones[1:]
+ if phones[-1] == 'sil' and len(durations) > 1:
+ durations = durations[:-1]
+ phones = phones[:-1]
+ # sentences[utt_id][0] = phones
+ # sentences[utt_id][1] = durations
+
+ phone_ids = [phone_dict[phn] for phn in phones]
+ phone_ids = paddle.to_tensor(np.array(phone_ids))
+ durations = paddle.to_tensor(np.array(durations))
+ # 生成的和真实的可能有 1, 2 帧的差距,但是 batch_fn 会修复
+ # split data into 3 sections
+ if args.dataset == "baker":
+ num_train = 9800
+ num_dev = 100
+ if i in range(0, num_train):
+ sub_output_dir = output_dir / ("train/raw")
+ elif i in range(num_train, num_train + num_dev):
+ sub_output_dir = output_dir / ("dev/raw")
+ else:
+ sub_output_dir = output_dir / ("test/raw")
+ sub_output_dir.mkdir(parents=True, exist_ok=True)
+ with paddle.no_grad():
+ mel = fastspeech2_inference(phone_ids, durations=durations)
+ np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)
+
+
+def main():
+ # parse args and config and redirect to train_sp
+ parser = argparse.ArgumentParser(
+ description="Synthesize with fastspeech2 & parallel wavegan.")
+ parser.add_argument(
+ "--dataset",
+ default="baker",
+ type=str,
+ help="name of dataset, should in {baker, ljspeech, vctk} now")
+ parser.add_argument(
+ "--fastspeech2-config", type=str, help="fastspeech2 config file.")
+ parser.add_argument(
+ "--fastspeech2-checkpoint",
+ type=str,
+ help="fastspeech2 checkpoint to load.")
+ parser.add_argument(
+ "--fastspeech2-stat",
+ type=str,
+ help="mean and standard deviation used to normalize spectrogram when training fastspeech2."
+ )
+
+ parser.add_argument(
+ "--phones-dict",
+ type=str,
+ default="phone_id_map.txt",
+ help="phone vocabulary file.")
+
+ parser.add_argument(
+ "--dur-file", default=None, type=str, help="path to durations.txt.")
+ parser.add_argument("--output-dir", type=str, help="output dir.")
+ parser.add_argument(
+ "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+ parser.add_argument("--verbose", type=int, default=1, help="verbose.")
+
+ def str2bool(str):
+ return True if str.lower() == 'true' else False
+
+ parser.add_argument(
+ "--cut-sil",
+ type=str2bool,
+ default=True,
+ help="whether cut sil in the edge of audio")
+
+ args = parser.parse_args()
+
+ if args.ngpu == 0:
+ paddle.set_device("cpu")
+ elif args.ngpu > 0:
+ paddle.set_device("gpu")
+ else:
+ print("ngpu should >= 0 !")
+
+ with open(args.fastspeech2_config) as f:
+ fastspeech2_config = CfgNode(yaml.safe_load(f))
+
+ print("========Args========")
+ print(yaml.safe_dump(vars(args)))
+ print("========Config========")
+ print(fastspeech2_config)
+
+ evaluate(args, fastspeech2_config)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
index 2202d156e85731919c3b44fe8c498230dded740c..2e52c10376e41a6cca508b7be2a6dea1ca4a2943 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -16,23 +16,25 @@
from typing import Dict
from typing import Sequence
from typing import Tuple
+from typing import Union
+import numpy as np
import paddle
import paddle.nn.functional as F
from paddle import nn
from typeguard import check_argument_types
-from paddlespeech.t2s.modules.fastspeech2_predictor.duration_predictor import DurationPredictor
-from paddlespeech.t2s.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss
-from paddlespeech.t2s.modules.fastspeech2_predictor.length_regulator import LengthRegulator
-from paddlespeech.t2s.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder
from paddlespeech.t2s.modules.nets_utils import initialize
from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
from paddlespeech.t2s.modules.nets_utils import make_pad_mask
+from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictor
+from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictorLoss
+from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
+from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor
from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.encoder import Encoder as TransformerEncoder
class FastSpeech2(nn.Layer):
@@ -687,6 +689,129 @@ class FastSpeech2Inference(nn.Layer):
return logmel
+class StyleFastSpeech2Inference(FastSpeech2Inference):
+ def __init__(self,
+ normalizer,
+ model,
+ pitch_stats_path=None,
+ energy_stats_path=None):
+ super().__init__(normalizer, model)
+ if pitch_stats_path:
+ pitch_mean, pitch_std = np.load(pitch_stats_path)
+ self.pitch_mean = paddle.to_tensor(pitch_mean)
+ self.pitch_std = paddle.to_tensor(pitch_std)
+ if energy_stats_path:
+ energy_mean, energy_std = np.load(energy_stats_path)
+ self.energy_mean = paddle.to_tensor(energy_mean)
+ self.energy_std = paddle.to_tensor(energy_std)
+
+ def denorm(self, data, mean, std):
+ return data * std + mean
+
+ def norm(self, data, mean, std):
+ return (data - mean) / std
+
+ def forward(self,
+ text: paddle.Tensor,
+ durations: Union[paddle.Tensor, np.ndarray]=None,
+ durations_scale: Union[int, float]=None,
+ durations_bias: Union[int, float]=None,
+ pitch: Union[paddle.Tensor, np.ndarray]=None,
+ pitch_scale: Union[int, float]=None,
+ pitch_bias: Union[int, float]=None,
+ energy: Union[paddle.Tensor, np.ndarray]=None,
+ energy_scale: Union[int, float]=None,
+ energy_bias: Union[int, float]=None,
+ robot: bool=False):
+ """
+ Parameters
+ ----------
+ text : Tensor(int64)
+ Input sequence of characters (T,).
+ speech : Tensor, optional
+ Feature sequence to extract style (N, idim).
+ durations : paddle.Tensor/np.ndarray, optional (int64)
+ Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
+ durations_scale: int/float, optional
+ durations_bias: int/float, optional
+ pitch : paddle.Tensor/np.ndarray, optional
+ Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
+ pitch_scale: int/float, optional
+ In denormed HZ domain.
+ pitch_bias: int/float, optional
+ In denormed HZ domain.
+ energy : paddle.Tensor/np.ndarray, optional
+ Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
+ energy_scale: int/float, optional
+ In denormed domain.
+ energy_bias: int/float, optional
+ In denormed domain.
+ robot : bool, optional
+ Weather output robot style
+ Returns
+ ----------
+ Tensor
+ Output sequence of features (L, odim).
+ """
+ normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
+ text, durations=None, pitch=None, energy=None)
+ # priority: groundtruth > scale/bias > previous output
+ # set durations
+ if isinstance(durations, np.ndarray):
+ durations = paddle.to_tensor(durations)
+ elif isinstance(durations, paddle.Tensor):
+ durations = durations
+ elif durations_scale or durations_bias:
+ durations_scale = durations_scale if durations_scale is not None else 1
+ durations_bias = durations_bias if durations_bias is not None else 0
+ durations = durations_scale * d_outs + durations_bias
+ else:
+ durations = d_outs
+
+ if robot:
+ # set normed pitch to zeros have the same effect with set denormd ones to mean
+ pitch = paddle.zeros(p_outs.shape)
+
+ # set pitch, can overwrite robot set
+ if isinstance(pitch, np.ndarray):
+ pitch = paddle.to_tensor(pitch)
+ elif isinstance(pitch, paddle.Tensor):
+ pitch = pitch
+ elif pitch_scale or pitch_bias:
+ pitch_scale = pitch_scale if pitch_scale is not None else 1
+ pitch_bias = pitch_bias if pitch_bias is not None else 0
+ p_Hz = paddle.exp(
+ self.denorm(p_outs, self.pitch_mean, self.pitch_std))
+ p_HZ = pitch_scale * p_Hz + pitch_bias
+ pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std)
+ else:
+ pitch = p_outs
+
+ # set energy
+ if isinstance(energy, np.ndarray):
+ energy = paddle.to_tensor(energy)
+ elif isinstance(energy, paddle.Tensor):
+ energy = energy
+ elif energy_scale or energy_bias:
+ energy_scale = energy_scale if energy_scale is not None else 1
+ energy_bias = energy_bias if energy_bias is not None else 0
+ e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std)
+ e_dnorm = energy_scale * e_dnorm + energy_bias
+ energy = self.norm(e_dnorm, self.energy_mean, self.energy_std)
+ else:
+ energy = e_outs
+
+ normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
+ text,
+ durations=durations,
+ pitch=pitch,
+ energy=energy,
+ use_teacher_forcing=True)
+
+ logmel = self.normalizer.inverse(normalized_mel)
+ return logmel
+
+
class FastSpeech2Loss(nn.Layer):
"""Loss function module for FastSpeech2."""
diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
index 97233c766eb0d71ad51a17d7d33c87a8bc2f4da3..03620fd4e0b50ad827508deb8efba4459ea4bf05 100644
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@@ -23,12 +23,6 @@ import paddle.nn.functional as F
from paddle import nn
from typeguard import check_argument_types
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
-from paddlespeech.t2s.modules.fastspeech2_transformer.decoder import Decoder
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.encoder import Encoder
-from paddlespeech.t2s.modules.fastspeech2_transformer.mask import subsequent_mask
from paddlespeech.t2s.modules.nets_utils import initialize
from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
from paddlespeech.t2s.modules.nets_utils import make_pad_mask
@@ -36,6 +30,12 @@ from paddlespeech.t2s.modules.style_encoder import StyleEncoder
from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
from paddlespeech.t2s.modules.tacotron2.decoder import Prenet as DecoderPrenet
from paddlespeech.t2s.modules.tacotron2.encoder import Encoder as EncoderPrenet
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.decoder import Decoder
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.encoder import Encoder
+from paddlespeech.t2s.modules.transformer.mask import subsequent_mask
class TransformerTTS(nn.Layer):
diff --git a/paddlespeech/t2s/modules/__init__.py b/paddlespeech/t2s/modules/__init__.py
index 664267895491c47ad0b3ecaaaae9412f3ce5110f..5b569f5d05100fa80587c9a06dc2c16f1d58a936 100644
--- a/paddlespeech/t2s/modules/__init__.py
+++ b/paddlespeech/t2s/modules/__init__.py
@@ -11,10 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-from .attention import *
from .conv import *
from .geometry import *
from .losses import *
from .masking import *
from .positional_encoding import *
-from .transformer import *
diff --git a/paddlespeech/t2s/modules/attention.py b/paddlespeech/t2s/modules/attention.py
deleted file mode 100644
index 154625cc3c1d9426ed2d21edc3064798b73ccd3a..0000000000000000000000000000000000000000
--- a/paddlespeech/t2s/modules/attention.py
+++ /dev/null
@@ -1,348 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-
-import numpy as np
-import paddle
-from paddle import nn
-from paddle.nn import functional as F
-
-
-def scaled_dot_product_attention(q, k, v, mask=None, dropout=0.0,
- training=True):
- r"""Scaled dot product attention with masking.
-
- Assume that q, k, v all have the same leading dimensions (denoted as * in
- descriptions below). Dropout is applied to attention weights before
- weighted sum of values.
-
- Parameters
- -----------
- q : Tensor [shape=(\*, T_q, d)]
- the query tensor.
- k : Tensor [shape=(\*, T_k, d)]
- the key tensor.
- v : Tensor [shape=(\*, T_k, d_v)]
- the value tensor.
- mask : Tensor, [shape=(\*, T_q, T_k) or broadcastable shape], optional
- the mask tensor, zeros correspond to paddings. Defaults to None.
-
- Returns
- ----------
- out : Tensor [shape=(\*, T_q, d_v)]
- the context vector.
- attn_weights : Tensor [shape=(\*, T_q, T_k)]
- the attention weights.
- """
- d = q.shape[-1] # we only support imperative execution
- qk = paddle.matmul(q, k, transpose_y=True)
- scaled_logit = paddle.scale(qk, 1.0 / math.sqrt(d))
-
- if mask is not None:
- scaled_logit += paddle.scale((1.0 - mask), -1e9) # hard coded here
-
- attn_weights = F.softmax(scaled_logit, axis=-1)
- attn_weights = F.dropout(attn_weights, dropout, training=training)
- out = paddle.matmul(attn_weights, v)
- return out, attn_weights
-
-
-def drop_head(x, drop_n_heads, training=True):
- """Drop n context vectors from multiple ones.
-
- Parameters
- ----------
- x : Tensor [shape=(batch_size, num_heads, time_steps, channels)]
- The input, multiple context vectors.
- drop_n_heads : int [0<= drop_n_heads <= num_heads]
- Number of vectors to drop.
- training : bool
- A flag indicating whether it is in training. If `False`, no dropout is
- applied.
-
- Returns
- -------
- Tensor
- The output.
- """
- if not training or (drop_n_heads == 0):
- return x
-
- batch_size, num_heads, _, _ = x.shape
- # drop all heads
- if num_heads == drop_n_heads:
- return paddle.zeros_like(x)
-
- mask = np.ones([batch_size, num_heads])
- mask[:, :drop_n_heads] = 0
- for subarray in mask:
- np.random.shuffle(subarray)
- scale = float(num_heads) / (num_heads - drop_n_heads)
- mask = scale * np.reshape(mask, [batch_size, num_heads, 1, 1])
- out = x * paddle.to_tensor(mask)
- return out
-
-
-def _split_heads(x, num_heads):
- batch_size, time_steps, _ = x.shape
- x = paddle.reshape(x, [batch_size, time_steps, num_heads, -1])
- x = paddle.transpose(x, [0, 2, 1, 3])
- return x
-
-
-def _concat_heads(x):
- batch_size, _, time_steps, _ = x.shape
- x = paddle.transpose(x, [0, 2, 1, 3])
- x = paddle.reshape(x, [batch_size, time_steps, -1])
- return x
-
-
-# Standard implementations of Monohead Attention & Multihead Attention
-class MonoheadAttention(nn.Layer):
- """Monohead Attention module.
-
- Parameters
- ----------
- model_dim : int
- Feature size of the query.
- dropout : float, optional
- Dropout probability of scaled dot product attention and final context
- vector. Defaults to 0.0.
- k_dim : int, optional
- Feature size of the key of each scaled dot product attention. If not
- provided, it is set to `model_dim / num_heads`. Defaults to None.
- v_dim : int, optional
- Feature size of the key of each scaled dot product attention. If not
- provided, it is set to `model_dim / num_heads`. Defaults to None.
- """
-
- def __init__(self,
- model_dim: int,
- dropout: float=0.0,
- k_dim: int=None,
- v_dim: int=None):
- super(MonoheadAttention, self).__init__()
- k_dim = k_dim or model_dim
- v_dim = v_dim or model_dim
- self.affine_q = nn.Linear(model_dim, k_dim)
- self.affine_k = nn.Linear(model_dim, k_dim)
- self.affine_v = nn.Linear(model_dim, v_dim)
- self.affine_o = nn.Linear(v_dim, model_dim)
-
- self.model_dim = model_dim
- self.dropout = dropout
-
- def forward(self, q, k, v, mask):
- """Compute context vector and attention weights.
-
- Parameters
- -----------
- q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
- The queries.
- k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
- The keys.
- v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
- The values.
- mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
- The mask.
-
- Returns
- ----------
- out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
- The context vector.
- attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
- The attention weights.
- """
- q = self.affine_q(q) # (B, T, C)
- k = self.affine_k(k)
- v = self.affine_v(v)
-
- context_vectors, attention_weights = scaled_dot_product_attention(
- q, k, v, mask, self.dropout, self.training)
-
- out = self.affine_o(context_vectors)
- return out, attention_weights
-
-
-class MultiheadAttention(nn.Layer):
- """Multihead Attention module.
-
- Parameters
- -----------
- model_dim: int
- The feature size of query.
- num_heads : int
- The number of attention heads.
- dropout : float, optional
- Dropout probability of scaled dot product attention and final context
- vector. Defaults to 0.0.
- k_dim : int, optional
- Feature size of the key of each scaled dot product attention. If not
- provided, it is set to ``model_dim / num_heads``. Defaults to None.
- v_dim : int, optional
- Feature size of the key of each scaled dot product attention. If not
- provided, it is set to ``model_dim / num_heads``. Defaults to None.
-
- Raises
- ---------
- ValueError
- If ``model_dim`` is not divisible by ``num_heads``.
- """
-
- def __init__(self,
- model_dim: int,
- num_heads: int,
- dropout: float=0.0,
- k_dim: int=None,
- v_dim: int=None):
- super(MultiheadAttention, self).__init__()
- if model_dim % num_heads != 0:
- raise ValueError("model_dim must be divisible by num_heads")
- depth = model_dim // num_heads
- k_dim = k_dim or depth
- v_dim = v_dim or depth
- self.affine_q = nn.Linear(model_dim, num_heads * k_dim)
- self.affine_k = nn.Linear(model_dim, num_heads * k_dim)
- self.affine_v = nn.Linear(model_dim, num_heads * v_dim)
- self.affine_o = nn.Linear(num_heads * v_dim, model_dim)
-
- self.num_heads = num_heads
- self.model_dim = model_dim
- self.dropout = dropout
-
- def forward(self, q, k, v, mask):
- """Compute context vector and attention weights.
-
- Parameters
- -----------
- q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
- The queries.
- k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
- The keys.
- v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
- The values.
- mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
- The mask.
-
- Returns
- ----------
- out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
- The context vector.
- attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
- The attention weights.
- """
- q = _split_heads(self.affine_q(q), self.num_heads) # (B, h, T, C)
- k = _split_heads(self.affine_k(k), self.num_heads)
- v = _split_heads(self.affine_v(v), self.num_heads)
- mask = paddle.unsqueeze(mask, 1) # unsqueeze for the h dim
-
- context_vectors, attention_weights = scaled_dot_product_attention(
- q, k, v, mask, self.dropout, self.training)
- # NOTE: there is more sophisticated implementation: Scheduled DropHead
- context_vectors = _concat_heads(context_vectors) # (B, T, h*C)
- out = self.affine_o(context_vectors)
- return out, attention_weights
-
-
-class LocationSensitiveAttention(nn.Layer):
- """Location Sensitive Attention module.
-
- Reference: `Attention-Based Models for Speech Recognition `_
-
- Parameters
- -----------
- d_query: int
- The feature size of query.
- d_key : int
- The feature size of key.
- d_attention : int
- The feature size of dimension.
- location_filters : int
- Filter size of attention convolution.
- location_kernel_size : int
- Kernel size of attention convolution.
- """
-
- def __init__(self,
- d_query: int,
- d_key: int,
- d_attention: int,
- location_filters: int,
- location_kernel_size: int):
- super().__init__()
-
- self.query_layer = nn.Linear(d_query, d_attention, bias_attr=False)
- self.key_layer = nn.Linear(d_key, d_attention, bias_attr=False)
- self.value = nn.Linear(d_attention, 1, bias_attr=False)
-
- # Location Layer
- self.location_conv = nn.Conv1D(
- 2,
- location_filters,
- kernel_size=location_kernel_size,
- padding=int((location_kernel_size - 1) / 2),
- bias_attr=False,
- data_format='NLC')
- self.location_layer = nn.Linear(
- location_filters, d_attention, bias_attr=False)
-
- def forward(self,
- query,
- processed_key,
- value,
- attention_weights_cat,
- mask=None):
- """Compute context vector and attention weights.
-
- Parameters
- -----------
- query : Tensor [shape=(batch_size, d_query)]
- The queries.
- processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
- The keys after linear layer.
- value : Tensor [shape=(batch_size, time_steps_k, d_key)]
- The values.
- attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
- Attention weights concat.
- mask : Tensor, optional
- The mask. Shape should be (batch_size, times_steps_k, 1).
- Defaults to None.
-
- Returns
- ----------
- attention_context : Tensor [shape=(batch_size, d_attention)]
- The context vector.
- attention_weights : Tensor [shape=(batch_size, time_steps_k)]
- The attention weights.
- """
-
- processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1]))
- processed_attention_weights = self.location_layer(
- self.location_conv(attention_weights_cat))
- # (B, T_enc, 1)
- alignment = self.value(
- paddle.tanh(processed_attention_weights + processed_key +
- processed_query))
-
- if mask is not None:
- alignment = alignment + (1.0 - mask) * -1e9
-
- attention_weights = F.softmax(alignment, axis=1)
- attention_context = paddle.matmul(
- attention_weights, value, transpose_x=True)
-
- attention_weights = paddle.squeeze(attention_weights, axis=-1)
- attention_context = paddle.squeeze(attention_context, axis=1)
-
- return attention_context, attention_weights
diff --git a/paddlespeech/t2s/modules/conformer/convolution.py b/paddlespeech/t2s/modules/conformer/convolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..25246736b92dfda364cf53a02ed37bb670e99c55
--- /dev/null
+++ b/paddlespeech/t2s/modules/conformer/convolution.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""ConvolutionModule definition."""
+from paddle import nn
+
+
+class ConvolutionModule(nn.Layer):
+ """ConvolutionModule in Conformer model.
+ Parameters
+ ----------
+ channels : int
+ The number of channels of conv layers.
+ kernel_size : int
+ Kernerl size of conv layers.
+ """
+
+ def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
+ """Construct an ConvolutionModule object."""
+ super().__init__()
+ # kernerl_size should be a odd number for 'SAME' padding
+ assert (kernel_size - 1) % 2 == 0
+
+ self.pointwise_conv1 = nn.Conv1D(
+ channels,
+ 2 * channels,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ bias_attr=bias, )
+ self.depthwise_conv = nn.Conv1D(
+ channels,
+ channels,
+ kernel_size,
+ stride=1,
+ padding=(kernel_size - 1) // 2,
+ groups=channels,
+ bias_attr=bias, )
+ self.norm = nn.BatchNorm1D(channels)
+ self.pointwise_conv2 = nn.Conv1D(
+ channels,
+ channels,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ bias_attr=bias, )
+ self.activation = activation
+
+ def forward(self, x):
+ """Compute convolution module.
+ Parameters
+ ----------
+ x : paddle.Tensor
+ Input tensor (#batch, time, channels).
+ Returns
+ ----------
+ paddle.Tensor
+ Output tensor (#batch, time, channels).
+ """
+ # exchange the temporal dimension and the feature dimension
+ x = x.transpose([0, 2, 1])
+
+ # GLU mechanism
+ x = self.pointwise_conv1(x) # (batch, 2*channel, dim)
+ x = nn.functional.glu(x, axis=1) # (batch, channel, dim)
+
+ # 1D Depthwise Conv
+ x = self.depthwise_conv(x)
+ x = self.activation(self.norm(x))
+
+ x = self.pointwise_conv2(x)
+
+ return x.transpose([0, 2, 1])
diff --git a/paddlespeech/t2s/modules/conformer/encoder.py b/paddlespeech/t2s/modules/conformer/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..568597ba56d8ac27691577c23a244068240958ae
--- /dev/null
+++ b/paddlespeech/t2s/modules/conformer/encoder.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Encoder definition."""
+import logging
+
+import paddle
+
+from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
+from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.nets_utils import get_activation
+from paddlespeech.t2s.modules.transformer.attention import LegacyRelPositionMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.embedding import LegacyRelPositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.repeat import repeat
+from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
+
+
+class Encoder(paddle.nn.Layer):
+ """Conformer encoder module.
+ Parameters
+ ----------
+ idim : int
+ Input dimension.
+ attention_dim : int
+ Dimension of attention.
+ attention_heads : int
+ The number of heads of multi head attention.
+ linear_units : int
+ The number of units of position-wise feed forward.
+ num_blocks : int
+ The number of decoder blocks.
+ dropout_rate : float
+ Dropout rate.
+ positional_dropout_rate : float
+ Dropout rate after adding positional encoding.
+ attention_dropout_rate : float
+ Dropout rate in attention.
+ input_layer : Union[str, paddle.nn.Layer]
+ Input layer type.
+ normalize_before : bool
+ Whether to use layer_norm before the first block.
+ concat_after : bool
+ Whether to concat attention layer's input and output.
+ if True, additional linear will be applied.
+ i.e. x -> x + linear(concat(x, att(x)))
+ if False, no additional linear will be applied. i.e. x -> x + att(x)
+ positionwise_layer_type : str
+ "linear", "conv1d", or "conv1d-linear".
+ positionwise_conv_kernel_size : int
+ Kernel size of positionwise conv1d layer.
+ macaron_style : bool
+ Whether to use macaron style for positionwise layer.
+ pos_enc_layer_type : str
+ Encoder positional encoding layer type.
+ selfattention_layer_type : str
+ Encoder attention layer type.
+ activation_type : str
+ Encoder activation function type.
+ use_cnn_module : bool
+ Whether to use convolution module.
+ zero_triu : bool
+ Whether to zero the upper triangular part of attention matrix.
+ cnn_module_kernel : int
+ Kernerl size of convolution module.
+ padding_idx : int
+ Padding idx for input_layer=embed.
+ stochastic_depth_rate : float
+ Maximum probability to skip the encoder layer.
+ intermediate_layers : Union[List[int], None]
+ indices of intermediate CTC layer.
+ indices start from 1.
+ if not None, intermediate outputs are returned (which changes return type
+ signature.)
+ """
+
+ def __init__(
+ self,
+ idim,
+ attention_dim=256,
+ attention_heads=4,
+ linear_units=2048,
+ num_blocks=6,
+ dropout_rate=0.1,
+ positional_dropout_rate=0.1,
+ attention_dropout_rate=0.0,
+ input_layer="conv2d",
+ normalize_before=True,
+ concat_after=False,
+ positionwise_layer_type="linear",
+ positionwise_conv_kernel_size=1,
+ macaron_style=False,
+ pos_enc_layer_type="abs_pos",
+ selfattention_layer_type="selfattn",
+ activation_type="swish",
+ use_cnn_module=False,
+ zero_triu=False,
+ cnn_module_kernel=31,
+ padding_idx=-1,
+ stochastic_depth_rate=0.0,
+ intermediate_layers=None, ):
+ """Construct an Encoder object."""
+ super(Encoder, self).__init__()
+
+ activation = get_activation(activation_type)
+ if pos_enc_layer_type == "abs_pos":
+ pos_enc_class = PositionalEncoding
+ elif pos_enc_layer_type == "scaled_abs_pos":
+ pos_enc_class = ScaledPositionalEncoding
+ elif pos_enc_layer_type == "rel_pos":
+ assert selfattention_layer_type == "rel_selfattn"
+ pos_enc_class = RelPositionalEncoding
+ elif pos_enc_layer_type == "legacy_rel_pos":
+ pos_enc_class = LegacyRelPositionalEncoding
+ assert selfattention_layer_type == "legacy_rel_selfattn"
+ else:
+ raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+
+ self.conv_subsampling_factor = 1
+ if input_layer == "linear":
+ self.embed = paddle.nn.Sequential(
+ paddle.nn.Linear(idim, attention_dim),
+ paddle.nn.LayerNorm(attention_dim),
+ paddle.nn.Dropout(dropout_rate),
+ pos_enc_class(attention_dim, positional_dropout_rate), )
+ elif input_layer == "conv2d":
+ self.embed = Conv2dSubsampling(
+ idim,
+ attention_dim,
+ dropout_rate,
+ pos_enc_class(attention_dim, positional_dropout_rate), )
+ self.conv_subsampling_factor = 4
+
+ elif input_layer == "embed":
+ self.embed = paddle.nn.Sequential(
+ paddle.nn.Embedding(
+ idim, attention_dim, padding_idx=padding_idx),
+ pos_enc_class(attention_dim, positional_dropout_rate), )
+ elif isinstance(input_layer, paddle.nn.Layer):
+ self.embed = paddle.nn.Sequential(
+ input_layer,
+ pos_enc_class(attention_dim, positional_dropout_rate), )
+ elif input_layer is None:
+ self.embed = paddle.nn.Sequential(
+ pos_enc_class(attention_dim, positional_dropout_rate))
+ else:
+ raise ValueError("unknown input_layer: " + input_layer)
+ self.normalize_before = normalize_before
+
+ # self-attention module definition
+ if selfattention_layer_type == "selfattn":
+ logging.info("encoder self-attention layer type = self-attention")
+ encoder_selfattn_layer = MultiHeadedAttention
+ encoder_selfattn_layer_args = (attention_heads, attention_dim,
+ attention_dropout_rate, )
+ elif selfattention_layer_type == "legacy_rel_selfattn":
+ assert pos_enc_layer_type == "legacy_rel_pos"
+ encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
+ encoder_selfattn_layer_args = (attention_heads, attention_dim,
+ attention_dropout_rate, )
+ elif selfattention_layer_type == "rel_selfattn":
+ logging.info(
+ "encoder self-attention layer type = relative self-attention")
+ assert pos_enc_layer_type == "rel_pos"
+ encoder_selfattn_layer = RelPositionMultiHeadedAttention
+ encoder_selfattn_layer_args = (attention_heads, attention_dim,
+ attention_dropout_rate, zero_triu, )
+ else:
+ raise ValueError("unknown encoder_attn_layer: " +
+ selfattention_layer_type)
+
+ # feed-forward module definition
+ if positionwise_layer_type == "linear":
+ positionwise_layer = PositionwiseFeedForward
+ positionwise_layer_args = (attention_dim, linear_units,
+ dropout_rate, activation, )
+ elif positionwise_layer_type == "conv1d":
+ positionwise_layer = MultiLayeredConv1d
+ positionwise_layer_args = (attention_dim, linear_units,
+ positionwise_conv_kernel_size,
+ dropout_rate, )
+ elif positionwise_layer_type == "conv1d-linear":
+ positionwise_layer = Conv1dLinear
+ positionwise_layer_args = (attention_dim, linear_units,
+ positionwise_conv_kernel_size,
+ dropout_rate, )
+ else:
+ raise NotImplementedError("Support only linear or conv1d.")
+
+ # convolution module definition
+ convolution_layer = ConvolutionModule
+ convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
+
+ self.encoders = repeat(
+ num_blocks,
+ lambda lnum: EncoderLayer(
+ attention_dim,
+ encoder_selfattn_layer(*encoder_selfattn_layer_args),
+ positionwise_layer(*positionwise_layer_args),
+ positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+ convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+ dropout_rate,
+ normalize_before,
+ concat_after,
+ stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
+ if self.normalize_before:
+ self.after_norm = LayerNorm(attention_dim)
+
+ self.intermediate_layers = intermediate_layers
+
+ def forward(self, xs, masks):
+ """Encode input sequence.
+ Parameters
+ ----------
+ xs : paddle.Tensor
+ Input tensor (#batch, time, idim).
+ masks (paddle.Tensor): Mask tensor (#batch, 1, time).
+ Returns
+ ----------
+ paddle.Tensor
+ Output tensor (#batch, time, attention_dim).
+ paddle.Tensor
+ Mask tensor (#batch, time).
+ """
+ if isinstance(self.embed, (Conv2dSubsampling)):
+ xs, masks = self.embed(xs, masks)
+ else:
+ xs = self.embed(xs)
+
+ if self.intermediate_layers is None:
+ xs, masks = self.encoders(xs, masks)
+ else:
+ intermediate_outputs = []
+ for layer_idx, encoder_layer in enumerate(self.encoders):
+ xs, masks = encoder_layer(xs, masks)
+
+ if (self.intermediate_layers is not None and
+ layer_idx + 1 in self.intermediate_layers):
+ # intermediate branches also require normalization.
+ encoder_output = xs
+ if isinstance(encoder_output, tuple):
+ encoder_output = encoder_output[0]
+ if self.normalize_before:
+ encoder_output = self.after_norm(encoder_output)
+ intermediate_outputs.append(encoder_output)
+
+ if isinstance(xs, tuple):
+ xs = xs[0]
+
+ if self.normalize_before:
+ xs = self.after_norm(xs)
+
+ if self.intermediate_layers is not None:
+ return xs, masks, intermediate_outputs
+ return xs, masks
diff --git a/paddlespeech/t2s/modules/conformer/encoder_layer.py b/paddlespeech/t2s/modules/conformer/encoder_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7a4936786f9b47f945740d4b45eb7a2b98101ee
--- /dev/null
+++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Encoder self-attention layer definition."""
+import paddle
+from paddle import nn
+
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+
+
+class EncoderLayer(nn.Layer):
+ """Encoder layer module.
+ Parameters
+ ----------
+ size : int
+ Input dimension.
+ self_attn : paddle.nn.Layer
+ Self-attention module instance.
+ `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
+ can be used as the argument.
+ feed_forward : paddle.nn.Layer
+ Feed-forward module instance.
+ `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+ can be used as the argument.
+ feed_forward_macaron : paddle.nn.Layer
+ Additional feed-forward module instance.
+ `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+ can be used as the argument.
+ conv_module : paddle.nn.Layer
+ Convolution module instance.
+ `ConvlutionModule` instance can be used as the argument.
+ dropout_rate : float
+ Dropout rate.
+ normalize_before : bool
+ Whether to use layer_norm before the first block.
+ concat_after : bool
+ Whether to concat attention layer's input and output.
+ if True, additional linear will be applied.
+ i.e. x -> x + linear(concat(x, att(x)))
+ if False, no additional linear will be applied. i.e. x -> x + att(x)
+ stochastic_depth_rate : float
+ Proability to skip this layer.
+ During training, the layer may skip residual computation and return input
+ as-is with given probability.
+ """
+
+ def __init__(
+ self,
+ size,
+ self_attn,
+ feed_forward,
+ feed_forward_macaron,
+ conv_module,
+ dropout_rate,
+ normalize_before=True,
+ concat_after=False,
+ stochastic_depth_rate=0.0, ):
+ """Construct an EncoderLayer object."""
+ super(EncoderLayer, self).__init__()
+ self.self_attn = self_attn
+ self.feed_forward = feed_forward
+ self.feed_forward_macaron = feed_forward_macaron
+ self.conv_module = conv_module
+ self.norm_ff = LayerNorm(size) # for the FNN module
+ self.norm_mha = LayerNorm(size) # for the MHA module
+ if feed_forward_macaron is not None:
+ self.norm_ff_macaron = LayerNorm(size)
+ self.ff_scale = 0.5
+ else:
+ self.ff_scale = 1.0
+ if self.conv_module is not None:
+ self.norm_conv = LayerNorm(size) # for the CNN module
+ self.norm_final = LayerNorm(
+ size) # for the final output of the block
+ self.dropout = nn.Dropout(dropout_rate)
+ self.size = size
+ self.normalize_before = normalize_before
+ self.concat_after = concat_after
+ if self.concat_after:
+ self.concat_linear = nn.Linear(size + size, size)
+ self.stochastic_depth_rate = stochastic_depth_rate
+
+ def forward(self, x_input, mask, cache=None):
+ """Compute encoded features.
+ Parameters
+ ----------
+ x_input : Union[Tuple, paddle.Tensor]
+ Input tensor w/ or w/o pos emb.
+ - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
+ - w/o pos emb: Tensor (#batch, time, size).
+ mask : paddle.Tensor
+ Mask tensor for the input (#batch, time).
+ cache paddle.Tensor
+ Cache tensor of the input (#batch, time - 1, size).
+ Returns
+ ----------
+ paddle.Tensor
+ Output tensor (#batch, time, size).
+ paddle.Tensor
+ Mask tensor (#batch, time).
+ """
+ if isinstance(x_input, tuple):
+ x, pos_emb = x_input[0], x_input[1]
+ else:
+ x, pos_emb = x_input, None
+
+ skip_layer = False
+ # with stochastic depth, residual connection `x + f(x)` becomes
+ # `x <- x + 1 / (1 - p) * f(x)` at training time.
+ stoch_layer_coeff = 1.0
+ if self.training and self.stochastic_depth_rate > 0:
+ skip_layer = paddle.rand(1).item() < self.stochastic_depth_rate
+ stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
+
+ if skip_layer:
+ if cache is not None:
+ x = paddle.concat([cache, x], axis=1)
+ if pos_emb is not None:
+ return (x, pos_emb), mask
+ return x, mask
+
+ # whether to use macaron style
+ if self.feed_forward_macaron is not None:
+ residual = x
+ if self.normalize_before:
+ x = self.norm_ff_macaron(x)
+ x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
+ self.feed_forward_macaron(x))
+ if not self.normalize_before:
+ x = self.norm_ff_macaron(x)
+
+ # multi-headed self-attention module
+ residual = x
+ if self.normalize_before:
+ x = self.norm_mha(x)
+
+ if cache is None:
+ x_q = x
+ else:
+ assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
+ x_q = x[:, -1:, :]
+ residual = residual[:, -1:, :]
+ mask = None if mask is None else mask[:, -1:, :]
+
+ if pos_emb is not None:
+ x_att = self.self_attn(x_q, x, x, pos_emb, mask)
+ else:
+ x_att = self.self_attn(x_q, x, x, mask)
+
+ if self.concat_after:
+ x_concat = paddle.concat((x, x_att), axis=-1)
+ x = residual + stoch_layer_coeff * self.concat_linear(x_concat)
+ else:
+ x = residual + stoch_layer_coeff * self.dropout(x_att)
+ if not self.normalize_before:
+ x = self.norm_mha(x)
+
+ # convolution module
+ if self.conv_module is not None:
+ residual = x
+ if self.normalize_before:
+ x = self.norm_conv(x)
+ x = residual + stoch_layer_coeff * self.dropout(self.conv_module(x))
+ if not self.normalize_before:
+ x = self.norm_conv(x)
+
+ # feed forward module
+ residual = x
+ if self.normalize_before:
+ x = self.norm_ff(x)
+ x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
+ self.feed_forward(x))
+ if not self.normalize_before:
+ x = self.norm_ff(x)
+
+ if self.conv_module is not None:
+ x = self.norm_final(x)
+
+ if cache is not None:
+ x = paddle.concat([cache, x], axis=1)
+
+ if pos_emb is not None:
+ return (x, pos_emb), mask
+
+ return x, mask
diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py
index 30d3db86c885a56204562b82f7e0d709a96717e2..fbb3a9a3d65f83fd43b19902c9e97137691f2a2d 100644
--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@@ -17,6 +17,14 @@ from paddle import nn
from typeguard import check_argument_types
+class Swish(paddle.nn.Layer):
+ """Construct an Swish object."""
+
+ def forward(self, x):
+ """Return Swich activation function."""
+ return x * paddle.nn.Sigmoid(x)
+
+
def pad_list(xs, pad_value):
"""Perform padding for the list of tensors.
@@ -150,3 +158,17 @@ def initialize(model: nn.Layer, init: str):
nn.initializer.Constant())
else:
raise ValueError("Unknown initialization: " + init)
+
+
+def get_activation(act):
+ """Return activation function."""
+
+ activation_funcs = {
+ "hardtanh": paddle.nn.Hardtanh,
+ "tanh": paddle.nn.Tanh,
+ "relu": paddle.nn.ReLU,
+ "selu": paddle.nn.SELU,
+ "swish": Swish,
+ }
+
+ return activation_funcs[act]()
diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/__init__.py b/paddlespeech/t2s/modules/predictor/__init__.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_predictor/__init__.py
rename to paddlespeech/t2s/modules/predictor/__init__.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/duration_predictor.py b/paddlespeech/t2s/modules/predictor/duration_predictor.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_predictor/duration_predictor.py
rename to paddlespeech/t2s/modules/predictor/duration_predictor.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_predictor/length_regulator.py
rename to paddlespeech/t2s/modules/predictor/length_regulator.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/variance_predictor.py b/paddlespeech/t2s/modules/predictor/variance_predictor.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_predictor/variance_predictor.py
rename to paddlespeech/t2s/modules/predictor/variance_predictor.py
diff --git a/paddlespeech/t2s/modules/style_encoder.py b/paddlespeech/t2s/modules/style_encoder.py
index 868a73a969edb6d3dc1affe6b0e401a88fb7d11b..8a23e85c61dd2f2cbbd06e281335317575dfc5ff 100644
--- a/paddlespeech/t2s/modules/style_encoder.py
+++ b/paddlespeech/t2s/modules/style_encoder.py
@@ -19,7 +19,7 @@ import paddle
from paddle import nn
from typeguard import check_argument_types
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention
class StyleEncoder(nn.Layer):
diff --git a/paddlespeech/t2s/modules/transformer.py b/paddlespeech/t2s/modules/transformer.py
deleted file mode 100644
index e50d58d44bc6663414a7390589d3a8d7ad6f2c5b..0000000000000000000000000000000000000000
--- a/paddlespeech/t2s/modules/transformer.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle import nn
-from paddle.nn import functional as F
-
-from paddlespeech.t2s.modules import attention as attn
-
-__all__ = [
- "PositionwiseFFN",
- "TransformerEncoderLayer",
- "TransformerDecoderLayer",
-]
-
-
-class PositionwiseFFN(nn.Layer):
- """A faithful implementation of Position-wise Feed-Forward Network
- in `Attention is All You Need `_.
- It is basically a 2-layer MLP, with relu actication and dropout in between.
-
- Parameters
- ----------
- input_size: int
- The feature size of the intput. It is also the feature size of the
- output.
- hidden_size: int
- The hidden size.
- dropout: float
- The probability of the Dropout applied to the output of the first
- layer, by default 0.
- """
-
- def __init__(self, input_size: int, hidden_size: int, dropout=0.0):
- super(PositionwiseFFN, self).__init__()
- self.linear1 = nn.Linear(input_size, hidden_size)
- self.linear2 = nn.Linear(hidden_size, input_size)
- self.dropout = nn.Dropout(dropout)
-
- self.input_size = input_size
- self.hidden_szie = hidden_size
-
- def forward(self, x):
- r"""Forward pass of positionwise feed forward network.
-
- Parameters
- ----------
- x : Tensor [shape=(\*, input_size)]
- The input tensor, where ``\*`` means arbitary shape.
-
- Returns
- -------
- Tensor [shape=(\*, input_size)]
- The output tensor.
- """
- l1 = self.dropout(F.relu(self.linear1(x)))
- l2 = self.linear2(l1)
- return l2
-
-
-class TransformerEncoderLayer(nn.Layer):
- """A faithful implementation of Transformer encoder layer in
- `Attention is All You Need `_.
-
- Parameters
- ----------
- d_model :int
- The feature size of the input. It is also the feature size of the
- output.
- n_heads : int
- The number of heads of self attention (a ``MultiheadAttention``
- layer).
- d_ffn : int
- The hidden size of the positional feed forward network (a
- ``PositionwiseFFN`` layer).
- dropout : float, optional
- The probability of the dropout in MultiHeadAttention and
- PositionwiseFFN, by default 0.
-
- Notes
- ------
- It uses the PostLN (post layer norm) scheme.
- """
-
- def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
- super(TransformerEncoderLayer, self).__init__()
- self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
- self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
-
- self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
- self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
-
- self.dropout = dropout
-
- def forward(self, x, mask):
- """Forward pass of TransformerEncoderLayer.
-
- Parameters
- ----------
- x : Tensor [shape=(batch_size, time_steps, d_model)]
- The input.
- mask : Tensor
- The padding mask. The shape is (batch_size, time_steps,
- time_steps) or broadcastable shape.
-
- Returns
- -------
- x :Tensor [shape=(batch_size, time_steps, d_model)]
- The encoded output.
-
- attn_weights : Tensor [shape=(batch_size, n_heads, time_steps, time_steps)]
- The attention weights of the self attention.
- """
- context_vector, attn_weights = self.self_mha(x, x, x, mask)
- x = self.layer_norm1(
- F.dropout(x + context_vector, self.dropout, training=self.training))
-
- x = self.layer_norm2(
- F.dropout(x + self.ffn(x), self.dropout, training=self.training))
- return x, attn_weights
-
-
-class TransformerDecoderLayer(nn.Layer):
- """A faithful implementation of Transformer decoder layer in
- `Attention is All You Need `_.
-
- Parameters
- ----------
- d_model :int
- The feature size of the input. It is also the feature size of the
- output.
- n_heads : int
- The number of heads of attentions (``MultiheadAttention``
- layers).
- d_ffn : int
- The hidden size of the positional feed forward network (a
- ``PositionwiseFFN`` layer).
- dropout : float, optional
- The probability of the dropout in MultiHeadAttention and
- PositionwiseFFN, by default 0.
-
- Notes
- ------
- It uses the PostLN (post layer norm) scheme.
- """
-
- def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
- super(TransformerDecoderLayer, self).__init__()
- self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
- self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
-
- self.cross_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
- self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
-
- self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
- self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6)
-
- self.dropout = dropout
-
- def forward(self, q, k, v, encoder_mask, decoder_mask):
- """Forward pass of TransformerEncoderLayer.
-
- Parameters
- ----------
- q : Tensor [shape=(batch_size, time_steps_q, d_model)]
- The decoder input.
- k : Tensor [shape=(batch_size, time_steps_k, d_model)]
- The keys.
- v : Tensor [shape=(batch_size, time_steps_k, d_model)]
- The values
- encoder_mask : Tensor
- Encoder padding mask, shape is ``(batch_size, time_steps_k,
- time_steps_k)`` or broadcastable shape.
- decoder_mask : Tensor
- Decoder mask, shape is ``(batch_size, time_steps_q, time_steps_k)``
- or broadcastable shape.
-
- Returns
- --------
- q : Tensor [shape=(batch_size, time_steps_q, d_model)]
- The decoder output.
- self_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_q)]
- Decoder self attention.
-
- cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)]
- Decoder-encoder cross attention.
- """
- context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask)
- q = self.layer_norm1(
- F.dropout(q + context_vector, self.dropout, training=self.training))
-
- context_vector, cross_attn_weights = self.cross_mha(q, k, v,
- encoder_mask)
- q = self.layer_norm2(
- F.dropout(q + context_vector, self.dropout, training=self.training))
-
- q = self.layer_norm3(
- F.dropout(q + self.ffn(q), self.dropout, training=self.training))
- return q, self_attn_weights, cross_attn_weights
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/__init__.py b/paddlespeech/t2s/modules/transformer/__init__.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/__init__.py
rename to paddlespeech/t2s/modules/transformer/__init__.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/attention.py
rename to paddlespeech/t2s/modules/transformer/attention.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py b/paddlespeech/t2s/modules/transformer/decoder.py
similarity index 94%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py
rename to paddlespeech/t2s/modules/transformer/decoder.py
index 489fda12bc9d5708418ef2b8e3b96ea264f7101e..072fc813737f3963ccfb6536a1e90e033116e7d4 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py
+++ b/paddlespeech/t2s/modules/transformer/decoder.py
@@ -23,14 +23,14 @@ import paddle
import paddle.nn.functional as F
from paddle import nn
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
-from paddlespeech.t2s.modules.fastspeech2_transformer.decoder_layer import DecoderLayer
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.lightconv import LightweightConvolution
-from paddlespeech.t2s.modules.fastspeech2_transformer.mask import subsequent_mask
-from paddlespeech.t2s.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
-from paddlespeech.t2s.modules.fastspeech2_transformer.repeat import repeat
from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.decoder_layer import DecoderLayer
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.lightconv import LightweightConvolution
+from paddlespeech.t2s.modules.transformer.mask import subsequent_mask
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.repeat import repeat
class Decoder(nn.Layer):
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder_layer.py b/paddlespeech/t2s/modules/transformer/decoder_layer.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/decoder_layer.py
rename to paddlespeech/t2s/modules/transformer/decoder_layer.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/embedding.py
rename to paddlespeech/t2s/modules/transformer/embedding.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py
similarity index 92%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
rename to paddlespeech/t2s/modules/transformer/encoder.py
index f91c76b727e8af153ec82bf70410c3c6cae0f227..f088ac7fad38a2b3fc77b6251cea9dc845ebd813 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@@ -14,13 +14,13 @@
# Modified from espnet(https://github.com/espnet/espnet)
from paddle import nn
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.encoder_layer import EncoderLayer
-from paddlespeech.t2s.modules.fastspeech2_transformer.multi_layer_conv import Conv1dLinear
-from paddlespeech.t2s.modules.fastspeech2_transformer.multi_layer_conv import MultiLayeredConv1d
-from paddlespeech.t2s.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
-from paddlespeech.t2s.modules.fastspeech2_transformer.repeat import repeat
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.encoder_layer import EncoderLayer
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.repeat import repeat
class Encoder(nn.Layer):
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder_layer.py b/paddlespeech/t2s/modules/transformer/encoder_layer.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/encoder_layer.py
rename to paddlespeech/t2s/modules/transformer/encoder_layer.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/lightconv.py b/paddlespeech/t2s/modules/transformer/lightconv.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/lightconv.py
rename to paddlespeech/t2s/modules/transformer/lightconv.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/mask.py b/paddlespeech/t2s/modules/transformer/mask.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/mask.py
rename to paddlespeech/t2s/modules/transformer/mask.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/multi_layer_conv.py b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/multi_layer_conv.py
rename to paddlespeech/t2s/modules/transformer/multi_layer_conv.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/positionwise_feed_forward.py b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/positionwise_feed_forward.py
rename to paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/repeat.py b/paddlespeech/t2s/modules/transformer/repeat.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/repeat.py
rename to paddlespeech/t2s/modules/transformer/repeat.py
diff --git a/paddlespeech/t2s/modules/transformer/subsampling.py b/paddlespeech/t2s/modules/transformer/subsampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..300b35beda72dda735629b525a0f00bb25129e94
--- /dev/null
+++ b/paddlespeech/t2s/modules/transformer/subsampling.py
@@ -0,0 +1,291 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+# Conv2dSubsampling 测试通过
+"""Subsampling layer definition."""
+import paddle
+
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+
+
+class TooShortUttError(Exception):
+ """Raised when the utt is too short for subsampling.
+ Parameters
+ ----------
+ message : str
+ Message for error catch
+ actual_size : int
+ the short size that cannot pass the subsampling
+ limit : int
+ the limit size for subsampling
+ """
+
+ def __init__(self, message, actual_size, limit):
+ """Construct a TooShortUttError for error handler."""
+ super().__init__(message)
+ self.actual_size = actual_size
+ self.limit = limit
+
+
+def check_short_utt(ins, size):
+ """Check if the utterance is too short for subsampling."""
+ if isinstance(ins, Conv2dSubsampling2) and size < 3:
+ return True, 3
+ if isinstance(ins, Conv2dSubsampling) and size < 7:
+ return True, 7
+ if isinstance(ins, Conv2dSubsampling6) and size < 11:
+ return True, 11
+ if isinstance(ins, Conv2dSubsampling8) and size < 15:
+ return True, 15
+ return False, -1
+
+
+class Conv2dSubsampling(paddle.nn.Layer):
+ """Convolutional 2D subsampling (to 1/4 length).
+ Parameters
+ ----------
+ idim : int
+ Input dimension.
+ odim : int
+ Output dimension.
+ dropout_rate : float
+ Dropout rate.
+ pos_enc : paddle.nn.Layer
+ Custom position encoding layer.
+ """
+
+ def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+ """Construct an Conv2dSubsampling object."""
+ super(Conv2dSubsampling, self).__init__()
+ self.conv = paddle.nn.Sequential(
+ paddle.nn.Conv2D(1, odim, 3, 2),
+ paddle.nn.ReLU(),
+ paddle.nn.Conv2D(odim, odim, 3, 2),
+ paddle.nn.ReLU(), )
+ self.out = paddle.nn.Sequential(
+ paddle.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim),
+ pos_enc if pos_enc is not None else
+ PositionalEncoding(odim, dropout_rate), )
+
+ def forward(self, x, x_mask):
+ """Subsample x.
+ Parameters
+ ----------
+ x : paddle.Tensor
+ Input tensor (#batch, time, idim).
+ x_mask : paddle.Tensor
+ Input mask (#batch, 1, time).
+ Returns
+ ----------
+ paddle.Tensor
+ Subsampled tensor (#batch, time', odim),
+ where time' = time // 4.
+ paddle.Tensor
+ Subsampled mask (#batch, 1, time'),
+ where time' = time // 4.
+ """
+ # (b, c, t, f)
+ x = x.unsqueeze(1)
+ x = self.conv(x)
+ b, c, t, f = x.shape
+ # x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+ x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
+ if x_mask is None:
+ return x, None
+ return x, x_mask[:, :, :-2:2][:, :, :-2:2]
+
+ def __getitem__(self, key):
+ """Get item.
+ When reset_parameters() is called, if use_scaled_pos_enc is used,
+ return the positioning encoding.
+ """
+ if key != -1:
+ raise NotImplementedError(
+ "Support only `-1` (for `reset_parameters`).")
+ return self.out[key]
+
+
+class Conv2dSubsampling2(paddle.nn.Layer):
+ """Convolutional 2D subsampling (to 1/2 length).
+ Parameters
+ ----------
+ idim : int
+ Input dimension.
+ odim : int
+ Output dimension.
+ dropout_rate : float
+ Dropout rate.
+ pos_enc : paddle.nn.Layer
+ Custom position encoding layer.
+ """
+
+ def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+ """Construct an Conv2dSubsampling2 object."""
+ super(Conv2dSubsampling2, self).__init__()
+ self.conv = paddle.nn.Sequential(
+ paddle.nn.Conv2D(1, odim, 3, 2),
+ paddle.nn.ReLU(),
+ paddle.nn.Conv2D(odim, odim, 3, 1),
+ paddle.nn.ReLU(), )
+ self.out = paddle.nn.Sequential(
+ paddle.nn.Linear(odim * (((idim - 1) // 2 - 2)), odim),
+ pos_enc if pos_enc is not None else
+ PositionalEncoding(odim, dropout_rate), )
+
+ def forward(self, x, x_mask):
+ """Subsample x.
+ Parameters
+ ----------
+ x : paddle.Tensor
+ Input tensor (#batch, time, idim).
+ x_mask : paddle.Tensor
+ Input mask (#batch, 1, time).
+ Returns
+ ----------
+ paddle.Tensor
+ ubsampled tensor (#batch, time', odim),
+ where time' = time // 2.
+ paddle.Tensor
+ Subsampled mask (#batch, 1, time'),
+ where time' = time // 2.
+ """
+ # (b, c, t, f)
+ x = x.unsqueeze(1)
+ x = self.conv(x)
+ b, c, t, f = x.shape
+ x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
+ if x_mask is None:
+ return x, None
+ return x, x_mask[:, :, :-2:2][:, :, :-2:1]
+
+ def __getitem__(self, key):
+ """Get item.
+ When reset_parameters() is called, if use_scaled_pos_enc is used,
+ return the positioning encoding.
+ """
+ if key != -1:
+ raise NotImplementedError(
+ "Support only `-1` (for `reset_parameters`).")
+ return self.out[key]
+
+
+class Conv2dSubsampling6(paddle.nn.Layer):
+ """Convolutional 2D subsampling (to 1/6 length).
+ Parameters
+ ----------
+ idim : int
+ Input dimension.
+ odim : int
+ Output dimension.
+ dropout_rate : float
+ Dropout rate.
+ pos_enc : paddle.nn.Layer
+ Custom position encoding layer.
+ """
+
+ def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+ """Construct an Conv2dSubsampling6 object."""
+ super(Conv2dSubsampling6, self).__init__()
+ self.conv = paddle.nn.Sequential(
+ paddle.nn.Conv2D(1, odim, 3, 2),
+ paddle.nn.ReLU(),
+ paddle.nn.Conv2D(odim, odim, 5, 3),
+ paddle.nn.ReLU(), )
+ self.out = paddle.nn.Sequential(
+ paddle.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim),
+ pos_enc if pos_enc is not None else
+ PositionalEncoding(odim, dropout_rate), )
+
+ def forward(self, x, x_mask):
+ """Subsample x.
+ Parameters
+ ----------
+ x : paddle.Tensor
+ Input tensor (#batch, time, idim).
+ x_mask paddle.Tensor
+ Input mask (#batch, 1, time).
+ Returns
+ ----------
+ paddle.Tensor
+ Subsampled tensor (#batch, time', odim),
+ where time' = time // 6.
+ paddle.Tensor
+ Subsampled mask (#batch, 1, time'),
+ where time' = time // 6.
+ """
+ # (b, c, t, f)
+ x = x.unsqueeze(1)
+ x = self.conv(x)
+ b, c, t, f = x.shape
+ x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
+ if x_mask is None:
+ return x, None
+ return x, x_mask[:, :, :-2:2][:, :, :-4:3]
+
+
+class Conv2dSubsampling8(paddle.nn.Layer):
+ """Convolutional 2D subsampling (to 1/8 length).
+ Parameters
+ ----------
+ idim : int
+ Input dimension.
+ odim : int
+ Output dimension.
+ dropout_rate : float
+ Dropout rate.
+ pos_enc : paddle.nn.Layer
+ Custom position encoding layer.
+ """
+
+ def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+ """Construct an Conv2dSubsampling8 object."""
+ super(Conv2dSubsampling8, self).__init__()
+ self.conv = paddle.nn.Sequential(
+ paddle.nn.Conv2D(1, odim, 3, 2),
+ paddle.nn.ReLU(),
+ paddle.nn.Conv2D(odim, odim, 3, 2),
+ paddle.nn.ReLU(),
+ paddle.nn.Conv2D(odim, odim, 3, 2),
+ paddle.nn.ReLU(), )
+ self.out = paddle.nn.Sequential(
+ paddle.nn.Linear(odim * (((
+ (idim - 1) // 2 - 1) // 2 - 1) // 2), odim),
+ pos_enc if pos_enc is not None else
+ PositionalEncoding(odim, dropout_rate), )
+
+ def forward(self, x, x_mask):
+ """Subsample x.
+ Parameters
+ ----------
+ x : paddle.Tensor
+ Input tensor (#batch, time, idim).
+ x_mask : paddle.Tensor
+ Input mask (#batch, 1, time).
+ Returns
+ ----------
+ paddle.Tensor
+ Subsampled tensor (#batch, time', odim),
+ where time' = time // 8.
+ paddle.Tensor
+ Subsampled mask (#batch, 1, time'),
+ where time' = time // 8.
+ """
+ # (b, c, t, f)
+ x = x.unsqueeze(1)
+ x = self.conv(x)
+ b, c, t, f = x.shape
+ x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
+ if x_mask is None:
+ return x, None
+ return x, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2]
diff --git a/requirements.txt b/requirements.txt
index 2b34d36bdb467e0286c5d1e87d2f1383a9356f98..8e2552e7059e24ebbedb1ef2b67530e6780eb0cb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,7 +28,7 @@ python-dateutil
pyworld
resampy==0.2.2
sacrebleu
-scipy==1.2.1
+scipy
sentencepiece
snakeviz
soundfile~=0.10
@@ -44,3 +44,9 @@ visualdl==2.2.0
webrtcvad
yacs
yq
+pypi-kenlm
+GPUtil
+psutil
+pynvml
+distro
+
diff --git a/setup.sh b/setup.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0bfacb548bfa6eb61bcb506c1fbc0a5acc185577
--- /dev/null
+++ b/setup.sh
@@ -0,0 +1,20 @@
+# Install conda dependencies
+conda install -c conda-forge sox libsndfile swig bzip2 bottleneck gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 --yes
+
+# Install the python lib
+pip install -r requirements.txt
+
+# Install the auto_log
+pushd tools/extras
+bash install_autolog.sh
+popd
+
+# Install the ctcdecoder
+pushd paddlespeech/s2t/decoders/ctcdecoder/swig
+bash -e setup.sh
+popd
+
+# Install the python_speech_features
+pushd third_party
+bash -e install.sh
+popd
diff --git a/tests/benchmark/conformer/README.md b/tests/benchmark/conformer/README.md
index 71d5f91b8f283fe65afed2cfdf54eb7691e56ed8..22e0009d4445820a9ca6a226a1978ac065d698a9 100644
--- a/tests/benchmark/conformer/README.md
+++ b/tests/benchmark/conformer/README.md
@@ -43,16 +43,6 @@ bash prepare.sh
bash run.sh
```
-### Analyse the sp
-```
-bash run_analysis_sp.sh
-```
-
-### Analyse the mp
-```
-bash run_analysis_mp.sh
-```
-
### The log
```
{"log_file": "recoder_sp_bs16_fp32_ngpu1.txt",
diff --git a/tests/benchmark/conformer/analysis.py b/tests/benchmark/conformer/analysis.py
deleted file mode 100644
index 610791c8cf11640a4d1142441cd1d349cf8b3be1..0000000000000000000000000000000000000000
--- a/tests/benchmark/conformer/analysis.py
+++ /dev/null
@@ -1,345 +0,0 @@
-# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-
-import argparse
-import json
-import re
-import traceback
-
-
-def parse_args():
- parser = argparse.ArgumentParser(description=__doc__)
- parser.add_argument(
- "--filename", type=str, help="The name of log which need to analysis.")
- parser.add_argument(
- "--log_with_profiler",
- type=str,
- help="The path of train log with profiler")
- parser.add_argument(
- "--profiler_path", type=str, help="The path of profiler timeline log.")
- parser.add_argument(
- "--keyword", type=str, help="Keyword to specify analysis data")
- parser.add_argument(
- "--separator",
- type=str,
- default=None,
- help="Separator of different field in log")
- parser.add_argument(
- '--position', type=int, default=None, help='The position of data field')
- parser.add_argument(
- '--range',
- type=str,
- default="",
- help='The range of data field to intercept')
- parser.add_argument(
- '--base_batch_size', type=int, help='base_batch size on gpu')
- parser.add_argument(
- '--skip_steps',
- type=int,
- default=0,
- help='The number of steps to be skipped')
- parser.add_argument(
- '--model_mode',
- type=int,
- default=-1,
- help='Analysis mode, default value is -1')
- parser.add_argument('--ips_unit', type=str, default=None, help='IPS unit')
- parser.add_argument(
- '--model_name',
- type=str,
- default=0,
- help='training model_name, transformer_base')
- parser.add_argument(
- '--mission_name', type=str, default=0, help='training mission name')
- parser.add_argument(
- '--direction_id', type=int, default=0, help='training direction_id')
- parser.add_argument(
- '--run_mode',
- type=str,
- default="sp",
- help='multi process or single process')
- parser.add_argument(
- '--index',
- type=int,
- default=1,
- help='{1: speed, 2:mem, 3:profiler, 6:max_batch_size}')
- parser.add_argument(
- '--gpu_num', type=int, default=1, help='nums of training gpus')
- parser.add_argument(
- '--use_num', type=int, default=1, help='nums of used recoders')
- args = parser.parse_args()
- args.separator = None if args.separator == "None" else args.separator
- return args
-
-
-def _is_number(num):
- pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$')
- result = pattern.match(num)
- if result:
- return True
- else:
- return False
-
-
-class TimeAnalyzer(object):
- def __init__(self,
- filename,
- keyword=None,
- separator=None,
- position=None,
- range="-1"):
- if filename is None:
- raise Exception("Please specify the filename!")
-
- if keyword is None:
- raise Exception("Please specify the keyword!")
-
- self.filename = filename
- self.keyword = keyword
- self.separator = separator
- self.position = position
- self.range = range
- self.records = None
- self._distil()
-
- def _distil(self):
- self.records = []
- with open(self.filename, "r") as f_object:
- lines = f_object.readlines()
- for line in lines:
- if self.keyword not in line:
- continue
- try:
- result = None
-
- # Distil the string from a line.
- line = line.strip()
- line_words = line.split(
- self.separator) if self.separator else line.split()
- print("line_words", line_words)
- if args.position:
- result = line_words[self.position]
- else:
- # Distil the string following the keyword.
- for i in range(len(line_words) - 1):
- if line_words[i] == self.keyword:
- result = line_words[i + 1]
- break
-
- # Distil the result from the picked string.
- if not self.range:
- result = result[0:]
- elif _is_number(self.range):
- result = result[0:int(self.range)]
- else:
- result = result[int(self.range.split(":")[0]):int(
- self.range.split(":")[1])]
- self.records.append(float(result))
- except Exception as exc:
- pass
- #print("line is: {}; separator={}; position={}".format(line, self.separator, self.position))
- self.records.sort()
- self.records = self.records[:args.use_num]
- print("records", self.records)
- print("Extract {} records: separator={}; position={}".format(
- len(self.records), self.separator, self.position))
-
- def _get_fps(self,
- mode,
- batch_size,
- gpu_num,
- avg_of_records,
- run_mode,
- unit=None):
- if mode == -1 and run_mode == 'sp':
- assert unit, "Please set the unit when mode is -1."
- fps = gpu_num * avg_of_records
- elif mode == -1 and run_mode == 'mp':
- assert unit, "Please set the unit when mode is -1."
- fps = gpu_num * avg_of_records #temporarily, not used now
- print("------------this is mp")
- elif mode == 0:
- # s/step -> samples/s
- fps = (batch_size * gpu_num) / avg_of_records
- unit = "samples/s"
- elif mode == 1:
- # steps/s -> steps/s
- fps = avg_of_records
- unit = "steps/s"
- elif mode == 2:
- # s/step -> steps/s
- fps = 1 / avg_of_records
- unit = "steps/s"
- elif mode == 3:
- # steps/s -> samples/s
- fps = batch_size * gpu_num * avg_of_records
- unit = "samples/s"
- elif mode == 4:
- # s/epoch -> s/epoch
- fps = avg_of_records
- unit = "s/epoch"
- else:
- ValueError("Unsupported analysis mode.")
-
- return fps, unit
-
- def analysis(self,
- batch_size,
- gpu_num=1,
- skip_steps=0,
- mode=-1,
- run_mode='sp',
- unit=None):
- if batch_size <= 0:
- print("base_batch_size should larger than 0.")
- return 0, ''
-
- if len(
- self.records
- ) <= skip_steps: # to address the condition which item of log equals to skip_steps
- print("no records")
- return 0, ''
-
- sum_of_records = 0
- sum_of_records_skipped = 0
- skip_min = self.records[skip_steps]
- skip_max = self.records[skip_steps]
-
- count = len(self.records)
- for i in range(count):
- sum_of_records += self.records[i]
- if i >= skip_steps:
- sum_of_records_skipped += self.records[i]
- if self.records[i] < skip_min:
- skip_min = self.records[i]
- if self.records[i] > skip_max:
- skip_max = self.records[i]
-
- avg_of_records = sum_of_records / float(count)
- avg_of_records_skipped = sum_of_records_skipped / float(count -
- skip_steps)
-
- fps, fps_unit = self._get_fps(mode, batch_size, gpu_num, avg_of_records,
- run_mode, unit)
- fps_skipped, _ = self._get_fps(mode, batch_size, gpu_num,
- avg_of_records_skipped, run_mode, unit)
- if mode == -1:
- print("average ips of %d steps, skip 0 step:" % count)
- print("\tAvg: %.3f %s" % (avg_of_records, fps_unit))
- print("\tFPS: %.3f %s" % (fps, fps_unit))
- if skip_steps > 0:
- print("average ips of %d steps, skip %d steps:" %
- (count, skip_steps))
- print("\tAvg: %.3f %s" % (avg_of_records_skipped, fps_unit))
- print("\tMin: %.3f %s" % (skip_min, fps_unit))
- print("\tMax: %.3f %s" % (skip_max, fps_unit))
- print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
- elif mode == 1 or mode == 3:
- print("average latency of %d steps, skip 0 step:" % count)
- print("\tAvg: %.3f steps/s" % avg_of_records)
- print("\tFPS: %.3f %s" % (fps, fps_unit))
- if skip_steps > 0:
- print("average latency of %d steps, skip %d steps:" %
- (count, skip_steps))
- print("\tAvg: %.3f steps/s" % avg_of_records_skipped)
- print("\tMin: %.3f steps/s" % skip_min)
- print("\tMax: %.3f steps/s" % skip_max)
- print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
- elif mode == 0 or mode == 2:
- print("average latency of %d steps, skip 0 step:" % count)
- print("\tAvg: %.3f s/step" % avg_of_records)
- print("\tFPS: %.3f %s" % (fps, fps_unit))
- if skip_steps > 0:
- print("average latency of %d steps, skip %d steps:" %
- (count, skip_steps))
- print("\tAvg: %.3f s/step" % avg_of_records_skipped)
- print("\tMin: %.3f s/step" % skip_min)
- print("\tMax: %.3f s/step" % skip_max)
- print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
-
- return round(fps_skipped, 3), fps_unit
-
-
-if __name__ == "__main__":
- args = parse_args()
- run_info = dict()
- run_info["log_file"] = args.filename
- run_info["model_name"] = args.model_name
- run_info["mission_name"] = args.mission_name
- run_info["direction_id"] = args.direction_id
- run_info["run_mode"] = args.run_mode
- run_info["index"] = args.index
- run_info["gpu_num"] = args.gpu_num
- run_info["FINAL_RESULT"] = 0
- run_info["JOB_FAIL_FLAG"] = 0
-
- try:
- if args.index == 1:
- if args.gpu_num == 1:
- run_info["log_with_profiler"] = args.log_with_profiler
- run_info["profiler_path"] = args.profiler_path
- analyzer = TimeAnalyzer(args.filename, args.keyword, args.separator,
- args.position, args.range)
- run_info["FINAL_RESULT"], run_info["UNIT"] = analyzer.analysis(
- batch_size=args.base_batch_size,
- gpu_num=args.gpu_num,
- skip_steps=args.skip_steps,
- mode=args.model_mode,
- run_mode=args.run_mode,
- unit=args.ips_unit)
- # if int(os.getenv('job_fail_flag')) == 1 or int(run_info["FINAL_RESULT"]) == 0:
- # run_info["JOB_FAIL_FLAG"] = 1
- elif args.index == 3:
- run_info["FINAL_RESULT"] = {}
- records_fo_total = TimeAnalyzer(args.filename, 'Framework overhead',
- None, 3, '').records
- records_fo_ratio = TimeAnalyzer(args.filename, 'Framework overhead',
- None, 5).records
- records_ct_total = TimeAnalyzer(args.filename, 'Computation time',
- None, 3, '').records
- records_gm_total = TimeAnalyzer(args.filename,
- 'GpuMemcpy Calls',
- None, 4, '').records
- records_gm_ratio = TimeAnalyzer(args.filename,
- 'GpuMemcpy Calls',
- None, 6).records
- records_gmas_total = TimeAnalyzer(args.filename,
- 'GpuMemcpyAsync Calls',
- None, 4, '').records
- records_gms_total = TimeAnalyzer(args.filename,
- 'GpuMemcpySync Calls',
- None, 4, '').records
- run_info["FINAL_RESULT"]["Framework_Total"] = records_fo_total[
- 0] if records_fo_total else 0
- run_info["FINAL_RESULT"]["Framework_Ratio"] = records_fo_ratio[
- 0] if records_fo_ratio else 0
- run_info["FINAL_RESULT"][
- "ComputationTime_Total"] = records_ct_total[
- 0] if records_ct_total else 0
- run_info["FINAL_RESULT"]["GpuMemcpy_Total"] = records_gm_total[
- 0] if records_gm_total else 0
- run_info["FINAL_RESULT"]["GpuMemcpy_Ratio"] = records_gm_ratio[
- 0] if records_gm_ratio else 0
- run_info["FINAL_RESULT"][
- "GpuMemcpyAsync_Total"] = records_gmas_total[
- 0] if records_gmas_total else 0
- run_info["FINAL_RESULT"]["GpuMemcpySync_Total"] = records_gms_total[
- 0] if records_gms_total else 0
- else:
- print("Not support!")
- except Exception:
- traceback.print_exc()
- print("{}".format(json.dumps(run_info))
- ) # it's required, for the log file path insert to the database
diff --git a/tests/benchmark/conformer/prepare.sh b/tests/benchmark/conformer/prepare.sh
index 8f03fd1b988fb458a681d7c8612416cf9ef65895..c5fae06a59d41147c9aaa89f3074914e7ea9906f 100644
--- a/tests/benchmark/conformer/prepare.sh
+++ b/tests/benchmark/conformer/prepare.sh
@@ -1,5 +1,6 @@
-source ../../../tools/venv/bin/activate
-
+cd ../../../
+pip install -e . # 安装pdspeech
+cd -
#Enter the example dir
pushd ../../../examples/aishell/s1
diff --git a/tests/benchmark/conformer/run.sh b/tests/benchmark/conformer/run.sh
index c09bbf09b0547f3f0214f85a437dae23f764df98..79beb4e961fc01d7b1d5a80e81d94289057c0398 100644
--- a/tests/benchmark/conformer/run.sh
+++ b/tests/benchmark/conformer/run.sh
@@ -1,8 +1,12 @@
# 提供可稳定复现性能的脚本,默认在标准docker环境内py37执行: paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7 paddle=2.1.2 py=37
# 执行目录:需说明
-CUR_DIR=${PWD}
-source ../../../tools/venv/bin/activate
+CUR_DIR=${PWD} # PaddleSpeech/tests/benchmark/conformer
+cd ../../../
+log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} # benchmark系统指定该参数,不需要跑profile时,log_path指向存speed的目录
+cd ${CUR_DIR}
+sed -i '/set\ -xe/d' run_benchmark.sh
+
#cd **
pushd ../../../examples/aishell/s1
# 1 安装该模型需要的依赖 (如需开启优化策略请注明)
@@ -11,26 +15,33 @@ pushd ../../../examples/aishell/s1
source path.sh
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
-
+mkdir -p conf/benchmark
+#yq e ".training.accum_grad=1" conf/conformer.yaml > conf/benchmark/conformer.yaml
+cp conf/conformer.yaml conf/benchmark/conformer.yaml
+sed -i "s/ accum_grad: 2/ accum_grad: 1/g" conf/benchmark/conformer.yaml
fp_item_list=(fp32)
bs_item=(16 30)
-config_path=conf/conformer.yaml
+config_path=conf/benchmark/conformer.yaml
seed=0
output=exp/conformer
profiler_options=None
+model_item=conformer
for fp_item in ${fp_item_list[@]}; do
- for batch_size in ${bs_item[@]}
+ for bs_item in ${bs_item[@]}
do
rm exp -rf
+ log_name=speech_${model_item}_bs${bs_item}_${fp_item} # 如:clas_MobileNetv1_mp_bs32_fp32_8
echo "index is speed, 8gpus, run_mode is multi_process, begin, conformer"
run_mode=mp
ngpu=8
- CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${batch_size} ${fp_item} ${CUR_DIR}
- rm exp -rf
- echo "index is speed, 1gpus, begin, conformer"
+ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1
+ sleep 60
+ log_name=speech_${model_item}_bs${bs_item}_${fp_item} # 如:clas_MobileNetv1_mp_bs32_fp32_8
+ echo "index is speed, 1gpus, begin, ${log_name}"
run_mode=sp
ngpu=1
- CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${batch_size} ${fp_item} ${CUR_DIR}
+ CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_1gpus 2>&1 # (5min)
+ sleep 60
done
done
diff --git a/tests/benchmark/conformer/run_benchmark.sh b/tests/benchmark/conformer/run_benchmark.sh
index c03a08f3b000e4d81649a392124ee1dbb445dace..56b63e76b1f23abf8f36c237dcd2232e20792d39 100644
--- a/tests/benchmark/conformer/run_benchmark.sh
+++ b/tests/benchmark/conformer/run_benchmark.sh
@@ -12,17 +12,24 @@ function _set_params(){
profiler_options=${6:-"None"}
batch_size=${7:-"32"}
fp_item=${8:-"fp32"}
- TRAIN_LOG_DIR=${9:-$(pwd)}
-
+ model_item=${9:-"conformer"}
benchmark_max_step=0
-
run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数
+# 添加日志解析需要的参数
+ base_batch_size=${batch_size}
+ mission_name="语音识别"
+ direction_id="1"
+ ips_unit="sent./sec"
+ skip_steps=10 # 解析日志,有些模型前几个step耗时长,需要跳过 (必填)
+ keyword="ips:" # 解析日志,筛选出数据所在行的关键字 (必填)
+ index="1"
+ model_name=${model_item}_bs${batch_size}_${fp_item}
# 以下不用修改
device=${CUDA_VISIBLE_DEVICES//,/ }
arr=(${device})
num_gpu_devices=${#arr[*]}
- log_file=${run_log_path}/recoder_${run_mode}_bs${batch_size}_${fp_item}_ngpu${ngpu}.txt
+ log_file=${run_log_path}/recoder_${model_item}_${run_mode}_bs${batch_size}_${fp_item}_ngpu${ngpu}
}
function _train(){
@@ -36,11 +43,9 @@ function _train(){
--benchmark-batch-size ${batch_size}
--benchmark-max-step ${benchmark_max_step} "
- echo "run_mode "${run_mode}
-
case ${run_mode} in
- sp) train_cmd="python3 -u ${BIN_DIR}/train.py "${train_cmd} ;;
- mp) train_cmd="python3 -u ${BIN_DIR}/train.py "${train_cmd} ;;
+ sp) train_cmd="python -u ${BIN_DIR}/train.py "${train_cmd} ;;
+ mp) train_cmd="python -u ${BIN_DIR}/train.py "${train_cmd} ;;
*) echo "choose run_mode(sp or mp)"; exit 1;
esac
echo ${train_cmd}
@@ -61,5 +66,8 @@ function _train(){
fi
}
+source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开
_set_params $@
-_train
+# _train # 如果只想产出训练log,不解析,可取消注释
+_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开
+
diff --git a/tools/extras/install_miniconda.sh b/tools/extras/install_miniconda.sh
index 3d1909af6f4f8a23e261e8983bf9ee6d1275cb4f..c6ee4b361ca7733d46ecb9b6d3b260199c190203 100755
--- a/tools/extras/install_miniconda.sh
+++ b/tools/extras/install_miniconda.sh
@@ -13,6 +13,8 @@ else
fi
bash Miniconda3-latest-Linux-x86_64.sh -b
+$HOME/miniconda3/bin/conda init
+
$HOME/miniconda3/bin/python -m pip install --user tqdm
$HOME/miniconda3/bin/python -m pip install --user scikit-learn
$HOME/miniconda3/bin/python -m pip install --user librosa