diff --git a/README.md b/README.md index 66feb0982025fce8caf819fddca27b0d81598d7a..2f9d992895309f28ccfabc5d0bf83dfa94aaa443 100644 --- a/README.md +++ b/README.md @@ -154,7 +154,7 @@ If you want to try more functions like training and tuning, please see [Speech-t ## Model List -PaddleSpeech supports a series of most popular models, summarized in [released models](./docs/source/released_models.md) with available pretrained models. +PaddleSpeech supports a series of most popular models, summarized in [released models](./docs/source/released_model.md) with available pretrained models. Speech-to-Text module contains *Acoustic Model* and *Language Model*, with the following details: @@ -344,4 +344,4 @@ year={2021} PaddleSpeech is provided under the [Apache-2.0 License](./LICENSE). -PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information. \ No newline at end of file +PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information. diff --git a/demos/style_fs2/style_syn.py b/demos/style_fs2/style_syn.py index db15b7ef3a6c9baa8bdbf92ba3803c170a134932..5b8ce35139aea0edb084cd3b1d33b702b27d2628 100644 --- a/demos/style_fs2/style_syn.py +++ b/demos/style_fs2/style_syn.py @@ -13,7 +13,6 @@ # limitations under the License. import argparse from pathlib import Path -from typing import Union import numpy as np import paddle @@ -23,129 +22,12 @@ from yacs.config import CfgNode from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.models.fastspeech2 import FastSpeech2 -from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference +from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator from paddlespeech.t2s.models.parallel_wavegan import PWGInference from paddlespeech.t2s.modules.normalizer import ZScore -class StyleFastSpeech2Inference(FastSpeech2Inference): - def __init__(self, normalizer, model, pitch_stats_path, energy_stats_path): - super().__init__(normalizer, model) - pitch_mean, pitch_std = np.load(pitch_stats_path) - self.pitch_mean = paddle.to_tensor(pitch_mean) - self.pitch_std = paddle.to_tensor(pitch_std) - energy_mean, energy_std = np.load(energy_stats_path) - self.energy_mean = paddle.to_tensor(energy_mean) - self.energy_std = paddle.to_tensor(energy_std) - - def denorm(self, data, mean, std): - return data * std + mean - - def norm(self, data, mean, std): - return (data - mean) / std - - def forward(self, - text: paddle.Tensor, - durations: Union[paddle.Tensor, np.ndarray]=None, - durations_scale: Union[int, float]=None, - durations_bias: Union[int, float]=None, - pitch: Union[paddle.Tensor, np.ndarray]=None, - pitch_scale: Union[int, float]=None, - pitch_bias: Union[int, float]=None, - energy: Union[paddle.Tensor, np.ndarray]=None, - energy_scale: Union[int, float]=None, - energy_bias: Union[int, float]=None, - robot: bool=False): - """ - Parameters - ---------- - text : Tensor(int64) - Input sequence of characters (T,). - speech : Tensor, optional - Feature sequence to extract style (N, idim). - durations : paddle.Tensor/np.ndarray, optional (int64) - Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias - durations_scale: int/float, optional - durations_bias: int/float, optional - pitch : paddle.Tensor/np.ndarray, optional - Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias - pitch_scale: int/float, optional - In denormed HZ domain. - pitch_bias: int/float, optional - In denormed HZ domain. - energy : paddle.Tensor/np.ndarray, optional - Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias - energy_scale: int/float, optional - In denormed domain. - energy_bias: int/float, optional - In denormed domain. - robot : bool, optional - Weather output robot style - Returns - ---------- - Tensor - Output sequence of features (L, odim). - """ - normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( - text, durations=None, pitch=None, energy=None) - # priority: groundtruth > scale/bias > previous output - # set durations - if isinstance(durations, np.ndarray): - durations = paddle.to_tensor(durations) - elif isinstance(durations, paddle.Tensor): - durations = durations - elif durations_scale or durations_bias: - durations_scale = durations_scale if durations_scale is not None else 1 - durations_bias = durations_bias if durations_bias is not None else 0 - durations = durations_scale * d_outs + durations_bias - else: - durations = d_outs - - if robot: - # set normed pitch to zeros have the same effect with set denormd ones to mean - pitch = paddle.zeros(p_outs.shape) - - # set pitch, can overwrite robot set - if isinstance(pitch, np.ndarray): - pitch = paddle.to_tensor(pitch) - elif isinstance(pitch, paddle.Tensor): - pitch = pitch - elif pitch_scale or pitch_bias: - pitch_scale = pitch_scale if pitch_scale is not None else 1 - pitch_bias = pitch_bias if pitch_bias is not None else 0 - p_Hz = paddle.exp( - self.denorm(p_outs, self.pitch_mean, self.pitch_std)) - p_HZ = pitch_scale * p_Hz + pitch_bias - pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std) - else: - pitch = p_outs - - # set energy - if isinstance(energy, np.ndarray): - energy = paddle.to_tensor(energy) - elif isinstance(energy, paddle.Tensor): - energy = energy - elif energy_scale or energy_bias: - energy_scale = energy_scale if energy_scale is not None else 1 - energy_bias = energy_bias if energy_bias is not None else 0 - e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std) - e_dnorm = energy_scale * e_dnorm + energy_bias - energy = self.norm(e_dnorm, self.energy_mean, self.energy_std) - else: - energy = e_outs - - normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( - text, - durations=durations, - pitch=pitch, - energy=energy, - use_teacher_forcing=True) - - logmel = self.normalizer.inverse(normalized_mel) - return logmel - - def evaluate(args, fastspeech2_config, pwg_config): # construct dataset for evaluation diff --git a/docs/source/index.rst b/docs/source/index.rst index 53e5d15df5baaf307a3d0c24fce608af0d34a5e2..ea2599abe49e06c2b652488d073bda45a3a3b80e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -23,7 +23,7 @@ Contents .. toctree:: :maxdepth: 1 - :caption: Speech-To-Text + :caption: Speech-to-Text asr/models_introduction asr/data_preparation @@ -33,7 +33,7 @@ Contents .. toctree:: :maxdepth: 1 - :caption: Text-To-Speech + :caption: Text-to-Speech tts/basic_usage tts/advanced_usage diff --git a/docs/source/install.md b/docs/source/install.md index 0700a1667831bee9b303d5590388a5c9a49c0446..d68b990d2a6e3c4b9096808d7712c7792144960e 100644 --- a/docs/source/install.md +++ b/docs/source/install.md @@ -16,6 +16,22 @@ cd DeepSpeech pip install -e . ``` +For user who only needs the basic function of paddlespeech, using conda to do installing is recommended. +You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) to select a version and install it by yourself, or you can use the scripts below to install the last miniconda version. + +```python +pushd tools +bash extras/install_miniconda.sh +popd +bash +``` + +After installing the conda, run the setup.sh to complete the installing process. +```python +bash setup.sh +``` + + ## Setup (Other Platform) - Make sure these libraries or tools in [dependencies](./dependencies.md) installed. More information please see: `setup.py `and ` tools/Makefile`. diff --git a/docs/source/introduction.md b/docs/source/introduction.md index e7dd2892afe37c1391c04ca2bc9a410ea7754756..e3fc8b9ea9e1c2d9b6d80e8ea6edb1c6dbbf1385 100644 --- a/docs/source/introduction.md +++ b/docs/source/introduction.md @@ -1,11 +1,11 @@ # PaddleSpeech ## What is PaddleSpeech? -PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech - Speech-To-Text (Automatic Speech Recognition, ASR) and Text-To-Speech Synthesis (TTS), with modules involving state-of-art and influential models. +PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech - Speech-to-Text (Automatic Speech Recognition, ASR) and Text-to-Speech Synthesis (TTS), with modules involving state-of-art and influential models. ## What can PaddleSpeech do? -### Speech-To-Text +### Speech-to-Text PaddleSpeech ASR mainly consists of components below: - Implementation of models and commonly used neural network layers. - Dataset abstraction and common data preprocessing pipelines. @@ -29,9 +29,9 @@ PaddleSpeech ASR provides you with a complete ASR pipeline, including: - attention decoding (used in Transformer and Conformer) - attention rescoring (used in Transformer and Conformer) -Speech-To-Text helps you training the ASR model very simply. +Speech-to-Text helps you training the ASR model very simply. -### Text-To-Speech +### Text-to-Speech TTS mainly consists of components below: - Implementation of models and commonly used neural network layers. - Dataset abstraction and common data preprocessing pipelines. @@ -53,4 +53,4 @@ PaddleSpeech TTS provides you with a complete TTS pipeline, including: - Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis - GE2E -Text-To-Speech helps you to train TTS models with simple commands. +Text-to-Speech helps you to train TTS models with simple commands. diff --git a/docs/source/released_model.md b/docs/source/released_model.md index bb03689c7e3f1712af5f4d0d47c328206765d770..a7c6a036b455410cc7d88947ef6c99d7b867924c 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -1,7 +1,7 @@ # Released Models -## Speech-To-Text Models +## Speech-to-Text Models ### Acoustic Model Released in paddle 2.X Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech :-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- @@ -27,7 +27,7 @@ Language Model | Training Data | Token-based | Size | Descriptions [Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4;
About 0.13 billion n-grams;
'probing' binary with default settings [Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning;
About 3.7 billion n-grams;
'probing' binary with default settings -## Text-To-Speech Models +## Text-to-Speech Models ### Acoustic Models Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static) :-------------:| :------------:| :-----: | :-----:| :-----:| :-----: diff --git a/examples/csmsc/voc3/conf/finetune.yaml b/examples/csmsc/voc3/conf/finetune.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e02f3e220147e4ca78fffc1e564efa4c968c9089 --- /dev/null +++ b/examples/csmsc/voc3/conf/finetune.yaml @@ -0,0 +1,139 @@ +# This is the hyperparameter configuration file for MelGAN. +# Please make sure this is adjusted for the CSMSC dataset. If you want to +# apply to the other dataset, you might need to carefully change some parameters. +# This configuration requires ~ 8GB memory and will finish within 7 days on Titan V. + +# This configuration is based on full-band MelGAN but the hop size and sampling +# rate is different from the paper (16kHz vs 24kHz). The number of iteraions +# is not shown in the paper so currently we train 1M iterations (not sure enough +# to converge). The optimizer setting is based on @dathudeptrai advice. +# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906 + +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### +fs: 24000 # Sampling rate. +n_fft: 2048 # FFT size. (in samples) +n_shift: 300 # Hop size. (in samples) +win_length: 1200 # Window length. (in samples) + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. +n_mels: 80 # Number of mel basis. +fmin: 80 # Minimum freq in mel basis calculation. (Hz) +fmax: 7600 # Maximum frequency in mel basis calculation. (Hz) + +########################################################### +# GENERATOR NETWORK ARCHITECTURE SETTING # +########################################################### +generator_params: + in_channels: 80 # Number of input channels. + out_channels: 4 # Number of output channels. + kernel_size: 7 # Kernel size of initial and final conv layers. + channels: 384 # Initial number of channels for conv layers. + upsample_scales: [5, 5, 3] # List of Upsampling scales. + stack_kernel_size: 3 # Kernel size of dilated conv layers in residual stack. + stacks: 4 # Number of stacks in a single residual stack module. + use_weight_norm: True # Whether to use weight normalization. + use_causal_conv: False # Whether to use causal convolution. + use_final_nonlinear_activation: True + + +########################################################### +# DISCRIMINATOR NETWORK ARCHITECTURE SETTING # +########################################################### +discriminator_params: + in_channels: 1 # Number of input channels. + out_channels: 1 # Number of output channels. + scales: 3 # Number of multi-scales. + downsample_pooling: "AvgPool1D" # Pooling type for the input downsampling. + downsample_pooling_params: # Parameters of the above pooling function. + kernel_size: 4 + stride: 2 + padding: 1 + exclusive: True + kernel_sizes: [5, 3] # List of kernel size. + channels: 16 # Number of channels of the initial conv layer. + max_downsample_channels: 512 # Maximum number of channels of downsampling layers. + downsample_scales: [4, 4, 4] # List of downsampling scales. + nonlinear_activation: "LeakyReLU" # Nonlinear activation function. + nonlinear_activation_params: # Parameters of nonlinear activation function. + negative_slope: 0.2 + use_weight_norm: True # Whether to use weight norm. + + +########################################################### +# STFT LOSS SETTING # +########################################################### +use_stft_loss: true +stft_loss_params: + fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. + hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss + win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. + window: "hann" # Window function for STFT-based loss +use_subband_stft_loss: true +subband_stft_loss_params: + fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss. + hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss + win_lengths: [150, 300, 60] # List of window length for STFT-based loss. + window: "hann" # Window function for STFT-based loss + +########################################################### +# ADVERSARIAL LOSS SETTING # +########################################################### +use_feat_match_loss: false # Whether to use feature matching loss. +lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss. + +########################################################### +# DATA LOADER SETTING # +########################################################### +batch_size: 64 # Batch size. +batch_max_steps: 16200 # Length of each audio in batch. Make sure dividable by hop_size. +num_workers: 2 # Number of workers in DataLoader. + +########################################################### +# OPTIMIZER & SCHEDULER SETTING # +########################################################### +generator_optimizer_params: + epsilon: 1.0e-7 # Generator's epsilon. + weight_decay: 0.0 # Generator's weight decay coefficient. + +generator_grad_norm: -1 # Generator's gradient norm. +generator_scheduler_params: + learning_rate: 1.0e-3 # Generator's learning rate. + gamma: 0.5 # Generator's scheduler gamma. + milestones: # At each milestone, lr will be multiplied by gamma. + - 100000 + - 200000 + - 300000 + - 400000 + - 500000 + - 600000 +discriminator_optimizer_params: + epsilon: 1.0e-7 # Discriminator's epsilon. + weight_decay: 0.0 # Discriminator's weight decay coefficient. + +discriminator_grad_norm: -1 # Discriminator's gradient norm. +discriminator_scheduler_params: + learning_rate: 1.0e-3 # Discriminator's learning rate. + gamma: 0.5 # Discriminator's scheduler gamma. + milestones: # At each milestone, lr will be multiplied by gamma. + - 100000 + - 200000 + - 300000 + - 400000 + - 500000 + - 600000 + +########################################################### +# INTERVAL SETTING # +########################################################### +discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator. +train_max_steps: 1200000 # Number of training steps. +save_interval_steps: 1000 # Interval steps to save checkpoint. +eval_interval_steps: 1000 # Interval steps to evaluate the network. + +########################################################### +# OTHER SETTING # +########################################################### +num_snapshots: 10 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random \ No newline at end of file diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh new file mode 100755 index 0000000000000000000000000000000000000000..42e5a39796acdb46a8104876d8c4086b61866fdb --- /dev/null +++ b/examples/csmsc/voc3/finetune.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \ + --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ + --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ + --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ + --dur-file=durations.txt \ + --output-dir=dump_finetune \ + --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 local/link_wav.py \ + --old-dump-dir=dump \ + --dump-dir=dump_finetune + +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # get features' stats(mean and std) + echo "Get features' stats ..." + cp dump/train/feats_stats.npy dump_finetune/train/ +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # normalize, dev and test should use train's stats + echo "Normalize ..." + + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump_finetune/train/raw/metadata.jsonl \ + --dumpdir=dump_finetune/train/norm \ + --stats=dump_finetune/train/feats_stats.npy + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump_finetune/dev/raw/metadata.jsonl \ + --dumpdir=dump_finetune/dev/norm \ + --stats=dump_finetune/train/feats_stats.npy + + python3 ${BIN_DIR}/../normalize.py \ + --metadata=dump_finetune/test/raw/metadata.jsonl \ + --dumpdir=dump_finetune/test/norm \ + --stats=dump_finetune/train/feats_stats.npy +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + CUDA_VISIBLE_DEVICES=${gpus} \ + FLAGS_cudnn_exhaustive_search=true \ + FLAGS_conv_workspace_size_limit=4000 \ + python ${BIN_DIR}/train.py \ + --train-metadata=dump_finetune/train/norm/metadata.jsonl \ + --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \ + --config=conf/finetune.yaml \ + --output-dir=exp/finetune \ + --ngpu=1 +fi \ No newline at end of file diff --git a/examples/csmsc/voc3/local/link_wav.py b/examples/csmsc/voc3/local/link_wav.py new file mode 100644 index 0000000000000000000000000000000000000000..c81e0d4b83320665b98720d09a940e9de6dc63cd --- /dev/null +++ b/examples/csmsc/voc3/local/link_wav.py @@ -0,0 +1,85 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +from operator import itemgetter +from pathlib import Path + +import jsonlines +import numpy as np + + +def main(): + # parse config and args + parser = argparse.ArgumentParser( + description="Preprocess audio and then extract features .") + + parser.add_argument( + "--old-dump-dir", + default=None, + type=str, + help="directory to dump feature files.") + parser.add_argument( + "--dump-dir", + type=str, + required=True, + help="directory to finetune dump feature files.") + args = parser.parse_args() + + old_dump_dir = Path(args.old_dump_dir).expanduser() + old_dump_dir = old_dump_dir.resolve() + dump_dir = Path(args.dump_dir).expanduser() + # use absolute path + dump_dir = dump_dir.resolve() + dump_dir.mkdir(parents=True, exist_ok=True) + + assert old_dump_dir.is_dir() + assert dump_dir.is_dir() + + for sub in ["train", "dev", "test"]: + # 把 old_dump_dir 里面的 *-wave.npy 软连接到 dump_dir 的对应位置 + output_dir = dump_dir / sub + output_dir.mkdir(parents=True, exist_ok=True) + results = [] + for name in os.listdir(output_dir / "raw"): + # 003918_feats.npy + utt_id = name.split("_")[0] + mel_path = output_dir / ("raw/" + name) + gen_mel = np.load(mel_path) + wave_name = utt_id + "_wave.npy" + wav = np.load(old_dump_dir / sub / ("raw/" + wave_name)) + os.symlink(old_dump_dir / sub / ("raw/" + wave_name), + output_dir / ("raw/" + wave_name)) + num_sample = wav.shape[0] + num_frames = gen_mel.shape[0] + wav_path = output_dir / ("raw/" + wave_name) + + record = { + "utt_id": utt_id, + "num_samples": num_sample, + "num_frames": num_frames, + "feats": str(mel_path), + "wave": str(wav_path), + } + results.append(record) + + results.sort(key=itemgetter("utt_id")) + + with jsonlines.open(output_dir / "raw/metadata.jsonl", 'w') as writer: + for item in results: + writer.write(item) + + +if __name__ == "__main__": + main() diff --git a/examples/librispeech/s2/conf/transformer.yaml b/examples/librispeech/s2/conf/transformer.yaml index b2babca7bcde8b6e68480a5dddeb4950d49159ec..d77329f50843e270b52750ef5dcc2e9429bd8617 100644 --- a/examples/librispeech/s2/conf/transformer.yaml +++ b/examples/librispeech/s2/conf/transformer.yaml @@ -1,36 +1,6 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean - -collator: - vocab_filepath: data/lang_char/train_960_unigram5000_units.txt - unit_type: spm - spm_model_prefix: data/lang_char/train_960_unigram5000 - feat_dim: 83 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 30 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/augmentation.json - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 - - # network architecture model: - cmvn_file: - cmvn_file_type: "json" # encoder related encoder: transformer encoder_conf: @@ -63,6 +33,33 @@ model: lsm_weight: 0.1 # label smoothing option length_normalized_loss: false +data: + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test-clean + +collator: + vocab_filepath: data/lang_char/train_960_unigram5000_units.txt + unit_type: spm + spm_model_prefix: data/lang_char/train_960_unigram5000 + feat_dim: 83 + stride_ms: 10.0 + window_ms: 25.0 + sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs + batch_size: 30 + maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced + maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced + minibatches: 0 # for debug + batch_count: auto + batch_bins: 0 + batch_frames_in: 0 + batch_frames_out: 0 + batch_frames_inout: 0 + augmentation_config: conf/augmentation.json + num_workers: 0 + subsampling_factor: 1 + num_encs: 1 + training: n_epoch: 120 diff --git a/paddlespeech/t2s/datasets/vocoder_batch_fn.py b/paddlespeech/t2s/datasets/vocoder_batch_fn.py index 2de4fb124e1e50a7c5481366c8cec675922d8a98..2e4f740fb6f048dd91a1e799d598261a88a6419c 100644 --- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py +++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py @@ -110,10 +110,10 @@ class Clip(object): if len(x) < c.shape[0] * self.hop_size: x = np.pad(x, (0, c.shape[0] * self.hop_size - len(x)), mode="edge") elif len(x) > c.shape[0] * self.hop_size: - print( - f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })" - ) - x = x[:c.shape[1] * self.hop_size] + # print( + # f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })" + # ) + x = x[:c.shape[0] * self.hop_size] # check the legnth is valid assert len(x) == c.shape[ diff --git a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py new file mode 100644 index 0000000000000000000000000000000000000000..8a9ef370c0f2916149b62c50d2425e969b49a5cb --- /dev/null +++ b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py @@ -0,0 +1,167 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# generate mels using durations.txt +# for mb melgan finetune +# 长度和原本的 mel 不一致怎么办? +import argparse +from pathlib import Path + +import numpy as np +import paddle +import yaml +from yacs.config import CfgNode + +from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur +from paddlespeech.t2s.datasets.preprocess_utils import merge_silence +from paddlespeech.t2s.models.fastspeech2 import FastSpeech2 +from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference +from paddlespeech.t2s.modules.normalizer import ZScore + + +def evaluate(args, fastspeech2_config): + + # construct dataset for evaluation + with open(args.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + vocab_size = len(phn_id) + print("vocab_size:", vocab_size) + + phone_dict = {} + for phn, id in phn_id: + phone_dict[phn] = int(id) + + odim = fastspeech2_config.n_mels + model = FastSpeech2( + idim=vocab_size, odim=odim, **fastspeech2_config["model"]) + + model.set_state_dict( + paddle.load(args.fastspeech2_checkpoint)["main_params"]) + model.eval() + + stat = np.load(args.fastspeech2_stat) + mu, std = stat + mu = paddle.to_tensor(mu) + std = paddle.to_tensor(std) + fastspeech2_normalizer = ZScore(mu, std) + + fastspeech2_inference = StyleFastSpeech2Inference(fastspeech2_normalizer, + model) + fastspeech2_inference.eval() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + sentences, speaker_set = get_phn_dur(args.dur_file) + merge_silence(sentences) + + for i, utt_id in enumerate(sentences): + phones = sentences[utt_id][0] + durations = sentences[utt_id][1] + speaker = sentences[utt_id][2] + # 裁剪掉开头和结尾的 sil + if args.cut_sil: + if phones[0] == "sil" and len(durations) > 1: + durations = durations[1:] + phones = phones[1:] + if phones[-1] == 'sil' and len(durations) > 1: + durations = durations[:-1] + phones = phones[:-1] + # sentences[utt_id][0] = phones + # sentences[utt_id][1] = durations + + phone_ids = [phone_dict[phn] for phn in phones] + phone_ids = paddle.to_tensor(np.array(phone_ids)) + durations = paddle.to_tensor(np.array(durations)) + # 生成的和真实的可能有 1, 2 帧的差距,但是 batch_fn 会修复 + # split data into 3 sections + if args.dataset == "baker": + num_train = 9800 + num_dev = 100 + if i in range(0, num_train): + sub_output_dir = output_dir / ("train/raw") + elif i in range(num_train, num_train + num_dev): + sub_output_dir = output_dir / ("dev/raw") + else: + sub_output_dir = output_dir / ("test/raw") + sub_output_dir.mkdir(parents=True, exist_ok=True) + with paddle.no_grad(): + mel = fastspeech2_inference(phone_ids, durations=durations) + np.save(sub_output_dir / (utt_id + "_feats.npy"), mel) + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser( + description="Synthesize with fastspeech2 & parallel wavegan.") + parser.add_argument( + "--dataset", + default="baker", + type=str, + help="name of dataset, should in {baker, ljspeech, vctk} now") + parser.add_argument( + "--fastspeech2-config", type=str, help="fastspeech2 config file.") + parser.add_argument( + "--fastspeech2-checkpoint", + type=str, + help="fastspeech2 checkpoint to load.") + parser.add_argument( + "--fastspeech2-stat", + type=str, + help="mean and standard deviation used to normalize spectrogram when training fastspeech2." + ) + + parser.add_argument( + "--phones-dict", + type=str, + default="phone_id_map.txt", + help="phone vocabulary file.") + + parser.add_argument( + "--dur-file", default=None, type=str, help="path to durations.txt.") + parser.add_argument("--output-dir", type=str, help="output dir.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") + parser.add_argument("--verbose", type=int, default=1, help="verbose.") + + def str2bool(str): + return True if str.lower() == 'true' else False + + parser.add_argument( + "--cut-sil", + type=str2bool, + default=True, + help="whether cut sil in the edge of audio") + + args = parser.parse_args() + + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") + + with open(args.fastspeech2_config) as f: + fastspeech2_config = CfgNode(yaml.safe_load(f)) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(fastspeech2_config) + + evaluate(args, fastspeech2_config) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index 2202d156e85731919c3b44fe8c498230dded740c..2e52c10376e41a6cca508b7be2a6dea1ca4a2943 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -16,23 +16,25 @@ from typing import Dict from typing import Sequence from typing import Tuple +from typing import Union +import numpy as np import paddle import paddle.nn.functional as F from paddle import nn from typeguard import check_argument_types -from paddlespeech.t2s.modules.fastspeech2_predictor.duration_predictor import DurationPredictor -from paddlespeech.t2s.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss -from paddlespeech.t2s.modules.fastspeech2_predictor.length_regulator import LengthRegulator -from paddlespeech.t2s.modules.fastspeech2_predictor.variance_predictor import VariancePredictor -from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding -from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding -from paddlespeech.t2s.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder from paddlespeech.t2s.modules.nets_utils import initialize from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask from paddlespeech.t2s.modules.nets_utils import make_pad_mask +from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictor +from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictorLoss +from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator +from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor from paddlespeech.t2s.modules.tacotron2.decoder import Postnet +from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding +from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding +from paddlespeech.t2s.modules.transformer.encoder import Encoder as TransformerEncoder class FastSpeech2(nn.Layer): @@ -687,6 +689,129 @@ class FastSpeech2Inference(nn.Layer): return logmel +class StyleFastSpeech2Inference(FastSpeech2Inference): + def __init__(self, + normalizer, + model, + pitch_stats_path=None, + energy_stats_path=None): + super().__init__(normalizer, model) + if pitch_stats_path: + pitch_mean, pitch_std = np.load(pitch_stats_path) + self.pitch_mean = paddle.to_tensor(pitch_mean) + self.pitch_std = paddle.to_tensor(pitch_std) + if energy_stats_path: + energy_mean, energy_std = np.load(energy_stats_path) + self.energy_mean = paddle.to_tensor(energy_mean) + self.energy_std = paddle.to_tensor(energy_std) + + def denorm(self, data, mean, std): + return data * std + mean + + def norm(self, data, mean, std): + return (data - mean) / std + + def forward(self, + text: paddle.Tensor, + durations: Union[paddle.Tensor, np.ndarray]=None, + durations_scale: Union[int, float]=None, + durations_bias: Union[int, float]=None, + pitch: Union[paddle.Tensor, np.ndarray]=None, + pitch_scale: Union[int, float]=None, + pitch_bias: Union[int, float]=None, + energy: Union[paddle.Tensor, np.ndarray]=None, + energy_scale: Union[int, float]=None, + energy_bias: Union[int, float]=None, + robot: bool=False): + """ + Parameters + ---------- + text : Tensor(int64) + Input sequence of characters (T,). + speech : Tensor, optional + Feature sequence to extract style (N, idim). + durations : paddle.Tensor/np.ndarray, optional (int64) + Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias + durations_scale: int/float, optional + durations_bias: int/float, optional + pitch : paddle.Tensor/np.ndarray, optional + Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias + pitch_scale: int/float, optional + In denormed HZ domain. + pitch_bias: int/float, optional + In denormed HZ domain. + energy : paddle.Tensor/np.ndarray, optional + Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias + energy_scale: int/float, optional + In denormed domain. + energy_bias: int/float, optional + In denormed domain. + robot : bool, optional + Weather output robot style + Returns + ---------- + Tensor + Output sequence of features (L, odim). + """ + normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( + text, durations=None, pitch=None, energy=None) + # priority: groundtruth > scale/bias > previous output + # set durations + if isinstance(durations, np.ndarray): + durations = paddle.to_tensor(durations) + elif isinstance(durations, paddle.Tensor): + durations = durations + elif durations_scale or durations_bias: + durations_scale = durations_scale if durations_scale is not None else 1 + durations_bias = durations_bias if durations_bias is not None else 0 + durations = durations_scale * d_outs + durations_bias + else: + durations = d_outs + + if robot: + # set normed pitch to zeros have the same effect with set denormd ones to mean + pitch = paddle.zeros(p_outs.shape) + + # set pitch, can overwrite robot set + if isinstance(pitch, np.ndarray): + pitch = paddle.to_tensor(pitch) + elif isinstance(pitch, paddle.Tensor): + pitch = pitch + elif pitch_scale or pitch_bias: + pitch_scale = pitch_scale if pitch_scale is not None else 1 + pitch_bias = pitch_bias if pitch_bias is not None else 0 + p_Hz = paddle.exp( + self.denorm(p_outs, self.pitch_mean, self.pitch_std)) + p_HZ = pitch_scale * p_Hz + pitch_bias + pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std) + else: + pitch = p_outs + + # set energy + if isinstance(energy, np.ndarray): + energy = paddle.to_tensor(energy) + elif isinstance(energy, paddle.Tensor): + energy = energy + elif energy_scale or energy_bias: + energy_scale = energy_scale if energy_scale is not None else 1 + energy_bias = energy_bias if energy_bias is not None else 0 + e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std) + e_dnorm = energy_scale * e_dnorm + energy_bias + energy = self.norm(e_dnorm, self.energy_mean, self.energy_std) + else: + energy = e_outs + + normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( + text, + durations=durations, + pitch=pitch, + energy=energy, + use_teacher_forcing=True) + + logmel = self.normalizer.inverse(normalized_mel) + return logmel + + class FastSpeech2Loss(nn.Layer): """Loss function module for FastSpeech2.""" diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py index 97233c766eb0d71ad51a17d7d33c87a8bc2f4da3..03620fd4e0b50ad827508deb8efba4459ea4bf05 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py @@ -23,12 +23,6 @@ import paddle.nn.functional as F from paddle import nn from typeguard import check_argument_types -from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention -from paddlespeech.t2s.modules.fastspeech2_transformer.decoder import Decoder -from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding -from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding -from paddlespeech.t2s.modules.fastspeech2_transformer.encoder import Encoder -from paddlespeech.t2s.modules.fastspeech2_transformer.mask import subsequent_mask from paddlespeech.t2s.modules.nets_utils import initialize from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask from paddlespeech.t2s.modules.nets_utils import make_pad_mask @@ -36,6 +30,12 @@ from paddlespeech.t2s.modules.style_encoder import StyleEncoder from paddlespeech.t2s.modules.tacotron2.decoder import Postnet from paddlespeech.t2s.modules.tacotron2.decoder import Prenet as DecoderPrenet from paddlespeech.t2s.modules.tacotron2.encoder import Encoder as EncoderPrenet +from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention +from paddlespeech.t2s.modules.transformer.decoder import Decoder +from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding +from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding +from paddlespeech.t2s.modules.transformer.encoder import Encoder +from paddlespeech.t2s.modules.transformer.mask import subsequent_mask class TransformerTTS(nn.Layer): diff --git a/paddlespeech/t2s/modules/__init__.py b/paddlespeech/t2s/modules/__init__.py index 664267895491c47ad0b3ecaaaae9412f3ce5110f..5b569f5d05100fa80587c9a06dc2c16f1d58a936 100644 --- a/paddlespeech/t2s/modules/__init__.py +++ b/paddlespeech/t2s/modules/__init__.py @@ -11,10 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .attention import * from .conv import * from .geometry import * from .losses import * from .masking import * from .positional_encoding import * -from .transformer import * diff --git a/paddlespeech/t2s/modules/attention.py b/paddlespeech/t2s/modules/attention.py deleted file mode 100644 index 154625cc3c1d9426ed2d21edc3064798b73ccd3a..0000000000000000000000000000000000000000 --- a/paddlespeech/t2s/modules/attention.py +++ /dev/null @@ -1,348 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import math - -import numpy as np -import paddle -from paddle import nn -from paddle.nn import functional as F - - -def scaled_dot_product_attention(q, k, v, mask=None, dropout=0.0, - training=True): - r"""Scaled dot product attention with masking. - - Assume that q, k, v all have the same leading dimensions (denoted as * in - descriptions below). Dropout is applied to attention weights before - weighted sum of values. - - Parameters - ----------- - q : Tensor [shape=(\*, T_q, d)] - the query tensor. - k : Tensor [shape=(\*, T_k, d)] - the key tensor. - v : Tensor [shape=(\*, T_k, d_v)] - the value tensor. - mask : Tensor, [shape=(\*, T_q, T_k) or broadcastable shape], optional - the mask tensor, zeros correspond to paddings. Defaults to None. - - Returns - ---------- - out : Tensor [shape=(\*, T_q, d_v)] - the context vector. - attn_weights : Tensor [shape=(\*, T_q, T_k)] - the attention weights. - """ - d = q.shape[-1] # we only support imperative execution - qk = paddle.matmul(q, k, transpose_y=True) - scaled_logit = paddle.scale(qk, 1.0 / math.sqrt(d)) - - if mask is not None: - scaled_logit += paddle.scale((1.0 - mask), -1e9) # hard coded here - - attn_weights = F.softmax(scaled_logit, axis=-1) - attn_weights = F.dropout(attn_weights, dropout, training=training) - out = paddle.matmul(attn_weights, v) - return out, attn_weights - - -def drop_head(x, drop_n_heads, training=True): - """Drop n context vectors from multiple ones. - - Parameters - ---------- - x : Tensor [shape=(batch_size, num_heads, time_steps, channels)] - The input, multiple context vectors. - drop_n_heads : int [0<= drop_n_heads <= num_heads] - Number of vectors to drop. - training : bool - A flag indicating whether it is in training. If `False`, no dropout is - applied. - - Returns - ------- - Tensor - The output. - """ - if not training or (drop_n_heads == 0): - return x - - batch_size, num_heads, _, _ = x.shape - # drop all heads - if num_heads == drop_n_heads: - return paddle.zeros_like(x) - - mask = np.ones([batch_size, num_heads]) - mask[:, :drop_n_heads] = 0 - for subarray in mask: - np.random.shuffle(subarray) - scale = float(num_heads) / (num_heads - drop_n_heads) - mask = scale * np.reshape(mask, [batch_size, num_heads, 1, 1]) - out = x * paddle.to_tensor(mask) - return out - - -def _split_heads(x, num_heads): - batch_size, time_steps, _ = x.shape - x = paddle.reshape(x, [batch_size, time_steps, num_heads, -1]) - x = paddle.transpose(x, [0, 2, 1, 3]) - return x - - -def _concat_heads(x): - batch_size, _, time_steps, _ = x.shape - x = paddle.transpose(x, [0, 2, 1, 3]) - x = paddle.reshape(x, [batch_size, time_steps, -1]) - return x - - -# Standard implementations of Monohead Attention & Multihead Attention -class MonoheadAttention(nn.Layer): - """Monohead Attention module. - - Parameters - ---------- - model_dim : int - Feature size of the query. - dropout : float, optional - Dropout probability of scaled dot product attention and final context - vector. Defaults to 0.0. - k_dim : int, optional - Feature size of the key of each scaled dot product attention. If not - provided, it is set to `model_dim / num_heads`. Defaults to None. - v_dim : int, optional - Feature size of the key of each scaled dot product attention. If not - provided, it is set to `model_dim / num_heads`. Defaults to None. - """ - - def __init__(self, - model_dim: int, - dropout: float=0.0, - k_dim: int=None, - v_dim: int=None): - super(MonoheadAttention, self).__init__() - k_dim = k_dim or model_dim - v_dim = v_dim or model_dim - self.affine_q = nn.Linear(model_dim, k_dim) - self.affine_k = nn.Linear(model_dim, k_dim) - self.affine_v = nn.Linear(model_dim, v_dim) - self.affine_o = nn.Linear(v_dim, model_dim) - - self.model_dim = model_dim - self.dropout = dropout - - def forward(self, q, k, v, mask): - """Compute context vector and attention weights. - - Parameters - ----------- - q : Tensor [shape=(batch_size, time_steps_q, model_dim)] - The queries. - k : Tensor [shape=(batch_size, time_steps_k, model_dim)] - The keys. - v : Tensor [shape=(batch_size, time_steps_k, model_dim)] - The values. - mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape - The mask. - - Returns - ---------- - out : Tensor [shape=(batch_size, time_steps_q, model_dim)] - The context vector. - attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)] - The attention weights. - """ - q = self.affine_q(q) # (B, T, C) - k = self.affine_k(k) - v = self.affine_v(v) - - context_vectors, attention_weights = scaled_dot_product_attention( - q, k, v, mask, self.dropout, self.training) - - out = self.affine_o(context_vectors) - return out, attention_weights - - -class MultiheadAttention(nn.Layer): - """Multihead Attention module. - - Parameters - ----------- - model_dim: int - The feature size of query. - num_heads : int - The number of attention heads. - dropout : float, optional - Dropout probability of scaled dot product attention and final context - vector. Defaults to 0.0. - k_dim : int, optional - Feature size of the key of each scaled dot product attention. If not - provided, it is set to ``model_dim / num_heads``. Defaults to None. - v_dim : int, optional - Feature size of the key of each scaled dot product attention. If not - provided, it is set to ``model_dim / num_heads``. Defaults to None. - - Raises - --------- - ValueError - If ``model_dim`` is not divisible by ``num_heads``. - """ - - def __init__(self, - model_dim: int, - num_heads: int, - dropout: float=0.0, - k_dim: int=None, - v_dim: int=None): - super(MultiheadAttention, self).__init__() - if model_dim % num_heads != 0: - raise ValueError("model_dim must be divisible by num_heads") - depth = model_dim // num_heads - k_dim = k_dim or depth - v_dim = v_dim or depth - self.affine_q = nn.Linear(model_dim, num_heads * k_dim) - self.affine_k = nn.Linear(model_dim, num_heads * k_dim) - self.affine_v = nn.Linear(model_dim, num_heads * v_dim) - self.affine_o = nn.Linear(num_heads * v_dim, model_dim) - - self.num_heads = num_heads - self.model_dim = model_dim - self.dropout = dropout - - def forward(self, q, k, v, mask): - """Compute context vector and attention weights. - - Parameters - ----------- - q : Tensor [shape=(batch_size, time_steps_q, model_dim)] - The queries. - k : Tensor [shape=(batch_size, time_steps_k, model_dim)] - The keys. - v : Tensor [shape=(batch_size, time_steps_k, model_dim)] - The values. - mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape - The mask. - - Returns - ---------- - out : Tensor [shape=(batch_size, time_steps_q, model_dim)] - The context vector. - attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)] - The attention weights. - """ - q = _split_heads(self.affine_q(q), self.num_heads) # (B, h, T, C) - k = _split_heads(self.affine_k(k), self.num_heads) - v = _split_heads(self.affine_v(v), self.num_heads) - mask = paddle.unsqueeze(mask, 1) # unsqueeze for the h dim - - context_vectors, attention_weights = scaled_dot_product_attention( - q, k, v, mask, self.dropout, self.training) - # NOTE: there is more sophisticated implementation: Scheduled DropHead - context_vectors = _concat_heads(context_vectors) # (B, T, h*C) - out = self.affine_o(context_vectors) - return out, attention_weights - - -class LocationSensitiveAttention(nn.Layer): - """Location Sensitive Attention module. - - Reference: `Attention-Based Models for Speech Recognition `_ - - Parameters - ----------- - d_query: int - The feature size of query. - d_key : int - The feature size of key. - d_attention : int - The feature size of dimension. - location_filters : int - Filter size of attention convolution. - location_kernel_size : int - Kernel size of attention convolution. - """ - - def __init__(self, - d_query: int, - d_key: int, - d_attention: int, - location_filters: int, - location_kernel_size: int): - super().__init__() - - self.query_layer = nn.Linear(d_query, d_attention, bias_attr=False) - self.key_layer = nn.Linear(d_key, d_attention, bias_attr=False) - self.value = nn.Linear(d_attention, 1, bias_attr=False) - - # Location Layer - self.location_conv = nn.Conv1D( - 2, - location_filters, - kernel_size=location_kernel_size, - padding=int((location_kernel_size - 1) / 2), - bias_attr=False, - data_format='NLC') - self.location_layer = nn.Linear( - location_filters, d_attention, bias_attr=False) - - def forward(self, - query, - processed_key, - value, - attention_weights_cat, - mask=None): - """Compute context vector and attention weights. - - Parameters - ----------- - query : Tensor [shape=(batch_size, d_query)] - The queries. - processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)] - The keys after linear layer. - value : Tensor [shape=(batch_size, time_steps_k, d_key)] - The values. - attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)] - Attention weights concat. - mask : Tensor, optional - The mask. Shape should be (batch_size, times_steps_k, 1). - Defaults to None. - - Returns - ---------- - attention_context : Tensor [shape=(batch_size, d_attention)] - The context vector. - attention_weights : Tensor [shape=(batch_size, time_steps_k)] - The attention weights. - """ - - processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1])) - processed_attention_weights = self.location_layer( - self.location_conv(attention_weights_cat)) - # (B, T_enc, 1) - alignment = self.value( - paddle.tanh(processed_attention_weights + processed_key + - processed_query)) - - if mask is not None: - alignment = alignment + (1.0 - mask) * -1e9 - - attention_weights = F.softmax(alignment, axis=1) - attention_context = paddle.matmul( - attention_weights, value, transpose_x=True) - - attention_weights = paddle.squeeze(attention_weights, axis=-1) - attention_context = paddle.squeeze(attention_context, axis=1) - - return attention_context, attention_weights diff --git a/paddlespeech/t2s/modules/conformer/convolution.py b/paddlespeech/t2s/modules/conformer/convolution.py new file mode 100644 index 0000000000000000000000000000000000000000..25246736b92dfda364cf53a02ed37bb670e99c55 --- /dev/null +++ b/paddlespeech/t2s/modules/conformer/convolution.py @@ -0,0 +1,84 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from espnet(https://github.com/espnet/espnet) +"""ConvolutionModule definition.""" +from paddle import nn + + +class ConvolutionModule(nn.Layer): + """ConvolutionModule in Conformer model. + Parameters + ---------- + channels : int + The number of channels of conv layers. + kernel_size : int + Kernerl size of conv layers. + """ + + def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True): + """Construct an ConvolutionModule object.""" + super().__init__() + # kernerl_size should be a odd number for 'SAME' padding + assert (kernel_size - 1) % 2 == 0 + + self.pointwise_conv1 = nn.Conv1D( + channels, + 2 * channels, + kernel_size=1, + stride=1, + padding=0, + bias_attr=bias, ) + self.depthwise_conv = nn.Conv1D( + channels, + channels, + kernel_size, + stride=1, + padding=(kernel_size - 1) // 2, + groups=channels, + bias_attr=bias, ) + self.norm = nn.BatchNorm1D(channels) + self.pointwise_conv2 = nn.Conv1D( + channels, + channels, + kernel_size=1, + stride=1, + padding=0, + bias_attr=bias, ) + self.activation = activation + + def forward(self, x): + """Compute convolution module. + Parameters + ---------- + x : paddle.Tensor + Input tensor (#batch, time, channels). + Returns + ---------- + paddle.Tensor + Output tensor (#batch, time, channels). + """ + # exchange the temporal dimension and the feature dimension + x = x.transpose([0, 2, 1]) + + # GLU mechanism + x = self.pointwise_conv1(x) # (batch, 2*channel, dim) + x = nn.functional.glu(x, axis=1) # (batch, channel, dim) + + # 1D Depthwise Conv + x = self.depthwise_conv(x) + x = self.activation(self.norm(x)) + + x = self.pointwise_conv2(x) + + return x.transpose([0, 2, 1]) diff --git a/paddlespeech/t2s/modules/conformer/encoder.py b/paddlespeech/t2s/modules/conformer/encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..568597ba56d8ac27691577c23a244068240958ae --- /dev/null +++ b/paddlespeech/t2s/modules/conformer/encoder.py @@ -0,0 +1,274 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from espnet(https://github.com/espnet/espnet) +"""Encoder definition.""" +import logging + +import paddle + +from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule +from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer +from paddlespeech.t2s.modules.layer_norm import LayerNorm +from paddlespeech.t2s.modules.nets_utils import get_activation +from paddlespeech.t2s.modules.transformer.attention import LegacyRelPositionMultiHeadedAttention +from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention +from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention +from paddlespeech.t2s.modules.transformer.embedding import LegacyRelPositionalEncoding +from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding +from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding +from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding +from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear +from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d +from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward +from paddlespeech.t2s.modules.transformer.repeat import repeat +from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling + + +class Encoder(paddle.nn.Layer): + """Conformer encoder module. + Parameters + ---------- + idim : int + Input dimension. + attention_dim : int + Dimension of attention. + attention_heads : int + The number of heads of multi head attention. + linear_units : int + The number of units of position-wise feed forward. + num_blocks : int + The number of decoder blocks. + dropout_rate : float + Dropout rate. + positional_dropout_rate : float + Dropout rate after adding positional encoding. + attention_dropout_rate : float + Dropout rate in attention. + input_layer : Union[str, paddle.nn.Layer] + Input layer type. + normalize_before : bool + Whether to use layer_norm before the first block. + concat_after : bool + Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + positionwise_layer_type : str + "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size : int + Kernel size of positionwise conv1d layer. + macaron_style : bool + Whether to use macaron style for positionwise layer. + pos_enc_layer_type : str + Encoder positional encoding layer type. + selfattention_layer_type : str + Encoder attention layer type. + activation_type : str + Encoder activation function type. + use_cnn_module : bool + Whether to use convolution module. + zero_triu : bool + Whether to zero the upper triangular part of attention matrix. + cnn_module_kernel : int + Kernerl size of convolution module. + padding_idx : int + Padding idx for input_layer=embed. + stochastic_depth_rate : float + Maximum probability to skip the encoder layer. + intermediate_layers : Union[List[int], None] + indices of intermediate CTC layer. + indices start from 1. + if not None, intermediate outputs are returned (which changes return type + signature.) + """ + + def __init__( + self, + idim, + attention_dim=256, + attention_heads=4, + linear_units=2048, + num_blocks=6, + dropout_rate=0.1, + positional_dropout_rate=0.1, + attention_dropout_rate=0.0, + input_layer="conv2d", + normalize_before=True, + concat_after=False, + positionwise_layer_type="linear", + positionwise_conv_kernel_size=1, + macaron_style=False, + pos_enc_layer_type="abs_pos", + selfattention_layer_type="selfattn", + activation_type="swish", + use_cnn_module=False, + zero_triu=False, + cnn_module_kernel=31, + padding_idx=-1, + stochastic_depth_rate=0.0, + intermediate_layers=None, ): + """Construct an Encoder object.""" + super(Encoder, self).__init__() + + activation = get_activation(activation_type) + if pos_enc_layer_type == "abs_pos": + pos_enc_class = PositionalEncoding + elif pos_enc_layer_type == "scaled_abs_pos": + pos_enc_class = ScaledPositionalEncoding + elif pos_enc_layer_type == "rel_pos": + assert selfattention_layer_type == "rel_selfattn" + pos_enc_class = RelPositionalEncoding + elif pos_enc_layer_type == "legacy_rel_pos": + pos_enc_class = LegacyRelPositionalEncoding + assert selfattention_layer_type == "legacy_rel_selfattn" + else: + raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) + + self.conv_subsampling_factor = 1 + if input_layer == "linear": + self.embed = paddle.nn.Sequential( + paddle.nn.Linear(idim, attention_dim), + paddle.nn.LayerNorm(attention_dim), + paddle.nn.Dropout(dropout_rate), + pos_enc_class(attention_dim, positional_dropout_rate), ) + elif input_layer == "conv2d": + self.embed = Conv2dSubsampling( + idim, + attention_dim, + dropout_rate, + pos_enc_class(attention_dim, positional_dropout_rate), ) + self.conv_subsampling_factor = 4 + + elif input_layer == "embed": + self.embed = paddle.nn.Sequential( + paddle.nn.Embedding( + idim, attention_dim, padding_idx=padding_idx), + pos_enc_class(attention_dim, positional_dropout_rate), ) + elif isinstance(input_layer, paddle.nn.Layer): + self.embed = paddle.nn.Sequential( + input_layer, + pos_enc_class(attention_dim, positional_dropout_rate), ) + elif input_layer is None: + self.embed = paddle.nn.Sequential( + pos_enc_class(attention_dim, positional_dropout_rate)) + else: + raise ValueError("unknown input_layer: " + input_layer) + self.normalize_before = normalize_before + + # self-attention module definition + if selfattention_layer_type == "selfattn": + logging.info("encoder self-attention layer type = self-attention") + encoder_selfattn_layer = MultiHeadedAttention + encoder_selfattn_layer_args = (attention_heads, attention_dim, + attention_dropout_rate, ) + elif selfattention_layer_type == "legacy_rel_selfattn": + assert pos_enc_layer_type == "legacy_rel_pos" + encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention + encoder_selfattn_layer_args = (attention_heads, attention_dim, + attention_dropout_rate, ) + elif selfattention_layer_type == "rel_selfattn": + logging.info( + "encoder self-attention layer type = relative self-attention") + assert pos_enc_layer_type == "rel_pos" + encoder_selfattn_layer = RelPositionMultiHeadedAttention + encoder_selfattn_layer_args = (attention_heads, attention_dim, + attention_dropout_rate, zero_triu, ) + else: + raise ValueError("unknown encoder_attn_layer: " + + selfattention_layer_type) + + # feed-forward module definition + if positionwise_layer_type == "linear": + positionwise_layer = PositionwiseFeedForward + positionwise_layer_args = (attention_dim, linear_units, + dropout_rate, activation, ) + elif positionwise_layer_type == "conv1d": + positionwise_layer = MultiLayeredConv1d + positionwise_layer_args = (attention_dim, linear_units, + positionwise_conv_kernel_size, + dropout_rate, ) + elif positionwise_layer_type == "conv1d-linear": + positionwise_layer = Conv1dLinear + positionwise_layer_args = (attention_dim, linear_units, + positionwise_conv_kernel_size, + dropout_rate, ) + else: + raise NotImplementedError("Support only linear or conv1d.") + + # convolution module definition + convolution_layer = ConvolutionModule + convolution_layer_args = (attention_dim, cnn_module_kernel, activation) + + self.encoders = repeat( + num_blocks, + lambda lnum: EncoderLayer( + attention_dim, + encoder_selfattn_layer(*encoder_selfattn_layer_args), + positionwise_layer(*positionwise_layer_args), + positionwise_layer(*positionwise_layer_args) if macaron_style else None, + convolution_layer(*convolution_layer_args) if use_cnn_module else None, + dropout_rate, + normalize_before, + concat_after, + stochastic_depth_rate * float(1 + lnum) / num_blocks, ), ) + if self.normalize_before: + self.after_norm = LayerNorm(attention_dim) + + self.intermediate_layers = intermediate_layers + + def forward(self, xs, masks): + """Encode input sequence. + Parameters + ---------- + xs : paddle.Tensor + Input tensor (#batch, time, idim). + masks (paddle.Tensor): Mask tensor (#batch, 1, time). + Returns + ---------- + paddle.Tensor + Output tensor (#batch, time, attention_dim). + paddle.Tensor + Mask tensor (#batch, time). + """ + if isinstance(self.embed, (Conv2dSubsampling)): + xs, masks = self.embed(xs, masks) + else: + xs = self.embed(xs) + + if self.intermediate_layers is None: + xs, masks = self.encoders(xs, masks) + else: + intermediate_outputs = [] + for layer_idx, encoder_layer in enumerate(self.encoders): + xs, masks = encoder_layer(xs, masks) + + if (self.intermediate_layers is not None and + layer_idx + 1 in self.intermediate_layers): + # intermediate branches also require normalization. + encoder_output = xs + if isinstance(encoder_output, tuple): + encoder_output = encoder_output[0] + if self.normalize_before: + encoder_output = self.after_norm(encoder_output) + intermediate_outputs.append(encoder_output) + + if isinstance(xs, tuple): + xs = xs[0] + + if self.normalize_before: + xs = self.after_norm(xs) + + if self.intermediate_layers is not None: + return xs, masks, intermediate_outputs + return xs, masks diff --git a/paddlespeech/t2s/modules/conformer/encoder_layer.py b/paddlespeech/t2s/modules/conformer/encoder_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..a7a4936786f9b47f945740d4b45eb7a2b98101ee --- /dev/null +++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py @@ -0,0 +1,196 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from espnet(https://github.com/espnet/espnet) +"""Encoder self-attention layer definition.""" +import paddle +from paddle import nn + +from paddlespeech.t2s.modules.layer_norm import LayerNorm + + +class EncoderLayer(nn.Layer): + """Encoder layer module. + Parameters + ---------- + size : int + Input dimension. + self_attn : paddle.nn.Layer + Self-attention module instance. + `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance + can be used as the argument. + feed_forward : paddle.nn.Layer + Feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance + can be used as the argument. + feed_forward_macaron : paddle.nn.Layer + Additional feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance + can be used as the argument. + conv_module : paddle.nn.Layer + Convolution module instance. + `ConvlutionModule` instance can be used as the argument. + dropout_rate : float + Dropout rate. + normalize_before : bool + Whether to use layer_norm before the first block. + concat_after : bool + Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + stochastic_depth_rate : float + Proability to skip this layer. + During training, the layer may skip residual computation and return input + as-is with given probability. + """ + + def __init__( + self, + size, + self_attn, + feed_forward, + feed_forward_macaron, + conv_module, + dropout_rate, + normalize_before=True, + concat_after=False, + stochastic_depth_rate=0.0, ): + """Construct an EncoderLayer object.""" + super(EncoderLayer, self).__init__() + self.self_attn = self_attn + self.feed_forward = feed_forward + self.feed_forward_macaron = feed_forward_macaron + self.conv_module = conv_module + self.norm_ff = LayerNorm(size) # for the FNN module + self.norm_mha = LayerNorm(size) # for the MHA module + if feed_forward_macaron is not None: + self.norm_ff_macaron = LayerNorm(size) + self.ff_scale = 0.5 + else: + self.ff_scale = 1.0 + if self.conv_module is not None: + self.norm_conv = LayerNorm(size) # for the CNN module + self.norm_final = LayerNorm( + size) # for the final output of the block + self.dropout = nn.Dropout(dropout_rate) + self.size = size + self.normalize_before = normalize_before + self.concat_after = concat_after + if self.concat_after: + self.concat_linear = nn.Linear(size + size, size) + self.stochastic_depth_rate = stochastic_depth_rate + + def forward(self, x_input, mask, cache=None): + """Compute encoded features. + Parameters + ---------- + x_input : Union[Tuple, paddle.Tensor] + Input tensor w/ or w/o pos emb. + - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)]. + - w/o pos emb: Tensor (#batch, time, size). + mask : paddle.Tensor + Mask tensor for the input (#batch, time). + cache paddle.Tensor + Cache tensor of the input (#batch, time - 1, size). + Returns + ---------- + paddle.Tensor + Output tensor (#batch, time, size). + paddle.Tensor + Mask tensor (#batch, time). + """ + if isinstance(x_input, tuple): + x, pos_emb = x_input[0], x_input[1] + else: + x, pos_emb = x_input, None + + skip_layer = False + # with stochastic depth, residual connection `x + f(x)` becomes + # `x <- x + 1 / (1 - p) * f(x)` at training time. + stoch_layer_coeff = 1.0 + if self.training and self.stochastic_depth_rate > 0: + skip_layer = paddle.rand(1).item() < self.stochastic_depth_rate + stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate) + + if skip_layer: + if cache is not None: + x = paddle.concat([cache, x], axis=1) + if pos_emb is not None: + return (x, pos_emb), mask + return x, mask + + # whether to use macaron style + if self.feed_forward_macaron is not None: + residual = x + if self.normalize_before: + x = self.norm_ff_macaron(x) + x = residual + stoch_layer_coeff * self.ff_scale * self.dropout( + self.feed_forward_macaron(x)) + if not self.normalize_before: + x = self.norm_ff_macaron(x) + + # multi-headed self-attention module + residual = x + if self.normalize_before: + x = self.norm_mha(x) + + if cache is None: + x_q = x + else: + assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size) + x_q = x[:, -1:, :] + residual = residual[:, -1:, :] + mask = None if mask is None else mask[:, -1:, :] + + if pos_emb is not None: + x_att = self.self_attn(x_q, x, x, pos_emb, mask) + else: + x_att = self.self_attn(x_q, x, x, mask) + + if self.concat_after: + x_concat = paddle.concat((x, x_att), axis=-1) + x = residual + stoch_layer_coeff * self.concat_linear(x_concat) + else: + x = residual + stoch_layer_coeff * self.dropout(x_att) + if not self.normalize_before: + x = self.norm_mha(x) + + # convolution module + if self.conv_module is not None: + residual = x + if self.normalize_before: + x = self.norm_conv(x) + x = residual + stoch_layer_coeff * self.dropout(self.conv_module(x)) + if not self.normalize_before: + x = self.norm_conv(x) + + # feed forward module + residual = x + if self.normalize_before: + x = self.norm_ff(x) + x = residual + stoch_layer_coeff * self.ff_scale * self.dropout( + self.feed_forward(x)) + if not self.normalize_before: + x = self.norm_ff(x) + + if self.conv_module is not None: + x = self.norm_final(x) + + if cache is not None: + x = paddle.concat([cache, x], axis=1) + + if pos_emb is not None: + return (x, pos_emb), mask + + return x, mask diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py index 30d3db86c885a56204562b82f7e0d709a96717e2..fbb3a9a3d65f83fd43b19902c9e97137691f2a2d 100644 --- a/paddlespeech/t2s/modules/nets_utils.py +++ b/paddlespeech/t2s/modules/nets_utils.py @@ -17,6 +17,14 @@ from paddle import nn from typeguard import check_argument_types +class Swish(paddle.nn.Layer): + """Construct an Swish object.""" + + def forward(self, x): + """Return Swich activation function.""" + return x * paddle.nn.Sigmoid(x) + + def pad_list(xs, pad_value): """Perform padding for the list of tensors. @@ -150,3 +158,17 @@ def initialize(model: nn.Layer, init: str): nn.initializer.Constant()) else: raise ValueError("Unknown initialization: " + init) + + +def get_activation(act): + """Return activation function.""" + + activation_funcs = { + "hardtanh": paddle.nn.Hardtanh, + "tanh": paddle.nn.Tanh, + "relu": paddle.nn.ReLU, + "selu": paddle.nn.SELU, + "swish": Swish, + } + + return activation_funcs[act]() diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/__init__.py b/paddlespeech/t2s/modules/predictor/__init__.py similarity index 100% rename from paddlespeech/t2s/modules/fastspeech2_predictor/__init__.py rename to paddlespeech/t2s/modules/predictor/__init__.py diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/duration_predictor.py b/paddlespeech/t2s/modules/predictor/duration_predictor.py similarity index 100% rename from paddlespeech/t2s/modules/fastspeech2_predictor/duration_predictor.py rename to paddlespeech/t2s/modules/predictor/duration_predictor.py diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py similarity index 100% rename from paddlespeech/t2s/modules/fastspeech2_predictor/length_regulator.py rename to paddlespeech/t2s/modules/predictor/length_regulator.py diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/variance_predictor.py b/paddlespeech/t2s/modules/predictor/variance_predictor.py similarity index 100% rename from paddlespeech/t2s/modules/fastspeech2_predictor/variance_predictor.py rename to paddlespeech/t2s/modules/predictor/variance_predictor.py diff --git a/paddlespeech/t2s/modules/style_encoder.py b/paddlespeech/t2s/modules/style_encoder.py index 868a73a969edb6d3dc1affe6b0e401a88fb7d11b..8a23e85c61dd2f2cbbd06e281335317575dfc5ff 100644 --- a/paddlespeech/t2s/modules/style_encoder.py +++ b/paddlespeech/t2s/modules/style_encoder.py @@ -19,7 +19,7 @@ import paddle from paddle import nn from typeguard import check_argument_types -from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention +from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention class StyleEncoder(nn.Layer): diff --git a/paddlespeech/t2s/modules/transformer.py b/paddlespeech/t2s/modules/transformer.py deleted file mode 100644 index e50d58d44bc6663414a7390589d3a8d7ad6f2c5b..0000000000000000000000000000000000000000 --- a/paddlespeech/t2s/modules/transformer.py +++ /dev/null @@ -1,208 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from paddle import nn -from paddle.nn import functional as F - -from paddlespeech.t2s.modules import attention as attn - -__all__ = [ - "PositionwiseFFN", - "TransformerEncoderLayer", - "TransformerDecoderLayer", -] - - -class PositionwiseFFN(nn.Layer): - """A faithful implementation of Position-wise Feed-Forward Network - in `Attention is All You Need `_. - It is basically a 2-layer MLP, with relu actication and dropout in between. - - Parameters - ---------- - input_size: int - The feature size of the intput. It is also the feature size of the - output. - hidden_size: int - The hidden size. - dropout: float - The probability of the Dropout applied to the output of the first - layer, by default 0. - """ - - def __init__(self, input_size: int, hidden_size: int, dropout=0.0): - super(PositionwiseFFN, self).__init__() - self.linear1 = nn.Linear(input_size, hidden_size) - self.linear2 = nn.Linear(hidden_size, input_size) - self.dropout = nn.Dropout(dropout) - - self.input_size = input_size - self.hidden_szie = hidden_size - - def forward(self, x): - r"""Forward pass of positionwise feed forward network. - - Parameters - ---------- - x : Tensor [shape=(\*, input_size)] - The input tensor, where ``\*`` means arbitary shape. - - Returns - ------- - Tensor [shape=(\*, input_size)] - The output tensor. - """ - l1 = self.dropout(F.relu(self.linear1(x))) - l2 = self.linear2(l1) - return l2 - - -class TransformerEncoderLayer(nn.Layer): - """A faithful implementation of Transformer encoder layer in - `Attention is All You Need `_. - - Parameters - ---------- - d_model :int - The feature size of the input. It is also the feature size of the - output. - n_heads : int - The number of heads of self attention (a ``MultiheadAttention`` - layer). - d_ffn : int - The hidden size of the positional feed forward network (a - ``PositionwiseFFN`` layer). - dropout : float, optional - The probability of the dropout in MultiHeadAttention and - PositionwiseFFN, by default 0. - - Notes - ------ - It uses the PostLN (post layer norm) scheme. - """ - - def __init__(self, d_model, n_heads, d_ffn, dropout=0.): - super(TransformerEncoderLayer, self).__init__() - self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout) - self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6) - - self.ffn = PositionwiseFFN(d_model, d_ffn, dropout) - self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6) - - self.dropout = dropout - - def forward(self, x, mask): - """Forward pass of TransformerEncoderLayer. - - Parameters - ---------- - x : Tensor [shape=(batch_size, time_steps, d_model)] - The input. - mask : Tensor - The padding mask. The shape is (batch_size, time_steps, - time_steps) or broadcastable shape. - - Returns - ------- - x :Tensor [shape=(batch_size, time_steps, d_model)] - The encoded output. - - attn_weights : Tensor [shape=(batch_size, n_heads, time_steps, time_steps)] - The attention weights of the self attention. - """ - context_vector, attn_weights = self.self_mha(x, x, x, mask) - x = self.layer_norm1( - F.dropout(x + context_vector, self.dropout, training=self.training)) - - x = self.layer_norm2( - F.dropout(x + self.ffn(x), self.dropout, training=self.training)) - return x, attn_weights - - -class TransformerDecoderLayer(nn.Layer): - """A faithful implementation of Transformer decoder layer in - `Attention is All You Need `_. - - Parameters - ---------- - d_model :int - The feature size of the input. It is also the feature size of the - output. - n_heads : int - The number of heads of attentions (``MultiheadAttention`` - layers). - d_ffn : int - The hidden size of the positional feed forward network (a - ``PositionwiseFFN`` layer). - dropout : float, optional - The probability of the dropout in MultiHeadAttention and - PositionwiseFFN, by default 0. - - Notes - ------ - It uses the PostLN (post layer norm) scheme. - """ - - def __init__(self, d_model, n_heads, d_ffn, dropout=0.): - super(TransformerDecoderLayer, self).__init__() - self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout) - self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6) - - self.cross_mha = attn.MultiheadAttention(d_model, n_heads, dropout) - self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6) - - self.ffn = PositionwiseFFN(d_model, d_ffn, dropout) - self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6) - - self.dropout = dropout - - def forward(self, q, k, v, encoder_mask, decoder_mask): - """Forward pass of TransformerEncoderLayer. - - Parameters - ---------- - q : Tensor [shape=(batch_size, time_steps_q, d_model)] - The decoder input. - k : Tensor [shape=(batch_size, time_steps_k, d_model)] - The keys. - v : Tensor [shape=(batch_size, time_steps_k, d_model)] - The values - encoder_mask : Tensor - Encoder padding mask, shape is ``(batch_size, time_steps_k, - time_steps_k)`` or broadcastable shape. - decoder_mask : Tensor - Decoder mask, shape is ``(batch_size, time_steps_q, time_steps_k)`` - or broadcastable shape. - - Returns - -------- - q : Tensor [shape=(batch_size, time_steps_q, d_model)] - The decoder output. - self_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_q)] - Decoder self attention. - - cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)] - Decoder-encoder cross attention. - """ - context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask) - q = self.layer_norm1( - F.dropout(q + context_vector, self.dropout, training=self.training)) - - context_vector, cross_attn_weights = self.cross_mha(q, k, v, - encoder_mask) - q = self.layer_norm2( - F.dropout(q + context_vector, self.dropout, training=self.training)) - - q = self.layer_norm3( - F.dropout(q + self.ffn(q), self.dropout, training=self.training)) - return q, self_attn_weights, cross_attn_weights diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/__init__.py b/paddlespeech/t2s/modules/transformer/__init__.py similarity index 100% rename from paddlespeech/t2s/modules/fastspeech2_transformer/__init__.py rename to paddlespeech/t2s/modules/transformer/__init__.py diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py similarity index 100% rename from paddlespeech/t2s/modules/fastspeech2_transformer/attention.py rename to paddlespeech/t2s/modules/transformer/attention.py diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py b/paddlespeech/t2s/modules/transformer/decoder.py similarity index 94% rename from paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py rename to paddlespeech/t2s/modules/transformer/decoder.py index 489fda12bc9d5708418ef2b8e3b96ea264f7101e..072fc813737f3963ccfb6536a1e90e033116e7d4 100644 --- a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py +++ b/paddlespeech/t2s/modules/transformer/decoder.py @@ -23,14 +23,14 @@ import paddle import paddle.nn.functional as F from paddle import nn -from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention -from paddlespeech.t2s.modules.fastspeech2_transformer.decoder_layer import DecoderLayer -from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding -from paddlespeech.t2s.modules.fastspeech2_transformer.lightconv import LightweightConvolution -from paddlespeech.t2s.modules.fastspeech2_transformer.mask import subsequent_mask -from paddlespeech.t2s.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward -from paddlespeech.t2s.modules.fastspeech2_transformer.repeat import repeat from paddlespeech.t2s.modules.layer_norm import LayerNorm +from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention +from paddlespeech.t2s.modules.transformer.decoder_layer import DecoderLayer +from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding +from paddlespeech.t2s.modules.transformer.lightconv import LightweightConvolution +from paddlespeech.t2s.modules.transformer.mask import subsequent_mask +from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward +from paddlespeech.t2s.modules.transformer.repeat import repeat class Decoder(nn.Layer): diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder_layer.py b/paddlespeech/t2s/modules/transformer/decoder_layer.py similarity index 100% rename from paddlespeech/t2s/modules/fastspeech2_transformer/decoder_layer.py rename to paddlespeech/t2s/modules/transformer/decoder_layer.py diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py similarity index 100% rename from paddlespeech/t2s/modules/fastspeech2_transformer/embedding.py rename to paddlespeech/t2s/modules/transformer/embedding.py diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py similarity index 92% rename from paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py rename to paddlespeech/t2s/modules/transformer/encoder.py index f91c76b727e8af153ec82bf70410c3c6cae0f227..f088ac7fad38a2b3fc77b6251cea9dc845ebd813 100644 --- a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py +++ b/paddlespeech/t2s/modules/transformer/encoder.py @@ -14,13 +14,13 @@ # Modified from espnet(https://github.com/espnet/espnet) from paddle import nn -from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention -from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding -from paddlespeech.t2s.modules.fastspeech2_transformer.encoder_layer import EncoderLayer -from paddlespeech.t2s.modules.fastspeech2_transformer.multi_layer_conv import Conv1dLinear -from paddlespeech.t2s.modules.fastspeech2_transformer.multi_layer_conv import MultiLayeredConv1d -from paddlespeech.t2s.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward -from paddlespeech.t2s.modules.fastspeech2_transformer.repeat import repeat +from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention +from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding +from paddlespeech.t2s.modules.transformer.encoder_layer import EncoderLayer +from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear +from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d +from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward +from paddlespeech.t2s.modules.transformer.repeat import repeat class Encoder(nn.Layer): diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder_layer.py b/paddlespeech/t2s/modules/transformer/encoder_layer.py similarity index 100% rename from paddlespeech/t2s/modules/fastspeech2_transformer/encoder_layer.py rename to paddlespeech/t2s/modules/transformer/encoder_layer.py diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/lightconv.py b/paddlespeech/t2s/modules/transformer/lightconv.py similarity index 100% rename from paddlespeech/t2s/modules/fastspeech2_transformer/lightconv.py rename to paddlespeech/t2s/modules/transformer/lightconv.py diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/mask.py b/paddlespeech/t2s/modules/transformer/mask.py similarity index 100% rename from paddlespeech/t2s/modules/fastspeech2_transformer/mask.py rename to paddlespeech/t2s/modules/transformer/mask.py diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/multi_layer_conv.py b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py similarity index 100% rename from paddlespeech/t2s/modules/fastspeech2_transformer/multi_layer_conv.py rename to paddlespeech/t2s/modules/transformer/multi_layer_conv.py diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/positionwise_feed_forward.py b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py similarity index 100% rename from paddlespeech/t2s/modules/fastspeech2_transformer/positionwise_feed_forward.py rename to paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/repeat.py b/paddlespeech/t2s/modules/transformer/repeat.py similarity index 100% rename from paddlespeech/t2s/modules/fastspeech2_transformer/repeat.py rename to paddlespeech/t2s/modules/transformer/repeat.py diff --git a/paddlespeech/t2s/modules/transformer/subsampling.py b/paddlespeech/t2s/modules/transformer/subsampling.py new file mode 100644 index 0000000000000000000000000000000000000000..300b35beda72dda735629b525a0f00bb25129e94 --- /dev/null +++ b/paddlespeech/t2s/modules/transformer/subsampling.py @@ -0,0 +1,291 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from espnet(https://github.com/espnet/espnet) +# Conv2dSubsampling 测试通过 +"""Subsampling layer definition.""" +import paddle + +from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding + + +class TooShortUttError(Exception): + """Raised when the utt is too short for subsampling. + Parameters + ---------- + message : str + Message for error catch + actual_size : int + the short size that cannot pass the subsampling + limit : int + the limit size for subsampling + """ + + def __init__(self, message, actual_size, limit): + """Construct a TooShortUttError for error handler.""" + super().__init__(message) + self.actual_size = actual_size + self.limit = limit + + +def check_short_utt(ins, size): + """Check if the utterance is too short for subsampling.""" + if isinstance(ins, Conv2dSubsampling2) and size < 3: + return True, 3 + if isinstance(ins, Conv2dSubsampling) and size < 7: + return True, 7 + if isinstance(ins, Conv2dSubsampling6) and size < 11: + return True, 11 + if isinstance(ins, Conv2dSubsampling8) and size < 15: + return True, 15 + return False, -1 + + +class Conv2dSubsampling(paddle.nn.Layer): + """Convolutional 2D subsampling (to 1/4 length). + Parameters + ---------- + idim : int + Input dimension. + odim : int + Output dimension. + dropout_rate : float + Dropout rate. + pos_enc : paddle.nn.Layer + Custom position encoding layer. + """ + + def __init__(self, idim, odim, dropout_rate, pos_enc=None): + """Construct an Conv2dSubsampling object.""" + super(Conv2dSubsampling, self).__init__() + self.conv = paddle.nn.Sequential( + paddle.nn.Conv2D(1, odim, 3, 2), + paddle.nn.ReLU(), + paddle.nn.Conv2D(odim, odim, 3, 2), + paddle.nn.ReLU(), ) + self.out = paddle.nn.Sequential( + paddle.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim), + pos_enc if pos_enc is not None else + PositionalEncoding(odim, dropout_rate), ) + + def forward(self, x, x_mask): + """Subsample x. + Parameters + ---------- + x : paddle.Tensor + Input tensor (#batch, time, idim). + x_mask : paddle.Tensor + Input mask (#batch, 1, time). + Returns + ---------- + paddle.Tensor + Subsampled tensor (#batch, time', odim), + where time' = time // 4. + paddle.Tensor + Subsampled mask (#batch, 1, time'), + where time' = time // 4. + """ + # (b, c, t, f) + x = x.unsqueeze(1) + x = self.conv(x) + b, c, t, f = x.shape + # x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) + x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f])) + if x_mask is None: + return x, None + return x, x_mask[:, :, :-2:2][:, :, :-2:2] + + def __getitem__(self, key): + """Get item. + When reset_parameters() is called, if use_scaled_pos_enc is used, + return the positioning encoding. + """ + if key != -1: + raise NotImplementedError( + "Support only `-1` (for `reset_parameters`).") + return self.out[key] + + +class Conv2dSubsampling2(paddle.nn.Layer): + """Convolutional 2D subsampling (to 1/2 length). + Parameters + ---------- + idim : int + Input dimension. + odim : int + Output dimension. + dropout_rate : float + Dropout rate. + pos_enc : paddle.nn.Layer + Custom position encoding layer. + """ + + def __init__(self, idim, odim, dropout_rate, pos_enc=None): + """Construct an Conv2dSubsampling2 object.""" + super(Conv2dSubsampling2, self).__init__() + self.conv = paddle.nn.Sequential( + paddle.nn.Conv2D(1, odim, 3, 2), + paddle.nn.ReLU(), + paddle.nn.Conv2D(odim, odim, 3, 1), + paddle.nn.ReLU(), ) + self.out = paddle.nn.Sequential( + paddle.nn.Linear(odim * (((idim - 1) // 2 - 2)), odim), + pos_enc if pos_enc is not None else + PositionalEncoding(odim, dropout_rate), ) + + def forward(self, x, x_mask): + """Subsample x. + Parameters + ---------- + x : paddle.Tensor + Input tensor (#batch, time, idim). + x_mask : paddle.Tensor + Input mask (#batch, 1, time). + Returns + ---------- + paddle.Tensor + ubsampled tensor (#batch, time', odim), + where time' = time // 2. + paddle.Tensor + Subsampled mask (#batch, 1, time'), + where time' = time // 2. + """ + # (b, c, t, f) + x = x.unsqueeze(1) + x = self.conv(x) + b, c, t, f = x.shape + x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f])) + if x_mask is None: + return x, None + return x, x_mask[:, :, :-2:2][:, :, :-2:1] + + def __getitem__(self, key): + """Get item. + When reset_parameters() is called, if use_scaled_pos_enc is used, + return the positioning encoding. + """ + if key != -1: + raise NotImplementedError( + "Support only `-1` (for `reset_parameters`).") + return self.out[key] + + +class Conv2dSubsampling6(paddle.nn.Layer): + """Convolutional 2D subsampling (to 1/6 length). + Parameters + ---------- + idim : int + Input dimension. + odim : int + Output dimension. + dropout_rate : float + Dropout rate. + pos_enc : paddle.nn.Layer + Custom position encoding layer. + """ + + def __init__(self, idim, odim, dropout_rate, pos_enc=None): + """Construct an Conv2dSubsampling6 object.""" + super(Conv2dSubsampling6, self).__init__() + self.conv = paddle.nn.Sequential( + paddle.nn.Conv2D(1, odim, 3, 2), + paddle.nn.ReLU(), + paddle.nn.Conv2D(odim, odim, 5, 3), + paddle.nn.ReLU(), ) + self.out = paddle.nn.Sequential( + paddle.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim), + pos_enc if pos_enc is not None else + PositionalEncoding(odim, dropout_rate), ) + + def forward(self, x, x_mask): + """Subsample x. + Parameters + ---------- + x : paddle.Tensor + Input tensor (#batch, time, idim). + x_mask paddle.Tensor + Input mask (#batch, 1, time). + Returns + ---------- + paddle.Tensor + Subsampled tensor (#batch, time', odim), + where time' = time // 6. + paddle.Tensor + Subsampled mask (#batch, 1, time'), + where time' = time // 6. + """ + # (b, c, t, f) + x = x.unsqueeze(1) + x = self.conv(x) + b, c, t, f = x.shape + x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f])) + if x_mask is None: + return x, None + return x, x_mask[:, :, :-2:2][:, :, :-4:3] + + +class Conv2dSubsampling8(paddle.nn.Layer): + """Convolutional 2D subsampling (to 1/8 length). + Parameters + ---------- + idim : int + Input dimension. + odim : int + Output dimension. + dropout_rate : float + Dropout rate. + pos_enc : paddle.nn.Layer + Custom position encoding layer. + """ + + def __init__(self, idim, odim, dropout_rate, pos_enc=None): + """Construct an Conv2dSubsampling8 object.""" + super(Conv2dSubsampling8, self).__init__() + self.conv = paddle.nn.Sequential( + paddle.nn.Conv2D(1, odim, 3, 2), + paddle.nn.ReLU(), + paddle.nn.Conv2D(odim, odim, 3, 2), + paddle.nn.ReLU(), + paddle.nn.Conv2D(odim, odim, 3, 2), + paddle.nn.ReLU(), ) + self.out = paddle.nn.Sequential( + paddle.nn.Linear(odim * ((( + (idim - 1) // 2 - 1) // 2 - 1) // 2), odim), + pos_enc if pos_enc is not None else + PositionalEncoding(odim, dropout_rate), ) + + def forward(self, x, x_mask): + """Subsample x. + Parameters + ---------- + x : paddle.Tensor + Input tensor (#batch, time, idim). + x_mask : paddle.Tensor + Input mask (#batch, 1, time). + Returns + ---------- + paddle.Tensor + Subsampled tensor (#batch, time', odim), + where time' = time // 8. + paddle.Tensor + Subsampled mask (#batch, 1, time'), + where time' = time // 8. + """ + # (b, c, t, f) + x = x.unsqueeze(1) + x = self.conv(x) + b, c, t, f = x.shape + x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f])) + if x_mask is None: + return x, None + return x, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2] diff --git a/requirements.txt b/requirements.txt index 2b34d36bdb467e0286c5d1e87d2f1383a9356f98..8e2552e7059e24ebbedb1ef2b67530e6780eb0cb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,7 +28,7 @@ python-dateutil pyworld resampy==0.2.2 sacrebleu -scipy==1.2.1 +scipy sentencepiece snakeviz soundfile~=0.10 @@ -44,3 +44,9 @@ visualdl==2.2.0 webrtcvad yacs yq +pypi-kenlm +GPUtil +psutil +pynvml +distro + diff --git a/setup.sh b/setup.sh new file mode 100644 index 0000000000000000000000000000000000000000..0bfacb548bfa6eb61bcb506c1fbc0a5acc185577 --- /dev/null +++ b/setup.sh @@ -0,0 +1,20 @@ +# Install conda dependencies +conda install -c conda-forge sox libsndfile swig bzip2 bottleneck gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 --yes + +# Install the python lib +pip install -r requirements.txt + +# Install the auto_log +pushd tools/extras +bash install_autolog.sh +popd + +# Install the ctcdecoder +pushd paddlespeech/s2t/decoders/ctcdecoder/swig +bash -e setup.sh +popd + +# Install the python_speech_features +pushd third_party +bash -e install.sh +popd diff --git a/tests/benchmark/conformer/README.md b/tests/benchmark/conformer/README.md index 71d5f91b8f283fe65afed2cfdf54eb7691e56ed8..22e0009d4445820a9ca6a226a1978ac065d698a9 100644 --- a/tests/benchmark/conformer/README.md +++ b/tests/benchmark/conformer/README.md @@ -43,16 +43,6 @@ bash prepare.sh bash run.sh ``` -### Analyse the sp -``` -bash run_analysis_sp.sh -``` - -### Analyse the mp -``` -bash run_analysis_mp.sh -``` - ### The log ``` {"log_file": "recoder_sp_bs16_fp32_ngpu1.txt", diff --git a/tests/benchmark/conformer/analysis.py b/tests/benchmark/conformer/analysis.py deleted file mode 100644 index 610791c8cf11640a4d1142441cd1d349cf8b3be1..0000000000000000000000000000000000000000 --- a/tests/benchmark/conformer/analysis.py +++ /dev/null @@ -1,345 +0,0 @@ -# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from __future__ import print_function - -import argparse -import json -import re -import traceback - - -def parse_args(): - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--filename", type=str, help="The name of log which need to analysis.") - parser.add_argument( - "--log_with_profiler", - type=str, - help="The path of train log with profiler") - parser.add_argument( - "--profiler_path", type=str, help="The path of profiler timeline log.") - parser.add_argument( - "--keyword", type=str, help="Keyword to specify analysis data") - parser.add_argument( - "--separator", - type=str, - default=None, - help="Separator of different field in log") - parser.add_argument( - '--position', type=int, default=None, help='The position of data field') - parser.add_argument( - '--range', - type=str, - default="", - help='The range of data field to intercept') - parser.add_argument( - '--base_batch_size', type=int, help='base_batch size on gpu') - parser.add_argument( - '--skip_steps', - type=int, - default=0, - help='The number of steps to be skipped') - parser.add_argument( - '--model_mode', - type=int, - default=-1, - help='Analysis mode, default value is -1') - parser.add_argument('--ips_unit', type=str, default=None, help='IPS unit') - parser.add_argument( - '--model_name', - type=str, - default=0, - help='training model_name, transformer_base') - parser.add_argument( - '--mission_name', type=str, default=0, help='training mission name') - parser.add_argument( - '--direction_id', type=int, default=0, help='training direction_id') - parser.add_argument( - '--run_mode', - type=str, - default="sp", - help='multi process or single process') - parser.add_argument( - '--index', - type=int, - default=1, - help='{1: speed, 2:mem, 3:profiler, 6:max_batch_size}') - parser.add_argument( - '--gpu_num', type=int, default=1, help='nums of training gpus') - parser.add_argument( - '--use_num', type=int, default=1, help='nums of used recoders') - args = parser.parse_args() - args.separator = None if args.separator == "None" else args.separator - return args - - -def _is_number(num): - pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$') - result = pattern.match(num) - if result: - return True - else: - return False - - -class TimeAnalyzer(object): - def __init__(self, - filename, - keyword=None, - separator=None, - position=None, - range="-1"): - if filename is None: - raise Exception("Please specify the filename!") - - if keyword is None: - raise Exception("Please specify the keyword!") - - self.filename = filename - self.keyword = keyword - self.separator = separator - self.position = position - self.range = range - self.records = None - self._distil() - - def _distil(self): - self.records = [] - with open(self.filename, "r") as f_object: - lines = f_object.readlines() - for line in lines: - if self.keyword not in line: - continue - try: - result = None - - # Distil the string from a line. - line = line.strip() - line_words = line.split( - self.separator) if self.separator else line.split() - print("line_words", line_words) - if args.position: - result = line_words[self.position] - else: - # Distil the string following the keyword. - for i in range(len(line_words) - 1): - if line_words[i] == self.keyword: - result = line_words[i + 1] - break - - # Distil the result from the picked string. - if not self.range: - result = result[0:] - elif _is_number(self.range): - result = result[0:int(self.range)] - else: - result = result[int(self.range.split(":")[0]):int( - self.range.split(":")[1])] - self.records.append(float(result)) - except Exception as exc: - pass - #print("line is: {}; separator={}; position={}".format(line, self.separator, self.position)) - self.records.sort() - self.records = self.records[:args.use_num] - print("records", self.records) - print("Extract {} records: separator={}; position={}".format( - len(self.records), self.separator, self.position)) - - def _get_fps(self, - mode, - batch_size, - gpu_num, - avg_of_records, - run_mode, - unit=None): - if mode == -1 and run_mode == 'sp': - assert unit, "Please set the unit when mode is -1." - fps = gpu_num * avg_of_records - elif mode == -1 and run_mode == 'mp': - assert unit, "Please set the unit when mode is -1." - fps = gpu_num * avg_of_records #temporarily, not used now - print("------------this is mp") - elif mode == 0: - # s/step -> samples/s - fps = (batch_size * gpu_num) / avg_of_records - unit = "samples/s" - elif mode == 1: - # steps/s -> steps/s - fps = avg_of_records - unit = "steps/s" - elif mode == 2: - # s/step -> steps/s - fps = 1 / avg_of_records - unit = "steps/s" - elif mode == 3: - # steps/s -> samples/s - fps = batch_size * gpu_num * avg_of_records - unit = "samples/s" - elif mode == 4: - # s/epoch -> s/epoch - fps = avg_of_records - unit = "s/epoch" - else: - ValueError("Unsupported analysis mode.") - - return fps, unit - - def analysis(self, - batch_size, - gpu_num=1, - skip_steps=0, - mode=-1, - run_mode='sp', - unit=None): - if batch_size <= 0: - print("base_batch_size should larger than 0.") - return 0, '' - - if len( - self.records - ) <= skip_steps: # to address the condition which item of log equals to skip_steps - print("no records") - return 0, '' - - sum_of_records = 0 - sum_of_records_skipped = 0 - skip_min = self.records[skip_steps] - skip_max = self.records[skip_steps] - - count = len(self.records) - for i in range(count): - sum_of_records += self.records[i] - if i >= skip_steps: - sum_of_records_skipped += self.records[i] - if self.records[i] < skip_min: - skip_min = self.records[i] - if self.records[i] > skip_max: - skip_max = self.records[i] - - avg_of_records = sum_of_records / float(count) - avg_of_records_skipped = sum_of_records_skipped / float(count - - skip_steps) - - fps, fps_unit = self._get_fps(mode, batch_size, gpu_num, avg_of_records, - run_mode, unit) - fps_skipped, _ = self._get_fps(mode, batch_size, gpu_num, - avg_of_records_skipped, run_mode, unit) - if mode == -1: - print("average ips of %d steps, skip 0 step:" % count) - print("\tAvg: %.3f %s" % (avg_of_records, fps_unit)) - print("\tFPS: %.3f %s" % (fps, fps_unit)) - if skip_steps > 0: - print("average ips of %d steps, skip %d steps:" % - (count, skip_steps)) - print("\tAvg: %.3f %s" % (avg_of_records_skipped, fps_unit)) - print("\tMin: %.3f %s" % (skip_min, fps_unit)) - print("\tMax: %.3f %s" % (skip_max, fps_unit)) - print("\tFPS: %.3f %s" % (fps_skipped, fps_unit)) - elif mode == 1 or mode == 3: - print("average latency of %d steps, skip 0 step:" % count) - print("\tAvg: %.3f steps/s" % avg_of_records) - print("\tFPS: %.3f %s" % (fps, fps_unit)) - if skip_steps > 0: - print("average latency of %d steps, skip %d steps:" % - (count, skip_steps)) - print("\tAvg: %.3f steps/s" % avg_of_records_skipped) - print("\tMin: %.3f steps/s" % skip_min) - print("\tMax: %.3f steps/s" % skip_max) - print("\tFPS: %.3f %s" % (fps_skipped, fps_unit)) - elif mode == 0 or mode == 2: - print("average latency of %d steps, skip 0 step:" % count) - print("\tAvg: %.3f s/step" % avg_of_records) - print("\tFPS: %.3f %s" % (fps, fps_unit)) - if skip_steps > 0: - print("average latency of %d steps, skip %d steps:" % - (count, skip_steps)) - print("\tAvg: %.3f s/step" % avg_of_records_skipped) - print("\tMin: %.3f s/step" % skip_min) - print("\tMax: %.3f s/step" % skip_max) - print("\tFPS: %.3f %s" % (fps_skipped, fps_unit)) - - return round(fps_skipped, 3), fps_unit - - -if __name__ == "__main__": - args = parse_args() - run_info = dict() - run_info["log_file"] = args.filename - run_info["model_name"] = args.model_name - run_info["mission_name"] = args.mission_name - run_info["direction_id"] = args.direction_id - run_info["run_mode"] = args.run_mode - run_info["index"] = args.index - run_info["gpu_num"] = args.gpu_num - run_info["FINAL_RESULT"] = 0 - run_info["JOB_FAIL_FLAG"] = 0 - - try: - if args.index == 1: - if args.gpu_num == 1: - run_info["log_with_profiler"] = args.log_with_profiler - run_info["profiler_path"] = args.profiler_path - analyzer = TimeAnalyzer(args.filename, args.keyword, args.separator, - args.position, args.range) - run_info["FINAL_RESULT"], run_info["UNIT"] = analyzer.analysis( - batch_size=args.base_batch_size, - gpu_num=args.gpu_num, - skip_steps=args.skip_steps, - mode=args.model_mode, - run_mode=args.run_mode, - unit=args.ips_unit) - # if int(os.getenv('job_fail_flag')) == 1 or int(run_info["FINAL_RESULT"]) == 0: - # run_info["JOB_FAIL_FLAG"] = 1 - elif args.index == 3: - run_info["FINAL_RESULT"] = {} - records_fo_total = TimeAnalyzer(args.filename, 'Framework overhead', - None, 3, '').records - records_fo_ratio = TimeAnalyzer(args.filename, 'Framework overhead', - None, 5).records - records_ct_total = TimeAnalyzer(args.filename, 'Computation time', - None, 3, '').records - records_gm_total = TimeAnalyzer(args.filename, - 'GpuMemcpy Calls', - None, 4, '').records - records_gm_ratio = TimeAnalyzer(args.filename, - 'GpuMemcpy Calls', - None, 6).records - records_gmas_total = TimeAnalyzer(args.filename, - 'GpuMemcpyAsync Calls', - None, 4, '').records - records_gms_total = TimeAnalyzer(args.filename, - 'GpuMemcpySync Calls', - None, 4, '').records - run_info["FINAL_RESULT"]["Framework_Total"] = records_fo_total[ - 0] if records_fo_total else 0 - run_info["FINAL_RESULT"]["Framework_Ratio"] = records_fo_ratio[ - 0] if records_fo_ratio else 0 - run_info["FINAL_RESULT"][ - "ComputationTime_Total"] = records_ct_total[ - 0] if records_ct_total else 0 - run_info["FINAL_RESULT"]["GpuMemcpy_Total"] = records_gm_total[ - 0] if records_gm_total else 0 - run_info["FINAL_RESULT"]["GpuMemcpy_Ratio"] = records_gm_ratio[ - 0] if records_gm_ratio else 0 - run_info["FINAL_RESULT"][ - "GpuMemcpyAsync_Total"] = records_gmas_total[ - 0] if records_gmas_total else 0 - run_info["FINAL_RESULT"]["GpuMemcpySync_Total"] = records_gms_total[ - 0] if records_gms_total else 0 - else: - print("Not support!") - except Exception: - traceback.print_exc() - print("{}".format(json.dumps(run_info)) - ) # it's required, for the log file path insert to the database diff --git a/tests/benchmark/conformer/prepare.sh b/tests/benchmark/conformer/prepare.sh index 8f03fd1b988fb458a681d7c8612416cf9ef65895..c5fae06a59d41147c9aaa89f3074914e7ea9906f 100644 --- a/tests/benchmark/conformer/prepare.sh +++ b/tests/benchmark/conformer/prepare.sh @@ -1,5 +1,6 @@ -source ../../../tools/venv/bin/activate - +cd ../../../ +pip install -e . # 安装pdspeech +cd - #Enter the example dir pushd ../../../examples/aishell/s1 diff --git a/tests/benchmark/conformer/run.sh b/tests/benchmark/conformer/run.sh index c09bbf09b0547f3f0214f85a437dae23f764df98..79beb4e961fc01d7b1d5a80e81d94289057c0398 100644 --- a/tests/benchmark/conformer/run.sh +++ b/tests/benchmark/conformer/run.sh @@ -1,8 +1,12 @@ # 提供可稳定复现性能的脚本,默认在标准docker环境内py37执行: paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7 paddle=2.1.2 py=37 # 执行目录:需说明 -CUR_DIR=${PWD} -source ../../../tools/venv/bin/activate +CUR_DIR=${PWD} # PaddleSpeech/tests/benchmark/conformer +cd ../../../ +log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} # benchmark系统指定该参数,不需要跑profile时,log_path指向存speed的目录 +cd ${CUR_DIR} +sed -i '/set\ -xe/d' run_benchmark.sh + #cd ** pushd ../../../examples/aishell/s1 # 1 安装该模型需要的依赖 (如需开启优化策略请注明) @@ -11,26 +15,33 @@ pushd ../../../examples/aishell/s1 source path.sh source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; - +mkdir -p conf/benchmark +#yq e ".training.accum_grad=1" conf/conformer.yaml > conf/benchmark/conformer.yaml +cp conf/conformer.yaml conf/benchmark/conformer.yaml +sed -i "s/ accum_grad: 2/ accum_grad: 1/g" conf/benchmark/conformer.yaml fp_item_list=(fp32) bs_item=(16 30) -config_path=conf/conformer.yaml +config_path=conf/benchmark/conformer.yaml seed=0 output=exp/conformer profiler_options=None +model_item=conformer for fp_item in ${fp_item_list[@]}; do - for batch_size in ${bs_item[@]} + for bs_item in ${bs_item[@]} do rm exp -rf + log_name=speech_${model_item}_bs${bs_item}_${fp_item} # 如:clas_MobileNetv1_mp_bs32_fp32_8 echo "index is speed, 8gpus, run_mode is multi_process, begin, conformer" run_mode=mp ngpu=8 - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${batch_size} ${fp_item} ${CUR_DIR} - rm exp -rf - echo "index is speed, 1gpus, begin, conformer" + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1 + sleep 60 + log_name=speech_${model_item}_bs${bs_item}_${fp_item} # 如:clas_MobileNetv1_mp_bs32_fp32_8 + echo "index is speed, 1gpus, begin, ${log_name}" run_mode=sp ngpu=1 - CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${batch_size} ${fp_item} ${CUR_DIR} + CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_1gpus 2>&1 # (5min) + sleep 60 done done diff --git a/tests/benchmark/conformer/run_benchmark.sh b/tests/benchmark/conformer/run_benchmark.sh index c03a08f3b000e4d81649a392124ee1dbb445dace..56b63e76b1f23abf8f36c237dcd2232e20792d39 100644 --- a/tests/benchmark/conformer/run_benchmark.sh +++ b/tests/benchmark/conformer/run_benchmark.sh @@ -12,17 +12,24 @@ function _set_params(){ profiler_options=${6:-"None"} batch_size=${7:-"32"} fp_item=${8:-"fp32"} - TRAIN_LOG_DIR=${9:-$(pwd)} - + model_item=${9:-"conformer"} benchmark_max_step=0 - run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数 +# 添加日志解析需要的参数 + base_batch_size=${batch_size} + mission_name="语音识别" + direction_id="1" + ips_unit="sent./sec" + skip_steps=10 # 解析日志,有些模型前几个step耗时长,需要跳过 (必填) + keyword="ips:" # 解析日志,筛选出数据所在行的关键字 (必填) + index="1" + model_name=${model_item}_bs${batch_size}_${fp_item} # 以下不用修改 device=${CUDA_VISIBLE_DEVICES//,/ } arr=(${device}) num_gpu_devices=${#arr[*]} - log_file=${run_log_path}/recoder_${run_mode}_bs${batch_size}_${fp_item}_ngpu${ngpu}.txt + log_file=${run_log_path}/recoder_${model_item}_${run_mode}_bs${batch_size}_${fp_item}_ngpu${ngpu} } function _train(){ @@ -36,11 +43,9 @@ function _train(){ --benchmark-batch-size ${batch_size} --benchmark-max-step ${benchmark_max_step} " - echo "run_mode "${run_mode} - case ${run_mode} in - sp) train_cmd="python3 -u ${BIN_DIR}/train.py "${train_cmd} ;; - mp) train_cmd="python3 -u ${BIN_DIR}/train.py "${train_cmd} ;; + sp) train_cmd="python -u ${BIN_DIR}/train.py "${train_cmd} ;; + mp) train_cmd="python -u ${BIN_DIR}/train.py "${train_cmd} ;; *) echo "choose run_mode(sp or mp)"; exit 1; esac echo ${train_cmd} @@ -61,5 +66,8 @@ function _train(){ fi } +source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ -_train +# _train # 如果只想产出训练log,不解析,可取消注释 +_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开 + diff --git a/tools/extras/install_miniconda.sh b/tools/extras/install_miniconda.sh index 3d1909af6f4f8a23e261e8983bf9ee6d1275cb4f..c6ee4b361ca7733d46ecb9b6d3b260199c190203 100755 --- a/tools/extras/install_miniconda.sh +++ b/tools/extras/install_miniconda.sh @@ -13,6 +13,8 @@ else fi bash Miniconda3-latest-Linux-x86_64.sh -b +$HOME/miniconda3/bin/conda init + $HOME/miniconda3/bin/python -m pip install --user tqdm $HOME/miniconda3/bin/python -m pip install --user scikit-learn $HOME/miniconda3/bin/python -m pip install --user librosa