提交 bc0dd511 编写于 作者: 小湉湉's avatar 小湉湉 提交者: root

Merge branch 'develop' of github.com:PaddlePaddle/PaddleSpeech into HEAD

......@@ -41,10 +41,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# # export ckpt avg_n
# CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi
# Optionally, you can add LM and test it with runtime.
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
......
......@@ -24,7 +24,7 @@ f0max: 400 # Minimum f0 for pitch extraction.
# DATA SETTING #
###########################################################
batch_size: 64
num_workers: 4
num_workers: 2
###########################################################
......@@ -45,7 +45,6 @@ model:
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_masking: True # whether to apply masking for padded part in loss calculation
use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input
......
......@@ -7,7 +7,6 @@ gpus=0,1
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_482.pdz
......
......@@ -9,7 +9,7 @@ alignment=$3
ge2e_ckpt_path=$4
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../../ge2e/inference.py \
python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \
--input=${input}/wav \
--output=${preprocess_path}/embed \
--checkpoint_path=${ge2e_ckpt_path}
......
......@@ -45,7 +45,6 @@ model:
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_masking: True # whether to apply masking for padded part in loss calculation
use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input
......
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs: 24000 # sr
n_fft: 2048 # FFT size.
n_shift: 300 # Hop size.
win_length: 1200 # Window length.
# If set to null, it will be the same as fft_size.
window: "hann" # Window function.
# Only used for feats_type != raw
fmin: 80 # Minimum frequency of Mel basis.
fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Maximum f0 for pitch extraction.
f0max: 400 # Minimum f0 for pitch extraction.
###########################################################
# DATA SETTING #
###########################################################
batch_size: 64
num_workers: 4
###########################################################
# MODEL SETTING #
###########################################################
model:
adim: 384 # attention dimension
aheads: 2 # number of attention heads
elayers: 4 # number of encoder layers
eunits: 1536 # number of encoder ff units
dlayers: 4 # number of decoder layers
dunits: 1536 # number of decoder ff units
positionwise_layer_type: conv1d # type of position-wise layer
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input
reduction_factor: 1 # reduction factor
encoder_type: conformer # encoder type
decoder_type: conformer # decoder type
conformer_pos_enc_layer_type: rel_pos # conformer positional encoding type
conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
conformer_activation_type: swish # conformer activation type
use_macaron_style_in_conformer: true # whether to use macaron style in conformer
use_cnn_in_conformer: true # whether to use CNN in conformer
conformer_enc_kernel_size: 7 # kernel size in CNN module of conformer-based encoder
conformer_dec_kernel_size: 31 # kernel size in CNN module of conformer-based decoder
init_type: xavier_uniform # initialization type
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
###########################################################
# UPDATER SETTING #
###########################################################
updater:
use_masking: True # whether to apply masking for padded part in loss calculation
###########################################################
# OPTIMIZER SETTING #
###########################################################
optimizer:
optim: adam # optimizer type
learning_rate: 0.001 # learning rate
###########################################################
# TRAINING SETTING #
###########################################################
max_epoch: 1000
num_snapshots: 5
###########################################################
# OTHER SETTING #
###########################################################
seed: 10086
......@@ -45,7 +45,6 @@ model:
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_masking: True # whether to apply masking for padded part in loss calculation
use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input
......
......@@ -80,7 +80,7 @@ lambda_adv: 4.0 # Loss balancing coefficient.
batch_size: 8 # Batch size.
batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by hop_size.
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
num_workers: 4 # Number of workers in Pytorch DataLoader.
num_workers: 2 # Number of workers in Pytorch DataLoader.
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
......
......@@ -43,10 +43,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# # export ckpt avg_n
# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# test a single .wav file
......
# https://yaml.org/type/float.html
# network architecture
model:
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: transformer
encoder_conf:
......
......@@ -48,10 +48,10 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# export ckpt avg_n
./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
# if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# # export ckpt avg_n
# ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
./local/cacu_perplexity.sh || exit -1
......
......@@ -45,7 +45,6 @@ model:
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_masking: True # whether to apply masking for padded part in loss calculation
use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input
......
......@@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=ge2e
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
export BIN_DIR=${MAIN_ROOT}/paddlespeech/vector/exps/${MODEL}
......@@ -35,7 +35,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
# if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# # export ckpt avg_n
# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi
......@@ -42,7 +42,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# # export ckpt avg_n
# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi
......@@ -39,8 +39,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# # export ckpt avg_n
# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi
......@@ -45,7 +45,6 @@ model:
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_masking: True # whether to apply masking for padded part in loss calculation
use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input
......
......@@ -126,8 +126,12 @@ decoders_module = [
]
setup(
name='swig_decoders',
version='1.1',
description="""CTC decoders""",
name='paddlespeech_ctcdecoders',
version='0.0.1a',
description="CTC decoders in paddlespeech",
author="PaddlePaddle Speech and Language Team",
author_email="paddlesl@baidu.com",
url="https://github.com/PaddlePaddle/PaddleSpeech",
license='Apache 2.0',
ext_modules=decoders_module,
py_modules=['swig_decoders'], )
py_modules=['swig_decoders'])
......@@ -860,7 +860,7 @@ class U2Model(U2DecodeModel):
int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
"""
# cmvn
if configs['cmvn_file'] is not None:
if 'cmvn_file' in configs and configs['cmvn_file'] is not None:
mean, istd = load_cmvn(configs['cmvn_file'],
configs['cmvn_file_type'])
global_cmvn = GlobalCMVN(
......
......@@ -100,7 +100,7 @@ def fastspeech2_single_spk_batch_fn(examples):
def fastspeech2_multi_spk_batch_fn(examples):
# fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"]
# fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"/"spk_emb"]
text = [np.array(item["text"], dtype=np.int64) for item in examples]
speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
......@@ -114,7 +114,6 @@ def fastspeech2_multi_spk_batch_fn(examples):
speech_lengths = [
np.array(item["speech_lengths"], dtype=np.int64) for item in examples
]
spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
text = batch_sequences(text)
pitch = batch_sequences(pitch)
......@@ -130,7 +129,6 @@ def fastspeech2_multi_spk_batch_fn(examples):
energy = paddle.to_tensor(energy)
text_lengths = paddle.to_tensor(text_lengths)
speech_lengths = paddle.to_tensor(speech_lengths)
spk_id = paddle.to_tensor(spk_id)
batch = {
"text": text,
......@@ -139,9 +137,20 @@ def fastspeech2_multi_spk_batch_fn(examples):
"speech": speech,
"speech_lengths": speech_lengths,
"pitch": pitch,
"energy": energy,
"spk_id": spk_id
"energy": energy
}
# spk_emb has a higher priority than spk_id
if "spk_emb" in examples[0]:
spk_emb = [
np.array(item["spk_emb"], dtype=np.float32) for item in examples
]
spk_emb = batch_sequences(spk_emb)
spk_emb = paddle.to_tensor(spk_emb)
batch["spk_emb"] = spk_emb
elif "spk_id" in examples[0]:
spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
spk_id = paddle.to_tensor(spk_id)
batch["spk_id"] = spk_id
return batch
......
......@@ -46,14 +46,14 @@ def evaluate(args, fastspeech2_config, pwg_config):
print("vocab_size:", vocab_size)
with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()]
num_speakers = len(spk_id)
print("num_speakers:", num_speakers)
spk_num = len(spk_id)
print("spk_num:", spk_num)
odim = fastspeech2_config.n_mels
model = FastSpeech2(
idim=vocab_size,
odim=odim,
num_speakers=num_speakers,
spk_num=spk_num,
**fastspeech2_config["model"])
model.set_state_dict(
......
......@@ -51,14 +51,14 @@ def evaluate(args, fastspeech2_config, pwg_config):
print("vocab_size:", vocab_size)
with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()]
num_speakers = len(spk_id)
print("num_speakers:", num_speakers)
spk_num = len(spk_id)
print("spk_num:", spk_num)
odim = fastspeech2_config.n_mels
model = FastSpeech2(
idim=vocab_size,
odim=odim,
num_speakers=num_speakers,
spk_num=spk_num,
**fastspeech2_config["model"])
model.set_state_dict(
......
......@@ -167,6 +167,10 @@ def main():
"pitch": str(pitch_path),
"energy": str(energy_path)
}
# add spk_emb for voice cloning
if "spk_emb" in item:
record["spk_emb"] = str(item["spk_emb"])
output_metadata.append(record)
output_metadata.sort(key=itemgetter('utt_id'))
output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
......
......@@ -44,7 +44,8 @@ def process_sentence(config: Dict[str, Any],
mel_extractor=None,
pitch_extractor=None,
energy_extractor=None,
cut_sil: bool=True):
cut_sil: bool=True,
spk_emb_dir: Path=None):
utt_id = fp.stem
# for vctk
if utt_id.endswith("_mic2"):
......@@ -116,6 +117,14 @@ def process_sentence(config: Dict[str, Any],
"energy": str(energy_path),
"speaker": speaker
}
if spk_emb_dir:
if speaker in os.listdir(spk_emb_dir):
embed_name = utt_id + ".npy"
embed_path = spk_emb_dir / speaker / embed_name
if embed_path.is_file():
record["spk_emb"] = str(embed_path)
else:
return None
return record
......@@ -127,13 +136,14 @@ def process_sentences(config,
pitch_extractor=None,
energy_extractor=None,
nprocs: int=1,
cut_sil: bool=True):
cut_sil: bool=True,
spk_emb_dir: Path=None):
if nprocs == 1:
results = []
for fp in fps:
record = process_sentence(config, fp, sentences, output_dir,
mel_extractor, pitch_extractor,
energy_extractor, cut_sil)
energy_extractor, cut_sil, spk_emb_dir)
if record:
results.append(record)
else:
......@@ -144,7 +154,7 @@ def process_sentences(config,
future = pool.submit(process_sentence, config, fp,
sentences, output_dir, mel_extractor,
pitch_extractor, energy_extractor,
cut_sil)
cut_sil, spk_emb_dir)
future.add_done_callback(lambda p: progress.update())
futures.append(future)
......@@ -202,6 +212,11 @@ def main():
default=True,
help="whether cut sil in the edge of audio")
parser.add_argument(
"--spk_emb_dir",
default=None,
type=str,
help="directory to speaker embedding files.")
args = parser.parse_args()
rootdir = Path(args.rootdir).expanduser()
......@@ -211,6 +226,11 @@ def main():
dumpdir.mkdir(parents=True, exist_ok=True)
dur_file = Path(args.dur_file).expanduser()
if args.spk_emb_dir:
spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve()
else:
spk_emb_dir = None
assert rootdir.is_dir()
assert dur_file.is_file()
......@@ -251,6 +271,7 @@ def main():
test_wav_files += wav_files[-sub_num_dev:]
else:
train_wav_files += wav_files
elif args.dataset == "ljspeech":
wav_files = sorted(list((rootdir / "wavs").rglob("*.wav")))
# split data into 3 sections
......@@ -317,7 +338,8 @@ def main():
pitch_extractor,
energy_extractor,
nprocs=args.num_cpu,
cut_sil=args.cut_sil)
cut_sil=args.cut_sil,
spk_emb_dir=spk_emb_dir)
if dev_wav_files:
process_sentences(
config,
......@@ -327,7 +349,8 @@ def main():
mel_extractor,
pitch_extractor,
energy_extractor,
cut_sil=args.cut_sil)
cut_sil=args.cut_sil,
spk_emb_dir=spk_emb_dir)
if test_wav_files:
process_sentences(
config,
......@@ -338,7 +361,8 @@ def main():
pitch_extractor,
energy_extractor,
nprocs=args.num_cpu,
cut_sil=args.cut_sil)
cut_sil=args.cut_sil,
spk_emb_dir=spk_emb_dir)
if __name__ == "__main__":
......
......@@ -40,16 +40,19 @@ def evaluate(args, fastspeech2_config, pwg_config):
fields = ["utt_id", "text"]
spk_num = None
if args.speaker_dict is not None:
print("multiple speaker fastspeech2!")
with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()]
num_speakers = len(spk_id)
spk_num = len(spk_id)
fields += ["spk_id"]
elif args.voice_cloning:
print("voice cloning!")
fields += ["spk_emb"]
else:
print("single speaker fastspeech2!")
num_speakers = None
print("num_speakers:", num_speakers)
print("spk_num:", spk_num)
test_dataset = DataTable(data=test_metadata, fields=fields)
......@@ -62,7 +65,7 @@ def evaluate(args, fastspeech2_config, pwg_config):
model = FastSpeech2(
idim=vocab_size,
odim=odim,
num_speakers=num_speakers,
spk_num=spk_num,
**fastspeech2_config["model"])
model.set_state_dict(
......@@ -96,12 +99,15 @@ def evaluate(args, fastspeech2_config, pwg_config):
for datum in test_dataset:
utt_id = datum["utt_id"]
text = paddle.to_tensor(datum["text"])
if "spk_id" in datum:
spk_id = paddle.to_tensor(datum["spk_id"])
else:
spk_emb = None
spk_id = None
if args.voice_cloning and "spk_emb" in datum:
spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
elif "spk_id" in datum:
spk_id = paddle.to_tensor(datum["spk_id"])
with paddle.no_grad():
wav = pwg_inference(fastspeech2_inference(text, spk_id=spk_id))
wav = pwg_inference(
fastspeech2_inference(text, spk_id=spk_id, spk_emb=spk_emb))
sf.write(
str(output_dir / (utt_id + ".wav")),
wav.numpy(),
......@@ -142,6 +148,15 @@ def main():
type=str,
default=None,
help="speaker id map file for multiple speaker model.")
def str2bool(str):
return True if str.lower() == 'true' else False
parser.add_argument(
"--voice-cloning",
type=str2bool,
default=False,
help="whether training voice cloning model.")
parser.add_argument("--test-metadata", type=str, help="test metadata.")
parser.add_argument("--output-dir", type=str, help="output dir.")
parser.add_argument(
......
......@@ -61,18 +61,24 @@ def train_sp(args, config):
"text", "text_lengths", "speech", "speech_lengths", "durations",
"pitch", "energy"
]
converters = {"speech": np.load, "pitch": np.load, "energy": np.load}
spk_num = None
if args.speaker_dict is not None:
print("multiple speaker fastspeech2!")
collate_fn = fastspeech2_multi_spk_batch_fn
with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()]
num_speakers = len(spk_id)
spk_num = len(spk_id)
fields += ["spk_id"]
elif args.voice_cloning:
print("Training voice cloning!")
collate_fn = fastspeech2_multi_spk_batch_fn
fields += ["spk_emb"]
converters["spk_emb"] = np.load
else:
print("single speaker fastspeech2!")
collate_fn = fastspeech2_single_spk_batch_fn
num_speakers = None
print("num_speakers:", num_speakers)
print("spk_num:", spk_num)
# dataloader has been too verbose
logging.getLogger("DataLoader").disabled = True
......@@ -83,17 +89,13 @@ def train_sp(args, config):
train_dataset = DataTable(
data=train_metadata,
fields=fields,
converters={"speech": np.load,
"pitch": np.load,
"energy": np.load}, )
converters=converters, )
with jsonlines.open(args.dev_metadata, 'r') as reader:
dev_metadata = list(reader)
dev_dataset = DataTable(
data=dev_metadata,
fields=fields,
converters={"speech": np.load,
"pitch": np.load,
"energy": np.load}, )
converters=converters, )
# collate function and dataloader
......@@ -127,10 +129,7 @@ def train_sp(args, config):
odim = config.n_mels
model = FastSpeech2(
idim=vocab_size,
odim=odim,
num_speakers=num_speakers,
**config["model"])
idim=vocab_size, odim=odim, spk_num=spk_num, **config["model"])
if world_size > 1:
model = DataParallel(model)
print("model done!")
......@@ -184,6 +183,15 @@ def main():
default=None,
help="speaker id map file for multiple speaker model.")
def str2bool(str):
return True if str.lower() == 'true' else False
parser.add_argument(
"--voice-cloning",
type=str2bool,
default=False,
help="whether training voice cloning model.")
args = parser.parse_args()
with open(args.config) as f:
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
def cycle(iterable):
# cycle('ABCD') --> A B C D A B C D A B C D ...
saved = []
for element in iterable:
yield element
saved.append(element)
while saved:
for element in saved:
yield element
def random_cycle(iterable):
# cycle('ABCD') --> A B C D B C D A A D B C ...
saved = []
for element in iterable:
yield element
saved.append(element)
random.shuffle(saved)
while saved:
for element in saved:
yield element
random.shuffle(saved)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
from pathlib import Path
import numpy as np
from paddle.io import BatchSampler
from paddle.io import Dataset
from paddlespeech.t2s.exps.ge2e.random_cycle import random_cycle
class MultiSpeakerMelDataset(Dataset):
"""A 2 layer directory thatn contains mel spectrograms in *.npy format.
An Example file structure tree is shown below. We prefer to preprocess
raw datasets and organized them like this.
dataset_root/
speaker1/
utterance1.npy
utterance2.npy
utterance3.npy
speaker2/
utterance1.npy
utterance2.npy
utterance3.npy
"""
def __init__(self, dataset_root: Path):
self.root = Path(dataset_root).expanduser()
speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
speaker_utterances = {
speaker_dir: list(speaker_dir.glob("*.npy"))
for speaker_dir in speaker_dirs
}
self.speaker_dirs = speaker_dirs
self.speaker_to_utterances = speaker_utterances
# meta data
self.num_speakers = len(self.speaker_dirs)
self.num_utterances = np.sum(
len(utterances)
for speaker, utterances in self.speaker_to_utterances.items())
def get_example_by_index(self, speaker_index, utterance_index):
speaker_dir = self.speaker_dirs[speaker_index]
fpath = self.speaker_to_utterances[speaker_dir][utterance_index]
return self[fpath]
def __getitem__(self, fpath):
return np.load(fpath)
def __len__(self):
return int(self.num_utterances)
class MultiSpeakerSampler(BatchSampler):
"""A multi-stratal sampler designed for speaker verification task.
First, N speakers from all speakers are sampled randomly. Then, for each
speaker, randomly sample M utterances from their corresponding utterances.
"""
def __init__(self,
dataset: MultiSpeakerMelDataset,
speakers_per_batch: int,
utterances_per_speaker: int):
self._speakers = list(dataset.speaker_dirs)
self._speaker_to_utterances = dataset.speaker_to_utterances
self.speakers_per_batch = speakers_per_batch
self.utterances_per_speaker = utterances_per_speaker
def __iter__(self):
# yield list of Paths
speaker_generator = iter(random_cycle(self._speakers))
speaker_utterances_generator = {
s: iter(random_cycle(us))
for s, us in self._speaker_to_utterances.items()
}
while True:
speakers = []
for _ in range(self.speakers_per_batch):
speakers.append(next(speaker_generator))
utterances = []
for s in speakers:
us = speaker_utterances_generator[s]
for _ in range(self.utterances_per_speaker):
utterances.append(next(us))
yield utterances
class RandomClip(object):
def __init__(self, frames):
self.frames = frames
def __call__(self, spec):
# spec [T, C]
T = spec.shape[0]
start = random.randint(0, T - self.frames)
return spec[start:start + self.frames, :]
class Collate(object):
def __init__(self, num_frames):
self.random_crop = RandomClip(num_frames)
def __call__(self, examples):
frame_clips = [self.random_crop(mel) for mel in examples]
batced_clips = np.stack(frame_clips)
return batced_clips
if __name__ == "__main__":
mydataset = MultiSpeakerMelDataset(
Path("/home/chenfeiyu/datasets/SV2TTS/encoder"))
print(mydataset.get_example_by_index(0, 10))
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
from paddle import DataParallel
from paddle import distributed as dist
from paddle.io import DataLoader
from paddle.nn.clip import ClipGradByGlobalNorm
from paddle.optimizer import Adam
from paddlespeech.t2s.exps.ge2e.config import get_cfg_defaults
from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import Collate
from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import MultiSpeakerMelDataset
from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import MultiSpeakerSampler
from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder
from paddlespeech.t2s.training import default_argument_parser
from paddlespeech.t2s.training import ExperimentBase
class Ge2eExperiment(ExperimentBase):
def setup_model(self):
config = self.config
model = LSTMSpeakerEncoder(config.data.n_mels, config.model.num_layers,
config.model.hidden_size,
config.model.embedding_size)
optimizer = Adam(
config.training.learning_rate_init,
parameters=model.parameters(),
grad_clip=ClipGradByGlobalNorm(3))
self.model = DataParallel(model) if self.parallel else model
self.model_core = model
self.optimizer = optimizer
def setup_dataloader(self):
config = self.config
train_dataset = MultiSpeakerMelDataset(self.args.data)
sampler = MultiSpeakerSampler(train_dataset,
config.training.speakers_per_batch,
config.training.utterances_per_speaker)
train_loader = DataLoader(
train_dataset,
batch_sampler=sampler,
collate_fn=Collate(config.data.partial_n_frames),
num_workers=16)
self.train_dataset = train_dataset
self.train_loader = train_loader
def train_batch(self):
start = time.time()
batch = self.read_batch()
data_loader_time = time.time() - start
self.optimizer.clear_grad()
self.model.train()
specs = batch
loss, eer = self.model(specs, self.config.training.speakers_per_batch)
loss.backward()
self.model_core.do_gradient_ops()
self.optimizer.step()
iteration_time = time.time() - start
# logging
loss_value = float(loss)
msg = "Rank: {}, ".format(dist.get_rank())
msg += "step: {}, ".format(self.iteration)
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
iteration_time)
msg += 'loss: {:>.6f} err: {:>.6f}'.format(loss_value, eer)
self.logger.info(msg)
if dist.get_rank() == 0:
self.visualizer.add_scalar("train/loss", loss_value, self.iteration)
self.visualizer.add_scalar("train/eer", eer, self.iteration)
self.visualizer.add_scalar("param/w",
float(self.model_core.similarity_weight),
self.iteration)
self.visualizer.add_scalar("param/b",
float(self.model_core.similarity_bias),
self.iteration)
def valid(self):
pass
def main_sp(config, args):
exp = Ge2eExperiment(config, args)
exp.setup()
exp.resume_or_load()
exp.run()
def main(config, args):
if args.ngpu > 1:
dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
else:
main_sp(config, args)
if __name__ == "__main__":
config = get_cfg_defaults()
parser = default_argument_parser()
args = parser.parse_args()
if args.config:
config.merge_from_file(args.config)
if args.opts:
config.merge_from_list(args.opts)
config.freeze()
print(config)
print(args)
main(config, args)
......@@ -20,14 +20,14 @@ import paddle
import soundfile as sf
from matplotlib import pyplot as plt
from paddlespeech.t2s.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_phones
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_tones
from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.chinese_g2p import convert_sentence
from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder
from paddlespeech.t2s.models.tacotron2 import Tacotron2
from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow
from paddlespeech.t2s.utils import display
from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder
def voice_cloning(args):
......
......@@ -54,6 +54,10 @@ class FastSpeech2Updater(StandardUpdater):
losses_dict = {}
# spk_id!=None in multiple spk fastspeech2
spk_id = batch["spk_id"] if "spk_id" in batch else None
spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
# No explicit speaker identifier labels are used during voice cloning training.
if spk_emb is not None:
spk_id = None
before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model(
text=batch["text"],
......@@ -63,7 +67,8 @@ class FastSpeech2Updater(StandardUpdater):
durations=batch["durations"],
pitch=batch["pitch"],
energy=batch["energy"],
spk_id=spk_id)
spk_id=spk_id,
spk_emb=spk_emb)
l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
after_outs=after_outs,
......@@ -126,6 +131,9 @@ class FastSpeech2Evaluator(StandardEvaluator):
losses_dict = {}
# spk_id!=None in multiple spk fastspeech2
spk_id = batch["spk_id"] if "spk_id" in batch else None
spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
if spk_emb is not None:
spk_id = None
before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model(
text=batch["text"],
......@@ -135,7 +143,8 @@ class FastSpeech2Evaluator(StandardEvaluator):
durations=batch["durations"],
pitch=batch["pitch"],
energy=batch["energy"],
spk_id=spk_id)
spk_id=spk_id,
spk_emb=spk_emb)
l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
after_outs=after_outs,
......
......@@ -257,9 +257,9 @@ class TransformerTTS(nn.Layer):
self.padding_idx = 0
# set_global_initializer 会影响后面的全局,包括 create_parameter
initialize(self, init_type)
# get positional encoding class
pos_enc_class = (ScaledPositionalEncoding
if self.use_scaled_pos_enc else PositionalEncoding)
# get positional encoding layer type
transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos"
# define transformer encoder
if eprenet_conv_layers != 0:
......@@ -291,7 +291,7 @@ class TransformerTTS(nn.Layer):
dropout_rate=transformer_enc_dropout_rate,
positional_dropout_rate=transformer_enc_positional_dropout_rate,
attention_dropout_rate=transformer_enc_attn_dropout_rate,
pos_enc_class=pos_enc_class,
pos_enc_layer_type=transformer_pos_enc_layer_type,
normalize_before=encoder_normalize_before,
concat_after=encoder_concat_after,
positionwise_layer_type=positionwise_layer_type,
......@@ -330,6 +330,9 @@ class TransformerTTS(nn.Layer):
nn.Linear(dprenet_units, adim), )
else:
decoder_input_layer = "linear"
# get positional encoding class
pos_enc_class = (ScaledPositionalEncoding
if self.use_scaled_pos_enc else PositionalEncoding)
self.decoder = Decoder(
odim=odim, # odim is needed when no prenet is used
attention_dim=adim,
......@@ -391,7 +394,7 @@ class TransformerTTS(nn.Layer):
text_lengths: paddle.Tensor,
speech: paddle.Tensor,
speech_lengths: paddle.Tensor,
spembs: paddle.Tensor=None,
spk_emb: paddle.Tensor=None,
) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
"""Calculate forward propagation.
......@@ -405,7 +408,7 @@ class TransformerTTS(nn.Layer):
Batch of padded target features (B, Lmax, odim).
speech_lengths : Tensor(int64)
Batch of the lengths of each target (B,).
spembs : Tensor, optional
spk_emb : Tensor, optional
Batch of speaker embeddings (B, spk_embed_dim).
Returns
......@@ -439,7 +442,7 @@ class TransformerTTS(nn.Layer):
# calculate transformer outputs
after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens,
spembs)
spk_emb)
# modifiy mod part of groundtruth
......@@ -467,7 +470,7 @@ class TransformerTTS(nn.Layer):
ilens: paddle.Tensor,
ys: paddle.Tensor,
olens: paddle.Tensor,
spembs: paddle.Tensor,
spk_emb: paddle.Tensor,
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
# forward encoder
x_masks = self._source_mask(ilens)
......@@ -480,7 +483,7 @@ class TransformerTTS(nn.Layer):
# integrate speaker embedding
if self.spk_embed_dim is not None:
hs = self._integrate_with_spk_embed(hs, spembs)
hs = self._integrate_with_spk_embed(hs, spk_emb)
# thin out frames for reduction factor (B, Lmax, odim) -> (B, Lmax//r, odim)
if self.reduction_factor > 1:
......@@ -514,7 +517,7 @@ class TransformerTTS(nn.Layer):
self,
text: paddle.Tensor,
speech: paddle.Tensor=None,
spembs: paddle.Tensor=None,
spk_emb: paddle.Tensor=None,
threshold: float=0.5,
minlenratio: float=0.0,
maxlenratio: float=10.0,
......@@ -528,7 +531,7 @@ class TransformerTTS(nn.Layer):
Input sequence of characters (T,).
speech : Tensor, optional
Feature sequence to extract style (N, idim).
spembs : Tensor, optional
spk_emb : Tensor, optional
Speaker embedding vector (spk_embed_dim,).
threshold : float, optional
Threshold in inference.
......@@ -551,7 +554,6 @@ class TransformerTTS(nn.Layer):
"""
# input of embedding must be int64
y = speech
spemb = spembs
# add eos at the last of sequence
text = numpy.pad(
......@@ -564,12 +566,12 @@ class TransformerTTS(nn.Layer):
# get teacher forcing outputs
xs, ys = x.unsqueeze(0), y.unsqueeze(0)
spembs = None if spemb is None else spemb.unsqueeze(0)
spk_emb = None if spk_emb is None else spk_emb.unsqueeze(0)
ilens = paddle.to_tensor(
[xs.shape[1]], dtype=paddle.int64, place=xs.place)
olens = paddle.to_tensor(
[ys.shape[1]], dtype=paddle.int64, place=ys.place)
outs, *_ = self._forward(xs, ilens, ys, olens, spembs)
outs, *_ = self._forward(xs, ilens, ys, olens, spk_emb)
# get attention weights
att_ws = []
......@@ -590,9 +592,9 @@ class TransformerTTS(nn.Layer):
hs = hs + style_embs.unsqueeze(1)
# integrate speaker embedding
if self.spk_embed_dim is not None:
spembs = spemb.unsqueeze(0)
hs = self._integrate_with_spk_embed(hs, spembs)
if spk_emb is not None:
spk_emb = spk_emb.unsqueeze(0)
hs = self._integrate_with_spk_embed(hs, spk_emb)
# set limits of length
maxlen = int(hs.shape[1] * maxlenratio / self.reduction_factor)
......@@ -726,14 +728,14 @@ class TransformerTTS(nn.Layer):
def _integrate_with_spk_embed(self,
hs: paddle.Tensor,
spembs: paddle.Tensor) -> paddle.Tensor:
spk_emb: paddle.Tensor) -> paddle.Tensor:
"""Integrate speaker embedding with hidden states.
Parameters
----------
hs : Tensor
Batch of hidden state sequences (B, Tmax, adim).
spembs : Tensor
spk_emb : Tensor
Batch of speaker embeddings (B, spk_embed_dim).
Returns
......@@ -744,13 +746,13 @@ class TransformerTTS(nn.Layer):
"""
if self.spk_embed_integration_type == "add":
# apply projection and then add to hidden states
spembs = self.projection(F.normalize(spembs))
hs = hs + spembs.unsqueeze(1)
spk_emb = self.projection(F.normalize(spk_emb))
hs = hs + spk_emb.unsqueeze(1)
elif self.spk_embed_integration_type == "concat":
# concat hidden states with spk embeds and then apply projection
spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.shape[1],
spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(-1, hs.shape[1],
-1)
hs = self.projection(paddle.concat([hs, spembs], axis=-1))
hs = self.projection(paddle.concat([hs, spk_emb], axis=-1))
else:
raise NotImplementedError("support only add or concat.")
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
"""Encoder definition."""
import logging
import paddle
from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer
from paddlespeech.t2s.modules.layer_norm import LayerNorm
from paddlespeech.t2s.modules.nets_utils import get_activation
from paddlespeech.t2s.modules.transformer.attention import LegacyRelPositionMultiHeadedAttention
from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
from paddlespeech.t2s.modules.transformer.embedding import LegacyRelPositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
from paddlespeech.t2s.modules.transformer.repeat import repeat
from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
class Encoder(paddle.nn.Layer):
"""Conformer encoder module.
Parameters
----------
idim : int
Input dimension.
attention_dim : int
Dimension of attention.
attention_heads : int
The number of heads of multi head attention.
linear_units : int
The number of units of position-wise feed forward.
num_blocks : int
The number of decoder blocks.
dropout_rate : float
Dropout rate.
positional_dropout_rate : float
Dropout rate after adding positional encoding.
attention_dropout_rate : float
Dropout rate in attention.
input_layer : Union[str, paddle.nn.Layer]
Input layer type.
normalize_before : bool
Whether to use layer_norm before the first block.
concat_after : bool
Whether to concat attention layer's input and output.
if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x)
positionwise_layer_type : str
"linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size : int
Kernel size of positionwise conv1d layer.
macaron_style : bool
Whether to use macaron style for positionwise layer.
pos_enc_layer_type : str
Encoder positional encoding layer type.
selfattention_layer_type : str
Encoder attention layer type.
activation_type : str
Encoder activation function type.
use_cnn_module : bool
Whether to use convolution module.
zero_triu : bool
Whether to zero the upper triangular part of attention matrix.
cnn_module_kernel : int
Kernerl size of convolution module.
padding_idx : int
Padding idx for input_layer=embed.
stochastic_depth_rate : float
Maximum probability to skip the encoder layer.
intermediate_layers : Union[List[int], None]
indices of intermediate CTC layer.
indices start from 1.
if not None, intermediate outputs are returned (which changes return type
signature.)
"""
def __init__(
self,
idim,
attention_dim=256,
attention_heads=4,
linear_units=2048,
num_blocks=6,
dropout_rate=0.1,
positional_dropout_rate=0.1,
attention_dropout_rate=0.0,
input_layer="conv2d",
normalize_before=True,
concat_after=False,
positionwise_layer_type="linear",
positionwise_conv_kernel_size=1,
macaron_style=False,
pos_enc_layer_type="abs_pos",
selfattention_layer_type="selfattn",
activation_type="swish",
use_cnn_module=False,
zero_triu=False,
cnn_module_kernel=31,
padding_idx=-1,
stochastic_depth_rate=0.0,
intermediate_layers=None, ):
"""Construct an Encoder object."""
super(Encoder, self).__init__()
activation = get_activation(activation_type)
if pos_enc_layer_type == "abs_pos":
pos_enc_class = PositionalEncoding
elif pos_enc_layer_type == "scaled_abs_pos":
pos_enc_class = ScaledPositionalEncoding
elif pos_enc_layer_type == "rel_pos":
assert selfattention_layer_type == "rel_selfattn"
pos_enc_class = RelPositionalEncoding
elif pos_enc_layer_type == "legacy_rel_pos":
pos_enc_class = LegacyRelPositionalEncoding
assert selfattention_layer_type == "legacy_rel_selfattn"
else:
raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
self.conv_subsampling_factor = 1
if input_layer == "linear":
self.embed = paddle.nn.Sequential(
paddle.nn.Linear(idim, attention_dim),
paddle.nn.LayerNorm(attention_dim),
paddle.nn.Dropout(dropout_rate),
pos_enc_class(attention_dim, positional_dropout_rate), )
elif input_layer == "conv2d":
self.embed = Conv2dSubsampling(
idim,
attention_dim,
dropout_rate,
pos_enc_class(attention_dim, positional_dropout_rate), )
self.conv_subsampling_factor = 4
elif input_layer == "embed":
self.embed = paddle.nn.Sequential(
paddle.nn.Embedding(
idim, attention_dim, padding_idx=padding_idx),
pos_enc_class(attention_dim, positional_dropout_rate), )
elif isinstance(input_layer, paddle.nn.Layer):
self.embed = paddle.nn.Sequential(
input_layer,
pos_enc_class(attention_dim, positional_dropout_rate), )
elif input_layer is None:
self.embed = paddle.nn.Sequential(
pos_enc_class(attention_dim, positional_dropout_rate))
else:
raise ValueError("unknown input_layer: " + input_layer)
self.normalize_before = normalize_before
# self-attention module definition
if selfattention_layer_type == "selfattn":
logging.info("encoder self-attention layer type = self-attention")
encoder_selfattn_layer = MultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, attention_dim,
attention_dropout_rate, )
elif selfattention_layer_type == "legacy_rel_selfattn":
assert pos_enc_layer_type == "legacy_rel_pos"
encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, attention_dim,
attention_dropout_rate, )
elif selfattention_layer_type == "rel_selfattn":
logging.info(
"encoder self-attention layer type = relative self-attention")
assert pos_enc_layer_type == "rel_pos"
encoder_selfattn_layer = RelPositionMultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, attention_dim,
attention_dropout_rate, zero_triu, )
else:
raise ValueError("unknown encoder_attn_layer: " +
selfattention_layer_type)
# feed-forward module definition
if positionwise_layer_type == "linear":
positionwise_layer = PositionwiseFeedForward
positionwise_layer_args = (attention_dim, linear_units,
dropout_rate, activation, )
elif positionwise_layer_type == "conv1d":
positionwise_layer = MultiLayeredConv1d
positionwise_layer_args = (attention_dim, linear_units,
positionwise_conv_kernel_size,
dropout_rate, )
elif positionwise_layer_type == "conv1d-linear":
positionwise_layer = Conv1dLinear
positionwise_layer_args = (attention_dim, linear_units,
positionwise_conv_kernel_size,
dropout_rate, )
else:
raise NotImplementedError("Support only linear or conv1d.")
# convolution module definition
convolution_layer = ConvolutionModule
convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
self.encoders = repeat(
num_blocks,
lambda lnum: EncoderLayer(
attention_dim,
encoder_selfattn_layer(*encoder_selfattn_layer_args),
positionwise_layer(*positionwise_layer_args),
positionwise_layer(*positionwise_layer_args) if macaron_style else None,
convolution_layer(*convolution_layer_args) if use_cnn_module else None,
dropout_rate,
normalize_before,
concat_after,
stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
if self.normalize_before:
self.after_norm = LayerNorm(attention_dim)
self.intermediate_layers = intermediate_layers
def forward(self, xs, masks):
"""Encode input sequence.
Parameters
----------
xs : paddle.Tensor
Input tensor (#batch, time, idim).
masks (paddle.Tensor): Mask tensor (#batch, 1, time).
Returns
----------
paddle.Tensor
Output tensor (#batch, time, attention_dim).
paddle.Tensor
Mask tensor (#batch, time).
"""
if isinstance(self.embed, (Conv2dSubsampling)):
xs, masks = self.embed(xs, masks)
else:
xs = self.embed(xs)
if self.intermediate_layers is None:
xs, masks = self.encoders(xs, masks)
else:
intermediate_outputs = []
for layer_idx, encoder_layer in enumerate(self.encoders):
xs, masks = encoder_layer(xs, masks)
if (self.intermediate_layers is not None and
layer_idx + 1 in self.intermediate_layers):
# intermediate branches also require normalization.
encoder_output = xs
if isinstance(encoder_output, tuple):
encoder_output = encoder_output[0]
if self.normalize_before:
encoder_output = self.after_norm(encoder_output)
intermediate_outputs.append(encoder_output)
if isinstance(xs, tuple):
xs = xs[0]
if self.normalize_before:
xs = self.after_norm(xs)
if self.intermediate_layers is not None:
return xs, masks, intermediate_outputs
return xs, masks
......@@ -37,7 +37,7 @@ class MultiHeadedAttention(nn.Layer):
def __init__(self, n_head, n_feat, dropout_rate):
"""Construct an MultiHeadedAttention object."""
super(MultiHeadedAttention, self).__init__()
super().__init__()
assert n_feat % n_head == 0
# We assume d_v always equals d_k
self.d_k = n_feat // n_head
......@@ -70,7 +70,7 @@ class MultiHeadedAttention(nn.Layer):
paddle.Tensor
Transformed value tensor (#batch, n_head, time2, d_k).
"""
n_batch = query.shape[0]
n_batch = paddle.shape(query)[0]
q = paddle.reshape(
self.linear_q(query), [n_batch, -1, self.h, self.d_k])
......@@ -104,7 +104,7 @@ class MultiHeadedAttention(nn.Layer):
Transformed value (#batch, time1, d_model)
weighted by the attention score (#batch, time1, time2).
"""
n_batch = value.shape[0]
n_batch = paddle.shape(value)[0]
softmax = paddle.nn.Softmax(axis=-1)
if mask is not None:
mask = mask.unsqueeze(1)
......@@ -126,8 +126,8 @@ class MultiHeadedAttention(nn.Layer):
# (batch, time1, d_model)
x = (paddle.reshape(
x.transpose((0, 2, 1, 3)), (n_batch, -1, self.h * self.d_k)))
return self.linear_out(x) # (batch, time1, d_model)
# (batch, time1, d_model)
return self.linear_out(x)
def forward(self, query, key, value, mask=None):
"""Compute scaled dot product attention.
......@@ -153,3 +153,113 @@ class MultiHeadedAttention(nn.Layer):
(0, 1, 3, 2))) / math.sqrt(self.d_k)
return self.forward_attention(v, scores, mask)
class RelPositionMultiHeadedAttention(MultiHeadedAttention):
"""Multi-Head Attention layer with relative position encoding (new implementation).
Details can be found in https://github.com/espnet/espnet/pull/2816.
Paper: https://arxiv.org/abs/1901.02860
Parameters
----------
n_head : int
The number of heads.
n_feat : int
The number of features.
dropout_rate : float
Dropout rate.
zero_triu : bool
Whether to zero the upper triangular part of attention matrix.
"""
def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
"""Construct an RelPositionMultiHeadedAttention object."""
super().__init__(n_head, n_feat, dropout_rate)
self.zero_triu = zero_triu
# linear transformation for positional encoding
self.linear_pos = nn.Linear(n_feat, n_feat, bias_attr=False)
# these two learnable bias are used in matrix c and matrix d
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
self.pos_bias_u = paddle.create_parameter(
shape=(self.h, self.d_k),
dtype='float32',
default_initializer=paddle.nn.initializer.XavierUniform())
self.pos_bias_v = paddle.create_parameter(
shape=(self.h, self.d_k),
dtype='float32',
default_initializer=paddle.nn.initializer.XavierUniform())
def rel_shift(self, x):
"""Compute relative positional encoding.
Parameters
----------
x : paddle.Tensor
Input tensor (batch, head, time1, 2*time1-1).
time1 means the length of query vector.
Returns
----------
paddle.Tensor
Output tensor.
"""
b, h, t1, t2 = paddle.shape(x)
zero_pad = paddle.zeros((b, h, t1, 1))
x_padded = paddle.concat([zero_pad, x], axis=-1)
x_padded = x_padded.reshape([b, h, t2 + 1, t1])
# only keep the positions from 0 to time2
x = x_padded[:, :, 1:].reshape([b, h, t1, t2])[:, :, :, :t2 // 2 + 1]
if self.zero_triu:
ones = paddle.ones((t1, t2))
x = x * paddle.tril(ones, t2 - 1)[None, None, :, :]
return x
def forward(self, query, key, value, pos_emb, mask):
"""Compute 'Scaled Dot Product Attention' with rel. positional encoding.
Parameters
----------
query : paddle.Tensor
Query tensor (#batch, time1, size).
key : paddle.Tensor
Key tensor (#batch, time2, size).
value : paddle.Tensor
Value tensor (#batch, time2, size).
pos_emb : paddle.Tensor
Positional embedding tensor
(#batch, 2*time1-1, size).
mask : paddle.Tensor
Mask tensor (#batch, 1, time2) or
(#batch, time1, time2).
Returns
----------
paddle.Tensor
Output tensor (#batch, time1, d_model).
"""
q, k, v = self.forward_qkv(query, key, value)
# (batch, time1, head, d_k)
q = q.transpose([0, 2, 1, 3])
n_batch_pos = paddle.shape(pos_emb)[0]
p = self.linear_pos(pos_emb).reshape(
[n_batch_pos, -1, self.h, self.d_k])
# (batch, head, 2*time1-1, d_k)
p = p.transpose([0, 2, 1, 3])
# (batch, head, time1, d_k)
q_with_bias_u = (q + self.pos_bias_u).transpose([0, 2, 1, 3])
# (batch, head, time1, d_k)
q_with_bias_v = (q + self.pos_bias_v).transpose([0, 2, 1, 3])
# compute attention score
# first compute matrix a and matrix c
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
# (batch, head, time1, time2)
matrix_ac = paddle.matmul(q_with_bias_u, k.transpose([0, 1, 3, 2]))
# compute matrix b and matrix d
# (batch, head, time1, 2*time1-1)
matrix_bd = paddle.matmul(q_with_bias_v, p.transpose([0, 1, 3, 2]))
matrix_bd = self.rel_shift(matrix_bd)
# (batch, head, time1, time2)
scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k)
return self.forward_attention(v, scores, mask)
......@@ -139,3 +139,76 @@ class ScaledPositionalEncoding(PositionalEncoding):
T = paddle.shape(x)[1]
x = x + self.alpha * self.pe[:, :T]
return self.dropout(x)
class RelPositionalEncoding(paddle.nn.Layer):
"""Relative positional encoding module (new implementation).
Details can be found in https://github.com/espnet/espnet/pull/2816.
See : Appendix B in https://arxiv.org/abs/1901.02860
Parameters
----------
d_model : int
Embedding dimension.
dropout_rate : float
Dropout rate.
max_len : int
Maximum input length.
"""
def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
"""Construct an PositionalEncoding object."""
super(RelPositionalEncoding, self).__init__()
self.d_model = d_model
self.xscale = math.sqrt(self.d_model)
self.dropout = paddle.nn.Dropout(p=dropout_rate)
self.pe = None
self.dtype = dtype
self.extend_pe(paddle.expand(paddle.zeros([1]), (1, max_len)))
def extend_pe(self, x):
"""Reset the positional encodings."""
if self.pe is not None:
# self.pe contains both positive and negative parts
# the length of self.pe is 2 * input_len - 1
if paddle.shape(self.pe)[1] >= paddle.shape(x)[1] * 2 - 1:
return
# Suppose `i` means to the position of query vecotr and `j` means the
# position of key vector. We use position relative positions when keys
# are to the left (i>j) and negative relative positions otherwise (i<j).
x_shape = paddle.shape(x)
pe_positive = paddle.zeros([x_shape[1], self.d_model])
pe_negative = paddle.zeros([x_shape[1], self.d_model])
position = paddle.arange(0, x_shape[1], dtype=self.dtype).unsqueeze(1)
div_term = paddle.exp(
paddle.arange(0, self.d_model, 2, dtype=self.dtype) *
-(math.log(10000.0) / self.d_model))
pe_positive[:, 0::2] = paddle.sin(position * div_term)
pe_positive[:, 1::2] = paddle.cos(position * div_term)
pe_negative[:, 0::2] = paddle.sin(-1 * position * div_term)
pe_negative[:, 1::2] = paddle.cos(-1 * position * div_term)
# Reserve the order of positive indices and concat both positive and
# negative indices. This is used to support the shifting trick
# as in https://arxiv.org/abs/1901.02860
pe_positive = paddle.flip(pe_positive, [0]).unsqueeze(0)
pe_negative = pe_negative[1:].unsqueeze(0)
pe = paddle.concat([pe_positive, pe_negative], axis=1)
self.pe = pe
def forward(self, x: paddle.Tensor):
"""Add positional encoding.
Parameters
----------
x : paddle.Tensor
Input tensor (batch, time, `*`).
Returns
----------
paddle.Tensor
Encoded tensor (batch, time, `*`).
"""
self.extend_pe(x)
x = x * self.xscale
T = paddle.shape(x)[1]
pe_size = paddle.shape(self.pe)
pos_emb = self.pe[:, pe_size[1] // 2 - T + 1:pe_size[1] // 2 + T, ]
return self.dropout(x), self.dropout(pos_emb)
......@@ -12,15 +12,26 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
from typing import List
from typing import Union
from paddle import nn
from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer as ConformerEncoderLayer
from paddlespeech.t2s.modules.layer_norm import LayerNorm
from paddlespeech.t2s.modules.nets_utils import get_activation
from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
from paddlespeech.t2s.modules.transformer.encoder_layer import EncoderLayer
from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
from paddlespeech.t2s.modules.transformer.repeat import repeat
from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
class Encoder(nn.Layer):
......@@ -46,9 +57,6 @@ class Encoder(nn.Layer):
Dropout rate in attention.
input_layer : Union[str, paddle.nn.Layer]
Input layer type.
pos_enc_class : paddle.nn.Layer
Positional encoding module class.
`PositionalEncoding `or `ScaledPositionalEncoding`
normalize_before : bool
Whether to use layer_norm before the first block.
concat_after : bool
......@@ -60,98 +68,137 @@ class Encoder(nn.Layer):
"linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size : int
Kernel size of positionwise conv1d layer.
macaron_style : bool
Whether to use macaron style for positionwise layer.
pos_enc_layer_type : str
Encoder positional encoding layer type.
selfattention_layer_type : str
Encoder attention layer type.
activation_type : str
Encoder activation function type.
use_cnn_module : bool
Whether to use convolution module.
zero_triu : bool
Whether to zero the upper triangular part of attention matrix.
cnn_module_kernel : int
Kernerl size of convolution module.
padding_idx : int
Padding idx for input_layer=embed.
stochastic_depth_rate : float
Maximum probability to skip the encoder layer.
intermediate_layers : Union[List[int], None]
indices of intermediate CTC layer.
indices start from 1.
if not None, intermediate outputs are returned (which changes return type
signature.)
encoder_type: str
"transformer", or "conformer".
"""
def __init__(
self,
idim,
attention_dim=256,
attention_heads=4,
linear_units=2048,
num_blocks=6,
dropout_rate=0.1,
positional_dropout_rate=0.1,
attention_dropout_rate=0.0,
input_layer="conv2d",
pos_enc_class=PositionalEncoding,
normalize_before=True,
concat_after=False,
positionwise_layer_type="linear",
positionwise_conv_kernel_size=1,
selfattention_layer_type="selfattn",
padding_idx=-1, ):
def __init__(self,
idim: int,
attention_dim: int=256,
attention_heads: int=4,
linear_units: int=2048,
num_blocks: int=6,
dropout_rate: float=0.1,
positional_dropout_rate: float=0.1,
attention_dropout_rate: float=0.0,
input_layer: str="conv2d",
normalize_before: bool=True,
concat_after: bool=False,
positionwise_layer_type: str="linear",
positionwise_conv_kernel_size: int=1,
macaron_style: bool=False,
pos_enc_layer_type: str="abs_pos",
selfattention_layer_type: str="selfattn",
activation_type: str="swish",
use_cnn_module: bool=False,
zero_triu: bool=False,
cnn_module_kernel: int=31,
padding_idx: int=-1,
stochastic_depth_rate: float=0.0,
intermediate_layers: Union[List[int], None]=None,
encoder_type: str="transformer"):
"""Construct an Encoder object."""
super(Encoder, self).__init__()
super().__init__()
activation = get_activation(activation_type)
pos_enc_class = self.get_pos_enc_class(pos_enc_layer_type,
selfattention_layer_type)
self.encoder_type = encoder_type
self.conv_subsampling_factor = 1
if input_layer == "linear":
self.embed = nn.Sequential(
nn.Linear(idim, attention_dim, bias_attr=True),
nn.LayerNorm(attention_dim),
nn.Dropout(dropout_rate),
nn.ReLU(),
pos_enc_class(attention_dim, positional_dropout_rate), )
elif input_layer == "embed":
self.embed = nn.Sequential(
nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
pos_enc_class(attention_dim, positional_dropout_rate), )
elif isinstance(input_layer, nn.Layer):
self.embed = nn.Sequential(
input_layer,
pos_enc_class(attention_dim, positional_dropout_rate), )
elif input_layer is None:
self.embed = nn.Sequential(
pos_enc_class(attention_dim, positional_dropout_rate))
else:
raise ValueError("unknown input_layer: " + input_layer)
self.embed = self.get_embed(
idim=idim,
input_layer=input_layer,
attention_dim=attention_dim,
pos_enc_class=pos_enc_class,
dropout_rate=dropout_rate,
positional_dropout_rate=positional_dropout_rate,
padding_idx=padding_idx)
self.normalize_before = normalize_before
# self-attention module definition
encoder_selfattn_layer, encoder_selfattn_layer_args = self.get_encoder_selfattn_layer(
selfattention_layer_type=selfattention_layer_type,
attention_heads=attention_heads,
attention_dim=attention_dim,
attention_dropout_rate=attention_dropout_rate,
zero_triu=zero_triu,
pos_enc_layer_type=pos_enc_layer_type)
# feed-forward module definition
positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
positionwise_layer_type,
attention_dim,
linear_units,
dropout_rate,
positionwise_conv_kernel_size, )
if selfattention_layer_type in [
"selfattn",
"rel_selfattn",
"legacy_rel_selfattn",
]:
encoder_selfattn_layer = MultiHeadedAttention
encoder_selfattn_layer_args = [
(attention_heads, attention_dim, attention_dropout_rate, )
] * num_blocks
positionwise_layer_type, attention_dim, linear_units, dropout_rate,
positionwise_conv_kernel_size, activation)
else:
raise NotImplementedError(selfattention_layer_type)
# convolution module definition
convolution_layer = ConvolutionModule
convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
if self.encoder_type == "transformer":
self.encoders = repeat(
num_blocks,
lambda lnum: EncoderLayer(
attention_dim,
encoder_selfattn_layer(*encoder_selfattn_layer_args[lnum]),
encoder_selfattn_layer(*encoder_selfattn_layer_args),
positionwise_layer(*positionwise_layer_args),
dropout_rate,
normalize_before,
concat_after, ), )
elif self.encoder_type == "conformer":
self.encoders = repeat(
num_blocks,
lambda lnum: ConformerEncoderLayer(
attention_dim,
encoder_selfattn_layer(*encoder_selfattn_layer_args),
positionwise_layer(*positionwise_layer_args),
positionwise_layer(*positionwise_layer_args) if macaron_style else None,
convolution_layer(*convolution_layer_args) if use_cnn_module else None,
dropout_rate,
normalize_before,
concat_after,
stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
self.intermediate_layers = intermediate_layers
else:
raise NotImplementedError("Support only linear or conv1d.")
if self.normalize_before:
self.after_norm = nn.LayerNorm(attention_dim)
def get_positionwise_layer(
self,
positionwise_layer_type="linear",
attention_dim=256,
linear_units=2048,
dropout_rate=0.1,
positionwise_conv_kernel_size=1, ):
self.after_norm = LayerNorm(attention_dim)
def get_positionwise_layer(self,
positionwise_layer_type: str="linear",
attention_dim: int=256,
linear_units: int=2048,
dropout_rate: float=0.1,
positionwise_conv_kernel_size: int=1,
activation: nn.Layer=nn.ReLU()):
"""Define positionwise layer."""
if positionwise_layer_type == "linear":
positionwise_layer = PositionwiseFeedForward
positionwise_layer_args = (attention_dim, linear_units,
dropout_rate)
dropout_rate, activation)
elif positionwise_layer_type == "conv1d":
positionwise_layer = MultiLayeredConv1d
positionwise_layer_args = (attention_dim, linear_units,
......@@ -166,6 +213,81 @@ class Encoder(nn.Layer):
raise NotImplementedError("Support only linear or conv1d.")
return positionwise_layer, positionwise_layer_args
def get_encoder_selfattn_layer(self,
selfattention_layer_type: str="selfattn",
attention_heads: int=4,
attention_dim: int=256,
attention_dropout_rate: float=0.0,
zero_triu: bool=False,
pos_enc_layer_type: str="abs_pos"):
if selfattention_layer_type == "selfattn":
encoder_selfattn_layer = MultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, attention_dim,
attention_dropout_rate, )
elif selfattention_layer_type == "rel_selfattn":
assert pos_enc_layer_type == "rel_pos"
encoder_selfattn_layer = RelPositionMultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, attention_dim,
attention_dropout_rate, zero_triu, )
else:
raise ValueError("unknown encoder_attn_layer: " +
selfattention_layer_type)
return encoder_selfattn_layer, encoder_selfattn_layer_args
def get_pos_enc_class(self,
pos_enc_layer_type: str="abs_pos",
selfattention_layer_type: str="selfattn"):
if pos_enc_layer_type == "abs_pos":
pos_enc_class = PositionalEncoding
elif pos_enc_layer_type == "scaled_abs_pos":
pos_enc_class = ScaledPositionalEncoding
elif pos_enc_layer_type == "rel_pos":
assert selfattention_layer_type == "rel_selfattn"
pos_enc_class = RelPositionalEncoding
else:
raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
return pos_enc_class
def get_embed(self,
idim,
input_layer="conv2d",
attention_dim: int=256,
pos_enc_class=PositionalEncoding,
dropout_rate: int=0.1,
positional_dropout_rate: int=0.1,
padding_idx: int=-1):
if input_layer == "linear":
embed = nn.Sequential(
nn.Linear(idim, attention_dim),
nn.LayerNorm(attention_dim),
nn.Dropout(dropout_rate),
nn.ReLU(),
pos_enc_class(attention_dim, positional_dropout_rate), )
elif input_layer == "conv2d":
embed = Conv2dSubsampling(
idim,
attention_dim,
dropout_rate,
pos_enc_class(attention_dim, positional_dropout_rate), )
self.conv_subsampling_factor = 4
elif input_layer == "embed":
embed = nn.Sequential(
nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
pos_enc_class(attention_dim, positional_dropout_rate), )
elif isinstance(input_layer, nn.Layer):
embed = nn.Sequential(
input_layer,
pos_enc_class(attention_dim, positional_dropout_rate), )
elif input_layer is None:
embed = nn.Sequential(
pos_enc_class(attention_dim, positional_dropout_rate))
else:
raise ValueError("unknown input_layer: " + input_layer)
return embed
def forward(self, xs, masks):
"""Encode input sequence.
......@@ -174,21 +296,55 @@ class Encoder(nn.Layer):
xs : paddle.Tensor
Input tensor (#batch, time, idim).
masks : paddle.Tensor
Mask tensor (#batch, time).
Mask tensor (#batch, 1, time).
Returns
----------
paddle.Tensor
Output tensor (#batch, time, attention_dim).
paddle.Tensor
Mask tensor (#batch, time).
Mask tensor (#batch, 1, time).
"""
if self.encoder_type == "transformer":
xs = self.embed(xs)
xs, masks = self.encoders(xs, masks)
if self.normalize_before:
xs = self.after_norm(xs)
return xs, masks
elif self.encoder_type == "conformer":
if isinstance(self.embed, (Conv2dSubsampling)):
xs, masks = self.embed(xs, masks)
else:
xs = self.embed(xs)
if self.intermediate_layers is None:
xs, masks = self.encoders(xs, masks)
else:
intermediate_outputs = []
for layer_idx, encoder_layer in enumerate(self.encoders):
xs, masks = encoder_layer(xs, masks)
if (self.intermediate_layers is not None and
layer_idx + 1 in self.intermediate_layers):
# intermediate branches also require normalization.
encoder_output = xs
if isinstance(encoder_output, tuple):
encoder_output = encoder_output[0]
if self.normalize_before:
encoder_output = self.after_norm(encoder_output)
intermediate_outputs.append(encoder_output)
if isinstance(xs, tuple):
xs = xs[0]
if self.normalize_before:
xs = self.after_norm(xs)
if self.intermediate_layers is not None:
return xs, masks, intermediate_outputs
return xs, masks
else:
raise ValueError(f"{self.encoder_type} is not supported.")
def forward_one_step(self, xs, masks, cache=None):
"""Encode input frame.
......
......@@ -18,38 +18,6 @@ import paddle
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
class TooShortUttError(Exception):
"""Raised when the utt is too short for subsampling.
Parameters
----------
message : str
Message for error catch
actual_size : int
the short size that cannot pass the subsampling
limit : int
the limit size for subsampling
"""
def __init__(self, message, actual_size, limit):
"""Construct a TooShortUttError for error handler."""
super().__init__(message)
self.actual_size = actual_size
self.limit = limit
def check_short_utt(ins, size):
"""Check if the utterance is too short for subsampling."""
if isinstance(ins, Conv2dSubsampling2) and size < 3:
return True, 3
if isinstance(ins, Conv2dSubsampling) and size < 7:
return True, 7
if isinstance(ins, Conv2dSubsampling6) and size < 11:
return True, 11
if isinstance(ins, Conv2dSubsampling8) and size < 15:
return True, 15
return False, -1
class Conv2dSubsampling(paddle.nn.Layer):
"""Convolutional 2D subsampling (to 1/4 length).
Parameters
......@@ -112,178 +80,3 @@ class Conv2dSubsampling(paddle.nn.Layer):
raise NotImplementedError(
"Support only `-1` (for `reset_parameters`).")
return self.out[key]
class Conv2dSubsampling2(paddle.nn.Layer):
"""Convolutional 2D subsampling (to 1/2 length).
Parameters
----------
idim : int
Input dimension.
odim : int
Output dimension.
dropout_rate : float
Dropout rate.
pos_enc : paddle.nn.Layer
Custom position encoding layer.
"""
def __init__(self, idim, odim, dropout_rate, pos_enc=None):
"""Construct an Conv2dSubsampling2 object."""
super(Conv2dSubsampling2, self).__init__()
self.conv = paddle.nn.Sequential(
paddle.nn.Conv2D(1, odim, 3, 2),
paddle.nn.ReLU(),
paddle.nn.Conv2D(odim, odim, 3, 1),
paddle.nn.ReLU(), )
self.out = paddle.nn.Sequential(
paddle.nn.Linear(odim * (((idim - 1) // 2 - 2)), odim),
pos_enc if pos_enc is not None else
PositionalEncoding(odim, dropout_rate), )
def forward(self, x, x_mask):
"""Subsample x.
Parameters
----------
x : paddle.Tensor
Input tensor (#batch, time, idim).
x_mask : paddle.Tensor
Input mask (#batch, 1, time).
Returns
----------
paddle.Tensor
ubsampled tensor (#batch, time', odim),
where time' = time // 2.
paddle.Tensor
Subsampled mask (#batch, 1, time'),
where time' = time // 2.
"""
# (b, c, t, f)
x = x.unsqueeze(1)
x = self.conv(x)
b, c, t, f = paddle.shape(x)
x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
if x_mask is None:
return x, None
return x, x_mask[:, :, :-2:2][:, :, :-2:1]
def __getitem__(self, key):
"""Get item.
When reset_parameters() is called, if use_scaled_pos_enc is used,
return the positioning encoding.
"""
if key != -1:
raise NotImplementedError(
"Support only `-1` (for `reset_parameters`).")
return self.out[key]
class Conv2dSubsampling6(paddle.nn.Layer):
"""Convolutional 2D subsampling (to 1/6 length).
Parameters
----------
idim : int
Input dimension.
odim : int
Output dimension.
dropout_rate : float
Dropout rate.
pos_enc : paddle.nn.Layer
Custom position encoding layer.
"""
def __init__(self, idim, odim, dropout_rate, pos_enc=None):
"""Construct an Conv2dSubsampling6 object."""
super(Conv2dSubsampling6, self).__init__()
self.conv = paddle.nn.Sequential(
paddle.nn.Conv2D(1, odim, 3, 2),
paddle.nn.ReLU(),
paddle.nn.Conv2D(odim, odim, 5, 3),
paddle.nn.ReLU(), )
self.out = paddle.nn.Sequential(
paddle.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim),
pos_enc if pos_enc is not None else
PositionalEncoding(odim, dropout_rate), )
def forward(self, x, x_mask):
"""Subsample x.
Parameters
----------
x : paddle.Tensor
Input tensor (#batch, time, idim).
x_mask paddle.Tensor
Input mask (#batch, 1, time).
Returns
----------
paddle.Tensor
Subsampled tensor (#batch, time', odim),
where time' = time // 6.
paddle.Tensor
Subsampled mask (#batch, 1, time'),
where time' = time // 6.
"""
# (b, c, t, f)
x = x.unsqueeze(1)
x = self.conv(x)
b, c, t, f = paddle.shape(x)
x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
if x_mask is None:
return x, None
return x, x_mask[:, :, :-2:2][:, :, :-4:3]
class Conv2dSubsampling8(paddle.nn.Layer):
"""Convolutional 2D subsampling (to 1/8 length).
Parameters
----------
idim : int
Input dimension.
odim : int
Output dimension.
dropout_rate : float
Dropout rate.
pos_enc : paddle.nn.Layer
Custom position encoding layer.
"""
def __init__(self, idim, odim, dropout_rate, pos_enc=None):
"""Construct an Conv2dSubsampling8 object."""
super(Conv2dSubsampling8, self).__init__()
self.conv = paddle.nn.Sequential(
paddle.nn.Conv2D(1, odim, 3, 2),
paddle.nn.ReLU(),
paddle.nn.Conv2D(odim, odim, 3, 2),
paddle.nn.ReLU(),
paddle.nn.Conv2D(odim, odim, 3, 2),
paddle.nn.ReLU(), )
self.out = paddle.nn.Sequential(
paddle.nn.Linear(odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2),
odim),
pos_enc if pos_enc is not None else
PositionalEncoding(odim, dropout_rate), )
def forward(self, x, x_mask):
"""Subsample x.
Parameters
----------
x : paddle.Tensor
Input tensor (#batch, time, idim).
x_mask : paddle.Tensor
Input mask (#batch, 1, time).
Returns
----------
paddle.Tensor
Subsampled tensor (#batch, time', odim),
where time' = time // 8.
paddle.Tensor
Subsampled mask (#batch, 1, time'),
where time' = time // 8.
"""
# (b, c, t, f)
x = x.unsqueeze(1)
x = self.conv(x)
b, c, t, f = paddle.shape(x)
x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
if x_mask is None:
return x, None
return x, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2]
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册