Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
bc0dd511
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
bc0dd511
编写于
11月 19, 2021
作者:
小湉湉
提交者:
root
11月 24, 2021
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of github.com:PaddlePaddle/PaddleSpeech into HEAD
上级
4370c5cf
b8ead782
变更
38
隐藏空白更改
内联
并排
Showing
38 changed file
with
970 addition
and
1034 deletion
+970
-1034
examples/aishell/s1/run.sh
examples/aishell/s1/run.sh
+4
-4
examples/aishell3/tts3/conf/default.yaml
examples/aishell3/tts3/conf/default.yaml
+1
-2
examples/aishell3/tts3/run.sh
examples/aishell3/tts3/run.sh
+0
-1
examples/aishell3/vc0/local/preprocess.sh
examples/aishell3/vc0/local/preprocess.sh
+1
-1
examples/aishell3/vc1/conf/default.yaml
examples/aishell3/vc1/conf/default.yaml
+0
-1
examples/csmsc/tts3/conf/conformer.yaml
examples/csmsc/tts3/conf/conformer.yaml
+109
-0
examples/csmsc/tts3/conf/default.yaml
examples/csmsc/tts3/conf/default.yaml
+0
-1
examples/csmsc/voc1/conf/default.yaml
examples/csmsc/voc1/conf/default.yaml
+1
-1
examples/librispeech/s1/run.sh
examples/librispeech/s1/run.sh
+4
-4
examples/librispeech/s2/conf/transformer.yaml
examples/librispeech/s2/conf/transformer.yaml
+2
-0
examples/librispeech/s2/run.sh
examples/librispeech/s2/run.sh
+4
-4
examples/ljspeech/tts3/conf/default.yaml
examples/ljspeech/tts3/conf/default.yaml
+0
-1
examples/other/ge2e/path.sh
examples/other/ge2e/path.sh
+1
-1
examples/ted_en_zh/t0/run.sh
examples/ted_en_zh/t0/run.sh
+4
-4
examples/timit/s1/run.sh
examples/timit/s1/run.sh
+4
-4
examples/tiny/s1/run.sh
examples/tiny/s1/run.sh
+4
-4
examples/vctk/tts3/conf/default.yaml
examples/vctk/tts3/conf/default.yaml
+0
-1
paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py
paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py
+8
-4
paddlespeech/s2t/models/u2/u2.py
paddlespeech/s2t/models/u2/u2.py
+1
-1
paddlespeech/t2s/datasets/am_batch_fn.py
paddlespeech/t2s/datasets/am_batch_fn.py
+14
-5
paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
...lespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
+3
-3
paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py
...peech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py
+3
-3
paddlespeech/t2s/exps/fastspeech2/normalize.py
paddlespeech/t2s/exps/fastspeech2/normalize.py
+4
-0
paddlespeech/t2s/exps/fastspeech2/preprocess.py
paddlespeech/t2s/exps/fastspeech2/preprocess.py
+31
-7
paddlespeech/t2s/exps/fastspeech2/synthesize.py
paddlespeech/t2s/exps/fastspeech2/synthesize.py
+23
-8
paddlespeech/t2s/exps/fastspeech2/train.py
paddlespeech/t2s/exps/fastspeech2/train.py
+21
-13
paddlespeech/t2s/exps/ge2e/random_cycle.py
paddlespeech/t2s/exps/ge2e/random_cycle.py
+0
-38
paddlespeech/t2s/exps/ge2e/speaker_verification_dataset.py
paddlespeech/t2s/exps/ge2e/speaker_verification_dataset.py
+0
-131
paddlespeech/t2s/exps/ge2e/train.py
paddlespeech/t2s/exps/ge2e/train.py
+0
-123
paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py
...ch/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py
+2
-2
paddlespeech/t2s/models/fastspeech2/fastspeech2.py
paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+245
-55
paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
+11
-2
paddlespeech/t2s/models/transformer_tts/transformer_tts.py
paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+26
-24
paddlespeech/t2s/modules/conformer/encoder.py
paddlespeech/t2s/modules/conformer/encoder.py
+0
-274
paddlespeech/t2s/modules/transformer/attention.py
paddlespeech/t2s/modules/transformer/attention.py
+115
-5
paddlespeech/t2s/modules/transformer/embedding.py
paddlespeech/t2s/modules/transformer/embedding.py
+85
-12
paddlespeech/t2s/modules/transformer/encoder.py
paddlespeech/t2s/modules/transformer/encoder.py
+239
-83
paddlespeech/t2s/modules/transformer/subsampling.py
paddlespeech/t2s/modules/transformer/subsampling.py
+0
-207
未找到文件。
examples/aishell/s1/run.sh
浏览文件 @
bc0dd511
...
@@ -41,10 +41,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
...
@@ -41,10 +41,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES
=
0 ./local/align.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/align.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
#
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n
#
# export ckpt avg_n
CUDA_VISIBLE_DEVICES
=
0 ./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
#
CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
#
fi
# Optionally, you can add LM and test it with runtime.
# Optionally, you can add LM and test it with runtime.
if
[
${
stage
}
-le
6
]
&&
[
${
stop_stage
}
-ge
6
]
;
then
if
[
${
stage
}
-le
6
]
&&
[
${
stop_stage
}
-ge
6
]
;
then
...
...
examples/aishell3/tts3/conf/default.yaml
浏览文件 @
bc0dd511
...
@@ -24,7 +24,7 @@ f0max: 400 # Minimum f0 for pitch extraction.
...
@@ -24,7 +24,7 @@ f0max: 400 # Minimum f0 for pitch extraction.
# DATA SETTING #
# DATA SETTING #
###########################################################
###########################################################
batch_size
:
64
batch_size
:
64
num_workers
:
4
num_workers
:
2
###########################################################
###########################################################
...
@@ -45,7 +45,6 @@ model:
...
@@ -45,7 +45,6 @@ model:
postnet_layers
:
5
# number of layers of postnset
postnet_layers
:
5
# number of layers of postnset
postnet_filts
:
5
# filter size of conv layers in postnet
postnet_filts
:
5
# filter size of conv layers in postnet
postnet_chans
:
256
# number of channels of conv layers in postnet
postnet_chans
:
256
# number of channels of conv layers in postnet
use_masking
:
True
# whether to apply masking for padded part in loss calculation
use_scaled_pos_enc
:
True
# whether to use scaled positional encoding
use_scaled_pos_enc
:
True
# whether to use scaled positional encoding
encoder_normalize_before
:
True
# whether to perform layer normalization before the input
encoder_normalize_before
:
True
# whether to perform layer normalization before the input
decoder_normalize_before
:
True
# whether to perform layer normalization before the input
decoder_normalize_before
:
True
# whether to perform layer normalization before the input
...
...
examples/aishell3/tts3/run.sh
浏览文件 @
bc0dd511
...
@@ -7,7 +7,6 @@ gpus=0,1
...
@@ -7,7 +7,6 @@ gpus=0,1
stage
=
0
stage
=
0
stop_stage
=
100
stop_stage
=
100
conf_path
=
conf/default.yaml
conf_path
=
conf/default.yaml
train_output_path
=
exp/default
train_output_path
=
exp/default
ckpt_name
=
snapshot_iter_482.pdz
ckpt_name
=
snapshot_iter_482.pdz
...
...
examples/aishell3/vc0/local/preprocess.sh
浏览文件 @
bc0dd511
...
@@ -9,7 +9,7 @@ alignment=$3
...
@@ -9,7 +9,7 @@ alignment=$3
ge2e_ckpt_path
=
$4
ge2e_ckpt_path
=
$4
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
python3
${
BIN_DIR
}
/../..
/ge2e/inference.py
\
python3
${
MAIN_ROOT
}
/paddlespeech/vector/exps
/ge2e/inference.py
\
--input
=
${
input
}
/wav
\
--input
=
${
input
}
/wav
\
--output
=
${
preprocess_path
}
/embed
\
--output
=
${
preprocess_path
}
/embed
\
--checkpoint_path
=
${
ge2e_ckpt_path
}
--checkpoint_path
=
${
ge2e_ckpt_path
}
...
...
examples/aishell3/vc1/conf/default.yaml
浏览文件 @
bc0dd511
...
@@ -45,7 +45,6 @@ model:
...
@@ -45,7 +45,6 @@ model:
postnet_layers
:
5
# number of layers of postnset
postnet_layers
:
5
# number of layers of postnset
postnet_filts
:
5
# filter size of conv layers in postnet
postnet_filts
:
5
# filter size of conv layers in postnet
postnet_chans
:
256
# number of channels of conv layers in postnet
postnet_chans
:
256
# number of channels of conv layers in postnet
use_masking
:
True
# whether to apply masking for padded part in loss calculation
use_scaled_pos_enc
:
True
# whether to use scaled positional encoding
use_scaled_pos_enc
:
True
# whether to use scaled positional encoding
encoder_normalize_before
:
True
# whether to perform layer normalization before the input
encoder_normalize_before
:
True
# whether to perform layer normalization before the input
decoder_normalize_before
:
True
# whether to perform layer normalization before the input
decoder_normalize_before
:
True
# whether to perform layer normalization before the input
...
...
examples/csmsc/tts3/conf/conformer.yaml
0 → 100644
浏览文件 @
bc0dd511
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs
:
24000
# sr
n_fft
:
2048
# FFT size.
n_shift
:
300
# Hop size.
win_length
:
1200
# Window length.
# If set to null, it will be the same as fft_size.
window
:
"
hann"
# Window function.
# Only used for feats_type != raw
fmin
:
80
# Minimum frequency of Mel basis.
fmax
:
7600
# Maximum frequency of Mel basis.
n_mels
:
80
# The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min
:
80
# Maximum f0 for pitch extraction.
f0max
:
400
# Minimum f0 for pitch extraction.
###########################################################
# DATA SETTING #
###########################################################
batch_size
:
64
num_workers
:
4
###########################################################
# MODEL SETTING #
###########################################################
model
:
adim
:
384
# attention dimension
aheads
:
2
# number of attention heads
elayers
:
4
# number of encoder layers
eunits
:
1536
# number of encoder ff units
dlayers
:
4
# number of decoder layers
dunits
:
1536
# number of decoder ff units
positionwise_layer_type
:
conv1d
# type of position-wise layer
positionwise_conv_kernel_size
:
3
# kernel size of position wise conv layer
duration_predictor_layers
:
2
# number of layers of duration predictor
duration_predictor_chans
:
256
# number of channels of duration predictor
duration_predictor_kernel_size
:
3
# filter size of duration predictor
postnet_layers
:
5
# number of layers of postnset
postnet_filts
:
5
# filter size of conv layers in postnet
postnet_chans
:
256
# number of channels of conv layers in postnet
encoder_normalize_before
:
True
# whether to perform layer normalization before the input
decoder_normalize_before
:
True
# whether to perform layer normalization before the input
reduction_factor
:
1
# reduction factor
encoder_type
:
conformer
# encoder type
decoder_type
:
conformer
# decoder type
conformer_pos_enc_layer_type
:
rel_pos
# conformer positional encoding type
conformer_self_attn_layer_type
:
rel_selfattn
# conformer self-attention type
conformer_activation_type
:
swish
# conformer activation type
use_macaron_style_in_conformer
:
true
# whether to use macaron style in conformer
use_cnn_in_conformer
:
true
# whether to use CNN in conformer
conformer_enc_kernel_size
:
7
# kernel size in CNN module of conformer-based encoder
conformer_dec_kernel_size
:
31
# kernel size in CNN module of conformer-based decoder
init_type
:
xavier_uniform
# initialization type
transformer_enc_dropout_rate
:
0.2
# dropout rate for transformer encoder layer
transformer_enc_positional_dropout_rate
:
0.2
# dropout rate for transformer encoder positional encoding
transformer_enc_attn_dropout_rate
:
0.2
# dropout rate for transformer encoder attention layer
transformer_dec_dropout_rate
:
0.2
# dropout rate for transformer decoder layer
transformer_dec_positional_dropout_rate
:
0.2
# dropout rate for transformer decoder positional encoding
transformer_dec_attn_dropout_rate
:
0.2
# dropout rate for transformer decoder attention layer
pitch_predictor_layers
:
5
# number of conv layers in pitch predictor
pitch_predictor_chans
:
256
# number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size
:
5
# kernel size of conv leyers in pitch predictor
pitch_predictor_dropout
:
0.5
# dropout rate in pitch predictor
pitch_embed_kernel_size
:
1
# kernel size of conv embedding layer for pitch
pitch_embed_dropout
:
0.0
# dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor
:
true
# whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers
:
2
# number of conv layers in energy predictor
energy_predictor_chans
:
256
# number of channels of conv layers in energy predictor
energy_predictor_kernel_size
:
3
# kernel size of conv leyers in energy predictor
energy_predictor_dropout
:
0.5
# dropout rate in energy predictor
energy_embed_kernel_size
:
1
# kernel size of conv embedding layer for energy
energy_embed_dropout
:
0.0
# dropout rate after conv embedding layer for energy
stop_gradient_from_energy_predictor
:
false
# whether to stop the gradient from energy predictor to encoder
###########################################################
# UPDATER SETTING #
###########################################################
updater
:
use_masking
:
True
# whether to apply masking for padded part in loss calculation
###########################################################
# OPTIMIZER SETTING #
###########################################################
optimizer
:
optim
:
adam
# optimizer type
learning_rate
:
0.001
# learning rate
###########################################################
# TRAINING SETTING #
###########################################################
max_epoch
:
1000
num_snapshots
:
5
###########################################################
# OTHER SETTING #
###########################################################
seed
:
10086
examples/csmsc/tts3/conf/default.yaml
浏览文件 @
bc0dd511
...
@@ -45,7 +45,6 @@ model:
...
@@ -45,7 +45,6 @@ model:
postnet_layers
:
5
# number of layers of postnset
postnet_layers
:
5
# number of layers of postnset
postnet_filts
:
5
# filter size of conv layers in postnet
postnet_filts
:
5
# filter size of conv layers in postnet
postnet_chans
:
256
# number of channels of conv layers in postnet
postnet_chans
:
256
# number of channels of conv layers in postnet
use_masking
:
True
# whether to apply masking for padded part in loss calculation
use_scaled_pos_enc
:
True
# whether to use scaled positional encoding
use_scaled_pos_enc
:
True
# whether to use scaled positional encoding
encoder_normalize_before
:
True
# whether to perform layer normalization before the input
encoder_normalize_before
:
True
# whether to perform layer normalization before the input
decoder_normalize_before
:
True
# whether to perform layer normalization before the input
decoder_normalize_before
:
True
# whether to perform layer normalization before the input
...
...
examples/csmsc/voc1/conf/default.yaml
浏览文件 @
bc0dd511
...
@@ -80,7 +80,7 @@ lambda_adv: 4.0 # Loss balancing coefficient.
...
@@ -80,7 +80,7 @@ lambda_adv: 4.0 # Loss balancing coefficient.
batch_size
:
8
# Batch size.
batch_size
:
8
# Batch size.
batch_max_steps
:
25500
# Length of each audio in batch. Make sure dividable by hop_size.
batch_max_steps
:
25500
# Length of each audio in batch. Make sure dividable by hop_size.
pin_memory
:
true
# Whether to pin memory in Pytorch DataLoader.
pin_memory
:
true
# Whether to pin memory in Pytorch DataLoader.
num_workers
:
4
# Number of workers in Pytorch DataLoader.
num_workers
:
2
# Number of workers in Pytorch DataLoader.
remove_short_samples
:
true
# Whether to remove samples the length of which are less than batch_max_steps.
remove_short_samples
:
true
# Whether to remove samples the length of which are less than batch_max_steps.
allow_cache
:
true
# Whether to allow cache in dataset. If true, it requires cpu memory.
allow_cache
:
true
# Whether to allow cache in dataset. If true, it requires cpu memory.
...
...
examples/librispeech/s1/run.sh
浏览文件 @
bc0dd511
...
@@ -43,10 +43,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
...
@@ -43,10 +43,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES
=
0 ./local/align.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/align.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
#
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n
#
# export ckpt avg_n
CUDA_VISIBLE_DEVICES
=
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
#
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
#
fi
if
[
${
stage
}
-le
6
]
&&
[
${
stop_stage
}
-ge
6
]
;
then
if
[
${
stage
}
-le
6
]
&&
[
${
stop_stage
}
-ge
6
]
;
then
# test a single .wav file
# test a single .wav file
...
...
examples/librispeech/s2/conf/transformer.yaml
浏览文件 @
bc0dd511
# https://yaml.org/type/float.html
# https://yaml.org/type/float.html
# network architecture
# network architecture
model
:
model
:
cmvn_file
:
cmvn_file_type
:
"
json"
# encoder related
# encoder related
encoder
:
transformer
encoder
:
transformer
encoder_conf
:
encoder_conf
:
...
...
examples/librispeech/s2/run.sh
浏览文件 @
bc0dd511
...
@@ -48,10 +48,10 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
...
@@ -48,10 +48,10 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
CUDA_VISIBLE_DEVICES
=
0 ./local/align.sh
${
conf_path
}
${
dict_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/align.sh
${
conf_path
}
${
dict_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
6
]
&&
[
${
stop_stage
}
-ge
6
]
;
then
#
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# export ckpt avg_n
#
# export ckpt avg_n
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
#
./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
#
fi
if
[
${
stage
}
-le
7
]
&&
[
${
stop_stage
}
-ge
7
]
;
then
if
[
${
stage
}
-le
7
]
&&
[
${
stop_stage
}
-ge
7
]
;
then
./local/cacu_perplexity.sh
||
exit
-1
./local/cacu_perplexity.sh
||
exit
-1
...
...
examples/ljspeech/tts3/conf/default.yaml
浏览文件 @
bc0dd511
...
@@ -45,7 +45,6 @@ model:
...
@@ -45,7 +45,6 @@ model:
postnet_layers
:
5
# number of layers of postnset
postnet_layers
:
5
# number of layers of postnset
postnet_filts
:
5
# filter size of conv layers in postnet
postnet_filts
:
5
# filter size of conv layers in postnet
postnet_chans
:
256
# number of channels of conv layers in postnet
postnet_chans
:
256
# number of channels of conv layers in postnet
use_masking
:
True
# whether to apply masking for padded part in loss calculation
use_scaled_pos_enc
:
True
# whether to use scaled positional encoding
use_scaled_pos_enc
:
True
# whether to use scaled positional encoding
encoder_normalize_before
:
True
# whether to perform layer normalization before the input
encoder_normalize_before
:
True
# whether to perform layer normalization before the input
decoder_normalize_before
:
True
# whether to perform layer normalization before the input
decoder_normalize_before
:
True
# whether to perform layer normalization before the input
...
...
examples/other/ge2e/path.sh
浏览文件 @
bc0dd511
...
@@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8
...
@@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8
export
PYTHONPATH
=
${
MAIN_ROOT
}
:
${
PYTHONPATH
}
export
PYTHONPATH
=
${
MAIN_ROOT
}
:
${
PYTHONPATH
}
MODEL
=
ge2e
MODEL
=
ge2e
export
BIN_DIR
=
${
MAIN_ROOT
}
/paddlespeech/
t2s
/exps/
${
MODEL
}
export
BIN_DIR
=
${
MAIN_ROOT
}
/paddlespeech/
vector
/exps/
${
MODEL
}
examples/ted_en_zh/t0/run.sh
浏览文件 @
bc0dd511
...
@@ -35,7 +35,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
...
@@ -35,7 +35,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
#
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# export ckpt avg_n
#
# export ckpt avg_n
CUDA_VISIBLE_DEVICES
=
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
#
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
#
fi
examples/timit/s1/run.sh
浏览文件 @
bc0dd511
...
@@ -42,7 +42,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
...
@@ -42,7 +42,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES
=
0 ./local/align.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0 ./local/align.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
#
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n
#
# export ckpt avg_n
CUDA_VISIBLE_DEVICES
=
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
#
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
#
fi
examples/tiny/s1/run.sh
浏览文件 @
bc0dd511
...
@@ -39,8 +39,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
...
@@ -39,8 +39,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/align.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/align.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
#
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n
#
# export ckpt avg_n
CUDA_VISIBLE_DEVICES
=
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
#
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
#
fi
examples/vctk/tts3/conf/default.yaml
浏览文件 @
bc0dd511
...
@@ -45,7 +45,6 @@ model:
...
@@ -45,7 +45,6 @@ model:
postnet_layers
:
5
# number of layers of postnset
postnet_layers
:
5
# number of layers of postnset
postnet_filts
:
5
# filter size of conv layers in postnet
postnet_filts
:
5
# filter size of conv layers in postnet
postnet_chans
:
256
# number of channels of conv layers in postnet
postnet_chans
:
256
# number of channels of conv layers in postnet
use_masking
:
True
# whether to apply masking for padded part in loss calculation
use_scaled_pos_enc
:
True
# whether to use scaled positional encoding
use_scaled_pos_enc
:
True
# whether to use scaled positional encoding
encoder_normalize_before
:
True
# whether to perform layer normalization before the input
encoder_normalize_before
:
True
# whether to perform layer normalization before the input
decoder_normalize_before
:
True
# whether to perform layer normalization before the input
decoder_normalize_before
:
True
# whether to perform layer normalization before the input
...
...
paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py
浏览文件 @
bc0dd511
...
@@ -126,8 +126,12 @@ decoders_module = [
...
@@ -126,8 +126,12 @@ decoders_module = [
]
]
setup
(
setup
(
name
=
'swig_decoders'
,
name
=
'paddlespeech_ctcdecoders'
,
version
=
'1.1'
,
version
=
'0.0.1a'
,
description
=
"""CTC decoders"""
,
description
=
"CTC decoders in paddlespeech"
,
author
=
"PaddlePaddle Speech and Language Team"
,
author_email
=
"paddlesl@baidu.com"
,
url
=
"https://github.com/PaddlePaddle/PaddleSpeech"
,
license
=
'Apache 2.0'
,
ext_modules
=
decoders_module
,
ext_modules
=
decoders_module
,
py_modules
=
[
'swig_decoders'
]
,
)
py_modules
=
[
'swig_decoders'
])
paddlespeech/s2t/models/u2/u2.py
浏览文件 @
bc0dd511
...
@@ -860,7 +860,7 @@ class U2Model(U2DecodeModel):
...
@@ -860,7 +860,7 @@ class U2Model(U2DecodeModel):
int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
"""
"""
# cmvn
# cmvn
if
configs
[
'cmvn_file'
]
is
not
None
:
if
'cmvn_file'
in
configs
and
configs
[
'cmvn_file'
]
is
not
None
:
mean
,
istd
=
load_cmvn
(
configs
[
'cmvn_file'
],
mean
,
istd
=
load_cmvn
(
configs
[
'cmvn_file'
],
configs
[
'cmvn_file_type'
])
configs
[
'cmvn_file_type'
])
global_cmvn
=
GlobalCMVN
(
global_cmvn
=
GlobalCMVN
(
...
...
paddlespeech/t2s/datasets/am_batch_fn.py
浏览文件 @
bc0dd511
...
@@ -100,7 +100,7 @@ def fastspeech2_single_spk_batch_fn(examples):
...
@@ -100,7 +100,7 @@ def fastspeech2_single_spk_batch_fn(examples):
def
fastspeech2_multi_spk_batch_fn
(
examples
):
def
fastspeech2_multi_spk_batch_fn
(
examples
):
# fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"]
# fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"
/"spk_emb"
]
text
=
[
np
.
array
(
item
[
"text"
],
dtype
=
np
.
int64
)
for
item
in
examples
]
text
=
[
np
.
array
(
item
[
"text"
],
dtype
=
np
.
int64
)
for
item
in
examples
]
speech
=
[
np
.
array
(
item
[
"speech"
],
dtype
=
np
.
float32
)
for
item
in
examples
]
speech
=
[
np
.
array
(
item
[
"speech"
],
dtype
=
np
.
float32
)
for
item
in
examples
]
pitch
=
[
np
.
array
(
item
[
"pitch"
],
dtype
=
np
.
float32
)
for
item
in
examples
]
pitch
=
[
np
.
array
(
item
[
"pitch"
],
dtype
=
np
.
float32
)
for
item
in
examples
]
...
@@ -114,7 +114,6 @@ def fastspeech2_multi_spk_batch_fn(examples):
...
@@ -114,7 +114,6 @@ def fastspeech2_multi_spk_batch_fn(examples):
speech_lengths
=
[
speech_lengths
=
[
np
.
array
(
item
[
"speech_lengths"
],
dtype
=
np
.
int64
)
for
item
in
examples
np
.
array
(
item
[
"speech_lengths"
],
dtype
=
np
.
int64
)
for
item
in
examples
]
]
spk_id
=
[
np
.
array
(
item
[
"spk_id"
],
dtype
=
np
.
int64
)
for
item
in
examples
]
text
=
batch_sequences
(
text
)
text
=
batch_sequences
(
text
)
pitch
=
batch_sequences
(
pitch
)
pitch
=
batch_sequences
(
pitch
)
...
@@ -130,7 +129,6 @@ def fastspeech2_multi_spk_batch_fn(examples):
...
@@ -130,7 +129,6 @@ def fastspeech2_multi_spk_batch_fn(examples):
energy
=
paddle
.
to_tensor
(
energy
)
energy
=
paddle
.
to_tensor
(
energy
)
text_lengths
=
paddle
.
to_tensor
(
text_lengths
)
text_lengths
=
paddle
.
to_tensor
(
text_lengths
)
speech_lengths
=
paddle
.
to_tensor
(
speech_lengths
)
speech_lengths
=
paddle
.
to_tensor
(
speech_lengths
)
spk_id
=
paddle
.
to_tensor
(
spk_id
)
batch
=
{
batch
=
{
"text"
:
text
,
"text"
:
text
,
...
@@ -139,9 +137,20 @@ def fastspeech2_multi_spk_batch_fn(examples):
...
@@ -139,9 +137,20 @@ def fastspeech2_multi_spk_batch_fn(examples):
"speech"
:
speech
,
"speech"
:
speech
,
"speech_lengths"
:
speech_lengths
,
"speech_lengths"
:
speech_lengths
,
"pitch"
:
pitch
,
"pitch"
:
pitch
,
"energy"
:
energy
,
"energy"
:
energy
"spk_id"
:
spk_id
}
}
# spk_emb has a higher priority than spk_id
if
"spk_emb"
in
examples
[
0
]:
spk_emb
=
[
np
.
array
(
item
[
"spk_emb"
],
dtype
=
np
.
float32
)
for
item
in
examples
]
spk_emb
=
batch_sequences
(
spk_emb
)
spk_emb
=
paddle
.
to_tensor
(
spk_emb
)
batch
[
"spk_emb"
]
=
spk_emb
elif
"spk_id"
in
examples
[
0
]:
spk_id
=
[
np
.
array
(
item
[
"spk_id"
],
dtype
=
np
.
int64
)
for
item
in
examples
]
spk_id
=
paddle
.
to_tensor
(
spk_id
)
batch
[
"spk_id"
]
=
spk_id
return
batch
return
batch
...
...
paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
浏览文件 @
bc0dd511
...
@@ -46,14 +46,14 @@ def evaluate(args, fastspeech2_config, pwg_config):
...
@@ -46,14 +46,14 @@ def evaluate(args, fastspeech2_config, pwg_config):
print
(
"vocab_size:"
,
vocab_size
)
print
(
"vocab_size:"
,
vocab_size
)
with
open
(
args
.
speaker_dict
,
'rt'
)
as
f
:
with
open
(
args
.
speaker_dict
,
'rt'
)
as
f
:
spk_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
spk_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
num_speakers
=
len
(
spk_id
)
spk_num
=
len
(
spk_id
)
print
(
"
num_speakers:"
,
num_speakers
)
print
(
"
spk_num:"
,
spk_num
)
odim
=
fastspeech2_config
.
n_mels
odim
=
fastspeech2_config
.
n_mels
model
=
FastSpeech2
(
model
=
FastSpeech2
(
idim
=
vocab_size
,
idim
=
vocab_size
,
odim
=
odim
,
odim
=
odim
,
num_speakers
=
num_speakers
,
spk_num
=
spk_num
,
**
fastspeech2_config
[
"model"
])
**
fastspeech2_config
[
"model"
])
model
.
set_state_dict
(
model
.
set_state_dict
(
...
...
paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py
浏览文件 @
bc0dd511
...
@@ -51,14 +51,14 @@ def evaluate(args, fastspeech2_config, pwg_config):
...
@@ -51,14 +51,14 @@ def evaluate(args, fastspeech2_config, pwg_config):
print
(
"vocab_size:"
,
vocab_size
)
print
(
"vocab_size:"
,
vocab_size
)
with
open
(
args
.
speaker_dict
,
'rt'
)
as
f
:
with
open
(
args
.
speaker_dict
,
'rt'
)
as
f
:
spk_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
spk_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
num_speakers
=
len
(
spk_id
)
spk_num
=
len
(
spk_id
)
print
(
"
num_speakers:"
,
num_speakers
)
print
(
"
spk_num:"
,
spk_num
)
odim
=
fastspeech2_config
.
n_mels
odim
=
fastspeech2_config
.
n_mels
model
=
FastSpeech2
(
model
=
FastSpeech2
(
idim
=
vocab_size
,
idim
=
vocab_size
,
odim
=
odim
,
odim
=
odim
,
num_speakers
=
num_speakers
,
spk_num
=
spk_num
,
**
fastspeech2_config
[
"model"
])
**
fastspeech2_config
[
"model"
])
model
.
set_state_dict
(
model
.
set_state_dict
(
...
...
paddlespeech/t2s/exps/fastspeech2/normalize.py
浏览文件 @
bc0dd511
...
@@ -167,6 +167,10 @@ def main():
...
@@ -167,6 +167,10 @@ def main():
"pitch"
:
str
(
pitch_path
),
"pitch"
:
str
(
pitch_path
),
"energy"
:
str
(
energy_path
)
"energy"
:
str
(
energy_path
)
}
}
# add spk_emb for voice cloning
if
"spk_emb"
in
item
:
record
[
"spk_emb"
]
=
str
(
item
[
"spk_emb"
])
output_metadata
.
append
(
record
)
output_metadata
.
append
(
record
)
output_metadata
.
sort
(
key
=
itemgetter
(
'utt_id'
))
output_metadata
.
sort
(
key
=
itemgetter
(
'utt_id'
))
output_metadata_path
=
Path
(
args
.
dumpdir
)
/
"metadata.jsonl"
output_metadata_path
=
Path
(
args
.
dumpdir
)
/
"metadata.jsonl"
...
...
paddlespeech/t2s/exps/fastspeech2/preprocess.py
浏览文件 @
bc0dd511
...
@@ -44,7 +44,8 @@ def process_sentence(config: Dict[str, Any],
...
@@ -44,7 +44,8 @@ def process_sentence(config: Dict[str, Any],
mel_extractor
=
None
,
mel_extractor
=
None
,
pitch_extractor
=
None
,
pitch_extractor
=
None
,
energy_extractor
=
None
,
energy_extractor
=
None
,
cut_sil
:
bool
=
True
):
cut_sil
:
bool
=
True
,
spk_emb_dir
:
Path
=
None
):
utt_id
=
fp
.
stem
utt_id
=
fp
.
stem
# for vctk
# for vctk
if
utt_id
.
endswith
(
"_mic2"
):
if
utt_id
.
endswith
(
"_mic2"
):
...
@@ -116,6 +117,14 @@ def process_sentence(config: Dict[str, Any],
...
@@ -116,6 +117,14 @@ def process_sentence(config: Dict[str, Any],
"energy"
:
str
(
energy_path
),
"energy"
:
str
(
energy_path
),
"speaker"
:
speaker
"speaker"
:
speaker
}
}
if
spk_emb_dir
:
if
speaker
in
os
.
listdir
(
spk_emb_dir
):
embed_name
=
utt_id
+
".npy"
embed_path
=
spk_emb_dir
/
speaker
/
embed_name
if
embed_path
.
is_file
():
record
[
"spk_emb"
]
=
str
(
embed_path
)
else
:
return
None
return
record
return
record
...
@@ -127,13 +136,14 @@ def process_sentences(config,
...
@@ -127,13 +136,14 @@ def process_sentences(config,
pitch_extractor
=
None
,
pitch_extractor
=
None
,
energy_extractor
=
None
,
energy_extractor
=
None
,
nprocs
:
int
=
1
,
nprocs
:
int
=
1
,
cut_sil
:
bool
=
True
):
cut_sil
:
bool
=
True
,
spk_emb_dir
:
Path
=
None
):
if
nprocs
==
1
:
if
nprocs
==
1
:
results
=
[]
results
=
[]
for
fp
in
fps
:
for
fp
in
fps
:
record
=
process_sentence
(
config
,
fp
,
sentences
,
output_dir
,
record
=
process_sentence
(
config
,
fp
,
sentences
,
output_dir
,
mel_extractor
,
pitch_extractor
,
mel_extractor
,
pitch_extractor
,
energy_extractor
,
cut_sil
)
energy_extractor
,
cut_sil
,
spk_emb_dir
)
if
record
:
if
record
:
results
.
append
(
record
)
results
.
append
(
record
)
else
:
else
:
...
@@ -144,7 +154,7 @@ def process_sentences(config,
...
@@ -144,7 +154,7 @@ def process_sentences(config,
future
=
pool
.
submit
(
process_sentence
,
config
,
fp
,
future
=
pool
.
submit
(
process_sentence
,
config
,
fp
,
sentences
,
output_dir
,
mel_extractor
,
sentences
,
output_dir
,
mel_extractor
,
pitch_extractor
,
energy_extractor
,
pitch_extractor
,
energy_extractor
,
cut_sil
)
cut_sil
,
spk_emb_dir
)
future
.
add_done_callback
(
lambda
p
:
progress
.
update
())
future
.
add_done_callback
(
lambda
p
:
progress
.
update
())
futures
.
append
(
future
)
futures
.
append
(
future
)
...
@@ -202,6 +212,11 @@ def main():
...
@@ -202,6 +212,11 @@ def main():
default
=
True
,
default
=
True
,
help
=
"whether cut sil in the edge of audio"
)
help
=
"whether cut sil in the edge of audio"
)
parser
.
add_argument
(
"--spk_emb_dir"
,
default
=
None
,
type
=
str
,
help
=
"directory to speaker embedding files."
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
rootdir
=
Path
(
args
.
rootdir
).
expanduser
()
rootdir
=
Path
(
args
.
rootdir
).
expanduser
()
...
@@ -211,6 +226,11 @@ def main():
...
@@ -211,6 +226,11 @@ def main():
dumpdir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
dumpdir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
dur_file
=
Path
(
args
.
dur_file
).
expanduser
()
dur_file
=
Path
(
args
.
dur_file
).
expanduser
()
if
args
.
spk_emb_dir
:
spk_emb_dir
=
Path
(
args
.
spk_emb_dir
).
expanduser
().
resolve
()
else
:
spk_emb_dir
=
None
assert
rootdir
.
is_dir
()
assert
rootdir
.
is_dir
()
assert
dur_file
.
is_file
()
assert
dur_file
.
is_file
()
...
@@ -251,6 +271,7 @@ def main():
...
@@ -251,6 +271,7 @@ def main():
test_wav_files
+=
wav_files
[
-
sub_num_dev
:]
test_wav_files
+=
wav_files
[
-
sub_num_dev
:]
else
:
else
:
train_wav_files
+=
wav_files
train_wav_files
+=
wav_files
elif
args
.
dataset
==
"ljspeech"
:
elif
args
.
dataset
==
"ljspeech"
:
wav_files
=
sorted
(
list
((
rootdir
/
"wavs"
).
rglob
(
"*.wav"
)))
wav_files
=
sorted
(
list
((
rootdir
/
"wavs"
).
rglob
(
"*.wav"
)))
# split data into 3 sections
# split data into 3 sections
...
@@ -317,7 +338,8 @@ def main():
...
@@ -317,7 +338,8 @@ def main():
pitch_extractor
,
pitch_extractor
,
energy_extractor
,
energy_extractor
,
nprocs
=
args
.
num_cpu
,
nprocs
=
args
.
num_cpu
,
cut_sil
=
args
.
cut_sil
)
cut_sil
=
args
.
cut_sil
,
spk_emb_dir
=
spk_emb_dir
)
if
dev_wav_files
:
if
dev_wav_files
:
process_sentences
(
process_sentences
(
config
,
config
,
...
@@ -327,7 +349,8 @@ def main():
...
@@ -327,7 +349,8 @@ def main():
mel_extractor
,
mel_extractor
,
pitch_extractor
,
pitch_extractor
,
energy_extractor
,
energy_extractor
,
cut_sil
=
args
.
cut_sil
)
cut_sil
=
args
.
cut_sil
,
spk_emb_dir
=
spk_emb_dir
)
if
test_wav_files
:
if
test_wav_files
:
process_sentences
(
process_sentences
(
config
,
config
,
...
@@ -338,7 +361,8 @@ def main():
...
@@ -338,7 +361,8 @@ def main():
pitch_extractor
,
pitch_extractor
,
energy_extractor
,
energy_extractor
,
nprocs
=
args
.
num_cpu
,
nprocs
=
args
.
num_cpu
,
cut_sil
=
args
.
cut_sil
)
cut_sil
=
args
.
cut_sil
,
spk_emb_dir
=
spk_emb_dir
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
paddlespeech/t2s/exps/fastspeech2/synthesize.py
浏览文件 @
bc0dd511
...
@@ -40,16 +40,19 @@ def evaluate(args, fastspeech2_config, pwg_config):
...
@@ -40,16 +40,19 @@ def evaluate(args, fastspeech2_config, pwg_config):
fields
=
[
"utt_id"
,
"text"
]
fields
=
[
"utt_id"
,
"text"
]
spk_num
=
None
if
args
.
speaker_dict
is
not
None
:
if
args
.
speaker_dict
is
not
None
:
print
(
"multiple speaker fastspeech2!"
)
print
(
"multiple speaker fastspeech2!"
)
with
open
(
args
.
speaker_dict
,
'rt'
)
as
f
:
with
open
(
args
.
speaker_dict
,
'rt'
)
as
f
:
spk_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
spk_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
num_speakers
=
len
(
spk_id
)
spk_num
=
len
(
spk_id
)
fields
+=
[
"spk_id"
]
fields
+=
[
"spk_id"
]
elif
args
.
voice_cloning
:
print
(
"voice cloning!"
)
fields
+=
[
"spk_emb"
]
else
:
else
:
print
(
"single speaker fastspeech2!"
)
print
(
"single speaker fastspeech2!"
)
num_speakers
=
None
print
(
"spk_num:"
,
spk_num
)
print
(
"num_speakers:"
,
num_speakers
)
test_dataset
=
DataTable
(
data
=
test_metadata
,
fields
=
fields
)
test_dataset
=
DataTable
(
data
=
test_metadata
,
fields
=
fields
)
...
@@ -62,7 +65,7 @@ def evaluate(args, fastspeech2_config, pwg_config):
...
@@ -62,7 +65,7 @@ def evaluate(args, fastspeech2_config, pwg_config):
model
=
FastSpeech2
(
model
=
FastSpeech2
(
idim
=
vocab_size
,
idim
=
vocab_size
,
odim
=
odim
,
odim
=
odim
,
num_speakers
=
num_speakers
,
spk_num
=
spk_num
,
**
fastspeech2_config
[
"model"
])
**
fastspeech2_config
[
"model"
])
model
.
set_state_dict
(
model
.
set_state_dict
(
...
@@ -96,12 +99,15 @@ def evaluate(args, fastspeech2_config, pwg_config):
...
@@ -96,12 +99,15 @@ def evaluate(args, fastspeech2_config, pwg_config):
for
datum
in
test_dataset
:
for
datum
in
test_dataset
:
utt_id
=
datum
[
"utt_id"
]
utt_id
=
datum
[
"utt_id"
]
text
=
paddle
.
to_tensor
(
datum
[
"text"
])
text
=
paddle
.
to_tensor
(
datum
[
"text"
])
if
"spk_id"
in
datum
:
spk_emb
=
None
spk_id
=
None
if
args
.
voice_cloning
and
"spk_emb"
in
datum
:
spk_emb
=
paddle
.
to_tensor
(
np
.
load
(
datum
[
"spk_emb"
]))
elif
"spk_id"
in
datum
:
spk_id
=
paddle
.
to_tensor
(
datum
[
"spk_id"
])
spk_id
=
paddle
.
to_tensor
(
datum
[
"spk_id"
])
else
:
spk_id
=
None
with
paddle
.
no_grad
():
with
paddle
.
no_grad
():
wav
=
pwg_inference
(
fastspeech2_inference
(
text
,
spk_id
=
spk_id
))
wav
=
pwg_inference
(
fastspeech2_inference
(
text
,
spk_id
=
spk_id
,
spk_emb
=
spk_emb
))
sf
.
write
(
sf
.
write
(
str
(
output_dir
/
(
utt_id
+
".wav"
)),
str
(
output_dir
/
(
utt_id
+
".wav"
)),
wav
.
numpy
(),
wav
.
numpy
(),
...
@@ -142,6 +148,15 @@ def main():
...
@@ -142,6 +148,15 @@ def main():
type
=
str
,
type
=
str
,
default
=
None
,
default
=
None
,
help
=
"speaker id map file for multiple speaker model."
)
help
=
"speaker id map file for multiple speaker model."
)
def
str2bool
(
str
):
return
True
if
str
.
lower
()
==
'true'
else
False
parser
.
add_argument
(
"--voice-cloning"
,
type
=
str2bool
,
default
=
False
,
help
=
"whether training voice cloning model."
)
parser
.
add_argument
(
"--test-metadata"
,
type
=
str
,
help
=
"test metadata."
)
parser
.
add_argument
(
"--test-metadata"
,
type
=
str
,
help
=
"test metadata."
)
parser
.
add_argument
(
"--output-dir"
,
type
=
str
,
help
=
"output dir."
)
parser
.
add_argument
(
"--output-dir"
,
type
=
str
,
help
=
"output dir."
)
parser
.
add_argument
(
parser
.
add_argument
(
...
...
paddlespeech/t2s/exps/fastspeech2/train.py
浏览文件 @
bc0dd511
...
@@ -61,18 +61,24 @@ def train_sp(args, config):
...
@@ -61,18 +61,24 @@ def train_sp(args, config):
"text"
,
"text_lengths"
,
"speech"
,
"speech_lengths"
,
"durations"
,
"text"
,
"text_lengths"
,
"speech"
,
"speech_lengths"
,
"durations"
,
"pitch"
,
"energy"
"pitch"
,
"energy"
]
]
converters
=
{
"speech"
:
np
.
load
,
"pitch"
:
np
.
load
,
"energy"
:
np
.
load
}
spk_num
=
None
if
args
.
speaker_dict
is
not
None
:
if
args
.
speaker_dict
is
not
None
:
print
(
"multiple speaker fastspeech2!"
)
print
(
"multiple speaker fastspeech2!"
)
collate_fn
=
fastspeech2_multi_spk_batch_fn
collate_fn
=
fastspeech2_multi_spk_batch_fn
with
open
(
args
.
speaker_dict
,
'rt'
)
as
f
:
with
open
(
args
.
speaker_dict
,
'rt'
)
as
f
:
spk_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
spk_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
num_speakers
=
len
(
spk_id
)
spk_num
=
len
(
spk_id
)
fields
+=
[
"spk_id"
]
fields
+=
[
"spk_id"
]
elif
args
.
voice_cloning
:
print
(
"Training voice cloning!"
)
collate_fn
=
fastspeech2_multi_spk_batch_fn
fields
+=
[
"spk_emb"
]
converters
[
"spk_emb"
]
=
np
.
load
else
:
else
:
print
(
"single speaker fastspeech2!"
)
print
(
"single speaker fastspeech2!"
)
collate_fn
=
fastspeech2_single_spk_batch_fn
collate_fn
=
fastspeech2_single_spk_batch_fn
num_speakers
=
None
print
(
"spk_num:"
,
spk_num
)
print
(
"num_speakers:"
,
num_speakers
)
# dataloader has been too verbose
# dataloader has been too verbose
logging
.
getLogger
(
"DataLoader"
).
disabled
=
True
logging
.
getLogger
(
"DataLoader"
).
disabled
=
True
...
@@ -83,17 +89,13 @@ def train_sp(args, config):
...
@@ -83,17 +89,13 @@ def train_sp(args, config):
train_dataset
=
DataTable
(
train_dataset
=
DataTable
(
data
=
train_metadata
,
data
=
train_metadata
,
fields
=
fields
,
fields
=
fields
,
converters
=
{
"speech"
:
np
.
load
,
converters
=
converters
,
)
"pitch"
:
np
.
load
,
"energy"
:
np
.
load
},
)
with
jsonlines
.
open
(
args
.
dev_metadata
,
'r'
)
as
reader
:
with
jsonlines
.
open
(
args
.
dev_metadata
,
'r'
)
as
reader
:
dev_metadata
=
list
(
reader
)
dev_metadata
=
list
(
reader
)
dev_dataset
=
DataTable
(
dev_dataset
=
DataTable
(
data
=
dev_metadata
,
data
=
dev_metadata
,
fields
=
fields
,
fields
=
fields
,
converters
=
{
"speech"
:
np
.
load
,
converters
=
converters
,
)
"pitch"
:
np
.
load
,
"energy"
:
np
.
load
},
)
# collate function and dataloader
# collate function and dataloader
...
@@ -127,10 +129,7 @@ def train_sp(args, config):
...
@@ -127,10 +129,7 @@ def train_sp(args, config):
odim
=
config
.
n_mels
odim
=
config
.
n_mels
model
=
FastSpeech2
(
model
=
FastSpeech2
(
idim
=
vocab_size
,
idim
=
vocab_size
,
odim
=
odim
,
spk_num
=
spk_num
,
**
config
[
"model"
])
odim
=
odim
,
num_speakers
=
num_speakers
,
**
config
[
"model"
])
if
world_size
>
1
:
if
world_size
>
1
:
model
=
DataParallel
(
model
)
model
=
DataParallel
(
model
)
print
(
"model done!"
)
print
(
"model done!"
)
...
@@ -184,6 +183,15 @@ def main():
...
@@ -184,6 +183,15 @@ def main():
default
=
None
,
default
=
None
,
help
=
"speaker id map file for multiple speaker model."
)
help
=
"speaker id map file for multiple speaker model."
)
def
str2bool
(
str
):
return
True
if
str
.
lower
()
==
'true'
else
False
parser
.
add_argument
(
"--voice-cloning"
,
type
=
str2bool
,
default
=
False
,
help
=
"whether training voice cloning model."
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
with
open
(
args
.
config
)
as
f
:
with
open
(
args
.
config
)
as
f
:
...
...
paddlespeech/t2s/exps/ge2e/random_cycle.py
已删除
100644 → 0
浏览文件 @
4370c5cf
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
random
def
cycle
(
iterable
):
# cycle('ABCD') --> A B C D A B C D A B C D ...
saved
=
[]
for
element
in
iterable
:
yield
element
saved
.
append
(
element
)
while
saved
:
for
element
in
saved
:
yield
element
def
random_cycle
(
iterable
):
# cycle('ABCD') --> A B C D B C D A A D B C ...
saved
=
[]
for
element
in
iterable
:
yield
element
saved
.
append
(
element
)
random
.
shuffle
(
saved
)
while
saved
:
for
element
in
saved
:
yield
element
random
.
shuffle
(
saved
)
paddlespeech/t2s/exps/ge2e/speaker_verification_dataset.py
已删除
100644 → 0
浏览文件 @
4370c5cf
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
random
from
pathlib
import
Path
import
numpy
as
np
from
paddle.io
import
BatchSampler
from
paddle.io
import
Dataset
from
paddlespeech.t2s.exps.ge2e.random_cycle
import
random_cycle
class
MultiSpeakerMelDataset
(
Dataset
):
"""A 2 layer directory thatn contains mel spectrograms in *.npy format.
An Example file structure tree is shown below. We prefer to preprocess
raw datasets and organized them like this.
dataset_root/
speaker1/
utterance1.npy
utterance2.npy
utterance3.npy
speaker2/
utterance1.npy
utterance2.npy
utterance3.npy
"""
def
__init__
(
self
,
dataset_root
:
Path
):
self
.
root
=
Path
(
dataset_root
).
expanduser
()
speaker_dirs
=
[
f
for
f
in
self
.
root
.
glob
(
"*"
)
if
f
.
is_dir
()]
speaker_utterances
=
{
speaker_dir
:
list
(
speaker_dir
.
glob
(
"*.npy"
))
for
speaker_dir
in
speaker_dirs
}
self
.
speaker_dirs
=
speaker_dirs
self
.
speaker_to_utterances
=
speaker_utterances
# meta data
self
.
num_speakers
=
len
(
self
.
speaker_dirs
)
self
.
num_utterances
=
np
.
sum
(
len
(
utterances
)
for
speaker
,
utterances
in
self
.
speaker_to_utterances
.
items
())
def
get_example_by_index
(
self
,
speaker_index
,
utterance_index
):
speaker_dir
=
self
.
speaker_dirs
[
speaker_index
]
fpath
=
self
.
speaker_to_utterances
[
speaker_dir
][
utterance_index
]
return
self
[
fpath
]
def
__getitem__
(
self
,
fpath
):
return
np
.
load
(
fpath
)
def
__len__
(
self
):
return
int
(
self
.
num_utterances
)
class
MultiSpeakerSampler
(
BatchSampler
):
"""A multi-stratal sampler designed for speaker verification task.
First, N speakers from all speakers are sampled randomly. Then, for each
speaker, randomly sample M utterances from their corresponding utterances.
"""
def
__init__
(
self
,
dataset
:
MultiSpeakerMelDataset
,
speakers_per_batch
:
int
,
utterances_per_speaker
:
int
):
self
.
_speakers
=
list
(
dataset
.
speaker_dirs
)
self
.
_speaker_to_utterances
=
dataset
.
speaker_to_utterances
self
.
speakers_per_batch
=
speakers_per_batch
self
.
utterances_per_speaker
=
utterances_per_speaker
def
__iter__
(
self
):
# yield list of Paths
speaker_generator
=
iter
(
random_cycle
(
self
.
_speakers
))
speaker_utterances_generator
=
{
s
:
iter
(
random_cycle
(
us
))
for
s
,
us
in
self
.
_speaker_to_utterances
.
items
()
}
while
True
:
speakers
=
[]
for
_
in
range
(
self
.
speakers_per_batch
):
speakers
.
append
(
next
(
speaker_generator
))
utterances
=
[]
for
s
in
speakers
:
us
=
speaker_utterances_generator
[
s
]
for
_
in
range
(
self
.
utterances_per_speaker
):
utterances
.
append
(
next
(
us
))
yield
utterances
class
RandomClip
(
object
):
def
__init__
(
self
,
frames
):
self
.
frames
=
frames
def
__call__
(
self
,
spec
):
# spec [T, C]
T
=
spec
.
shape
[
0
]
start
=
random
.
randint
(
0
,
T
-
self
.
frames
)
return
spec
[
start
:
start
+
self
.
frames
,
:]
class
Collate
(
object
):
def
__init__
(
self
,
num_frames
):
self
.
random_crop
=
RandomClip
(
num_frames
)
def
__call__
(
self
,
examples
):
frame_clips
=
[
self
.
random_crop
(
mel
)
for
mel
in
examples
]
batced_clips
=
np
.
stack
(
frame_clips
)
return
batced_clips
if
__name__
==
"__main__"
:
mydataset
=
MultiSpeakerMelDataset
(
Path
(
"/home/chenfeiyu/datasets/SV2TTS/encoder"
))
print
(
mydataset
.
get_example_by_index
(
0
,
10
))
paddlespeech/t2s/exps/ge2e/train.py
已删除
100644 → 0
浏览文件 @
4370c5cf
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
time
from
paddle
import
DataParallel
from
paddle
import
distributed
as
dist
from
paddle.io
import
DataLoader
from
paddle.nn.clip
import
ClipGradByGlobalNorm
from
paddle.optimizer
import
Adam
from
paddlespeech.t2s.exps.ge2e.config
import
get_cfg_defaults
from
paddlespeech.t2s.exps.ge2e.speaker_verification_dataset
import
Collate
from
paddlespeech.t2s.exps.ge2e.speaker_verification_dataset
import
MultiSpeakerMelDataset
from
paddlespeech.t2s.exps.ge2e.speaker_verification_dataset
import
MultiSpeakerSampler
from
paddlespeech.t2s.models.lstm_speaker_encoder
import
LSTMSpeakerEncoder
from
paddlespeech.t2s.training
import
default_argument_parser
from
paddlespeech.t2s.training
import
ExperimentBase
class
Ge2eExperiment
(
ExperimentBase
):
def
setup_model
(
self
):
config
=
self
.
config
model
=
LSTMSpeakerEncoder
(
config
.
data
.
n_mels
,
config
.
model
.
num_layers
,
config
.
model
.
hidden_size
,
config
.
model
.
embedding_size
)
optimizer
=
Adam
(
config
.
training
.
learning_rate_init
,
parameters
=
model
.
parameters
(),
grad_clip
=
ClipGradByGlobalNorm
(
3
))
self
.
model
=
DataParallel
(
model
)
if
self
.
parallel
else
model
self
.
model_core
=
model
self
.
optimizer
=
optimizer
def
setup_dataloader
(
self
):
config
=
self
.
config
train_dataset
=
MultiSpeakerMelDataset
(
self
.
args
.
data
)
sampler
=
MultiSpeakerSampler
(
train_dataset
,
config
.
training
.
speakers_per_batch
,
config
.
training
.
utterances_per_speaker
)
train_loader
=
DataLoader
(
train_dataset
,
batch_sampler
=
sampler
,
collate_fn
=
Collate
(
config
.
data
.
partial_n_frames
),
num_workers
=
16
)
self
.
train_dataset
=
train_dataset
self
.
train_loader
=
train_loader
def
train_batch
(
self
):
start
=
time
.
time
()
batch
=
self
.
read_batch
()
data_loader_time
=
time
.
time
()
-
start
self
.
optimizer
.
clear_grad
()
self
.
model
.
train
()
specs
=
batch
loss
,
eer
=
self
.
model
(
specs
,
self
.
config
.
training
.
speakers_per_batch
)
loss
.
backward
()
self
.
model_core
.
do_gradient_ops
()
self
.
optimizer
.
step
()
iteration_time
=
time
.
time
()
-
start
# logging
loss_value
=
float
(
loss
)
msg
=
"Rank: {}, "
.
format
(
dist
.
get_rank
())
msg
+=
"step: {}, "
.
format
(
self
.
iteration
)
msg
+=
"time: {:>.3f}s/{:>.3f}s, "
.
format
(
data_loader_time
,
iteration_time
)
msg
+=
'loss: {:>.6f} err: {:>.6f}'
.
format
(
loss_value
,
eer
)
self
.
logger
.
info
(
msg
)
if
dist
.
get_rank
()
==
0
:
self
.
visualizer
.
add_scalar
(
"train/loss"
,
loss_value
,
self
.
iteration
)
self
.
visualizer
.
add_scalar
(
"train/eer"
,
eer
,
self
.
iteration
)
self
.
visualizer
.
add_scalar
(
"param/w"
,
float
(
self
.
model_core
.
similarity_weight
),
self
.
iteration
)
self
.
visualizer
.
add_scalar
(
"param/b"
,
float
(
self
.
model_core
.
similarity_bias
),
self
.
iteration
)
def
valid
(
self
):
pass
def
main_sp
(
config
,
args
):
exp
=
Ge2eExperiment
(
config
,
args
)
exp
.
setup
()
exp
.
resume_or_load
()
exp
.
run
()
def
main
(
config
,
args
):
if
args
.
ngpu
>
1
:
dist
.
spawn
(
main_sp
,
args
=
(
config
,
args
),
nprocs
=
args
.
ngpu
)
else
:
main_sp
(
config
,
args
)
if
__name__
==
"__main__"
:
config
=
get_cfg_defaults
()
parser
=
default_argument_parser
()
args
=
parser
.
parse_args
()
if
args
.
config
:
config
.
merge_from_file
(
args
.
config
)
if
args
.
opts
:
config
.
merge_from_list
(
args
.
opts
)
config
.
freeze
()
print
(
config
)
print
(
args
)
main
(
config
,
args
)
paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py
浏览文件 @
bc0dd511
...
@@ -20,14 +20,14 @@ import paddle
...
@@ -20,14 +20,14 @@ import paddle
import
soundfile
as
sf
import
soundfile
as
sf
from
matplotlib
import
pyplot
as
plt
from
matplotlib
import
pyplot
as
plt
from
paddlespeech.t2s.exps.ge2e.audio_processor
import
SpeakerVerificationPreprocessor
from
paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3
import
voc_phones
from
paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3
import
voc_phones
from
paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3
import
voc_tones
from
paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3
import
voc_tones
from
paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.chinese_g2p
import
convert_sentence
from
paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.chinese_g2p
import
convert_sentence
from
paddlespeech.t2s.models.lstm_speaker_encoder
import
LSTMSpeakerEncoder
from
paddlespeech.t2s.models.tacotron2
import
Tacotron2
from
paddlespeech.t2s.models.tacotron2
import
Tacotron2
from
paddlespeech.t2s.models.waveflow
import
ConditionalWaveFlow
from
paddlespeech.t2s.models.waveflow
import
ConditionalWaveFlow
from
paddlespeech.t2s.utils
import
display
from
paddlespeech.t2s.utils
import
display
from
paddlespeech.vector.exps.ge2e.audio_processor
import
SpeakerVerificationPreprocessor
from
paddlespeech.vector.models.lstm_speaker_encoder
import
LSTMSpeakerEncoder
def
voice_cloning
(
args
):
def
voice_cloning
(
args
):
...
...
paddlespeech/t2s/models/fastspeech2/fastspeech2.py
浏览文件 @
bc0dd511
...
@@ -32,9 +32,7 @@ from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredic
...
@@ -32,9 +32,7 @@ from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredic
from
paddlespeech.t2s.modules.predictor.length_regulator
import
LengthRegulator
from
paddlespeech.t2s.modules.predictor.length_regulator
import
LengthRegulator
from
paddlespeech.t2s.modules.predictor.variance_predictor
import
VariancePredictor
from
paddlespeech.t2s.modules.predictor.variance_predictor
import
VariancePredictor
from
paddlespeech.t2s.modules.tacotron2.decoder
import
Postnet
from
paddlespeech.t2s.modules.tacotron2.decoder
import
Postnet
from
paddlespeech.t2s.modules.transformer.embedding
import
PositionalEncoding
from
paddlespeech.t2s.modules.transformer.encoder
import
Encoder
from
paddlespeech.t2s.modules.transformer.embedding
import
ScaledPositionalEncoding
from
paddlespeech.t2s.modules.transformer.encoder
import
Encoder
as
TransformerEncoder
class
FastSpeech2
(
nn
.
Layer
):
class
FastSpeech2
(
nn
.
Layer
):
...
@@ -66,6 +64,7 @@ class FastSpeech2(nn.Layer):
...
@@ -66,6 +64,7 @@ class FastSpeech2(nn.Layer):
postnet_layers
:
int
=
5
,
postnet_layers
:
int
=
5
,
postnet_chans
:
int
=
512
,
postnet_chans
:
int
=
512
,
postnet_filts
:
int
=
5
,
postnet_filts
:
int
=
5
,
postnet_dropout_rate
:
float
=
0.5
,
positionwise_layer_type
:
str
=
"conv1d"
,
positionwise_layer_type
:
str
=
"conv1d"
,
positionwise_conv_kernel_size
:
int
=
1
,
positionwise_conv_kernel_size
:
int
=
1
,
use_scaled_pos_enc
:
bool
=
True
,
use_scaled_pos_enc
:
bool
=
True
,
...
@@ -77,10 +76,27 @@ class FastSpeech2(nn.Layer):
...
@@ -77,10 +76,27 @@ class FastSpeech2(nn.Layer):
reduction_factor
:
int
=
1
,
reduction_factor
:
int
=
1
,
encoder_type
:
str
=
"transformer"
,
encoder_type
:
str
=
"transformer"
,
decoder_type
:
str
=
"transformer"
,
decoder_type
:
str
=
"transformer"
,
# for transformer
transformer_enc_dropout_rate
:
float
=
0.1
,
transformer_enc_positional_dropout_rate
:
float
=
0.1
,
transformer_enc_attn_dropout_rate
:
float
=
0.1
,
transformer_dec_dropout_rate
:
float
=
0.1
,
transformer_dec_positional_dropout_rate
:
float
=
0.1
,
transformer_dec_attn_dropout_rate
:
float
=
0.1
,
# for conformer
conformer_pos_enc_layer_type
:
str
=
"rel_pos"
,
conformer_self_attn_layer_type
:
str
=
"rel_selfattn"
,
conformer_activation_type
:
str
=
"swish"
,
use_macaron_style_in_conformer
:
bool
=
True
,
use_cnn_in_conformer
:
bool
=
True
,
zero_triu
:
bool
=
False
,
conformer_enc_kernel_size
:
int
=
7
,
conformer_dec_kernel_size
:
int
=
31
,
# duration predictor
# duration predictor
duration_predictor_layers
:
int
=
2
,
duration_predictor_layers
:
int
=
2
,
duration_predictor_chans
:
int
=
384
,
duration_predictor_chans
:
int
=
384
,
duration_predictor_kernel_size
:
int
=
3
,
duration_predictor_kernel_size
:
int
=
3
,
duration_predictor_dropout_rate
:
float
=
0.1
,
# energy predictor
# energy predictor
energy_predictor_layers
:
int
=
2
,
energy_predictor_layers
:
int
=
2
,
energy_predictor_chans
:
int
=
384
,
energy_predictor_chans
:
int
=
384
,
...
@@ -98,28 +114,150 @@ class FastSpeech2(nn.Layer):
...
@@ -98,28 +114,150 @@ class FastSpeech2(nn.Layer):
pitch_embed_dropout
:
float
=
0.5
,
pitch_embed_dropout
:
float
=
0.5
,
stop_gradient_from_pitch_predictor
:
bool
=
False
,
stop_gradient_from_pitch_predictor
:
bool
=
False
,
# spk emb
# spk emb
num_speakers
:
int
=
None
,
spk_num
:
int
=
None
,
spk_embed_dim
:
int
=
None
,
spk_embed_dim
:
int
=
None
,
spk_embed_integration_type
:
str
=
"add"
,
spk_embed_integration_type
:
str
=
"add"
,
#
tone emb
# tone emb
num_tones
:
int
=
None
,
tone_num
:
int
=
None
,
tone_embed_dim
:
int
=
None
,
tone_embed_dim
:
int
=
None
,
tone_embed_integration_type
:
str
=
"add"
,
tone_embed_integration_type
:
str
=
"add"
,
# training related
# training related
transformer_enc_dropout_rate
:
float
=
0.1
,
transformer_enc_positional_dropout_rate
:
float
=
0.1
,
transformer_enc_attn_dropout_rate
:
float
=
0.1
,
transformer_dec_dropout_rate
:
float
=
0.1
,
transformer_dec_positional_dropout_rate
:
float
=
0.1
,
transformer_dec_attn_dropout_rate
:
float
=
0.1
,
duration_predictor_dropout_rate
:
float
=
0.1
,
postnet_dropout_rate
:
float
=
0.5
,
init_type
:
str
=
"xavier_uniform"
,
init_type
:
str
=
"xavier_uniform"
,
init_enc_alpha
:
float
=
1.0
,
init_enc_alpha
:
float
=
1.0
,
init_dec_alpha
:
float
=
1.0
,
init_dec_alpha
:
float
=
1.0
,
):
use_masking
:
bool
=
False
,
"""Initialize FastSpeech2 module.
use_weighted_masking
:
bool
=
False
,
):
Parameters
"""Initialize FastSpeech2 module."""
----------
idim : int
Dimension of the inputs.
odim : int
Dimension of the outputs.
adim : int
Attention dimension.
aheads : int
Number of attention heads.
elayers : int
Number of encoder layers.
eunits : int
Number of encoder hidden units.
dlayers : int
Number of decoder layers.
dunits : int
Number of decoder hidden units.
postnet_layers : int
Number of postnet layers.
postnet_chans : int
Number of postnet channels.
postnet_filts : int
Kernel size of postnet.
postnet_dropout_rate : float
Dropout rate in postnet.
use_scaled_pos_enc : bool
Whether to use trainable scaled pos encoding.
use_batch_norm : bool
Whether to use batch normalization in encoder prenet.
encoder_normalize_before : bool
Whether to apply layernorm layer before encoder block.
decoder_normalize_before : bool
Whether to apply layernorm layer before
decoder block.
encoder_concat_after : bool
Whether to concatenate attention layer's input and output in encoder.
decoder_concat_after : bool
Whether to concatenate attention layer's input and output in decoder.
reduction_factor : int
Reduction factor.
encoder_type : str
Encoder type ("transformer" or "conformer").
decoder_type : str
Decoder type ("transformer" or "conformer").
transformer_enc_dropout_rate : float
Dropout rate in encoder except attention and positional encoding.
transformer_enc_positional_dropout_rate (float): Dropout rate after encoder
positional encoding.
transformer_enc_attn_dropout_rate (float): Dropout rate in encoder
self-attention module.
transformer_dec_dropout_rate (float): Dropout rate in decoder except
attention & positional encoding.
transformer_dec_positional_dropout_rate (float): Dropout rate after decoder
positional encoding.
transformer_dec_attn_dropout_rate (float): Dropout rate in decoder
self-attention module.
conformer_pos_enc_layer_type : str
Pos encoding layer type in conformer.
conformer_self_attn_layer_type : str
Self-attention layer type in conformer
conformer_activation_type : str
Activation function type in conformer.
use_macaron_style_in_conformer : bool
Whether to use macaron style FFN.
use_cnn_in_conformer : bool
Whether to use CNN in conformer.
zero_triu : bool
Whether to use zero triu in relative self-attention module.
conformer_enc_kernel_size : int
Kernel size of encoder conformer.
conformer_dec_kernel_size : int
Kernel size of decoder conformer.
duration_predictor_layers : int
Number of duration predictor layers.
duration_predictor_chans : int
Number of duration predictor channels.
duration_predictor_kernel_size : int
Kernel size of duration predictor.
duration_predictor_dropout_rate : float
Dropout rate in duration predictor.
pitch_predictor_layers : int
Number of pitch predictor layers.
pitch_predictor_chans : int
Number of pitch predictor channels.
pitch_predictor_kernel_size : int
Kernel size of pitch predictor.
pitch_predictor_dropout_rate : float
Dropout rate in pitch predictor.
pitch_embed_kernel_size : float
Kernel size of pitch embedding.
pitch_embed_dropout_rate : float
Dropout rate for pitch embedding.
stop_gradient_from_pitch_predictor : bool
Whether to stop gradient from pitch predictor to encoder.
energy_predictor_layers : int
Number of energy predictor layers.
energy_predictor_chans : int
Number of energy predictor channels.
energy_predictor_kernel_size : int
Kernel size of energy predictor.
energy_predictor_dropout_rate : float
Dropout rate in energy predictor.
energy_embed_kernel_size : float
Kernel size of energy embedding.
energy_embed_dropout_rate : float
Dropout rate for energy embedding.
stop_gradient_from_energy_predictor : bool
Whether to stop gradient from energy predictor to encoder.
spk_num : Optional[int]
Number of speakers. If not None, assume that the spk_embed_dim is not None,
spk_ids will be provided as the input and use spk_embedding_table.
spk_embed_dim : Optional[int]
Speaker embedding dimension. If not None,
assume that spk_emb will be provided as the input or spk_num is not None.
spk_embed_integration_type : str
How to integrate speaker embedding.
tone_num : Optional[int]
Number of tones. If not None, assume that the
tone_ids will be provided as the input and use tone_embedding_table.
tone_embed_dim : Optional[int]
Tone embedding dimension. If not None, assume that tone_num is not None.
tone_embed_integration_type : str
How to integrate tone embedding.
init_type : str
How to initialize transformer parameters.
init_enc_alpha : float
Initial value of alpha in scaled pos encoding of the encoder.
init_dec_alpha : float
Initial value of alpha in scaled pos encoding of the decoder.
"""
assert
check_argument_types
()
assert
check_argument_types
()
super
().
__init__
()
super
().
__init__
()
...
@@ -148,30 +286,50 @@ class FastSpeech2(nn.Layer):
...
@@ -148,30 +286,50 @@ class FastSpeech2(nn.Layer):
# initialize parameters
# initialize parameters
initialize
(
self
,
init_type
)
initialize
(
self
,
init_type
)
if
s
elf
.
spk_embed_dim
is
not
None
:
if
s
pk_num
and
self
.
spk_embed_dim
:
self
.
spk_embedding_table
=
nn
.
Embedding
(
self
.
spk_embedding_table
=
nn
.
Embedding
(
num_embeddings
=
num_speakers
,
num_embeddings
=
spk_num
,
embedding_dim
=
self
.
spk_embed_dim
,
embedding_dim
=
self
.
spk_embed_dim
,
padding_idx
=
self
.
padding_idx
)
padding_idx
=
self
.
padding_idx
)
if
self
.
tone_embed_dim
is
not
None
:
if
self
.
tone_embed_dim
is
not
None
:
self
.
tone_embedding_table
=
nn
.
Embedding
(
self
.
tone_embedding_table
=
nn
.
Embedding
(
num_embeddings
=
num_tones
,
num_embeddings
=
tone_num
,
embedding_dim
=
self
.
tone_embed_dim
,
embedding_dim
=
self
.
tone_embed_dim
,
padding_idx
=
self
.
padding_idx
)
padding_idx
=
self
.
padding_idx
)
# get positional encoding class
# get positional encoding layer type
pos_enc_class
=
(
ScaledPositionalEncoding
transformer_pos_enc_layer_type
=
"scaled_abs_pos"
if
self
.
use_scaled_pos_enc
else
"abs_pos"
if
self
.
use_scaled_pos_enc
else
PositionalEncoding
)
# define encoder
# define encoder
encoder_input_layer
=
nn
.
Embedding
(
encoder_input_layer
=
nn
.
Embedding
(
num_embeddings
=
idim
,
num_embeddings
=
idim
,
embedding_dim
=
adim
,
embedding_dim
=
adim
,
padding_idx
=
self
.
padding_idx
)
padding_idx
=
self
.
padding_idx
)
# add encoder type here
# 测试模型还能跑通不
# 记得改 transformer tts
if
encoder_type
==
"transformer"
:
if
encoder_type
==
"transformer"
:
self
.
encoder
=
TransformerEncoder
(
print
(
"encoder_type is transformer"
)
self
.
encoder
=
Encoder
(
idim
=
idim
,
attention_dim
=
adim
,
attention_heads
=
aheads
,
linear_units
=
eunits
,
num_blocks
=
elayers
,
input_layer
=
encoder_input_layer
,
dropout_rate
=
transformer_enc_dropout_rate
,
positional_dropout_rate
=
transformer_enc_positional_dropout_rate
,
attention_dropout_rate
=
transformer_enc_attn_dropout_rate
,
pos_enc_layer_type
=
transformer_pos_enc_layer_type
,
normalize_before
=
encoder_normalize_before
,
concat_after
=
encoder_concat_after
,
positionwise_layer_type
=
positionwise_layer_type
,
positionwise_conv_kernel_size
=
positionwise_conv_kernel_size
,
encoder_type
=
encoder_type
)
elif
encoder_type
==
"conformer"
:
print
(
"encoder_type is conformer"
)
self
.
encoder
=
Encoder
(
idim
=
idim
,
idim
=
idim
,
attention_dim
=
adim
,
attention_dim
=
adim
,
attention_heads
=
aheads
,
attention_heads
=
aheads
,
...
@@ -181,11 +339,18 @@ class FastSpeech2(nn.Layer):
...
@@ -181,11 +339,18 @@ class FastSpeech2(nn.Layer):
dropout_rate
=
transformer_enc_dropout_rate
,
dropout_rate
=
transformer_enc_dropout_rate
,
positional_dropout_rate
=
transformer_enc_positional_dropout_rate
,
positional_dropout_rate
=
transformer_enc_positional_dropout_rate
,
attention_dropout_rate
=
transformer_enc_attn_dropout_rate
,
attention_dropout_rate
=
transformer_enc_attn_dropout_rate
,
pos_enc_class
=
pos_enc_class
,
normalize_before
=
encoder_normalize_before
,
normalize_before
=
encoder_normalize_before
,
concat_after
=
encoder_concat_after
,
concat_after
=
encoder_concat_after
,
positionwise_layer_type
=
positionwise_layer_type
,
positionwise_layer_type
=
positionwise_layer_type
,
positionwise_conv_kernel_size
=
positionwise_conv_kernel_size
,
)
positionwise_conv_kernel_size
=
positionwise_conv_kernel_size
,
macaron_style
=
use_macaron_style_in_conformer
,
pos_enc_layer_type
=
conformer_pos_enc_layer_type
,
selfattention_layer_type
=
conformer_self_attn_layer_type
,
activation_type
=
conformer_activation_type
,
use_cnn_module
=
use_cnn_in_conformer
,
cnn_module_kernel
=
conformer_enc_kernel_size
,
zero_triu
=
zero_triu
,
encoder_type
=
encoder_type
)
else
:
else
:
raise
ValueError
(
f
"
{
encoder_type
}
is not supported."
)
raise
ValueError
(
f
"
{
encoder_type
}
is not supported."
)
...
@@ -251,7 +416,8 @@ class FastSpeech2(nn.Layer):
...
@@ -251,7 +416,8 @@ class FastSpeech2(nn.Layer):
# NOTE: we use encoder as decoder
# NOTE: we use encoder as decoder
# because fastspeech's decoder is the same as encoder
# because fastspeech's decoder is the same as encoder
if
decoder_type
==
"transformer"
:
if
decoder_type
==
"transformer"
:
self
.
decoder
=
TransformerEncoder
(
print
(
"decoder_type is transformer"
)
self
.
decoder
=
Encoder
(
idim
=
0
,
idim
=
0
,
attention_dim
=
adim
,
attention_dim
=
adim
,
attention_heads
=
aheads
,
attention_heads
=
aheads
,
...
@@ -262,11 +428,35 @@ class FastSpeech2(nn.Layer):
...
@@ -262,11 +428,35 @@ class FastSpeech2(nn.Layer):
dropout_rate
=
transformer_dec_dropout_rate
,
dropout_rate
=
transformer_dec_dropout_rate
,
positional_dropout_rate
=
transformer_dec_positional_dropout_rate
,
positional_dropout_rate
=
transformer_dec_positional_dropout_rate
,
attention_dropout_rate
=
transformer_dec_attn_dropout_rate
,
attention_dropout_rate
=
transformer_dec_attn_dropout_rate
,
pos_enc_
class
=
pos_enc_class
,
pos_enc_
layer_type
=
transformer_pos_enc_layer_type
,
normalize_before
=
decoder_normalize_before
,
normalize_before
=
decoder_normalize_before
,
concat_after
=
decoder_concat_after
,
concat_after
=
decoder_concat_after
,
positionwise_layer_type
=
positionwise_layer_type
,
positionwise_layer_type
=
positionwise_layer_type
,
positionwise_conv_kernel_size
=
positionwise_conv_kernel_size
,
)
positionwise_conv_kernel_size
=
positionwise_conv_kernel_size
,
encoder_type
=
decoder_type
)
elif
decoder_type
==
"conformer"
:
print
(
"decoder_type is conformer"
)
self
.
decoder
=
Encoder
(
idim
=
0
,
attention_dim
=
adim
,
attention_heads
=
aheads
,
linear_units
=
dunits
,
num_blocks
=
dlayers
,
input_layer
=
None
,
dropout_rate
=
transformer_dec_dropout_rate
,
positional_dropout_rate
=
transformer_dec_positional_dropout_rate
,
attention_dropout_rate
=
transformer_dec_attn_dropout_rate
,
normalize_before
=
decoder_normalize_before
,
concat_after
=
decoder_concat_after
,
positionwise_layer_type
=
positionwise_layer_type
,
positionwise_conv_kernel_size
=
positionwise_conv_kernel_size
,
macaron_style
=
use_macaron_style_in_conformer
,
pos_enc_layer_type
=
conformer_pos_enc_layer_type
,
selfattention_layer_type
=
conformer_self_attn_layer_type
,
activation_type
=
conformer_activation_type
,
use_cnn_module
=
use_cnn_in_conformer
,
cnn_module_kernel
=
conformer_dec_kernel_size
,
encoder_type
=
decoder_type
)
else
:
else
:
raise
ValueError
(
f
"
{
decoder_type
}
is not supported."
)
raise
ValueError
(
f
"
{
decoder_type
}
is not supported."
)
...
@@ -299,7 +489,7 @@ class FastSpeech2(nn.Layer):
...
@@ -299,7 +489,7 @@ class FastSpeech2(nn.Layer):
pitch
:
paddle
.
Tensor
,
pitch
:
paddle
.
Tensor
,
energy
:
paddle
.
Tensor
,
energy
:
paddle
.
Tensor
,
tone_id
:
paddle
.
Tensor
=
None
,
tone_id
:
paddle
.
Tensor
=
None
,
sp
embs
:
paddle
.
Tensor
=
None
,
sp
k_emb
:
paddle
.
Tensor
=
None
,
spk_id
:
paddle
.
Tensor
=
None
spk_id
:
paddle
.
Tensor
=
None
)
->
Tuple
[
paddle
.
Tensor
,
Dict
[
str
,
paddle
.
Tensor
],
paddle
.
Tensor
]:
)
->
Tuple
[
paddle
.
Tensor
,
Dict
[
str
,
paddle
.
Tensor
],
paddle
.
Tensor
]:
"""Calculate forward propagation.
"""Calculate forward propagation.
...
@@ -322,7 +512,7 @@ class FastSpeech2(nn.Layer):
...
@@ -322,7 +512,7 @@ class FastSpeech2(nn.Layer):
Batch of padded token-averaged energy (B, Tmax, 1).
Batch of padded token-averaged energy (B, Tmax, 1).
tone_id : Tensor, optional(int64)
tone_id : Tensor, optional(int64)
Batch of padded tone ids (B, Tmax).
Batch of padded tone ids (B, Tmax).
sp
embs
: Tensor, optional
sp
k_emb
: Tensor, optional
Batch of speaker embeddings (B, spk_embed_dim).
Batch of speaker embeddings (B, spk_embed_dim).
spk_id : Tnesor, optional(int64)
spk_id : Tnesor, optional(int64)
Batch of speaker ids (B,)
Batch of speaker ids (B,)
...
@@ -366,7 +556,7 @@ class FastSpeech2(nn.Layer):
...
@@ -366,7 +556,7 @@ class FastSpeech2(nn.Layer):
ps
,
ps
,
es
,
es
,
is_inference
=
False
,
is_inference
=
False
,
sp
embs
=
spembs
,
sp
k_emb
=
spk_emb
,
spk_id
=
spk_id
,
spk_id
=
spk_id
,
tone_id
=
tone_id
)
tone_id
=
tone_id
)
# modify mod part of groundtruth
# modify mod part of groundtruth
...
@@ -387,7 +577,7 @@ class FastSpeech2(nn.Layer):
...
@@ -387,7 +577,7 @@ class FastSpeech2(nn.Layer):
es
:
paddle
.
Tensor
=
None
,
es
:
paddle
.
Tensor
=
None
,
is_inference
:
bool
=
False
,
is_inference
:
bool
=
False
,
alpha
:
float
=
1.0
,
alpha
:
float
=
1.0
,
sp
embs
=
None
,
sp
k_emb
=
None
,
spk_id
=
None
,
spk_id
=
None
,
tone_id
=
None
)
->
Sequence
[
paddle
.
Tensor
]:
tone_id
=
None
)
->
Sequence
[
paddle
.
Tensor
]:
# forward encoder
# forward encoder
...
@@ -397,11 +587,12 @@ class FastSpeech2(nn.Layer):
...
@@ -397,11 +587,12 @@ class FastSpeech2(nn.Layer):
# integrate speaker embedding
# integrate speaker embedding
if
self
.
spk_embed_dim
is
not
None
:
if
self
.
spk_embed_dim
is
not
None
:
if
spembs
is
not
None
:
# spk_emb has a higher priority than spk_id
hs
=
self
.
_integrate_with_spk_embed
(
hs
,
spembs
)
if
spk_emb
is
not
None
:
hs
=
self
.
_integrate_with_spk_embed
(
hs
,
spk_emb
)
elif
spk_id
is
not
None
:
elif
spk_id
is
not
None
:
sp
embs
=
self
.
spk_embedding_table
(
spk_id
)
sp
k_emb
=
self
.
spk_embedding_table
(
spk_id
)
hs
=
self
.
_integrate_with_spk_embed
(
hs
,
sp
embs
)
hs
=
self
.
_integrate_with_spk_embed
(
hs
,
sp
k_emb
)
# integrate tone embedding
# integrate tone embedding
if
self
.
tone_embed_dim
is
not
None
:
if
self
.
tone_embed_dim
is
not
None
:
...
@@ -489,7 +680,7 @@ class FastSpeech2(nn.Layer):
...
@@ -489,7 +680,7 @@ class FastSpeech2(nn.Layer):
energy
:
paddle
.
Tensor
=
None
,
energy
:
paddle
.
Tensor
=
None
,
alpha
:
float
=
1.0
,
alpha
:
float
=
1.0
,
use_teacher_forcing
:
bool
=
False
,
use_teacher_forcing
:
bool
=
False
,
sp
embs
=
None
,
sp
k_emb
=
None
,
spk_id
=
None
,
spk_id
=
None
,
tone_id
=
None
,
tone_id
=
None
,
)
->
Tuple
[
paddle
.
Tensor
,
paddle
.
Tensor
,
paddle
.
Tensor
]:
)
->
Tuple
[
paddle
.
Tensor
,
paddle
.
Tensor
,
paddle
.
Tensor
]:
...
@@ -512,7 +703,7 @@ class FastSpeech2(nn.Layer):
...
@@ -512,7 +703,7 @@ class FastSpeech2(nn.Layer):
use_teacher_forcing : bool, optional
use_teacher_forcing : bool, optional
Whether to use teacher forcing.
Whether to use teacher forcing.
If true, groundtruth of duration, pitch and energy will be used.
If true, groundtruth of duration, pitch and energy will be used.
sp
embs
: Tensor, optional
sp
k_emb
: Tensor, optional
peaker embedding vector (spk_embed_dim,).
peaker embedding vector (spk_embed_dim,).
spk_id : Tensor, optional(int64)
spk_id : Tensor, optional(int64)
Batch of padded spk ids (1,).
Batch of padded spk ids (1,).
...
@@ -527,7 +718,6 @@ class FastSpeech2(nn.Layer):
...
@@ -527,7 +718,6 @@ class FastSpeech2(nn.Layer):
# input of embedding must be int64
# input of embedding must be int64
x
=
paddle
.
cast
(
text
,
'int64'
)
x
=
paddle
.
cast
(
text
,
'int64'
)
y
=
speech
y
=
speech
spemb
=
spembs
d
,
p
,
e
=
durations
,
pitch
,
energy
d
,
p
,
e
=
durations
,
pitch
,
energy
# setup batch axis
# setup batch axis
ilens
=
paddle
.
shape
(
x
)[
0
]
ilens
=
paddle
.
shape
(
x
)[
0
]
...
@@ -537,8 +727,8 @@ class FastSpeech2(nn.Layer):
...
@@ -537,8 +727,8 @@ class FastSpeech2(nn.Layer):
if
y
is
not
None
:
if
y
is
not
None
:
ys
=
y
.
unsqueeze
(
0
)
ys
=
y
.
unsqueeze
(
0
)
if
spemb
is
not
None
:
if
sp
k_
emb
is
not
None
:
sp
embs
=
sp
emb
.
unsqueeze
(
0
)
sp
k_emb
=
spk_
emb
.
unsqueeze
(
0
)
if
tone_id
is
not
None
:
if
tone_id
is
not
None
:
tone_id
=
tone_id
.
unsqueeze
(
0
)
tone_id
=
tone_id
.
unsqueeze
(
0
)
...
@@ -548,7 +738,7 @@ class FastSpeech2(nn.Layer):
...
@@ -548,7 +738,7 @@ class FastSpeech2(nn.Layer):
ds
=
d
.
unsqueeze
(
0
)
if
d
is
not
None
else
None
ds
=
d
.
unsqueeze
(
0
)
if
d
is
not
None
else
None
ps
=
p
.
unsqueeze
(
0
)
if
p
is
not
None
else
None
ps
=
p
.
unsqueeze
(
0
)
if
p
is
not
None
else
None
es
=
e
.
unsqueeze
(
0
)
if
e
is
not
None
else
None
es
=
e
.
unsqueeze
(
0
)
if
e
is
not
None
else
None
# ds, ps, es = , p.unsqueeze(0), e.unsqueeze(0)
# (1, L, odim)
# (1, L, odim)
_
,
outs
,
d_outs
,
p_outs
,
e_outs
=
self
.
_forward
(
_
,
outs
,
d_outs
,
p_outs
,
e_outs
=
self
.
_forward
(
xs
,
xs
,
...
@@ -557,7 +747,7 @@ class FastSpeech2(nn.Layer):
...
@@ -557,7 +747,7 @@ class FastSpeech2(nn.Layer):
ds
=
ds
,
ds
=
ds
,
ps
=
ps
,
ps
=
ps
,
es
=
es
,
es
=
es
,
sp
embs
=
spembs
,
sp
k_emb
=
spk_emb
,
spk_id
=
spk_id
,
spk_id
=
spk_id
,
tone_id
=
tone_id
,
tone_id
=
tone_id
,
is_inference
=
True
)
is_inference
=
True
)
...
@@ -569,19 +759,19 @@ class FastSpeech2(nn.Layer):
...
@@ -569,19 +759,19 @@ class FastSpeech2(nn.Layer):
ys
,
ys
,
is_inference
=
True
,
is_inference
=
True
,
alpha
=
alpha
,
alpha
=
alpha
,
sp
embs
=
spembs
,
sp
k_emb
=
spk_emb
,
spk_id
=
spk_id
,
spk_id
=
spk_id
,
tone_id
=
tone_id
)
tone_id
=
tone_id
)
return
outs
[
0
],
d_outs
[
0
],
p_outs
[
0
],
e_outs
[
0
]
return
outs
[
0
],
d_outs
[
0
],
p_outs
[
0
],
e_outs
[
0
]
def
_integrate_with_spk_embed
(
self
,
hs
,
sp
embs
):
def
_integrate_with_spk_embed
(
self
,
hs
,
sp
k_emb
):
"""Integrate speaker embedding with hidden states.
"""Integrate speaker embedding with hidden states.
Parameters
Parameters
----------
----------
hs : Tensor
hs : Tensor
Batch of hidden state sequences (B, Tmax, adim).
Batch of hidden state sequences (B, Tmax, adim).
sp
embs
: Tensor
sp
k_emb
: Tensor
Batch of speaker embeddings (B, spk_embed_dim).
Batch of speaker embeddings (B, spk_embed_dim).
Returns
Returns
...
@@ -591,13 +781,13 @@ class FastSpeech2(nn.Layer):
...
@@ -591,13 +781,13 @@ class FastSpeech2(nn.Layer):
"""
"""
if
self
.
spk_embed_integration_type
==
"add"
:
if
self
.
spk_embed_integration_type
==
"add"
:
# apply projection and then add to hidden states
# apply projection and then add to hidden states
sp
embs
=
self
.
spk_projection
(
F
.
normalize
(
spembs
))
sp
k_emb
=
self
.
spk_projection
(
F
.
normalize
(
spk_emb
))
hs
=
hs
+
sp
embs
.
unsqueeze
(
1
)
hs
=
hs
+
sp
k_emb
.
unsqueeze
(
1
)
elif
self
.
spk_embed_integration_type
==
"concat"
:
elif
self
.
spk_embed_integration_type
==
"concat"
:
# concat hidden states with spk embeds and then apply projection
# concat hidden states with spk embeds and then apply projection
sp
embs
=
F
.
normalize
(
spembs
).
unsqueeze
(
1
).
expand
(
sp
k_emb
=
F
.
normalize
(
spk_emb
).
unsqueeze
(
1
).
expand
(
shape
=
[
-
1
,
hs
.
shape
[
1
],
-
1
])
shape
=
[
-
1
,
hs
.
shape
[
1
],
-
1
])
hs
=
self
.
spk_projection
(
paddle
.
concat
([
hs
,
sp
embs
],
axis
=-
1
))
hs
=
self
.
spk_projection
(
paddle
.
concat
([
hs
,
sp
k_emb
],
axis
=-
1
))
else
:
else
:
raise
NotImplementedError
(
"support only add or concat."
)
raise
NotImplementedError
(
"support only add or concat."
)
...
@@ -682,9 +872,9 @@ class FastSpeech2Inference(nn.Layer):
...
@@ -682,9 +872,9 @@ class FastSpeech2Inference(nn.Layer):
self
.
normalizer
=
normalizer
self
.
normalizer
=
normalizer
self
.
acoustic_model
=
model
self
.
acoustic_model
=
model
def
forward
(
self
,
text
,
spk_id
=
None
):
def
forward
(
self
,
text
,
spk_id
=
None
,
spk_emb
=
None
):
normalized_mel
,
d_outs
,
p_outs
,
e_outs
=
self
.
acoustic_model
.
inference
(
normalized_mel
,
d_outs
,
p_outs
,
e_outs
=
self
.
acoustic_model
.
inference
(
text
,
spk_id
=
spk_id
)
text
,
spk_id
=
spk_id
,
spk_emb
=
spk_emb
)
logmel
=
self
.
normalizer
.
inverse
(
normalized_mel
)
logmel
=
self
.
normalizer
.
inverse
(
normalized_mel
)
return
logmel
return
logmel
...
...
paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
浏览文件 @
bc0dd511
...
@@ -54,6 +54,10 @@ class FastSpeech2Updater(StandardUpdater):
...
@@ -54,6 +54,10 @@ class FastSpeech2Updater(StandardUpdater):
losses_dict
=
{}
losses_dict
=
{}
# spk_id!=None in multiple spk fastspeech2
# spk_id!=None in multiple spk fastspeech2
spk_id
=
batch
[
"spk_id"
]
if
"spk_id"
in
batch
else
None
spk_id
=
batch
[
"spk_id"
]
if
"spk_id"
in
batch
else
None
spk_emb
=
batch
[
"spk_emb"
]
if
"spk_emb"
in
batch
else
None
# No explicit speaker identifier labels are used during voice cloning training.
if
spk_emb
is
not
None
:
spk_id
=
None
before_outs
,
after_outs
,
d_outs
,
p_outs
,
e_outs
,
ys
,
olens
=
self
.
model
(
before_outs
,
after_outs
,
d_outs
,
p_outs
,
e_outs
,
ys
,
olens
=
self
.
model
(
text
=
batch
[
"text"
],
text
=
batch
[
"text"
],
...
@@ -63,7 +67,8 @@ class FastSpeech2Updater(StandardUpdater):
...
@@ -63,7 +67,8 @@ class FastSpeech2Updater(StandardUpdater):
durations
=
batch
[
"durations"
],
durations
=
batch
[
"durations"
],
pitch
=
batch
[
"pitch"
],
pitch
=
batch
[
"pitch"
],
energy
=
batch
[
"energy"
],
energy
=
batch
[
"energy"
],
spk_id
=
spk_id
)
spk_id
=
spk_id
,
spk_emb
=
spk_emb
)
l1_loss
,
duration_loss
,
pitch_loss
,
energy_loss
=
self
.
criterion
(
l1_loss
,
duration_loss
,
pitch_loss
,
energy_loss
=
self
.
criterion
(
after_outs
=
after_outs
,
after_outs
=
after_outs
,
...
@@ -126,6 +131,9 @@ class FastSpeech2Evaluator(StandardEvaluator):
...
@@ -126,6 +131,9 @@ class FastSpeech2Evaluator(StandardEvaluator):
losses_dict
=
{}
losses_dict
=
{}
# spk_id!=None in multiple spk fastspeech2
# spk_id!=None in multiple spk fastspeech2
spk_id
=
batch
[
"spk_id"
]
if
"spk_id"
in
batch
else
None
spk_id
=
batch
[
"spk_id"
]
if
"spk_id"
in
batch
else
None
spk_emb
=
batch
[
"spk_emb"
]
if
"spk_emb"
in
batch
else
None
if
spk_emb
is
not
None
:
spk_id
=
None
before_outs
,
after_outs
,
d_outs
,
p_outs
,
e_outs
,
ys
,
olens
=
self
.
model
(
before_outs
,
after_outs
,
d_outs
,
p_outs
,
e_outs
,
ys
,
olens
=
self
.
model
(
text
=
batch
[
"text"
],
text
=
batch
[
"text"
],
...
@@ -135,7 +143,8 @@ class FastSpeech2Evaluator(StandardEvaluator):
...
@@ -135,7 +143,8 @@ class FastSpeech2Evaluator(StandardEvaluator):
durations
=
batch
[
"durations"
],
durations
=
batch
[
"durations"
],
pitch
=
batch
[
"pitch"
],
pitch
=
batch
[
"pitch"
],
energy
=
batch
[
"energy"
],
energy
=
batch
[
"energy"
],
spk_id
=
spk_id
)
spk_id
=
spk_id
,
spk_emb
=
spk_emb
)
l1_loss
,
duration_loss
,
pitch_loss
,
energy_loss
=
self
.
criterion
(
l1_loss
,
duration_loss
,
pitch_loss
,
energy_loss
=
self
.
criterion
(
after_outs
=
after_outs
,
after_outs
=
after_outs
,
...
...
paddlespeech/t2s/models/transformer_tts/transformer_tts.py
浏览文件 @
bc0dd511
...
@@ -257,9 +257,9 @@ class TransformerTTS(nn.Layer):
...
@@ -257,9 +257,9 @@ class TransformerTTS(nn.Layer):
self
.
padding_idx
=
0
self
.
padding_idx
=
0
# set_global_initializer 会影响后面的全局,包括 create_parameter
# set_global_initializer 会影响后面的全局,包括 create_parameter
initialize
(
self
,
init_type
)
initialize
(
self
,
init_type
)
# get positional encoding class
pos_enc_class
=
(
ScaledPositionalEncoding
# get positional encoding layer type
if
self
.
use_scaled_pos_enc
else
PositionalEncoding
)
transformer_pos_enc_layer_type
=
"scaled_abs_pos"
if
self
.
use_scaled_pos_enc
else
"abs_pos"
# define transformer encoder
# define transformer encoder
if
eprenet_conv_layers
!=
0
:
if
eprenet_conv_layers
!=
0
:
...
@@ -291,7 +291,7 @@ class TransformerTTS(nn.Layer):
...
@@ -291,7 +291,7 @@ class TransformerTTS(nn.Layer):
dropout_rate
=
transformer_enc_dropout_rate
,
dropout_rate
=
transformer_enc_dropout_rate
,
positional_dropout_rate
=
transformer_enc_positional_dropout_rate
,
positional_dropout_rate
=
transformer_enc_positional_dropout_rate
,
attention_dropout_rate
=
transformer_enc_attn_dropout_rate
,
attention_dropout_rate
=
transformer_enc_attn_dropout_rate
,
pos_enc_
class
=
pos_enc_class
,
pos_enc_
layer_type
=
transformer_pos_enc_layer_type
,
normalize_before
=
encoder_normalize_before
,
normalize_before
=
encoder_normalize_before
,
concat_after
=
encoder_concat_after
,
concat_after
=
encoder_concat_after
,
positionwise_layer_type
=
positionwise_layer_type
,
positionwise_layer_type
=
positionwise_layer_type
,
...
@@ -330,6 +330,9 @@ class TransformerTTS(nn.Layer):
...
@@ -330,6 +330,9 @@ class TransformerTTS(nn.Layer):
nn
.
Linear
(
dprenet_units
,
adim
),
)
nn
.
Linear
(
dprenet_units
,
adim
),
)
else
:
else
:
decoder_input_layer
=
"linear"
decoder_input_layer
=
"linear"
# get positional encoding class
pos_enc_class
=
(
ScaledPositionalEncoding
if
self
.
use_scaled_pos_enc
else
PositionalEncoding
)
self
.
decoder
=
Decoder
(
self
.
decoder
=
Decoder
(
odim
=
odim
,
# odim is needed when no prenet is used
odim
=
odim
,
# odim is needed when no prenet is used
attention_dim
=
adim
,
attention_dim
=
adim
,
...
@@ -391,7 +394,7 @@ class TransformerTTS(nn.Layer):
...
@@ -391,7 +394,7 @@ class TransformerTTS(nn.Layer):
text_lengths
:
paddle
.
Tensor
,
text_lengths
:
paddle
.
Tensor
,
speech
:
paddle
.
Tensor
,
speech
:
paddle
.
Tensor
,
speech_lengths
:
paddle
.
Tensor
,
speech_lengths
:
paddle
.
Tensor
,
sp
embs
:
paddle
.
Tensor
=
None
,
sp
k_emb
:
paddle
.
Tensor
=
None
,
)
->
Tuple
[
paddle
.
Tensor
,
Dict
[
str
,
paddle
.
Tensor
],
paddle
.
Tensor
]:
)
->
Tuple
[
paddle
.
Tensor
,
Dict
[
str
,
paddle
.
Tensor
],
paddle
.
Tensor
]:
"""Calculate forward propagation.
"""Calculate forward propagation.
...
@@ -405,7 +408,7 @@ class TransformerTTS(nn.Layer):
...
@@ -405,7 +408,7 @@ class TransformerTTS(nn.Layer):
Batch of padded target features (B, Lmax, odim).
Batch of padded target features (B, Lmax, odim).
speech_lengths : Tensor(int64)
speech_lengths : Tensor(int64)
Batch of the lengths of each target (B,).
Batch of the lengths of each target (B,).
sp
embs
: Tensor, optional
sp
k_emb
: Tensor, optional
Batch of speaker embeddings (B, spk_embed_dim).
Batch of speaker embeddings (B, spk_embed_dim).
Returns
Returns
...
@@ -439,7 +442,7 @@ class TransformerTTS(nn.Layer):
...
@@ -439,7 +442,7 @@ class TransformerTTS(nn.Layer):
# calculate transformer outputs
# calculate transformer outputs
after_outs
,
before_outs
,
logits
=
self
.
_forward
(
xs
,
ilens
,
ys
,
olens
,
after_outs
,
before_outs
,
logits
=
self
.
_forward
(
xs
,
ilens
,
ys
,
olens
,
sp
embs
)
sp
k_emb
)
# modifiy mod part of groundtruth
# modifiy mod part of groundtruth
...
@@ -467,7 +470,7 @@ class TransformerTTS(nn.Layer):
...
@@ -467,7 +470,7 @@ class TransformerTTS(nn.Layer):
ilens
:
paddle
.
Tensor
,
ilens
:
paddle
.
Tensor
,
ys
:
paddle
.
Tensor
,
ys
:
paddle
.
Tensor
,
olens
:
paddle
.
Tensor
,
olens
:
paddle
.
Tensor
,
sp
embs
:
paddle
.
Tensor
,
sp
k_emb
:
paddle
.
Tensor
,
)
->
Tuple
[
paddle
.
Tensor
,
paddle
.
Tensor
,
paddle
.
Tensor
]:
)
->
Tuple
[
paddle
.
Tensor
,
paddle
.
Tensor
,
paddle
.
Tensor
]:
# forward encoder
# forward encoder
x_masks
=
self
.
_source_mask
(
ilens
)
x_masks
=
self
.
_source_mask
(
ilens
)
...
@@ -480,7 +483,7 @@ class TransformerTTS(nn.Layer):
...
@@ -480,7 +483,7 @@ class TransformerTTS(nn.Layer):
# integrate speaker embedding
# integrate speaker embedding
if
self
.
spk_embed_dim
is
not
None
:
if
self
.
spk_embed_dim
is
not
None
:
hs
=
self
.
_integrate_with_spk_embed
(
hs
,
sp
embs
)
hs
=
self
.
_integrate_with_spk_embed
(
hs
,
sp
k_emb
)
# thin out frames for reduction factor (B, Lmax, odim) -> (B, Lmax//r, odim)
# thin out frames for reduction factor (B, Lmax, odim) -> (B, Lmax//r, odim)
if
self
.
reduction_factor
>
1
:
if
self
.
reduction_factor
>
1
:
...
@@ -514,7 +517,7 @@ class TransformerTTS(nn.Layer):
...
@@ -514,7 +517,7 @@ class TransformerTTS(nn.Layer):
self
,
self
,
text
:
paddle
.
Tensor
,
text
:
paddle
.
Tensor
,
speech
:
paddle
.
Tensor
=
None
,
speech
:
paddle
.
Tensor
=
None
,
sp
embs
:
paddle
.
Tensor
=
None
,
sp
k_emb
:
paddle
.
Tensor
=
None
,
threshold
:
float
=
0.5
,
threshold
:
float
=
0.5
,
minlenratio
:
float
=
0.0
,
minlenratio
:
float
=
0.0
,
maxlenratio
:
float
=
10.0
,
maxlenratio
:
float
=
10.0
,
...
@@ -528,7 +531,7 @@ class TransformerTTS(nn.Layer):
...
@@ -528,7 +531,7 @@ class TransformerTTS(nn.Layer):
Input sequence of characters (T,).
Input sequence of characters (T,).
speech : Tensor, optional
speech : Tensor, optional
Feature sequence to extract style (N, idim).
Feature sequence to extract style (N, idim).
sp
embs
: Tensor, optional
sp
k_emb
: Tensor, optional
Speaker embedding vector (spk_embed_dim,).
Speaker embedding vector (spk_embed_dim,).
threshold : float, optional
threshold : float, optional
Threshold in inference.
Threshold in inference.
...
@@ -551,7 +554,6 @@ class TransformerTTS(nn.Layer):
...
@@ -551,7 +554,6 @@ class TransformerTTS(nn.Layer):
"""
"""
# input of embedding must be int64
# input of embedding must be int64
y
=
speech
y
=
speech
spemb
=
spembs
# add eos at the last of sequence
# add eos at the last of sequence
text
=
numpy
.
pad
(
text
=
numpy
.
pad
(
...
@@ -564,12 +566,12 @@ class TransformerTTS(nn.Layer):
...
@@ -564,12 +566,12 @@ class TransformerTTS(nn.Layer):
# get teacher forcing outputs
# get teacher forcing outputs
xs
,
ys
=
x
.
unsqueeze
(
0
),
y
.
unsqueeze
(
0
)
xs
,
ys
=
x
.
unsqueeze
(
0
),
y
.
unsqueeze
(
0
)
sp
embs
=
None
if
spemb
is
None
else
sp
emb
.
unsqueeze
(
0
)
sp
k_emb
=
None
if
spk_emb
is
None
else
spk_
emb
.
unsqueeze
(
0
)
ilens
=
paddle
.
to_tensor
(
ilens
=
paddle
.
to_tensor
(
[
xs
.
shape
[
1
]],
dtype
=
paddle
.
int64
,
place
=
xs
.
place
)
[
xs
.
shape
[
1
]],
dtype
=
paddle
.
int64
,
place
=
xs
.
place
)
olens
=
paddle
.
to_tensor
(
olens
=
paddle
.
to_tensor
(
[
ys
.
shape
[
1
]],
dtype
=
paddle
.
int64
,
place
=
ys
.
place
)
[
ys
.
shape
[
1
]],
dtype
=
paddle
.
int64
,
place
=
ys
.
place
)
outs
,
*
_
=
self
.
_forward
(
xs
,
ilens
,
ys
,
olens
,
sp
embs
)
outs
,
*
_
=
self
.
_forward
(
xs
,
ilens
,
ys
,
olens
,
sp
k_emb
)
# get attention weights
# get attention weights
att_ws
=
[]
att_ws
=
[]
...
@@ -590,9 +592,9 @@ class TransformerTTS(nn.Layer):
...
@@ -590,9 +592,9 @@ class TransformerTTS(nn.Layer):
hs
=
hs
+
style_embs
.
unsqueeze
(
1
)
hs
=
hs
+
style_embs
.
unsqueeze
(
1
)
# integrate speaker embedding
# integrate speaker embedding
if
s
elf
.
spk_embed_dim
is
not
None
:
if
s
pk_emb
is
not
None
:
sp
embs
=
sp
emb
.
unsqueeze
(
0
)
sp
k_emb
=
spk_
emb
.
unsqueeze
(
0
)
hs
=
self
.
_integrate_with_spk_embed
(
hs
,
sp
embs
)
hs
=
self
.
_integrate_with_spk_embed
(
hs
,
sp
k_emb
)
# set limits of length
# set limits of length
maxlen
=
int
(
hs
.
shape
[
1
]
*
maxlenratio
/
self
.
reduction_factor
)
maxlen
=
int
(
hs
.
shape
[
1
]
*
maxlenratio
/
self
.
reduction_factor
)
...
@@ -726,14 +728,14 @@ class TransformerTTS(nn.Layer):
...
@@ -726,14 +728,14 @@ class TransformerTTS(nn.Layer):
def
_integrate_with_spk_embed
(
self
,
def
_integrate_with_spk_embed
(
self
,
hs
:
paddle
.
Tensor
,
hs
:
paddle
.
Tensor
,
sp
embs
:
paddle
.
Tensor
)
->
paddle
.
Tensor
:
sp
k_emb
:
paddle
.
Tensor
)
->
paddle
.
Tensor
:
"""Integrate speaker embedding with hidden states.
"""Integrate speaker embedding with hidden states.
Parameters
Parameters
----------
----------
hs : Tensor
hs : Tensor
Batch of hidden state sequences (B, Tmax, adim).
Batch of hidden state sequences (B, Tmax, adim).
sp
embs
: Tensor
sp
k_emb
: Tensor
Batch of speaker embeddings (B, spk_embed_dim).
Batch of speaker embeddings (B, spk_embed_dim).
Returns
Returns
...
@@ -744,13 +746,13 @@ class TransformerTTS(nn.Layer):
...
@@ -744,13 +746,13 @@ class TransformerTTS(nn.Layer):
"""
"""
if
self
.
spk_embed_integration_type
==
"add"
:
if
self
.
spk_embed_integration_type
==
"add"
:
# apply projection and then add to hidden states
# apply projection and then add to hidden states
sp
embs
=
self
.
projection
(
F
.
normalize
(
spembs
))
sp
k_emb
=
self
.
projection
(
F
.
normalize
(
spk_emb
))
hs
=
hs
+
sp
embs
.
unsqueeze
(
1
)
hs
=
hs
+
sp
k_emb
.
unsqueeze
(
1
)
elif
self
.
spk_embed_integration_type
==
"concat"
:
elif
self
.
spk_embed_integration_type
==
"concat"
:
# concat hidden states with spk embeds and then apply projection
# concat hidden states with spk embeds and then apply projection
sp
embs
=
F
.
normalize
(
spembs
).
unsqueeze
(
1
).
expand
(
-
1
,
hs
.
shape
[
1
],
sp
k_emb
=
F
.
normalize
(
spk_emb
).
unsqueeze
(
1
).
expand
(
-
1
,
hs
.
shape
[
1
],
-
1
)
-
1
)
hs
=
self
.
projection
(
paddle
.
concat
([
hs
,
sp
embs
],
axis
=-
1
))
hs
=
self
.
projection
(
paddle
.
concat
([
hs
,
sp
k_emb
],
axis
=-
1
))
else
:
else
:
raise
NotImplementedError
(
"support only add or concat."
)
raise
NotImplementedError
(
"support only add or concat."
)
...
...
paddlespeech/t2s/modules/conformer/encoder.py
已删除
100644 → 0
浏览文件 @
4370c5cf
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
"""Encoder definition."""
import
logging
import
paddle
from
paddlespeech.t2s.modules.conformer.convolution
import
ConvolutionModule
from
paddlespeech.t2s.modules.conformer.encoder_layer
import
EncoderLayer
from
paddlespeech.t2s.modules.layer_norm
import
LayerNorm
from
paddlespeech.t2s.modules.nets_utils
import
get_activation
from
paddlespeech.t2s.modules.transformer.attention
import
LegacyRelPositionMultiHeadedAttention
from
paddlespeech.t2s.modules.transformer.attention
import
MultiHeadedAttention
from
paddlespeech.t2s.modules.transformer.attention
import
RelPositionMultiHeadedAttention
from
paddlespeech.t2s.modules.transformer.embedding
import
LegacyRelPositionalEncoding
from
paddlespeech.t2s.modules.transformer.embedding
import
PositionalEncoding
from
paddlespeech.t2s.modules.transformer.embedding
import
RelPositionalEncoding
from
paddlespeech.t2s.modules.transformer.embedding
import
ScaledPositionalEncoding
from
paddlespeech.t2s.modules.transformer.multi_layer_conv
import
Conv1dLinear
from
paddlespeech.t2s.modules.transformer.multi_layer_conv
import
MultiLayeredConv1d
from
paddlespeech.t2s.modules.transformer.positionwise_feed_forward
import
PositionwiseFeedForward
from
paddlespeech.t2s.modules.transformer.repeat
import
repeat
from
paddlespeech.t2s.modules.transformer.subsampling
import
Conv2dSubsampling
class
Encoder
(
paddle
.
nn
.
Layer
):
"""Conformer encoder module.
Parameters
----------
idim : int
Input dimension.
attention_dim : int
Dimension of attention.
attention_heads : int
The number of heads of multi head attention.
linear_units : int
The number of units of position-wise feed forward.
num_blocks : int
The number of decoder blocks.
dropout_rate : float
Dropout rate.
positional_dropout_rate : float
Dropout rate after adding positional encoding.
attention_dropout_rate : float
Dropout rate in attention.
input_layer : Union[str, paddle.nn.Layer]
Input layer type.
normalize_before : bool
Whether to use layer_norm before the first block.
concat_after : bool
Whether to concat attention layer's input and output.
if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x)
positionwise_layer_type : str
"linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size : int
Kernel size of positionwise conv1d layer.
macaron_style : bool
Whether to use macaron style for positionwise layer.
pos_enc_layer_type : str
Encoder positional encoding layer type.
selfattention_layer_type : str
Encoder attention layer type.
activation_type : str
Encoder activation function type.
use_cnn_module : bool
Whether to use convolution module.
zero_triu : bool
Whether to zero the upper triangular part of attention matrix.
cnn_module_kernel : int
Kernerl size of convolution module.
padding_idx : int
Padding idx for input_layer=embed.
stochastic_depth_rate : float
Maximum probability to skip the encoder layer.
intermediate_layers : Union[List[int], None]
indices of intermediate CTC layer.
indices start from 1.
if not None, intermediate outputs are returned (which changes return type
signature.)
"""
def
__init__
(
self
,
idim
,
attention_dim
=
256
,
attention_heads
=
4
,
linear_units
=
2048
,
num_blocks
=
6
,
dropout_rate
=
0.1
,
positional_dropout_rate
=
0.1
,
attention_dropout_rate
=
0.0
,
input_layer
=
"conv2d"
,
normalize_before
=
True
,
concat_after
=
False
,
positionwise_layer_type
=
"linear"
,
positionwise_conv_kernel_size
=
1
,
macaron_style
=
False
,
pos_enc_layer_type
=
"abs_pos"
,
selfattention_layer_type
=
"selfattn"
,
activation_type
=
"swish"
,
use_cnn_module
=
False
,
zero_triu
=
False
,
cnn_module_kernel
=
31
,
padding_idx
=-
1
,
stochastic_depth_rate
=
0.0
,
intermediate_layers
=
None
,
):
"""Construct an Encoder object."""
super
(
Encoder
,
self
).
__init__
()
activation
=
get_activation
(
activation_type
)
if
pos_enc_layer_type
==
"abs_pos"
:
pos_enc_class
=
PositionalEncoding
elif
pos_enc_layer_type
==
"scaled_abs_pos"
:
pos_enc_class
=
ScaledPositionalEncoding
elif
pos_enc_layer_type
==
"rel_pos"
:
assert
selfattention_layer_type
==
"rel_selfattn"
pos_enc_class
=
RelPositionalEncoding
elif
pos_enc_layer_type
==
"legacy_rel_pos"
:
pos_enc_class
=
LegacyRelPositionalEncoding
assert
selfattention_layer_type
==
"legacy_rel_selfattn"
else
:
raise
ValueError
(
"unknown pos_enc_layer: "
+
pos_enc_layer_type
)
self
.
conv_subsampling_factor
=
1
if
input_layer
==
"linear"
:
self
.
embed
=
paddle
.
nn
.
Sequential
(
paddle
.
nn
.
Linear
(
idim
,
attention_dim
),
paddle
.
nn
.
LayerNorm
(
attention_dim
),
paddle
.
nn
.
Dropout
(
dropout_rate
),
pos_enc_class
(
attention_dim
,
positional_dropout_rate
),
)
elif
input_layer
==
"conv2d"
:
self
.
embed
=
Conv2dSubsampling
(
idim
,
attention_dim
,
dropout_rate
,
pos_enc_class
(
attention_dim
,
positional_dropout_rate
),
)
self
.
conv_subsampling_factor
=
4
elif
input_layer
==
"embed"
:
self
.
embed
=
paddle
.
nn
.
Sequential
(
paddle
.
nn
.
Embedding
(
idim
,
attention_dim
,
padding_idx
=
padding_idx
),
pos_enc_class
(
attention_dim
,
positional_dropout_rate
),
)
elif
isinstance
(
input_layer
,
paddle
.
nn
.
Layer
):
self
.
embed
=
paddle
.
nn
.
Sequential
(
input_layer
,
pos_enc_class
(
attention_dim
,
positional_dropout_rate
),
)
elif
input_layer
is
None
:
self
.
embed
=
paddle
.
nn
.
Sequential
(
pos_enc_class
(
attention_dim
,
positional_dropout_rate
))
else
:
raise
ValueError
(
"unknown input_layer: "
+
input_layer
)
self
.
normalize_before
=
normalize_before
# self-attention module definition
if
selfattention_layer_type
==
"selfattn"
:
logging
.
info
(
"encoder self-attention layer type = self-attention"
)
encoder_selfattn_layer
=
MultiHeadedAttention
encoder_selfattn_layer_args
=
(
attention_heads
,
attention_dim
,
attention_dropout_rate
,
)
elif
selfattention_layer_type
==
"legacy_rel_selfattn"
:
assert
pos_enc_layer_type
==
"legacy_rel_pos"
encoder_selfattn_layer
=
LegacyRelPositionMultiHeadedAttention
encoder_selfattn_layer_args
=
(
attention_heads
,
attention_dim
,
attention_dropout_rate
,
)
elif
selfattention_layer_type
==
"rel_selfattn"
:
logging
.
info
(
"encoder self-attention layer type = relative self-attention"
)
assert
pos_enc_layer_type
==
"rel_pos"
encoder_selfattn_layer
=
RelPositionMultiHeadedAttention
encoder_selfattn_layer_args
=
(
attention_heads
,
attention_dim
,
attention_dropout_rate
,
zero_triu
,
)
else
:
raise
ValueError
(
"unknown encoder_attn_layer: "
+
selfattention_layer_type
)
# feed-forward module definition
if
positionwise_layer_type
==
"linear"
:
positionwise_layer
=
PositionwiseFeedForward
positionwise_layer_args
=
(
attention_dim
,
linear_units
,
dropout_rate
,
activation
,
)
elif
positionwise_layer_type
==
"conv1d"
:
positionwise_layer
=
MultiLayeredConv1d
positionwise_layer_args
=
(
attention_dim
,
linear_units
,
positionwise_conv_kernel_size
,
dropout_rate
,
)
elif
positionwise_layer_type
==
"conv1d-linear"
:
positionwise_layer
=
Conv1dLinear
positionwise_layer_args
=
(
attention_dim
,
linear_units
,
positionwise_conv_kernel_size
,
dropout_rate
,
)
else
:
raise
NotImplementedError
(
"Support only linear or conv1d."
)
# convolution module definition
convolution_layer
=
ConvolutionModule
convolution_layer_args
=
(
attention_dim
,
cnn_module_kernel
,
activation
)
self
.
encoders
=
repeat
(
num_blocks
,
lambda
lnum
:
EncoderLayer
(
attention_dim
,
encoder_selfattn_layer
(
*
encoder_selfattn_layer_args
),
positionwise_layer
(
*
positionwise_layer_args
),
positionwise_layer
(
*
positionwise_layer_args
)
if
macaron_style
else
None
,
convolution_layer
(
*
convolution_layer_args
)
if
use_cnn_module
else
None
,
dropout_rate
,
normalize_before
,
concat_after
,
stochastic_depth_rate
*
float
(
1
+
lnum
)
/
num_blocks
,
),
)
if
self
.
normalize_before
:
self
.
after_norm
=
LayerNorm
(
attention_dim
)
self
.
intermediate_layers
=
intermediate_layers
def
forward
(
self
,
xs
,
masks
):
"""Encode input sequence.
Parameters
----------
xs : paddle.Tensor
Input tensor (#batch, time, idim).
masks (paddle.Tensor): Mask tensor (#batch, 1, time).
Returns
----------
paddle.Tensor
Output tensor (#batch, time, attention_dim).
paddle.Tensor
Mask tensor (#batch, time).
"""
if
isinstance
(
self
.
embed
,
(
Conv2dSubsampling
)):
xs
,
masks
=
self
.
embed
(
xs
,
masks
)
else
:
xs
=
self
.
embed
(
xs
)
if
self
.
intermediate_layers
is
None
:
xs
,
masks
=
self
.
encoders
(
xs
,
masks
)
else
:
intermediate_outputs
=
[]
for
layer_idx
,
encoder_layer
in
enumerate
(
self
.
encoders
):
xs
,
masks
=
encoder_layer
(
xs
,
masks
)
if
(
self
.
intermediate_layers
is
not
None
and
layer_idx
+
1
in
self
.
intermediate_layers
):
# intermediate branches also require normalization.
encoder_output
=
xs
if
isinstance
(
encoder_output
,
tuple
):
encoder_output
=
encoder_output
[
0
]
if
self
.
normalize_before
:
encoder_output
=
self
.
after_norm
(
encoder_output
)
intermediate_outputs
.
append
(
encoder_output
)
if
isinstance
(
xs
,
tuple
):
xs
=
xs
[
0
]
if
self
.
normalize_before
:
xs
=
self
.
after_norm
(
xs
)
if
self
.
intermediate_layers
is
not
None
:
return
xs
,
masks
,
intermediate_outputs
return
xs
,
masks
paddlespeech/t2s/modules/transformer/attention.py
浏览文件 @
bc0dd511
...
@@ -37,7 +37,7 @@ class MultiHeadedAttention(nn.Layer):
...
@@ -37,7 +37,7 @@ class MultiHeadedAttention(nn.Layer):
def
__init__
(
self
,
n_head
,
n_feat
,
dropout_rate
):
def
__init__
(
self
,
n_head
,
n_feat
,
dropout_rate
):
"""Construct an MultiHeadedAttention object."""
"""Construct an MultiHeadedAttention object."""
super
(
MultiHeadedAttention
,
self
).
__init__
()
super
().
__init__
()
assert
n_feat
%
n_head
==
0
assert
n_feat
%
n_head
==
0
# We assume d_v always equals d_k
# We assume d_v always equals d_k
self
.
d_k
=
n_feat
//
n_head
self
.
d_k
=
n_feat
//
n_head
...
@@ -70,7 +70,7 @@ class MultiHeadedAttention(nn.Layer):
...
@@ -70,7 +70,7 @@ class MultiHeadedAttention(nn.Layer):
paddle.Tensor
paddle.Tensor
Transformed value tensor (#batch, n_head, time2, d_k).
Transformed value tensor (#batch, n_head, time2, d_k).
"""
"""
n_batch
=
query
.
shape
[
0
]
n_batch
=
paddle
.
shape
(
query
)
[
0
]
q
=
paddle
.
reshape
(
q
=
paddle
.
reshape
(
self
.
linear_q
(
query
),
[
n_batch
,
-
1
,
self
.
h
,
self
.
d_k
])
self
.
linear_q
(
query
),
[
n_batch
,
-
1
,
self
.
h
,
self
.
d_k
])
...
@@ -104,7 +104,7 @@ class MultiHeadedAttention(nn.Layer):
...
@@ -104,7 +104,7 @@ class MultiHeadedAttention(nn.Layer):
Transformed value (#batch, time1, d_model)
Transformed value (#batch, time1, d_model)
weighted by the attention score (#batch, time1, time2).
weighted by the attention score (#batch, time1, time2).
"""
"""
n_batch
=
value
.
shape
[
0
]
n_batch
=
paddle
.
shape
(
value
)
[
0
]
softmax
=
paddle
.
nn
.
Softmax
(
axis
=-
1
)
softmax
=
paddle
.
nn
.
Softmax
(
axis
=-
1
)
if
mask
is
not
None
:
if
mask
is
not
None
:
mask
=
mask
.
unsqueeze
(
1
)
mask
=
mask
.
unsqueeze
(
1
)
...
@@ -126,8 +126,8 @@ class MultiHeadedAttention(nn.Layer):
...
@@ -126,8 +126,8 @@ class MultiHeadedAttention(nn.Layer):
# (batch, time1, d_model)
# (batch, time1, d_model)
x
=
(
paddle
.
reshape
(
x
=
(
paddle
.
reshape
(
x
.
transpose
((
0
,
2
,
1
,
3
)),
(
n_batch
,
-
1
,
self
.
h
*
self
.
d_k
)))
x
.
transpose
((
0
,
2
,
1
,
3
)),
(
n_batch
,
-
1
,
self
.
h
*
self
.
d_k
)))
# (batch, time1, d_model)
return
self
.
linear_out
(
x
)
# (batch, time1, d_model)
return
self
.
linear_out
(
x
)
def
forward
(
self
,
query
,
key
,
value
,
mask
=
None
):
def
forward
(
self
,
query
,
key
,
value
,
mask
=
None
):
"""Compute scaled dot product attention.
"""Compute scaled dot product attention.
...
@@ -153,3 +153,113 @@ class MultiHeadedAttention(nn.Layer):
...
@@ -153,3 +153,113 @@ class MultiHeadedAttention(nn.Layer):
(
0
,
1
,
3
,
2
)))
/
math
.
sqrt
(
self
.
d_k
)
(
0
,
1
,
3
,
2
)))
/
math
.
sqrt
(
self
.
d_k
)
return
self
.
forward_attention
(
v
,
scores
,
mask
)
return
self
.
forward_attention
(
v
,
scores
,
mask
)
class
RelPositionMultiHeadedAttention
(
MultiHeadedAttention
):
"""Multi-Head Attention layer with relative position encoding (new implementation).
Details can be found in https://github.com/espnet/espnet/pull/2816.
Paper: https://arxiv.org/abs/1901.02860
Parameters
----------
n_head : int
The number of heads.
n_feat : int
The number of features.
dropout_rate : float
Dropout rate.
zero_triu : bool
Whether to zero the upper triangular part of attention matrix.
"""
def
__init__
(
self
,
n_head
,
n_feat
,
dropout_rate
,
zero_triu
=
False
):
"""Construct an RelPositionMultiHeadedAttention object."""
super
().
__init__
(
n_head
,
n_feat
,
dropout_rate
)
self
.
zero_triu
=
zero_triu
# linear transformation for positional encoding
self
.
linear_pos
=
nn
.
Linear
(
n_feat
,
n_feat
,
bias_attr
=
False
)
# these two learnable bias are used in matrix c and matrix d
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
self
.
pos_bias_u
=
paddle
.
create_parameter
(
shape
=
(
self
.
h
,
self
.
d_k
),
dtype
=
'float32'
,
default_initializer
=
paddle
.
nn
.
initializer
.
XavierUniform
())
self
.
pos_bias_v
=
paddle
.
create_parameter
(
shape
=
(
self
.
h
,
self
.
d_k
),
dtype
=
'float32'
,
default_initializer
=
paddle
.
nn
.
initializer
.
XavierUniform
())
def
rel_shift
(
self
,
x
):
"""Compute relative positional encoding.
Parameters
----------
x : paddle.Tensor
Input tensor (batch, head, time1, 2*time1-1).
time1 means the length of query vector.
Returns
----------
paddle.Tensor
Output tensor.
"""
b
,
h
,
t1
,
t2
=
paddle
.
shape
(
x
)
zero_pad
=
paddle
.
zeros
((
b
,
h
,
t1
,
1
))
x_padded
=
paddle
.
concat
([
zero_pad
,
x
],
axis
=-
1
)
x_padded
=
x_padded
.
reshape
([
b
,
h
,
t2
+
1
,
t1
])
# only keep the positions from 0 to time2
x
=
x_padded
[:,
:,
1
:].
reshape
([
b
,
h
,
t1
,
t2
])[:,
:,
:,
:
t2
//
2
+
1
]
if
self
.
zero_triu
:
ones
=
paddle
.
ones
((
t1
,
t2
))
x
=
x
*
paddle
.
tril
(
ones
,
t2
-
1
)[
None
,
None
,
:,
:]
return
x
def
forward
(
self
,
query
,
key
,
value
,
pos_emb
,
mask
):
"""Compute 'Scaled Dot Product Attention' with rel. positional encoding.
Parameters
----------
query : paddle.Tensor
Query tensor (#batch, time1, size).
key : paddle.Tensor
Key tensor (#batch, time2, size).
value : paddle.Tensor
Value tensor (#batch, time2, size).
pos_emb : paddle.Tensor
Positional embedding tensor
(#batch, 2*time1-1, size).
mask : paddle.Tensor
Mask tensor (#batch, 1, time2) or
(#batch, time1, time2).
Returns
----------
paddle.Tensor
Output tensor (#batch, time1, d_model).
"""
q
,
k
,
v
=
self
.
forward_qkv
(
query
,
key
,
value
)
# (batch, time1, head, d_k)
q
=
q
.
transpose
([
0
,
2
,
1
,
3
])
n_batch_pos
=
paddle
.
shape
(
pos_emb
)[
0
]
p
=
self
.
linear_pos
(
pos_emb
).
reshape
(
[
n_batch_pos
,
-
1
,
self
.
h
,
self
.
d_k
])
# (batch, head, 2*time1-1, d_k)
p
=
p
.
transpose
([
0
,
2
,
1
,
3
])
# (batch, head, time1, d_k)
q_with_bias_u
=
(
q
+
self
.
pos_bias_u
).
transpose
([
0
,
2
,
1
,
3
])
# (batch, head, time1, d_k)
q_with_bias_v
=
(
q
+
self
.
pos_bias_v
).
transpose
([
0
,
2
,
1
,
3
])
# compute attention score
# first compute matrix a and matrix c
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
# (batch, head, time1, time2)
matrix_ac
=
paddle
.
matmul
(
q_with_bias_u
,
k
.
transpose
([
0
,
1
,
3
,
2
]))
# compute matrix b and matrix d
# (batch, head, time1, 2*time1-1)
matrix_bd
=
paddle
.
matmul
(
q_with_bias_v
,
p
.
transpose
([
0
,
1
,
3
,
2
]))
matrix_bd
=
self
.
rel_shift
(
matrix_bd
)
# (batch, head, time1, time2)
scores
=
(
matrix_ac
+
matrix_bd
)
/
math
.
sqrt
(
self
.
d_k
)
return
self
.
forward_attention
(
v
,
scores
,
mask
)
paddlespeech/t2s/modules/transformer/embedding.py
浏览文件 @
bc0dd511
...
@@ -96,14 +96,14 @@ class ScaledPositionalEncoding(PositionalEncoding):
...
@@ -96,14 +96,14 @@ class ScaledPositionalEncoding(PositionalEncoding):
Parameters
Parameters
----------
----------
d_model : int
d_model : int
Embedding dimension.
Embedding dimension.
dropout_rate : float
dropout_rate : float
Dropout rate.
Dropout rate.
max_len : int
max_len : int
Maximum input length.
Maximum input length.
dtype : str
dtype : str
dtype of param
dtype of param
"""
"""
def
__init__
(
self
,
d_model
,
dropout_rate
,
max_len
=
5000
,
dtype
=
"float32"
):
def
__init__
(
self
,
d_model
,
dropout_rate
,
max_len
=
5000
,
dtype
=
"float32"
):
...
@@ -128,14 +128,87 @@ class ScaledPositionalEncoding(PositionalEncoding):
...
@@ -128,14 +128,87 @@ class ScaledPositionalEncoding(PositionalEncoding):
Parameters
Parameters
----------
----------
x : paddle.Tensor
x : paddle.Tensor
Input tensor (batch, time, `*`).
Input tensor (batch, time, `*`).
Returns
Returns
----------
----------
paddle.Tensor
paddle.Tensor
Encoded tensor (batch, time, `*`).
Encoded tensor (batch, time, `*`).
"""
"""
self
.
extend_pe
(
x
)
self
.
extend_pe
(
x
)
T
=
paddle
.
shape
(
x
)[
1
]
T
=
paddle
.
shape
(
x
)[
1
]
x
=
x
+
self
.
alpha
*
self
.
pe
[:,
:
T
]
x
=
x
+
self
.
alpha
*
self
.
pe
[:,
:
T
]
return
self
.
dropout
(
x
)
return
self
.
dropout
(
x
)
class
RelPositionalEncoding
(
paddle
.
nn
.
Layer
):
"""Relative positional encoding module (new implementation).
Details can be found in https://github.com/espnet/espnet/pull/2816.
See : Appendix B in https://arxiv.org/abs/1901.02860
Parameters
----------
d_model : int
Embedding dimension.
dropout_rate : float
Dropout rate.
max_len : int
Maximum input length.
"""
def
__init__
(
self
,
d_model
,
dropout_rate
,
max_len
=
5000
,
dtype
=
"float32"
):
"""Construct an PositionalEncoding object."""
super
(
RelPositionalEncoding
,
self
).
__init__
()
self
.
d_model
=
d_model
self
.
xscale
=
math
.
sqrt
(
self
.
d_model
)
self
.
dropout
=
paddle
.
nn
.
Dropout
(
p
=
dropout_rate
)
self
.
pe
=
None
self
.
dtype
=
dtype
self
.
extend_pe
(
paddle
.
expand
(
paddle
.
zeros
([
1
]),
(
1
,
max_len
)))
def
extend_pe
(
self
,
x
):
"""Reset the positional encodings."""
if
self
.
pe
is
not
None
:
# self.pe contains both positive and negative parts
# the length of self.pe is 2 * input_len - 1
if
paddle
.
shape
(
self
.
pe
)[
1
]
>=
paddle
.
shape
(
x
)[
1
]
*
2
-
1
:
return
# Suppose `i` means to the position of query vecotr and `j` means the
# position of key vector. We use position relative positions when keys
# are to the left (i>j) and negative relative positions otherwise (i<j).
x_shape
=
paddle
.
shape
(
x
)
pe_positive
=
paddle
.
zeros
([
x_shape
[
1
],
self
.
d_model
])
pe_negative
=
paddle
.
zeros
([
x_shape
[
1
],
self
.
d_model
])
position
=
paddle
.
arange
(
0
,
x_shape
[
1
],
dtype
=
self
.
dtype
).
unsqueeze
(
1
)
div_term
=
paddle
.
exp
(
paddle
.
arange
(
0
,
self
.
d_model
,
2
,
dtype
=
self
.
dtype
)
*
-
(
math
.
log
(
10000.0
)
/
self
.
d_model
))
pe_positive
[:,
0
::
2
]
=
paddle
.
sin
(
position
*
div_term
)
pe_positive
[:,
1
::
2
]
=
paddle
.
cos
(
position
*
div_term
)
pe_negative
[:,
0
::
2
]
=
paddle
.
sin
(
-
1
*
position
*
div_term
)
pe_negative
[:,
1
::
2
]
=
paddle
.
cos
(
-
1
*
position
*
div_term
)
# Reserve the order of positive indices and concat both positive and
# negative indices. This is used to support the shifting trick
# as in https://arxiv.org/abs/1901.02860
pe_positive
=
paddle
.
flip
(
pe_positive
,
[
0
]).
unsqueeze
(
0
)
pe_negative
=
pe_negative
[
1
:].
unsqueeze
(
0
)
pe
=
paddle
.
concat
([
pe_positive
,
pe_negative
],
axis
=
1
)
self
.
pe
=
pe
def
forward
(
self
,
x
:
paddle
.
Tensor
):
"""Add positional encoding.
Parameters
----------
x : paddle.Tensor
Input tensor (batch, time, `*`).
Returns
----------
paddle.Tensor
Encoded tensor (batch, time, `*`).
"""
self
.
extend_pe
(
x
)
x
=
x
*
self
.
xscale
T
=
paddle
.
shape
(
x
)[
1
]
pe_size
=
paddle
.
shape
(
self
.
pe
)
pos_emb
=
self
.
pe
[:,
pe_size
[
1
]
//
2
-
T
+
1
:
pe_size
[
1
]
//
2
+
T
,
]
return
self
.
dropout
(
x
),
self
.
dropout
(
pos_emb
)
paddlespeech/t2s/modules/transformer/encoder.py
浏览文件 @
bc0dd511
...
@@ -12,15 +12,26 @@
...
@@ -12,15 +12,26 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
# Modified from espnet(https://github.com/espnet/espnet)
from
typing
import
List
from
typing
import
Union
from
paddle
import
nn
from
paddle
import
nn
from
paddlespeech.t2s.modules.conformer.convolution
import
ConvolutionModule
from
paddlespeech.t2s.modules.conformer.encoder_layer
import
EncoderLayer
as
ConformerEncoderLayer
from
paddlespeech.t2s.modules.layer_norm
import
LayerNorm
from
paddlespeech.t2s.modules.nets_utils
import
get_activation
from
paddlespeech.t2s.modules.transformer.attention
import
MultiHeadedAttention
from
paddlespeech.t2s.modules.transformer.attention
import
MultiHeadedAttention
from
paddlespeech.t2s.modules.transformer.attention
import
RelPositionMultiHeadedAttention
from
paddlespeech.t2s.modules.transformer.embedding
import
PositionalEncoding
from
paddlespeech.t2s.modules.transformer.embedding
import
PositionalEncoding
from
paddlespeech.t2s.modules.transformer.embedding
import
RelPositionalEncoding
from
paddlespeech.t2s.modules.transformer.embedding
import
ScaledPositionalEncoding
from
paddlespeech.t2s.modules.transformer.encoder_layer
import
EncoderLayer
from
paddlespeech.t2s.modules.transformer.encoder_layer
import
EncoderLayer
from
paddlespeech.t2s.modules.transformer.multi_layer_conv
import
Conv1dLinear
from
paddlespeech.t2s.modules.transformer.multi_layer_conv
import
Conv1dLinear
from
paddlespeech.t2s.modules.transformer.multi_layer_conv
import
MultiLayeredConv1d
from
paddlespeech.t2s.modules.transformer.multi_layer_conv
import
MultiLayeredConv1d
from
paddlespeech.t2s.modules.transformer.positionwise_feed_forward
import
PositionwiseFeedForward
from
paddlespeech.t2s.modules.transformer.positionwise_feed_forward
import
PositionwiseFeedForward
from
paddlespeech.t2s.modules.transformer.repeat
import
repeat
from
paddlespeech.t2s.modules.transformer.repeat
import
repeat
from
paddlespeech.t2s.modules.transformer.subsampling
import
Conv2dSubsampling
class
Encoder
(
nn
.
Layer
):
class
Encoder
(
nn
.
Layer
):
...
@@ -46,9 +57,6 @@ class Encoder(nn.Layer):
...
@@ -46,9 +57,6 @@ class Encoder(nn.Layer):
Dropout rate in attention.
Dropout rate in attention.
input_layer : Union[str, paddle.nn.Layer]
input_layer : Union[str, paddle.nn.Layer]
Input layer type.
Input layer type.
pos_enc_class : paddle.nn.Layer
Positional encoding module class.
`PositionalEncoding `or `ScaledPositionalEncoding`
normalize_before : bool
normalize_before : bool
Whether to use layer_norm before the first block.
Whether to use layer_norm before the first block.
concat_after : bool
concat_after : bool
...
@@ -60,98 +68,137 @@ class Encoder(nn.Layer):
...
@@ -60,98 +68,137 @@ class Encoder(nn.Layer):
"linear", "conv1d", or "conv1d-linear".
"linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size : int
positionwise_conv_kernel_size : int
Kernel size of positionwise conv1d layer.
Kernel size of positionwise conv1d layer.
macaron_style : bool
Whether to use macaron style for positionwise layer.
pos_enc_layer_type : str
Encoder positional encoding layer type.
selfattention_layer_type : str
selfattention_layer_type : str
Encoder attention layer type.
Encoder attention layer type.
activation_type : str
Encoder activation function type.
use_cnn_module : bool
Whether to use convolution module.
zero_triu : bool
Whether to zero the upper triangular part of attention matrix.
cnn_module_kernel : int
Kernerl size of convolution module.
padding_idx : int
padding_idx : int
Padding idx for input_layer=embed.
Padding idx for input_layer=embed.
stochastic_depth_rate : float
Maximum probability to skip the encoder layer.
intermediate_layers : Union[List[int], None]
indices of intermediate CTC layer.
indices start from 1.
if not None, intermediate outputs are returned (which changes return type
signature.)
encoder_type: str
"transformer", or "conformer".
"""
"""
def
__init__
(
def
__init__
(
self
,
self
,
idim
:
int
,
idim
,
attention_dim
:
int
=
256
,
attention_dim
=
256
,
attention_heads
:
int
=
4
,
attention_heads
=
4
,
linear_units
:
int
=
2048
,
linear_units
=
2048
,
num_blocks
:
int
=
6
,
num_blocks
=
6
,
dropout_rate
:
float
=
0.1
,
dropout_rate
=
0.1
,
positional_dropout_rate
:
float
=
0.1
,
positional_dropout_rate
=
0.1
,
attention_dropout_rate
:
float
=
0.0
,
attention_dropout_rate
=
0.0
,
input_layer
:
str
=
"conv2d"
,
input_layer
=
"conv2d"
,
normalize_before
:
bool
=
True
,
pos_enc_class
=
PositionalEncoding
,
concat_after
:
bool
=
False
,
normalize_before
=
True
,
positionwise_layer_type
:
str
=
"linear"
,
concat_after
=
False
,
positionwise_conv_kernel_size
:
int
=
1
,
positionwise_layer_type
=
"linear"
,
macaron_style
:
bool
=
False
,
positionwise_conv_kernel_size
=
1
,
pos_enc_layer_type
:
str
=
"abs_pos"
,
selfattention_layer_type
=
"selfattn"
,
selfattention_layer_type
:
str
=
"selfattn"
,
padding_idx
=-
1
,
):
activation_type
:
str
=
"swish"
,
use_cnn_module
:
bool
=
False
,
zero_triu
:
bool
=
False
,
cnn_module_kernel
:
int
=
31
,
padding_idx
:
int
=-
1
,
stochastic_depth_rate
:
float
=
0.0
,
intermediate_layers
:
Union
[
List
[
int
],
None
]
=
None
,
encoder_type
:
str
=
"transformer"
):
"""Construct an Encoder object."""
"""Construct an Encoder object."""
super
(
Encoder
,
self
).
__init__
()
super
().
__init__
()
activation
=
get_activation
(
activation_type
)
pos_enc_class
=
self
.
get_pos_enc_class
(
pos_enc_layer_type
,
selfattention_layer_type
)
self
.
encoder_type
=
encoder_type
self
.
conv_subsampling_factor
=
1
self
.
conv_subsampling_factor
=
1
if
input_layer
==
"linear"
:
self
.
embed
=
self
.
get_embed
(
self
.
embed
=
nn
.
Sequential
(
idim
=
idim
,
nn
.
Linear
(
idim
,
attention_dim
,
bias_attr
=
True
),
input_layer
=
input_layer
,
nn
.
LayerNorm
(
attention_dim
),
attention_dim
=
attention_dim
,
nn
.
Dropout
(
dropout_rate
),
pos_enc_class
=
pos_enc_class
,
nn
.
ReLU
(),
dropout_rate
=
dropout_rate
,
pos_enc_class
(
attention_dim
,
positional_dropout_rate
),
)
positional_dropout_rate
=
positional_dropout_rate
,
elif
input_layer
==
"embed"
:
padding_idx
=
padding_idx
)
self
.
embed
=
nn
.
Sequential
(
nn
.
Embedding
(
idim
,
attention_dim
,
padding_idx
=
padding_idx
),
pos_enc_class
(
attention_dim
,
positional_dropout_rate
),
)
elif
isinstance
(
input_layer
,
nn
.
Layer
):
self
.
embed
=
nn
.
Sequential
(
input_layer
,
pos_enc_class
(
attention_dim
,
positional_dropout_rate
),
)
elif
input_layer
is
None
:
self
.
embed
=
nn
.
Sequential
(
pos_enc_class
(
attention_dim
,
positional_dropout_rate
))
else
:
raise
ValueError
(
"unknown input_layer: "
+
input_layer
)
self
.
normalize_before
=
normalize_before
self
.
normalize_before
=
normalize_before
# self-attention module definition
encoder_selfattn_layer
,
encoder_selfattn_layer_args
=
self
.
get_encoder_selfattn_layer
(
selfattention_layer_type
=
selfattention_layer_type
,
attention_heads
=
attention_heads
,
attention_dim
=
attention_dim
,
attention_dropout_rate
=
attention_dropout_rate
,
zero_triu
=
zero_triu
,
pos_enc_layer_type
=
pos_enc_layer_type
)
# feed-forward module definition
positionwise_layer
,
positionwise_layer_args
=
self
.
get_positionwise_layer
(
positionwise_layer
,
positionwise_layer_args
=
self
.
get_positionwise_layer
(
positionwise_layer_type
,
positionwise_layer_type
,
attention_dim
,
linear_units
,
dropout_rate
,
attention_dim
,
positionwise_conv_kernel_size
,
activation
)
linear_units
,
dropout_rate
,
positionwise_conv_kernel_size
,
)
if
selfattention_layer_type
in
[
"selfattn"
,
"rel_selfattn"
,
"legacy_rel_selfattn"
,
]:
encoder_selfattn_layer
=
MultiHeadedAttention
encoder_selfattn_layer_args
=
[
(
attention_heads
,
attention_dim
,
attention_dropout_rate
,
)
]
*
num_blocks
# convolution module definition
convolution_layer
=
ConvolutionModule
convolution_layer_args
=
(
attention_dim
,
cnn_module_kernel
,
activation
)
if
self
.
encoder_type
==
"transformer"
:
self
.
encoders
=
repeat
(
num_blocks
,
lambda
lnum
:
EncoderLayer
(
attention_dim
,
encoder_selfattn_layer
(
*
encoder_selfattn_layer_args
),
positionwise_layer
(
*
positionwise_layer_args
),
dropout_rate
,
normalize_before
,
concat_after
,
),
)
elif
self
.
encoder_type
==
"conformer"
:
self
.
encoders
=
repeat
(
num_blocks
,
lambda
lnum
:
ConformerEncoderLayer
(
attention_dim
,
encoder_selfattn_layer
(
*
encoder_selfattn_layer_args
),
positionwise_layer
(
*
positionwise_layer_args
),
positionwise_layer
(
*
positionwise_layer_args
)
if
macaron_style
else
None
,
convolution_layer
(
*
convolution_layer_args
)
if
use_cnn_module
else
None
,
dropout_rate
,
normalize_before
,
concat_after
,
stochastic_depth_rate
*
float
(
1
+
lnum
)
/
num_blocks
,
),
)
self
.
intermediate_layers
=
intermediate_layers
else
:
else
:
raise
NotImplementedError
(
selfattention_layer_type
)
raise
NotImplementedError
(
"Support only linear or conv1d."
)
self
.
encoders
=
repeat
(
num_blocks
,
lambda
lnum
:
EncoderLayer
(
attention_dim
,
encoder_selfattn_layer
(
*
encoder_selfattn_layer_args
[
lnum
]),
positionwise_layer
(
*
positionwise_layer_args
),
dropout_rate
,
normalize_before
,
concat_after
,
),
)
if
self
.
normalize_before
:
if
self
.
normalize_before
:
self
.
after_norm
=
nn
.
LayerNorm
(
attention_dim
)
self
.
after_norm
=
LayerNorm
(
attention_dim
)
def
get_positionwise_layer
(
def
get_positionwise_layer
(
self
,
self
,
positionwise_layer_type
:
str
=
"linear"
,
positionwise_layer_type
=
"linear"
,
attention_dim
:
int
=
256
,
attention_dim
=
256
,
linear_units
:
int
=
2048
,
linear_units
=
2048
,
dropout_rate
:
float
=
0.1
,
dropout_rate
=
0.
1
,
positionwise_conv_kernel_size
:
int
=
1
,
positionwise_conv_kernel_size
=
1
,
):
activation
:
nn
.
Layer
=
nn
.
ReLU
()
):
"""Define positionwise layer."""
"""Define positionwise layer."""
if
positionwise_layer_type
==
"linear"
:
if
positionwise_layer_type
==
"linear"
:
positionwise_layer
=
PositionwiseFeedForward
positionwise_layer
=
PositionwiseFeedForward
positionwise_layer_args
=
(
attention_dim
,
linear_units
,
positionwise_layer_args
=
(
attention_dim
,
linear_units
,
dropout_rate
)
dropout_rate
,
activation
)
elif
positionwise_layer_type
==
"conv1d"
:
elif
positionwise_layer_type
==
"conv1d"
:
positionwise_layer
=
MultiLayeredConv1d
positionwise_layer
=
MultiLayeredConv1d
positionwise_layer_args
=
(
attention_dim
,
linear_units
,
positionwise_layer_args
=
(
attention_dim
,
linear_units
,
...
@@ -166,6 +213,81 @@ class Encoder(nn.Layer):
...
@@ -166,6 +213,81 @@ class Encoder(nn.Layer):
raise
NotImplementedError
(
"Support only linear or conv1d."
)
raise
NotImplementedError
(
"Support only linear or conv1d."
)
return
positionwise_layer
,
positionwise_layer_args
return
positionwise_layer
,
positionwise_layer_args
def
get_encoder_selfattn_layer
(
self
,
selfattention_layer_type
:
str
=
"selfattn"
,
attention_heads
:
int
=
4
,
attention_dim
:
int
=
256
,
attention_dropout_rate
:
float
=
0.0
,
zero_triu
:
bool
=
False
,
pos_enc_layer_type
:
str
=
"abs_pos"
):
if
selfattention_layer_type
==
"selfattn"
:
encoder_selfattn_layer
=
MultiHeadedAttention
encoder_selfattn_layer_args
=
(
attention_heads
,
attention_dim
,
attention_dropout_rate
,
)
elif
selfattention_layer_type
==
"rel_selfattn"
:
assert
pos_enc_layer_type
==
"rel_pos"
encoder_selfattn_layer
=
RelPositionMultiHeadedAttention
encoder_selfattn_layer_args
=
(
attention_heads
,
attention_dim
,
attention_dropout_rate
,
zero_triu
,
)
else
:
raise
ValueError
(
"unknown encoder_attn_layer: "
+
selfattention_layer_type
)
return
encoder_selfattn_layer
,
encoder_selfattn_layer_args
def
get_pos_enc_class
(
self
,
pos_enc_layer_type
:
str
=
"abs_pos"
,
selfattention_layer_type
:
str
=
"selfattn"
):
if
pos_enc_layer_type
==
"abs_pos"
:
pos_enc_class
=
PositionalEncoding
elif
pos_enc_layer_type
==
"scaled_abs_pos"
:
pos_enc_class
=
ScaledPositionalEncoding
elif
pos_enc_layer_type
==
"rel_pos"
:
assert
selfattention_layer_type
==
"rel_selfattn"
pos_enc_class
=
RelPositionalEncoding
else
:
raise
ValueError
(
"unknown pos_enc_layer: "
+
pos_enc_layer_type
)
return
pos_enc_class
def
get_embed
(
self
,
idim
,
input_layer
=
"conv2d"
,
attention_dim
:
int
=
256
,
pos_enc_class
=
PositionalEncoding
,
dropout_rate
:
int
=
0.1
,
positional_dropout_rate
:
int
=
0.1
,
padding_idx
:
int
=-
1
):
if
input_layer
==
"linear"
:
embed
=
nn
.
Sequential
(
nn
.
Linear
(
idim
,
attention_dim
),
nn
.
LayerNorm
(
attention_dim
),
nn
.
Dropout
(
dropout_rate
),
nn
.
ReLU
(),
pos_enc_class
(
attention_dim
,
positional_dropout_rate
),
)
elif
input_layer
==
"conv2d"
:
embed
=
Conv2dSubsampling
(
idim
,
attention_dim
,
dropout_rate
,
pos_enc_class
(
attention_dim
,
positional_dropout_rate
),
)
self
.
conv_subsampling_factor
=
4
elif
input_layer
==
"embed"
:
embed
=
nn
.
Sequential
(
nn
.
Embedding
(
idim
,
attention_dim
,
padding_idx
=
padding_idx
),
pos_enc_class
(
attention_dim
,
positional_dropout_rate
),
)
elif
isinstance
(
input_layer
,
nn
.
Layer
):
embed
=
nn
.
Sequential
(
input_layer
,
pos_enc_class
(
attention_dim
,
positional_dropout_rate
),
)
elif
input_layer
is
None
:
embed
=
nn
.
Sequential
(
pos_enc_class
(
attention_dim
,
positional_dropout_rate
))
else
:
raise
ValueError
(
"unknown input_layer: "
+
input_layer
)
return
embed
def
forward
(
self
,
xs
,
masks
):
def
forward
(
self
,
xs
,
masks
):
"""Encode input sequence.
"""Encode input sequence.
...
@@ -174,21 +296,55 @@ class Encoder(nn.Layer):
...
@@ -174,21 +296,55 @@ class Encoder(nn.Layer):
xs : paddle.Tensor
xs : paddle.Tensor
Input tensor (#batch, time, idim).
Input tensor (#batch, time, idim).
masks : paddle.Tensor
masks : paddle.Tensor
Mask tensor (#batch, time).
Mask tensor (#batch,
1,
time).
Returns
Returns
----------
----------
paddle.Tensor
paddle.Tensor
Output tensor (#batch, time, attention_dim).
Output tensor (#batch, time, attention_dim).
paddle.Tensor
paddle.Tensor
Mask tensor (#batch, time).
Mask tensor (#batch,
1,
time).
"""
"""
if
self
.
encoder_type
==
"transformer"
:
xs
=
self
.
embed
(
xs
)
xs
,
masks
=
self
.
encoders
(
xs
,
masks
)
if
self
.
normalize_before
:
xs
=
self
.
after_norm
(
xs
)
return
xs
,
masks
elif
self
.
encoder_type
==
"conformer"
:
if
isinstance
(
self
.
embed
,
(
Conv2dSubsampling
)):
xs
,
masks
=
self
.
embed
(
xs
,
masks
)
else
:
xs
=
self
.
embed
(
xs
)
xs
=
self
.
embed
(
xs
)
if
self
.
intermediate_layers
is
None
:
xs
,
masks
=
self
.
encoders
(
xs
,
masks
)
xs
,
masks
=
self
.
encoders
(
xs
,
masks
)
if
self
.
normalize_before
:
else
:
xs
=
self
.
after_norm
(
xs
)
intermediate_outputs
=
[]
return
xs
,
masks
for
layer_idx
,
encoder_layer
in
enumerate
(
self
.
encoders
):
xs
,
masks
=
encoder_layer
(
xs
,
masks
)
if
(
self
.
intermediate_layers
is
not
None
and
layer_idx
+
1
in
self
.
intermediate_layers
):
# intermediate branches also require normalization.
encoder_output
=
xs
if
isinstance
(
encoder_output
,
tuple
):
encoder_output
=
encoder_output
[
0
]
if
self
.
normalize_before
:
encoder_output
=
self
.
after_norm
(
encoder_output
)
intermediate_outputs
.
append
(
encoder_output
)
if
isinstance
(
xs
,
tuple
):
xs
=
xs
[
0
]
if
self
.
normalize_before
:
xs
=
self
.
after_norm
(
xs
)
if
self
.
intermediate_layers
is
not
None
:
return
xs
,
masks
,
intermediate_outputs
return
xs
,
masks
else
:
raise
ValueError
(
f
"
{
self
.
encoder_type
}
is not supported."
)
def
forward_one_step
(
self
,
xs
,
masks
,
cache
=
None
):
def
forward_one_step
(
self
,
xs
,
masks
,
cache
=
None
):
"""Encode input frame.
"""Encode input frame.
...
...
paddlespeech/t2s/modules/transformer/subsampling.py
浏览文件 @
bc0dd511
...
@@ -18,38 +18,6 @@ import paddle
...
@@ -18,38 +18,6 @@ import paddle
from
paddlespeech.t2s.modules.transformer.embedding
import
PositionalEncoding
from
paddlespeech.t2s.modules.transformer.embedding
import
PositionalEncoding
class
TooShortUttError
(
Exception
):
"""Raised when the utt is too short for subsampling.
Parameters
----------
message : str
Message for error catch
actual_size : int
the short size that cannot pass the subsampling
limit : int
the limit size for subsampling
"""
def
__init__
(
self
,
message
,
actual_size
,
limit
):
"""Construct a TooShortUttError for error handler."""
super
().
__init__
(
message
)
self
.
actual_size
=
actual_size
self
.
limit
=
limit
def
check_short_utt
(
ins
,
size
):
"""Check if the utterance is too short for subsampling."""
if
isinstance
(
ins
,
Conv2dSubsampling2
)
and
size
<
3
:
return
True
,
3
if
isinstance
(
ins
,
Conv2dSubsampling
)
and
size
<
7
:
return
True
,
7
if
isinstance
(
ins
,
Conv2dSubsampling6
)
and
size
<
11
:
return
True
,
11
if
isinstance
(
ins
,
Conv2dSubsampling8
)
and
size
<
15
:
return
True
,
15
return
False
,
-
1
class
Conv2dSubsampling
(
paddle
.
nn
.
Layer
):
class
Conv2dSubsampling
(
paddle
.
nn
.
Layer
):
"""Convolutional 2D subsampling (to 1/4 length).
"""Convolutional 2D subsampling (to 1/4 length).
Parameters
Parameters
...
@@ -112,178 +80,3 @@ class Conv2dSubsampling(paddle.nn.Layer):
...
@@ -112,178 +80,3 @@ class Conv2dSubsampling(paddle.nn.Layer):
raise
NotImplementedError
(
raise
NotImplementedError
(
"Support only `-1` (for `reset_parameters`)."
)
"Support only `-1` (for `reset_parameters`)."
)
return
self
.
out
[
key
]
return
self
.
out
[
key
]
class
Conv2dSubsampling2
(
paddle
.
nn
.
Layer
):
"""Convolutional 2D subsampling (to 1/2 length).
Parameters
----------
idim : int
Input dimension.
odim : int
Output dimension.
dropout_rate : float
Dropout rate.
pos_enc : paddle.nn.Layer
Custom position encoding layer.
"""
def
__init__
(
self
,
idim
,
odim
,
dropout_rate
,
pos_enc
=
None
):
"""Construct an Conv2dSubsampling2 object."""
super
(
Conv2dSubsampling2
,
self
).
__init__
()
self
.
conv
=
paddle
.
nn
.
Sequential
(
paddle
.
nn
.
Conv2D
(
1
,
odim
,
3
,
2
),
paddle
.
nn
.
ReLU
(),
paddle
.
nn
.
Conv2D
(
odim
,
odim
,
3
,
1
),
paddle
.
nn
.
ReLU
(),
)
self
.
out
=
paddle
.
nn
.
Sequential
(
paddle
.
nn
.
Linear
(
odim
*
(((
idim
-
1
)
//
2
-
2
)),
odim
),
pos_enc
if
pos_enc
is
not
None
else
PositionalEncoding
(
odim
,
dropout_rate
),
)
def
forward
(
self
,
x
,
x_mask
):
"""Subsample x.
Parameters
----------
x : paddle.Tensor
Input tensor (#batch, time, idim).
x_mask : paddle.Tensor
Input mask (#batch, 1, time).
Returns
----------
paddle.Tensor
ubsampled tensor (#batch, time', odim),
where time' = time // 2.
paddle.Tensor
Subsampled mask (#batch, 1, time'),
where time' = time // 2.
"""
# (b, c, t, f)
x
=
x
.
unsqueeze
(
1
)
x
=
self
.
conv
(
x
)
b
,
c
,
t
,
f
=
paddle
.
shape
(
x
)
x
=
self
.
out
(
x
.
transpose
([
0
,
2
,
1
,
3
]).
reshape
([
b
,
t
,
c
*
f
]))
if
x_mask
is
None
:
return
x
,
None
return
x
,
x_mask
[:,
:,
:
-
2
:
2
][:,
:,
:
-
2
:
1
]
def
__getitem__
(
self
,
key
):
"""Get item.
When reset_parameters() is called, if use_scaled_pos_enc is used,
return the positioning encoding.
"""
if
key
!=
-
1
:
raise
NotImplementedError
(
"Support only `-1` (for `reset_parameters`)."
)
return
self
.
out
[
key
]
class
Conv2dSubsampling6
(
paddle
.
nn
.
Layer
):
"""Convolutional 2D subsampling (to 1/6 length).
Parameters
----------
idim : int
Input dimension.
odim : int
Output dimension.
dropout_rate : float
Dropout rate.
pos_enc : paddle.nn.Layer
Custom position encoding layer.
"""
def
__init__
(
self
,
idim
,
odim
,
dropout_rate
,
pos_enc
=
None
):
"""Construct an Conv2dSubsampling6 object."""
super
(
Conv2dSubsampling6
,
self
).
__init__
()
self
.
conv
=
paddle
.
nn
.
Sequential
(
paddle
.
nn
.
Conv2D
(
1
,
odim
,
3
,
2
),
paddle
.
nn
.
ReLU
(),
paddle
.
nn
.
Conv2D
(
odim
,
odim
,
5
,
3
),
paddle
.
nn
.
ReLU
(),
)
self
.
out
=
paddle
.
nn
.
Sequential
(
paddle
.
nn
.
Linear
(
odim
*
(((
idim
-
1
)
//
2
-
2
)
//
3
),
odim
),
pos_enc
if
pos_enc
is
not
None
else
PositionalEncoding
(
odim
,
dropout_rate
),
)
def
forward
(
self
,
x
,
x_mask
):
"""Subsample x.
Parameters
----------
x : paddle.Tensor
Input tensor (#batch, time, idim).
x_mask paddle.Tensor
Input mask (#batch, 1, time).
Returns
----------
paddle.Tensor
Subsampled tensor (#batch, time', odim),
where time' = time // 6.
paddle.Tensor
Subsampled mask (#batch, 1, time'),
where time' = time // 6.
"""
# (b, c, t, f)
x
=
x
.
unsqueeze
(
1
)
x
=
self
.
conv
(
x
)
b
,
c
,
t
,
f
=
paddle
.
shape
(
x
)
x
=
self
.
out
(
x
.
transpose
([
0
,
2
,
1
,
3
]).
reshape
([
b
,
t
,
c
*
f
]))
if
x_mask
is
None
:
return
x
,
None
return
x
,
x_mask
[:,
:,
:
-
2
:
2
][:,
:,
:
-
4
:
3
]
class
Conv2dSubsampling8
(
paddle
.
nn
.
Layer
):
"""Convolutional 2D subsampling (to 1/8 length).
Parameters
----------
idim : int
Input dimension.
odim : int
Output dimension.
dropout_rate : float
Dropout rate.
pos_enc : paddle.nn.Layer
Custom position encoding layer.
"""
def
__init__
(
self
,
idim
,
odim
,
dropout_rate
,
pos_enc
=
None
):
"""Construct an Conv2dSubsampling8 object."""
super
(
Conv2dSubsampling8
,
self
).
__init__
()
self
.
conv
=
paddle
.
nn
.
Sequential
(
paddle
.
nn
.
Conv2D
(
1
,
odim
,
3
,
2
),
paddle
.
nn
.
ReLU
(),
paddle
.
nn
.
Conv2D
(
odim
,
odim
,
3
,
2
),
paddle
.
nn
.
ReLU
(),
paddle
.
nn
.
Conv2D
(
odim
,
odim
,
3
,
2
),
paddle
.
nn
.
ReLU
(),
)
self
.
out
=
paddle
.
nn
.
Sequential
(
paddle
.
nn
.
Linear
(
odim
*
((((
idim
-
1
)
//
2
-
1
)
//
2
-
1
)
//
2
),
odim
),
pos_enc
if
pos_enc
is
not
None
else
PositionalEncoding
(
odim
,
dropout_rate
),
)
def
forward
(
self
,
x
,
x_mask
):
"""Subsample x.
Parameters
----------
x : paddle.Tensor
Input tensor (#batch, time, idim).
x_mask : paddle.Tensor
Input mask (#batch, 1, time).
Returns
----------
paddle.Tensor
Subsampled tensor (#batch, time', odim),
where time' = time // 8.
paddle.Tensor
Subsampled mask (#batch, 1, time'),
where time' = time // 8.
"""
# (b, c, t, f)
x
=
x
.
unsqueeze
(
1
)
x
=
self
.
conv
(
x
)
b
,
c
,
t
,
f
=
paddle
.
shape
(
x
)
x
=
self
.
out
(
x
.
transpose
([
0
,
2
,
1
,
3
]).
reshape
([
b
,
t
,
c
*
f
]))
if
x_mask
is
None
:
return
x
,
None
return
x
,
x_mask
[:,
:,
:
-
2
:
2
][:,
:,
:
-
2
:
2
][:,
:,
:
-
2
:
2
]
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录