Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Parakeet
提交
ab0fe8f3
P
Parakeet
项目概览
PaddlePaddle
/
Parakeet
通知
11
Star
3
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
19
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Parakeet
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
19
Issue
19
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
ab0fe8f3
编写于
1月 13, 2020
作者:
L
lifuchen
提交者:
chenfeiyu
1月 13, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
TransformerTTS precision alignment
上级
ae88be34
变更
14
隐藏空白更改
内联
并排
Showing
14 changed file
with
93 addition
and
104 deletion
+93
-104
parakeet/g2p/text/cleaners.py
parakeet/g2p/text/cleaners.py
+1
-1
parakeet/models/fastspeech/config/fastspeech.yaml
parakeet/models/fastspeech/config/fastspeech.yaml
+6
-5
parakeet/models/fastspeech/modules.py
parakeet/models/fastspeech/modules.py
+0
-2
parakeet/models/fastspeech/network.py
parakeet/models/fastspeech/network.py
+9
-9
parakeet/models/fastspeech/parse.py
parakeet/models/fastspeech/parse.py
+9
-8
parakeet/models/transformerTTS/module.py
parakeet/models/transformerTTS/module.py
+10
-36
parakeet/models/transformerTTS/network.py
parakeet/models/transformerTTS/network.py
+11
-14
parakeet/models/transformerTTS/synthesis.py
parakeet/models/transformerTTS/synthesis.py
+2
-2
parakeet/models/transformerTTS/train_postnet.py
parakeet/models/transformerTTS/train_postnet.py
+0
-2
parakeet/models/transformerTTS/train_transformer.py
parakeet/models/transformerTTS/train_transformer.py
+5
-6
parakeet/modules/layers.py
parakeet/modules/layers.py
+19
-0
parakeet/modules/multihead_attention.py
parakeet/modules/multihead_attention.py
+8
-6
parakeet/modules/post_convnet.py
parakeet/modules/post_convnet.py
+10
-11
parakeet/modules/prenet.py
parakeet/modules/prenet.py
+3
-2
未找到文件。
parakeet/g2p/text/cleaners.py
浏览文件 @
ab0fe8f3
...
...
@@ -89,7 +89,7 @@ def transliteration_cleaners(text):
def
english_cleaners
(
text
):
'''Pipeline for English text, including number and abbreviation expansion.'''
text
=
convert_to_ascii
(
text
)
text
=
add_punctuation
(
text
)
#
text = add_punctuation(text)
text
=
lowercase
(
text
)
text
=
expand_numbers
(
text
)
text
=
expand_abbreviations
(
text
)
...
...
parakeet/models/fastspeech/config/fastspeech.yaml
浏览文件 @
ab0fe8f3
...
...
@@ -14,13 +14,11 @@ encoder_n_layer: 6
encoder_head
:
2
encoder_conv1d_filter_size
:
1536
max_sep_len
:
2048
encoder_output_size
:
384
embedding_size
:
384
fs_embedding_size
:
384
decoder_n_layer
:
6
decoder_head
:
2
decoder_conv1d_filter_size
:
1536
decoder_output_size
:
384
hidden_size
:
384
fs_hidden_size
:
384
duration_predictor_output_size
:
256
duration_predictor_filter_size
:
3
fft_conv1d_filter
:
3
...
...
@@ -28,6 +26,9 @@ fft_conv1d_padding: 1
dropout
:
0.1
transformer_head
:
4
embedding_size
:
512
hidden_size
:
256
warm_up_step
:
4000
grad_clip_thresh
:
0.1
batch_size
:
32
...
...
@@ -39,5 +40,5 @@ use_data_parallel: False
data_path
:
../../../dataset/LJSpeech-1.1
transtts_path
:
../transformerTTS/checkpoint
transformer_step
:
20
transformer_step
:
1
log_dir
:
./log
\ No newline at end of file
parakeet/models/fastspeech/modules.py
浏览文件 @
ab0fe8f3
...
...
@@ -8,8 +8,6 @@ from parakeet.modules.layers import Conv1D
from
parakeet.modules.multihead_attention
import
MultiheadAttention
from
parakeet.modules.feed_forward
import
PositionwiseFeedForward
class
FFTBlock
(
dg
.
Layer
):
def
__init__
(
self
,
d_model
,
d_inner
,
n_head
,
d_k
,
d_v
,
filter_size
,
padding
,
dropout
=
0.2
):
super
(
FFTBlock
,
self
).
__init__
()
...
...
parakeet/models/fastspeech/network.py
浏览文件 @
ab0fe8f3
from
utils
import
*
from
modules
import
*
from
modules
import
FFTBlock
,
LengthRegulator
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid
as
fluid
from
parakeet.g2p.text.symbols
import
symbols
...
...
@@ -131,38 +131,38 @@ class FastSpeech(dg.Layer):
self
.
encoder
=
Encoder
(
n_src_vocab
=
len
(
symbols
)
+
1
,
len_max_seq
=
cfg
.
max_sep_len
,
d_word_vec
=
cfg
.
embedding_size
,
d_word_vec
=
cfg
.
fs_
embedding_size
,
n_layers
=
cfg
.
encoder_n_layer
,
n_head
=
cfg
.
encoder_head
,
d_k
=
64
,
d_v
=
64
,
d_model
=
cfg
.
hidden_size
,
d_model
=
cfg
.
fs_
hidden_size
,
d_inner
=
cfg
.
encoder_conv1d_filter_size
,
fft_conv1d_kernel
=
cfg
.
fft_conv1d_filter
,
fft_conv1d_padding
=
cfg
.
fft_conv1d_padding
,
dropout
=
0.1
)
self
.
length_regulator
=
LengthRegulator
(
input_size
=
cfg
.
hidden_size
,
self
.
length_regulator
=
LengthRegulator
(
input_size
=
cfg
.
fs_
hidden_size
,
out_channels
=
cfg
.
duration_predictor_output_size
,
filter_size
=
cfg
.
duration_predictor_filter_size
,
dropout
=
cfg
.
dropout
)
self
.
decoder
=
Decoder
(
len_max_seq
=
cfg
.
max_sep_len
,
d_word_vec
=
cfg
.
embedding_size
,
d_word_vec
=
cfg
.
fs_
embedding_size
,
n_layers
=
cfg
.
decoder_n_layer
,
n_head
=
cfg
.
decoder_head
,
d_k
=
64
,
d_v
=
64
,
d_model
=
cfg
.
hidden_size
,
d_model
=
cfg
.
fs_
hidden_size
,
d_inner
=
cfg
.
decoder_conv1d_filter_size
,
fft_conv1d_kernel
=
cfg
.
fft_conv1d_filter
,
fft_conv1d_padding
=
cfg
.
fft_conv1d_padding
,
dropout
=
0.1
)
self
.
mel_linear
=
dg
.
Linear
(
cfg
.
decoder_output_size
,
cfg
.
audio
.
num_mels
)
self
.
postnet
=
PostConvNet
(
n_mels
=
80
,
self
.
mel_linear
=
dg
.
Linear
(
cfg
.
fs_hidden_size
,
cfg
.
audio
.
num_mels
*
cfg
.
audio
.
outputs_per_step
)
self
.
postnet
=
PostConvNet
(
n_mels
=
cfg
.
audio
.
num_mels
,
num_hidden
=
512
,
filter_size
=
5
,
padding
=
int
(
5
/
2
),
num_conv
=
5
,
outputs_per_step
=
1
,
outputs_per_step
=
cfg
.
audio
.
outputs_per_step
,
use_cudnn
=
True
,
dropout
=
0.1
)
...
...
parakeet/models/fastspeech/parse.py
浏览文件 @
ab0fe8f3
...
...
@@ -22,8 +22,8 @@ def add_config_options_to_parser(parser):
parser
.
add_argument
(
'--audio.outputs_per_step'
,
type
=
int
,
default
=
1
,
help
=
"the outputs per step."
)
parser
.
add_argument
(
'--embedding_size'
,
type
=
int
,
default
=
256
,
help
=
"the dim size of embedding."
)
parser
.
add_argument
(
'--
fs_
embedding_size'
,
type
=
int
,
default
=
256
,
help
=
"the dim size of embedding
of fastspeech
."
)
parser
.
add_argument
(
'--encoder_n_layer'
,
type
=
int
,
default
=
6
,
help
=
"the number of FFT Block in encoder."
)
parser
.
add_argument
(
'--encoder_head'
,
type
=
int
,
default
=
2
,
...
...
@@ -32,18 +32,14 @@ def add_config_options_to_parser(parser):
help
=
"the filter size of conv1d in encoder."
)
parser
.
add_argument
(
'--max_sep_len'
,
type
=
int
,
default
=
2048
,
help
=
"the max length of sequence."
)
parser
.
add_argument
(
'--encoder_output_size'
,
type
=
int
,
default
=
256
,
help
=
"the output channel size of encoder."
)
parser
.
add_argument
(
'--decoder_n_layer'
,
type
=
int
,
default
=
6
,
help
=
"the number of FFT Block in decoder."
)
parser
.
add_argument
(
'--decoder_head'
,
type
=
int
,
default
=
2
,
help
=
"the attention head number in decoder."
)
parser
.
add_argument
(
'--decoder_conv1d_filter_size'
,
type
=
int
,
default
=
1024
,
help
=
"the filter size of conv1d in decoder."
)
parser
.
add_argument
(
'--decoder_output_size'
,
type
=
int
,
default
=
256
,
help
=
"the output channel size of decoder."
)
parser
.
add_argument
(
'--hidden_size'
,
type
=
int
,
default
=
256
,
help
=
"the hidden size in model."
)
parser
.
add_argument
(
'--fs_hidden_size'
,
type
=
int
,
default
=
256
,
help
=
"the hidden size in model of fastspeech."
)
parser
.
add_argument
(
'--duration_predictor_output_size'
,
type
=
int
,
default
=
256
,
help
=
"the output size of duration predictior."
)
parser
.
add_argument
(
'--duration_predictor_filter_size'
,
type
=
int
,
default
=
3
,
...
...
@@ -57,6 +53,11 @@ def add_config_options_to_parser(parser):
parser
.
add_argument
(
'--transformer_head'
,
type
=
int
,
default
=
4
,
help
=
"the attention head num of transformerTTS."
)
parser
.
add_argument
(
'--hidden_size'
,
type
=
int
,
default
=
256
,
help
=
"the hidden size in model of transformerTTS."
)
parser
.
add_argument
(
'--embedding_size'
,
type
=
int
,
default
=
256
,
help
=
"the dim size of embedding of transformerTTS."
)
parser
.
add_argument
(
'--warm_up_step'
,
type
=
int
,
default
=
4000
,
help
=
"the warm up step of learning rate."
)
parser
.
add_argument
(
'--grad_clip_thresh'
,
type
=
float
,
default
=
1.0
,
...
...
parakeet/models/transformerTTS/module.py
浏览文件 @
ab0fe8f3
...
...
@@ -3,20 +3,18 @@ from parakeet.g2p.text.symbols import symbols
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid
as
fluid
import
paddle.fluid.layers
as
layers
from
parakeet.modules.layers
import
Conv
,
Pool1D
from
parakeet.modules.layers
import
Conv
,
Pool1D
,
Linear
from
parakeet.modules.dynamicGRU
import
DynamicGRU
import
numpy
as
np
class
EncoderPrenet
(
dg
.
Layer
):
def
__init__
(
self
,
embedding_size
,
num_hidden
,
use_cudnn
=
True
):
super
(
EncoderPrenet
,
self
).
__init__
()
self
.
embedding_size
=
embedding_size
self
.
num_hidden
=
num_hidden
self
.
use_cudnn
=
use_cudnn
self
.
embedding
=
dg
.
Embedding
(
size
=
[
len
(
symbols
),
embedding_size
],
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
self
.
embedding
=
dg
.
Embedding
(
size
=
[
len
(
symbols
),
embedding_size
],
padding_idx
=
None
)
self
.
conv_list
=
[]
self
.
conv_list
.
append
(
Conv
(
in_channels
=
embedding_size
,
...
...
@@ -37,16 +35,12 @@ class EncoderPrenet(dg.Layer):
self
.
add_sublayer
(
"conv_list_{}"
.
format
(
i
),
layer
)
self
.
batch_norm_list
=
[
dg
.
BatchNorm
(
num_hidden
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'bias'
),
moving_mean_name
=
'moving_mean'
,
moving_variance_name
=
'moving_var'
,
data_layout
=
'NCHW'
)
for
_
in
range
(
3
)]
data_layout
=
'NCHW'
,
epsilon
=
1e-30
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
batch_norm_list
):
self
.
add_sublayer
(
"batch_norm_list_{}"
.
format
(
i
),
layer
)
self
.
projection
=
dg
.
Linear
(
num_hidden
,
num_hidden
)
self
.
projection
=
Linear
(
num_hidden
,
num_hidden
)
def
forward
(
self
,
x
):
x
=
self
.
embedding
(
x
)
#(batch_size, seq_len, embending_size)
...
...
@@ -90,10 +84,6 @@ class CBHG(dg.Layer):
self
.
batchnorm_list
=
[]
for
i
in
range
(
K
):
self
.
batchnorm_list
.
append
(
dg
.
BatchNorm
(
hidden_size
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'bias'
),
moving_mean_name
=
'moving_mean'
,
moving_variance_name
=
'moving_var'
,
data_layout
=
'NCHW'
))
for
i
,
layer
in
enumerate
(
self
.
batchnorm_list
):
...
...
@@ -114,16 +104,8 @@ class CBHG(dg.Layer):
data_format
=
"NCT"
)
self
.
batchnorm_proj_1
=
dg
.
BatchNorm
(
hidden_size
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'bias'
),
moving_mean_name
=
'moving_mean'
,
moving_variance_name
=
'moving_var'
,
data_layout
=
'NCHW'
)
self
.
batchnorm_proj_2
=
dg
.
BatchNorm
(
projection_size
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'bias'
),
moving_mean_name
=
'moving_mean'
,
moving_variance_name
=
'moving_var'
,
data_layout
=
'NCHW'
)
self
.
max_pool
=
Pool1D
(
pool_size
=
max_pool_kernel_size
,
pool_type
=
'max'
,
...
...
@@ -134,32 +116,24 @@ class CBHG(dg.Layer):
h_0
=
np
.
zeros
((
batch_size
,
hidden_size
//
2
),
dtype
=
"float32"
)
h_0
=
dg
.
to_variable
(
h_0
)
self
.
fc_forward1
=
dg
.
Linear
(
hidden_size
,
hidden_size
//
2
*
3
)
self
.
fc_reverse1
=
dg
.
Linear
(
hidden_size
,
hidden_size
//
2
*
3
)
self
.
fc_forward1
=
Linear
(
hidden_size
,
hidden_size
//
2
*
3
)
self
.
fc_reverse1
=
Linear
(
hidden_size
,
hidden_size
//
2
*
3
)
self
.
gru_forward1
=
DynamicGRU
(
size
=
self
.
hidden_size
//
2
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'bias'
),
is_reverse
=
False
,
origin_mode
=
True
,
h_0
=
h_0
)
self
.
gru_reverse1
=
DynamicGRU
(
size
=
self
.
hidden_size
//
2
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'bias'
),
is_reverse
=
True
,
origin_mode
=
True
,
h_0
=
h_0
)
self
.
fc_forward2
=
dg
.
Linear
(
hidden_size
,
hidden_size
//
2
*
3
)
self
.
fc_reverse2
=
dg
.
Linear
(
hidden_size
,
hidden_size
//
2
*
3
)
self
.
fc_forward2
=
Linear
(
hidden_size
,
hidden_size
//
2
*
3
)
self
.
fc_reverse2
=
Linear
(
hidden_size
,
hidden_size
//
2
*
3
)
self
.
gru_forward2
=
DynamicGRU
(
size
=
self
.
hidden_size
//
2
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'bias'
),
is_reverse
=
False
,
origin_mode
=
True
,
h_0
=
h_0
)
self
.
gru_reverse2
=
DynamicGRU
(
size
=
self
.
hidden_size
//
2
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'bias'
),
is_reverse
=
True
,
origin_mode
=
True
,
h_0
=
h_0
)
...
...
@@ -216,8 +190,8 @@ class Highwaynet(dg.Layer):
self
.
linears
=
[]
for
i
in
range
(
num_layers
):
self
.
linears
.
append
(
dg
.
Linear
(
num_units
,
num_units
))
self
.
gates
.
append
(
dg
.
Linear
(
num_units
,
num_units
))
self
.
linears
.
append
(
Linear
(
num_units
,
num_units
))
self
.
gates
.
append
(
Linear
(
num_units
,
num_units
))
for
i
,
(
linear
,
gate
)
in
enumerate
(
zip
(
self
.
linears
,
self
.
gates
)):
self
.
add_sublayer
(
"linears_{}"
.
format
(
i
),
linear
)
...
...
parakeet/models/transformerTTS/network.py
浏览文件 @
ab0fe8f3
from
parakeet.models.transformerTTS.module
import
*
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid
as
fluid
from
parakeet.modules.layers
import
Conv1D
from
parakeet.modules.layers
import
Conv1D
,
Linear
from
parakeet.modules.utils
import
*
from
parakeet.modules.multihead_attention
import
MultiheadAttention
from
parakeet.modules.feed_forward
import
PositionwiseFeedForward
...
...
@@ -13,8 +13,7 @@ class Encoder(dg.Layer):
def
__init__
(
self
,
embedding_size
,
num_hidden
,
config
):
super
(
Encoder
,
self
).
__init__
()
self
.
num_hidden
=
num_hidden
param
=
fluid
.
ParamAttr
(
name
=
'alpha'
,
initializer
=
fluid
.
initializer
.
Constant
(
value
=
1.0
))
param
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
1.0
))
self
.
alpha
=
self
.
create_parameter
(
shape
=
(
1
,
),
attr
=
param
,
dtype
=
'float32'
)
self
.
pos_inp
=
get_sinusoid_encoding_table
(
1024
,
self
.
num_hidden
,
padding_idx
=
0
)
self
.
pos_emb
=
dg
.
Embedding
(
size
=
[
1024
,
num_hidden
],
...
...
@@ -39,13 +38,13 @@ class Encoder(dg.Layer):
else
:
query_mask
,
mask
=
None
,
None
# Encoder pre_network
x
=
self
.
encoder_prenet
(
x
)
#(N,T,C)
# Get positional encoding
positional
=
self
.
pos_emb
(
positional
)
x
=
positional
*
self
.
alpha
+
x
#(N, T, C)
...
...
@@ -65,21 +64,20 @@ class Decoder(dg.Layer):
def
__init__
(
self
,
num_hidden
,
config
):
super
(
Decoder
,
self
).
__init__
()
self
.
num_hidden
=
num_hidden
param
=
fluid
.
ParamAttr
(
name
=
'alpha'
)
param
=
fluid
.
ParamAttr
()
self
.
alpha
=
self
.
create_parameter
(
shape
=
(
1
,),
attr
=
param
,
dtype
=
'float32'
,
default_initializer
=
fluid
.
initializer
.
ConstantInitializer
(
value
=
1.0
))
self
.
pos_inp
=
get_sinusoid_encoding_table
(
1024
,
self
.
num_hidden
,
padding_idx
=
0
)
self
.
pos_emb
=
dg
.
Embedding
(
size
=
[
1024
,
num_hidden
],
padding_idx
=
0
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
,
initializer
=
fluid
.
initializer
.
NumpyArrayInitializer
(
self
.
pos_inp
),
trainable
=
False
))
self
.
decoder_prenet
=
PreNet
(
input_size
=
config
.
audio
.
num_mels
,
hidden_size
=
num_hidden
*
2
,
output_size
=
num_hidden
,
dropout_rate
=
0.2
)
self
.
linear
=
dg
.
Linear
(
num_hidden
,
num_hidden
)
self
.
linear
=
Linear
(
num_hidden
,
num_hidden
)
self
.
selfattn_layers
=
[
MultiheadAttention
(
num_hidden
,
num_hidden
//
4
,
num_hidden
//
4
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
selfattn_layers
):
...
...
@@ -90,8 +88,8 @@ class Decoder(dg.Layer):
self
.
ffns
=
[
PositionwiseFeedForward
(
num_hidden
,
num_hidden
*
4
,
filter_size
=
1
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
ffns
):
self
.
add_sublayer
(
"ffns_{}"
.
format
(
i
),
layer
)
self
.
mel_linear
=
dg
.
Linear
(
num_hidden
,
config
.
audio
.
num_mels
*
config
.
audio
.
outputs_per_step
)
self
.
stop_linear
=
dg
.
Linear
(
num_hidden
,
1
)
self
.
mel_linear
=
Linear
(
num_hidden
,
config
.
audio
.
num_mels
*
config
.
audio
.
outputs_per_step
)
self
.
stop_linear
=
Linear
(
num_hidden
,
1
)
self
.
postconvnet
=
PostConvNet
(
config
.
audio
.
num_mels
,
config
.
hidden_size
,
filter_size
=
5
,
padding
=
4
,
num_conv
=
5
,
...
...
@@ -115,10 +113,10 @@ class Decoder(dg.Layer):
mask
=
get_triu_tensor
(
query
.
numpy
(),
query
.
numpy
()).
astype
(
np
.
float32
)
mask
=
fluid
.
layers
.
cast
(
dg
.
to_variable
(
mask
==
0
),
np
.
float32
)
m_mask
,
zero_mask
=
None
,
None
# Decoder pre-network
query
=
self
.
decoder_prenet
(
query
)
# Centered position
query
=
self
.
linear
(
query
)
...
...
@@ -132,14 +130,13 @@ class Decoder(dg.Layer):
# Attention decoder-decoder, encoder-decoder
selfattn_list
=
list
()
attn_list
=
list
()
for
selfattn
,
attn
,
ffn
in
zip
(
self
.
selfattn_layers
,
self
.
attn_layers
,
self
.
ffns
):
query
,
attn_dec
=
selfattn
(
query
,
query
,
query
,
mask
=
mask
,
query_mask
=
m_mask
)
query
,
attn_dot
=
attn
(
key
,
value
,
query
,
mask
=
zero_mask
,
query_mask
=
m_mask
)
query
=
ffn
(
query
)
selfattn_list
.
append
(
attn_dec
)
attn_list
.
append
(
attn_dot
)
# Mel linear projection
mel_out
=
self
.
mel_linear
(
query
)
# Post Mel Network
...
...
@@ -164,7 +161,7 @@ class TransformerTTS(dg.Layer):
# key (batch_size, seq_len, channel)
# c_mask (batch_size, seq_len)
# attns_enc (channel / 2, seq_len, seq_len)
key
,
c_mask
,
attns_enc
=
self
.
encoder
(
characters
,
pos_text
)
# mel_output/postnet_output (batch_size, mel_len, n_mel)
...
...
parakeet/models/transformerTTS/synthesis.py
浏览文件 @
ab0fe8f3
...
...
@@ -2,7 +2,7 @@ import os
from
scipy.io.wavfile
import
write
from
parakeet.g2p.en
import
text_to_sequence
import
numpy
as
np
from
network
import
Model
,
ModelPostNet
from
network
import
TransformerTTS
,
ModelPostNet
from
tqdm
import
tqdm
from
tensorboardX
import
SummaryWriter
import
paddle.fluid
as
fluid
...
...
@@ -28,7 +28,7 @@ def synthesis(text_input, cfg):
writer
=
SummaryWriter
(
path
)
with
dg
.
guard
(
place
):
model
=
Model
(
cfg
)
model
=
TransformerTTS
(
cfg
)
model_postnet
=
ModelPostNet
(
cfg
)
model
.
set_dict
(
load_checkpoint
(
str
(
cfg
.
transformer_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"transformer"
)))
...
...
parakeet/models/transformerTTS/train_postnet.py
浏览文件 @
ab0fe8f3
...
...
@@ -89,8 +89,6 @@ def main(cfg):
else
:
loss
.
backward
()
optimizer
.
minimize
(
loss
,
grad_clip
=
fluid
.
dygraph_grad_clip
.
GradClipByGlobalNorm
(
cfg
.
grad_clip_thresh
))
print
(
"==============="
,
model
.
pre_proj
.
conv
.
weight
.
numpy
())
print
(
"==============="
,
model
.
pre_proj
.
conv
.
weight
.
gradient
())
model
.
clear_gradients
()
if
local_rank
==
0
:
...
...
parakeet/models/transformerTTS/train_transformer.py
浏览文件 @
ab0fe8f3
...
...
@@ -63,7 +63,7 @@ def main(cfg):
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
dg
.
NoamDecay
(
1
/
(
cfg
.
warm_up_step
*
(
cfg
.
lr
**
2
)),
cfg
.
warm_up_step
),
parameter_list
=
model
.
parameters
())
reader
=
LJSpeechLoader
(
cfg
,
nranks
,
local_rank
).
reader
()
reader
=
LJSpeechLoader
(
cfg
,
nranks
,
local_rank
,
shuffle
=
True
).
reader
()
if
cfg
.
checkpoint_path
is
not
None
:
model_dict
,
opti_dict
=
load_checkpoint
(
str
(
cfg
.
transformer_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"transformer"
))
...
...
@@ -78,26 +78,25 @@ def main(cfg):
for
epoch
in
range
(
cfg
.
epochs
):
pbar
=
tqdm
(
reader
)
for
i
,
data
in
enumerate
(
pbar
):
pbar
.
set_description
(
'Processing at epoch %d'
%
epoch
)
character
,
mel
,
mel_input
,
pos_text
,
pos_mel
,
text_length
=
data
global_step
+=
1
mel_pred
,
postnet_pred
,
attn_probs
,
stop_preds
,
attn_enc
,
attn_dec
=
model
(
character
,
mel_input
,
pos_text
,
pos_mel
)
label
=
np
.
zeros
(
stop_preds
.
shape
).
astype
(
np
.
float32
)
text_length
=
text_length
.
numpy
()
for
i
in
range
(
label
.
shape
[
0
]):
label
[
i
][
text_length
[
i
]
-
1
]
=
1
mel_loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
mel_pred
,
mel
)))
post_mel_loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
postnet_pred
,
mel
)))
stop_loss
=
cross_entropy
(
stop_preds
,
dg
.
to_variable
(
label
))
loss
=
mel_loss
+
post_mel_loss
+
stop_loss
if
local_rank
==
0
:
writer
.
add_scalars
(
'training_loss'
,
{
'mel_loss'
:
mel_loss
.
numpy
(),
...
...
parakeet/modules/layers.py
浏览文件 @
ab0fe8f3
...
...
@@ -5,6 +5,25 @@ import paddle
from
paddle
import
fluid
import
paddle.fluid.dygraph
as
dg
class
Linear
(
dg
.
Layer
):
def
__init__
(
self
,
in_features
,
out_features
,
is_bias
=
True
,
dtype
=
"float32"
):
super
(
Linear
,
self
).
__init__
()
self
.
in_features
=
in_features
self
.
out_features
=
out_features
self
.
dtype
=
dtype
self
.
weight
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
XavierInitializer
())
self
.
bias
=
is_bias
if
is_bias
is
not
False
:
k
=
math
.
sqrt
(
1
/
in_features
)
self
.
bias
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
k
,
high
=
k
))
self
.
linear
=
dg
.
Linear
(
in_features
,
out_features
,
param_attr
=
self
.
weight
,
bias_attr
=
self
.
bias
,)
def
forward
(
self
,
x
):
x
=
self
.
linear
(
x
)
return
x
class
Conv
(
dg
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels
,
filter_size
=
1
,
...
...
parakeet/modules/multihead_attention.py
浏览文件 @
ab0fe8f3
...
...
@@ -2,6 +2,7 @@ import math
import
numpy
as
np
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.layers
as
layers
from
parakeet.modules.layers
import
Linear
class
ScaledDotProductAttention
(
dg
.
Layer
):
def
__init__
(
self
,
d_key
):
...
...
@@ -34,10 +35,10 @@ class ScaledDotProductAttention(dg.Layer):
attention
=
attention
*
mask
mask
=
(
mask
==
0
).
astype
(
np
.
float32
)
*
(
-
2
**
32
+
1
)
attention
=
attention
+
mask
attention
=
layers
.
softmax
(
attention
)
attention
=
layers
.
dropout
(
attention
,
dropout
)
# Mask query to ignore padding
if
query_mask
is
not
None
:
attention
=
attention
*
query_mask
...
...
@@ -54,13 +55,13 @@ class MultiheadAttention(dg.Layer):
self
.
d_q
=
d_q
self
.
dropout
=
dropout
self
.
key
=
dg
.
Linear
(
num_hidden
,
num_head
*
d_k
)
self
.
value
=
dg
.
Linear
(
num_hidden
,
num_head
*
d_k
)
self
.
query
=
dg
.
Linear
(
num_hidden
,
num_head
*
d_q
)
self
.
key
=
Linear
(
num_hidden
,
num_head
*
d_k
,
is_bias
=
False
)
self
.
value
=
Linear
(
num_hidden
,
num_head
*
d_k
,
is_bias
=
False
)
self
.
query
=
Linear
(
num_hidden
,
num_head
*
d_q
,
is_bias
=
False
)
self
.
scal_attn
=
ScaledDotProductAttention
(
d_k
)
self
.
fc
=
dg
.
Linear
(
num_head
*
d_q
,
num_hidden
)
self
.
fc
=
Linear
(
num_head
*
d_q
*
2
,
num_hidden
)
self
.
layer_norm
=
dg
.
LayerNorm
(
num_hidden
)
...
...
@@ -105,6 +106,7 @@ class MultiheadAttention(dg.Layer):
result
=
layers
.
reshape
(
result
,
[
self
.
num_head
,
batch_size
,
seq_len_query
,
self
.
d_q
])
result
=
layers
.
reshape
(
layers
.
transpose
(
result
,
[
1
,
2
,
0
,
3
]),[
batch_size
,
seq_len_query
,
-
1
])
result
=
layers
.
concat
([
query_input
,
result
],
axis
=-
1
)
result
=
layers
.
dropout
(
self
.
fc
(
result
),
self
.
dropout
)
result
=
result
+
query_input
...
...
parakeet/modules/post_convnet.py
浏览文件 @
ab0fe8f3
...
...
@@ -16,6 +16,7 @@ class PostConvNet(dg.Layer):
super
(
PostConvNet
,
self
).
__init__
()
self
.
dropout
=
dropout
self
.
num_conv
=
num_conv
self
.
conv_list
=
[]
self
.
conv_list
.
append
(
Conv
(
in_channels
=
n_mels
*
outputs_per_step
,
out_channels
=
num_hidden
,
...
...
@@ -43,17 +44,9 @@ class PostConvNet(dg.Layer):
self
.
add_sublayer
(
"conv_list_{}"
.
format
(
i
),
layer
)
self
.
batch_norm_list
=
[
dg
.
BatchNorm
(
num_hidden
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'bias'
),
moving_mean_name
=
'moving_mean'
,
moving_variance_name
=
'moving_var'
,
data_layout
=
'NCHW'
)
for
_
in
range
(
num_conv
-
1
)]
self
.
batch_norm_list
.
append
(
dg
.
BatchNorm
(
n_mels
*
outputs_per_step
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'bias'
),
moving_mean_name
=
'moving_mean'
,
moving_variance_name
=
'moving_var'
,
data_layout
=
'NCHW'
))
#self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step,
# data_layout='NCHW'))
for
i
,
layer
in
enumerate
(
self
.
batch_norm_list
):
self
.
add_sublayer
(
"batch_norm_list_{}"
.
format
(
i
),
layer
)
...
...
@@ -67,9 +60,15 @@ class PostConvNet(dg.Layer):
Returns:
output (Variable), Shape(B, T, C), the result after postconvnet.
"""
input
=
layers
.
transpose
(
input
,
[
0
,
2
,
1
])
len
=
input
.
shape
[
-
1
]
for
batch_norm
,
conv
in
zip
(
self
.
batch_norm_list
,
self
.
conv_list
):
for
i
in
range
(
self
.
num_conv
-
1
):
batch_norm
=
self
.
batch_norm_list
[
i
]
conv
=
self
.
conv_list
[
i
]
input
=
layers
.
dropout
(
layers
.
tanh
(
batch_norm
(
conv
(
input
)[:,:,:
len
])),
self
.
dropout
)
conv
=
self
.
conv_list
[
self
.
num_conv
-
1
]
input
=
conv
(
input
)[:,:,:
len
]
output
=
layers
.
transpose
(
input
,
[
0
,
2
,
1
])
return
output
\ No newline at end of file
parakeet/modules/prenet.py
浏览文件 @
ab0fe8f3
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.layers
as
layers
from
parakeet.modules.layers
import
Linear
class
PreNet
(
dg
.
Layer
):
def
__init__
(
self
,
input_size
,
hidden_size
,
output_size
,
dropout_rate
=
0.2
):
...
...
@@ -14,8 +15,8 @@ class PreNet(dg.Layer):
self
.
output_size
=
output_size
self
.
dropout_rate
=
dropout_rate
self
.
linear1
=
dg
.
Linear
(
input_size
,
hidden_size
)
self
.
linear2
=
dg
.
Linear
(
hidden_size
,
output_size
)
self
.
linear1
=
Linear
(
input_size
,
hidden_size
)
self
.
linear2
=
Linear
(
hidden_size
,
output_size
)
def
forward
(
self
,
x
):
"""
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录