Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Parakeet
提交
f009411b
P
Parakeet
项目概览
PaddlePaddle
/
Parakeet
通知
14
Star
3
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
19
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Parakeet
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
19
Issue
19
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
f009411b
编写于
1月 15, 2020
作者:
L
lifuchen
提交者:
chenfeiyu
1月 15, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update fastspeech
上级
ab0fe8f3
变更
13
隐藏空白更改
内联
并排
Showing
13 changed file
with
58 addition
and
66 deletion
+58
-66
parakeet/models/fastspeech/config/fastspeech.yaml
parakeet/models/fastspeech/config/fastspeech.yaml
+2
-3
parakeet/models/fastspeech/modules.py
parakeet/models/fastspeech/modules.py
+4
-4
parakeet/models/fastspeech/network.py
parakeet/models/fastspeech/network.py
+11
-14
parakeet/models/fastspeech/parse.py
parakeet/models/fastspeech/parse.py
+0
-2
parakeet/models/fastspeech/train.py
parakeet/models/fastspeech/train.py
+8
-11
parakeet/models/transformerTTS/config/synthesis.yaml
parakeet/models/transformerTTS/config/synthesis.yaml
+2
-2
parakeet/models/transformerTTS/config/train_postnet.yaml
parakeet/models/transformerTTS/config/train_postnet.yaml
+2
-2
parakeet/models/transformerTTS/module.py
parakeet/models/transformerTTS/module.py
+2
-1
parakeet/models/transformerTTS/network.py
parakeet/models/transformerTTS/network.py
+7
-7
parakeet/models/transformerTTS/synthesis.py
parakeet/models/transformerTTS/synthesis.py
+9
-9
parakeet/models/transformerTTS/train_transformer.py
parakeet/models/transformerTTS/train_transformer.py
+6
-6
parakeet/modules/multihead_attention.py
parakeet/modules/multihead_attention.py
+0
-1
parakeet/modules/utils.py
parakeet/modules/utils.py
+5
-4
未找到文件。
parakeet/models/fastspeech/config/fastspeech.yaml
浏览文件 @
f009411b
...
@@ -14,7 +14,6 @@ encoder_n_layer: 6
...
@@ -14,7 +14,6 @@ encoder_n_layer: 6
encoder_head
:
2
encoder_head
:
2
encoder_conv1d_filter_size
:
1536
encoder_conv1d_filter_size
:
1536
max_sep_len
:
2048
max_sep_len
:
2048
fs_embedding_size
:
384
decoder_n_layer
:
6
decoder_n_layer
:
6
decoder_head
:
2
decoder_head
:
2
decoder_conv1d_filter_size
:
1536
decoder_conv1d_filter_size
:
1536
...
@@ -39,6 +38,6 @@ use_gpu: True
...
@@ -39,6 +38,6 @@ use_gpu: True
use_data_parallel
:
False
use_data_parallel
:
False
data_path
:
../../../dataset/LJSpeech-1.1
data_path
:
../../../dataset/LJSpeech-1.1
transtts_path
:
../transformerTTS/checkpoint
transtts_path
:
../transformerTTS/checkpoint
/
transformer_step
:
1
transformer_step
:
1
0
log_dir
:
./log
log_dir
:
./log
\ No newline at end of file
parakeet/models/fastspeech/modules.py
浏览文件 @
f009411b
...
@@ -4,7 +4,7 @@ import utils
...
@@ -4,7 +4,7 @@ import utils
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.layers
as
layers
import
paddle.fluid.layers
as
layers
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
parakeet.modules.layers
import
Conv
1D
from
parakeet.modules.layers
import
Conv
,
Linear
from
parakeet.modules.multihead_attention
import
MultiheadAttention
from
parakeet.modules.multihead_attention
import
MultiheadAttention
from
parakeet.modules.feed_forward
import
PositionwiseFeedForward
from
parakeet.modules.feed_forward
import
PositionwiseFeedForward
...
@@ -113,12 +113,12 @@ class DurationPredictor(dg.Layer):
...
@@ -113,12 +113,12 @@ class DurationPredictor(dg.Layer):
self
.
filter_size
=
filter_size
self
.
filter_size
=
filter_size
self
.
dropout
=
dropout
self
.
dropout
=
dropout
self
.
conv1
=
Conv
1D
(
in_channels
=
self
.
input_size
,
self
.
conv1
=
Conv
(
in_channels
=
self
.
input_size
,
out_channels
=
self
.
out_channels
,
out_channels
=
self
.
out_channels
,
filter_size
=
self
.
filter_size
,
filter_size
=
self
.
filter_size
,
padding
=
1
,
padding
=
1
,
data_format
=
'NTC'
)
data_format
=
'NTC'
)
self
.
conv2
=
Conv
1D
(
in_channels
=
self
.
out_channels
,
self
.
conv2
=
Conv
(
in_channels
=
self
.
out_channels
,
out_channels
=
self
.
out_channels
,
out_channels
=
self
.
out_channels
,
filter_size
=
self
.
filter_size
,
filter_size
=
self
.
filter_size
,
padding
=
1
,
padding
=
1
,
...
@@ -126,7 +126,7 @@ class DurationPredictor(dg.Layer):
...
@@ -126,7 +126,7 @@ class DurationPredictor(dg.Layer):
self
.
layer_norm1
=
dg
.
LayerNorm
(
self
.
out_channels
)
self
.
layer_norm1
=
dg
.
LayerNorm
(
self
.
out_channels
)
self
.
layer_norm2
=
dg
.
LayerNorm
(
self
.
out_channels
)
self
.
layer_norm2
=
dg
.
LayerNorm
(
self
.
out_channels
)
self
.
linear
=
dg
.
Linear
(
self
.
out_channels
,
1
)
self
.
linear
=
Linear
(
self
.
out_channels
,
1
)
def
forward
(
self
,
encoder_output
):
def
forward
(
self
,
encoder_output
):
"""
"""
...
...
parakeet/models/fastspeech/network.py
浏览文件 @
f009411b
...
@@ -5,12 +5,12 @@ import paddle.fluid as fluid
...
@@ -5,12 +5,12 @@ import paddle.fluid as fluid
from
parakeet.g2p.text.symbols
import
symbols
from
parakeet.g2p.text.symbols
import
symbols
from
parakeet.modules.utils
import
*
from
parakeet.modules.utils
import
*
from
parakeet.modules.post_convnet
import
PostConvNet
from
parakeet.modules.post_convnet
import
PostConvNet
from
parakeet.modules.layers
import
Linear
class
Encoder
(
dg
.
Layer
):
class
Encoder
(
dg
.
Layer
):
def
__init__
(
self
,
def
__init__
(
self
,
n_src_vocab
,
n_src_vocab
,
len_max_seq
,
len_max_seq
,
d_word_vec
,
n_layers
,
n_layers
,
n_head
,
n_head
,
d_k
,
d_k
,
...
@@ -23,9 +23,9 @@ class Encoder(dg.Layer):
...
@@ -23,9 +23,9 @@ class Encoder(dg.Layer):
super
(
Encoder
,
self
).
__init__
()
super
(
Encoder
,
self
).
__init__
()
n_position
=
len_max_seq
+
1
n_position
=
len_max_seq
+
1
self
.
src_word_emb
=
dg
.
Embedding
(
size
=
[
n_src_vocab
,
d_
word_vec
],
padding_idx
=
0
)
self
.
src_word_emb
=
dg
.
Embedding
(
size
=
[
n_src_vocab
,
d_
model
],
padding_idx
=
0
)
self
.
pos_inp
=
get_sinusoid_encoding_table
(
n_position
,
d_
word_vec
,
padding_idx
=
0
)
self
.
pos_inp
=
get_sinusoid_encoding_table
(
n_position
,
d_
model
,
padding_idx
=
0
)
self
.
position_enc
=
dg
.
Embedding
(
size
=
[
n_position
,
d_
word_vec
],
self
.
position_enc
=
dg
.
Embedding
(
size
=
[
n_position
,
d_
model
],
padding_idx
=
0
,
padding_idx
=
0
,
param_attr
=
fluid
.
ParamAttr
(
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
NumpyArrayInitializer
(
self
.
pos_inp
),
initializer
=
fluid
.
initializer
.
NumpyArrayInitializer
(
self
.
pos_inp
),
...
@@ -70,7 +70,6 @@ class Encoder(dg.Layer):
...
@@ -70,7 +70,6 @@ class Encoder(dg.Layer):
class
Decoder
(
dg
.
Layer
):
class
Decoder
(
dg
.
Layer
):
def
__init__
(
self
,
def
__init__
(
self
,
len_max_seq
,
len_max_seq
,
d_word_vec
,
n_layers
,
n_layers
,
n_head
,
n_head
,
d_k
,
d_k
,
...
@@ -83,8 +82,8 @@ class Decoder(dg.Layer):
...
@@ -83,8 +82,8 @@ class Decoder(dg.Layer):
super
(
Decoder
,
self
).
__init__
()
super
(
Decoder
,
self
).
__init__
()
n_position
=
len_max_seq
+
1
n_position
=
len_max_seq
+
1
self
.
pos_inp
=
get_sinusoid_encoding_table
(
n_position
,
d_
word_vec
,
padding_idx
=
0
)
self
.
pos_inp
=
get_sinusoid_encoding_table
(
n_position
,
d_
model
,
padding_idx
=
0
)
self
.
position_enc
=
dg
.
Embedding
(
size
=
[
n_position
,
d_
word_vec
],
self
.
position_enc
=
dg
.
Embedding
(
size
=
[
n_position
,
d_
model
],
padding_idx
=
0
,
padding_idx
=
0
,
param_attr
=
fluid
.
ParamAttr
(
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
NumpyArrayInitializer
(
self
.
pos_inp
),
initializer
=
fluid
.
initializer
.
NumpyArrayInitializer
(
self
.
pos_inp
),
...
@@ -131,11 +130,10 @@ class FastSpeech(dg.Layer):
...
@@ -131,11 +130,10 @@ class FastSpeech(dg.Layer):
self
.
encoder
=
Encoder
(
n_src_vocab
=
len
(
symbols
)
+
1
,
self
.
encoder
=
Encoder
(
n_src_vocab
=
len
(
symbols
)
+
1
,
len_max_seq
=
cfg
.
max_sep_len
,
len_max_seq
=
cfg
.
max_sep_len
,
d_word_vec
=
cfg
.
fs_embedding_size
,
n_layers
=
cfg
.
encoder_n_layer
,
n_layers
=
cfg
.
encoder_n_layer
,
n_head
=
cfg
.
encoder_head
,
n_head
=
cfg
.
encoder_head
,
d_k
=
64
,
d_k
=
cfg
.
fs_hidden_size
//
cfg
.
encoder_head
,
d_v
=
64
,
d_v
=
cfg
.
fs_hidden_size
//
cfg
.
encoder_head
,
d_model
=
cfg
.
fs_hidden_size
,
d_model
=
cfg
.
fs_hidden_size
,
d_inner
=
cfg
.
encoder_conv1d_filter_size
,
d_inner
=
cfg
.
encoder_conv1d_filter_size
,
fft_conv1d_kernel
=
cfg
.
fft_conv1d_filter
,
fft_conv1d_kernel
=
cfg
.
fft_conv1d_filter
,
...
@@ -146,17 +144,16 @@ class FastSpeech(dg.Layer):
...
@@ -146,17 +144,16 @@ class FastSpeech(dg.Layer):
filter_size
=
cfg
.
duration_predictor_filter_size
,
filter_size
=
cfg
.
duration_predictor_filter_size
,
dropout
=
cfg
.
dropout
)
dropout
=
cfg
.
dropout
)
self
.
decoder
=
Decoder
(
len_max_seq
=
cfg
.
max_sep_len
,
self
.
decoder
=
Decoder
(
len_max_seq
=
cfg
.
max_sep_len
,
d_word_vec
=
cfg
.
fs_embedding_size
,
n_layers
=
cfg
.
decoder_n_layer
,
n_layers
=
cfg
.
decoder_n_layer
,
n_head
=
cfg
.
decoder_head
,
n_head
=
cfg
.
decoder_head
,
d_k
=
64
,
d_k
=
cfg
.
fs_hidden_size
//
cfg
.
decoder_head
,
d_v
=
64
,
d_v
=
cfg
.
fs_hidden_size
//
cfg
.
decoder_head
,
d_model
=
cfg
.
fs_hidden_size
,
d_model
=
cfg
.
fs_hidden_size
,
d_inner
=
cfg
.
decoder_conv1d_filter_size
,
d_inner
=
cfg
.
decoder_conv1d_filter_size
,
fft_conv1d_kernel
=
cfg
.
fft_conv1d_filter
,
fft_conv1d_kernel
=
cfg
.
fft_conv1d_filter
,
fft_conv1d_padding
=
cfg
.
fft_conv1d_padding
,
fft_conv1d_padding
=
cfg
.
fft_conv1d_padding
,
dropout
=
0.1
)
dropout
=
0.1
)
self
.
mel_linear
=
dg
.
Linear
(
cfg
.
fs_hidden_size
,
cfg
.
audio
.
num_mels
*
cfg
.
audio
.
outputs_per_step
)
self
.
mel_linear
=
Linear
(
cfg
.
fs_hidden_size
,
cfg
.
audio
.
num_mels
*
cfg
.
audio
.
outputs_per_step
)
self
.
postnet
=
PostConvNet
(
n_mels
=
cfg
.
audio
.
num_mels
,
self
.
postnet
=
PostConvNet
(
n_mels
=
cfg
.
audio
.
num_mels
,
num_hidden
=
512
,
num_hidden
=
512
,
filter_size
=
5
,
filter_size
=
5
,
...
...
parakeet/models/fastspeech/parse.py
浏览文件 @
f009411b
...
@@ -22,8 +22,6 @@ def add_config_options_to_parser(parser):
...
@@ -22,8 +22,6 @@ def add_config_options_to_parser(parser):
parser
.
add_argument
(
'--audio.outputs_per_step'
,
type
=
int
,
default
=
1
,
parser
.
add_argument
(
'--audio.outputs_per_step'
,
type
=
int
,
default
=
1
,
help
=
"the outputs per step."
)
help
=
"the outputs per step."
)
parser
.
add_argument
(
'--fs_embedding_size'
,
type
=
int
,
default
=
256
,
help
=
"the dim size of embedding of fastspeech."
)
parser
.
add_argument
(
'--encoder_n_layer'
,
type
=
int
,
default
=
6
,
parser
.
add_argument
(
'--encoder_n_layer'
,
type
=
int
,
default
=
6
,
help
=
"the number of FFT Block in encoder."
)
help
=
"the number of FFT Block in encoder."
)
parser
.
add_argument
(
'--encoder_head'
,
type
=
int
,
default
=
2
,
parser
.
add_argument
(
'--encoder_head'
,
type
=
int
,
default
=
2
,
...
...
parakeet/models/fastspeech/train.py
浏览文件 @
f009411b
...
@@ -55,14 +55,13 @@ def main(cfg):
...
@@ -55,14 +55,13 @@ def main(cfg):
writer
=
SummaryWriter
(
path
)
if
local_rank
==
0
else
None
writer
=
SummaryWriter
(
path
)
if
local_rank
==
0
else
None
with
dg
.
guard
(
place
):
with
dg
.
guard
(
place
):
transformerTTS
=
TransformerTTS
(
cfg
)
with
fluid
.
unique_name
.
guard
():
model_path
=
os
.
path
.
join
(
cfg
.
transtts_path
,
"transformer"
)
transformerTTS
=
TransformerTTS
(
cfg
)
model_dict
,
_
=
fluid
.
dygraph
.
load_dygraph
(
os
.
path
.
join
(
model_path
,
str
(
cfg
.
transformer_step
)))
model_path
=
os
.
path
.
join
(
cfg
.
transtts_path
,
"transformer"
)
#for param in transformerTTS.state_dict():
model_dict
,
_
=
fluid
.
dygraph
.
load_dygraph
(
os
.
path
.
join
(
model_path
,
str
(
cfg
.
transformer_step
)))
# print(param)
transformerTTS
.
set_dict
(
model_dict
)
transformerTTS
.
set_dict
(
model_dict
)
transformerTTS
.
eval
()
transformerTTS
.
eval
()
model
=
FastSpeech
(
cfg
)
model
=
FastSpeech
(
cfg
)
model
.
train
()
model
.
train
()
...
@@ -89,7 +88,6 @@ def main(cfg):
...
@@ -89,7 +88,6 @@ def main(cfg):
_
,
_
,
attn_probs
,
_
,
_
,
_
=
transformerTTS
(
character
,
mel_input
,
pos_text
,
pos_mel
)
_
,
_
,
attn_probs
,
_
,
_
,
_
=
transformerTTS
(
character
,
mel_input
,
pos_text
,
pos_mel
)
alignment
=
dg
.
to_variable
(
get_alignment
(
attn_probs
,
cfg
.
transformer_head
)).
astype
(
np
.
float32
)
alignment
=
dg
.
to_variable
(
get_alignment
(
attn_probs
,
cfg
.
transformer_head
)).
astype
(
np
.
float32
)
global_step
+=
1
global_step
+=
1
#Forward
#Forward
...
@@ -104,8 +102,7 @@ def main(cfg):
...
@@ -104,8 +102,7 @@ def main(cfg):
total_loss
=
mel_loss
+
mel_postnet_loss
+
duration_loss
total_loss
=
mel_loss
+
mel_postnet_loss
+
duration_loss
if
local_rank
==
0
:
if
local_rank
==
0
:
print
(
'epoch:{}, step:{}, mel_loss:{}, mel_postnet_loss:{}, duration_loss:{}'
.
format
(
epoch
,
global_step
,
mel_loss
.
numpy
(),
mel_postnet_loss
.
numpy
(),
duration_loss
.
numpy
()))
#print('epoch:{}, step:{}, mel_loss:{}, mel_postnet_loss:{}, duration_loss:{}'.format(epoch, global_step, mel_loss.numpy(), mel_postnet_loss.numpy(), duration_loss.numpy()))
writer
.
add_scalar
(
'mel_loss'
,
mel_loss
.
numpy
(),
global_step
)
writer
.
add_scalar
(
'mel_loss'
,
mel_loss
.
numpy
(),
global_step
)
writer
.
add_scalar
(
'post_mel_loss'
,
mel_postnet_loss
.
numpy
(),
global_step
)
writer
.
add_scalar
(
'post_mel_loss'
,
mel_postnet_loss
.
numpy
(),
global_step
)
writer
.
add_scalar
(
'duration_loss'
,
duration_loss
.
numpy
(),
global_step
)
writer
.
add_scalar
(
'duration_loss'
,
duration_loss
.
numpy
(),
global_step
)
...
...
parakeet/models/transformerTTS/config/synthesis.yaml
浏览文件 @
f009411b
...
@@ -11,8 +11,8 @@ audio:
...
@@ -11,8 +11,8 @@ audio:
outputs_per_step
:
1
outputs_per_step
:
1
max_len
:
50
max_len
:
50
transformer_step
:
1
transformer_step
:
1
0
postnet_step
:
1
postnet_step
:
1
0
use_gpu
:
True
use_gpu
:
True
checkpoint_path
:
./checkpoint
checkpoint_path
:
./checkpoint
...
...
parakeet/models/transformerTTS/config/train_postnet.yaml
浏览文件 @
f009411b
...
@@ -18,9 +18,9 @@ grad_clip_thresh: 1.0
...
@@ -18,9 +18,9 @@ grad_clip_thresh: 1.0
batch_size
:
32
batch_size
:
32
epochs
:
10000
epochs
:
10000
lr
:
0.001
lr
:
0.001
save_step
:
50
0
save_step
:
1
0
use_gpu
:
True
use_gpu
:
True
use_data_parallel
:
Tru
e
use_data_parallel
:
Fals
e
data_path
:
../../../dataset/LJSpeech-1.1
data_path
:
../../../dataset/LJSpeech-1.1
save_path
:
./checkpoint
save_path
:
./checkpoint
...
...
parakeet/models/transformerTTS/module.py
浏览文件 @
f009411b
...
@@ -35,7 +35,7 @@ class EncoderPrenet(dg.Layer):
...
@@ -35,7 +35,7 @@ class EncoderPrenet(dg.Layer):
self
.
add_sublayer
(
"conv_list_{}"
.
format
(
i
),
layer
)
self
.
add_sublayer
(
"conv_list_{}"
.
format
(
i
),
layer
)
self
.
batch_norm_list
=
[
dg
.
BatchNorm
(
num_hidden
,
self
.
batch_norm_list
=
[
dg
.
BatchNorm
(
num_hidden
,
data_layout
=
'NCHW'
,
epsilon
=
1e-30
)
for
_
in
range
(
3
)]
data_layout
=
'NCHW'
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
batch_norm_list
):
for
i
,
layer
in
enumerate
(
self
.
batch_norm_list
):
self
.
add_sublayer
(
"batch_norm_list_{}"
.
format
(
i
),
layer
)
self
.
add_sublayer
(
"batch_norm_list_{}"
.
format
(
i
),
layer
)
...
@@ -57,6 +57,7 @@ class CBHG(dg.Layer):
...
@@ -57,6 +57,7 @@ class CBHG(dg.Layer):
super
(
CBHG
,
self
).
__init__
()
super
(
CBHG
,
self
).
__init__
()
"""
"""
:param hidden_size: dimension of hidden unit
:param hidden_size: dimension of hidden unit
:param batch_size: batch size
:param K: # of convolution banks
:param K: # of convolution banks
:param projection_size: dimension of projection unit
:param projection_size: dimension of projection unit
:param num_gru_layers: # of layers of GRUcell
:param num_gru_layers: # of layers of GRUcell
...
...
parakeet/models/transformerTTS/network.py
浏览文件 @
f009411b
...
@@ -10,7 +10,7 @@ from parakeet.modules.post_convnet import PostConvNet
...
@@ -10,7 +10,7 @@ from parakeet.modules.post_convnet import PostConvNet
class
Encoder
(
dg
.
Layer
):
class
Encoder
(
dg
.
Layer
):
def
__init__
(
self
,
embedding_size
,
num_hidden
,
config
):
def
__init__
(
self
,
embedding_size
,
num_hidden
,
config
,
num_head
=
4
):
super
(
Encoder
,
self
).
__init__
()
super
(
Encoder
,
self
).
__init__
()
self
.
num_hidden
=
num_hidden
self
.
num_hidden
=
num_hidden
param
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
1.0
))
param
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
1.0
))
...
@@ -24,10 +24,10 @@ class Encoder(dg.Layer):
...
@@ -24,10 +24,10 @@ class Encoder(dg.Layer):
self
.
encoder_prenet
=
EncoderPrenet
(
embedding_size
=
embedding_size
,
self
.
encoder_prenet
=
EncoderPrenet
(
embedding_size
=
embedding_size
,
num_hidden
=
num_hidden
,
num_hidden
=
num_hidden
,
use_cudnn
=
config
.
use_gpu
)
use_cudnn
=
config
.
use_gpu
)
self
.
layers
=
[
MultiheadAttention
(
num_hidden
,
num_hidden
//
4
,
num_hidden
//
4
)
for
_
in
range
(
3
)]
self
.
layers
=
[
MultiheadAttention
(
num_hidden
,
num_hidden
//
num_head
,
num_hidden
//
num_head
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
layers
):
for
i
,
layer
in
enumerate
(
self
.
layers
):
self
.
add_sublayer
(
"self_attn_{}"
.
format
(
i
),
layer
)
self
.
add_sublayer
(
"self_attn_{}"
.
format
(
i
),
layer
)
self
.
ffns
=
[
PositionwiseFeedForward
(
num_hidden
,
num_hidden
*
4
,
filter_size
=
1
,
use_cudnn
=
config
.
use_gpu
)
for
_
in
range
(
3
)]
self
.
ffns
=
[
PositionwiseFeedForward
(
num_hidden
,
num_hidden
*
num_head
,
filter_size
=
1
,
use_cudnn
=
config
.
use_gpu
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
ffns
):
for
i
,
layer
in
enumerate
(
self
.
ffns
):
self
.
add_sublayer
(
"ffns_{}"
.
format
(
i
),
layer
)
self
.
add_sublayer
(
"ffns_{}"
.
format
(
i
),
layer
)
...
@@ -61,7 +61,7 @@ class Encoder(dg.Layer):
...
@@ -61,7 +61,7 @@ class Encoder(dg.Layer):
return
x
,
query_mask
,
attentions
return
x
,
query_mask
,
attentions
class
Decoder
(
dg
.
Layer
):
class
Decoder
(
dg
.
Layer
):
def
__init__
(
self
,
num_hidden
,
config
):
def
__init__
(
self
,
num_hidden
,
config
,
num_head
=
4
):
super
(
Decoder
,
self
).
__init__
()
super
(
Decoder
,
self
).
__init__
()
self
.
num_hidden
=
num_hidden
self
.
num_hidden
=
num_hidden
param
=
fluid
.
ParamAttr
()
param
=
fluid
.
ParamAttr
()
...
@@ -79,13 +79,13 @@ class Decoder(dg.Layer):
...
@@ -79,13 +79,13 @@ class Decoder(dg.Layer):
dropout_rate
=
0.2
)
dropout_rate
=
0.2
)
self
.
linear
=
Linear
(
num_hidden
,
num_hidden
)
self
.
linear
=
Linear
(
num_hidden
,
num_hidden
)
self
.
selfattn_layers
=
[
MultiheadAttention
(
num_hidden
,
num_hidden
//
4
,
num_hidden
//
4
)
for
_
in
range
(
3
)]
self
.
selfattn_layers
=
[
MultiheadAttention
(
num_hidden
,
num_hidden
//
num_head
,
num_hidden
//
num_head
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
selfattn_layers
):
for
i
,
layer
in
enumerate
(
self
.
selfattn_layers
):
self
.
add_sublayer
(
"self_attn_{}"
.
format
(
i
),
layer
)
self
.
add_sublayer
(
"self_attn_{}"
.
format
(
i
),
layer
)
self
.
attn_layers
=
[
MultiheadAttention
(
num_hidden
,
num_hidden
//
4
,
num_hidden
//
4
)
for
_
in
range
(
3
)]
self
.
attn_layers
=
[
MultiheadAttention
(
num_hidden
,
num_hidden
//
num_head
,
num_hidden
//
num_head
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
attn_layers
):
for
i
,
layer
in
enumerate
(
self
.
attn_layers
):
self
.
add_sublayer
(
"attn_{}"
.
format
(
i
),
layer
)
self
.
add_sublayer
(
"attn_{}"
.
format
(
i
),
layer
)
self
.
ffns
=
[
PositionwiseFeedForward
(
num_hidden
,
num_hidden
*
4
,
filter_size
=
1
)
for
_
in
range
(
3
)]
self
.
ffns
=
[
PositionwiseFeedForward
(
num_hidden
,
num_hidden
*
num_head
,
filter_size
=
1
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
ffns
):
for
i
,
layer
in
enumerate
(
self
.
ffns
):
self
.
add_sublayer
(
"ffns_{}"
.
format
(
i
),
layer
)
self
.
add_sublayer
(
"ffns_{}"
.
format
(
i
),
layer
)
self
.
mel_linear
=
Linear
(
num_hidden
,
config
.
audio
.
num_mels
*
config
.
audio
.
outputs_per_step
)
self
.
mel_linear
=
Linear
(
num_hidden
,
config
.
audio
.
num_mels
*
config
.
audio
.
outputs_per_step
)
...
...
parakeet/models/transformerTTS/synthesis.py
浏览文件 @
f009411b
...
@@ -28,12 +28,15 @@ def synthesis(text_input, cfg):
...
@@ -28,12 +28,15 @@ def synthesis(text_input, cfg):
writer
=
SummaryWriter
(
path
)
writer
=
SummaryWriter
(
path
)
with
dg
.
guard
(
place
):
with
dg
.
guard
(
place
):
model
=
TransformerTTS
(
cfg
)
with
fluid
.
unique_name
.
guard
():
model_postnet
=
ModelPostNet
(
cfg
)
model
=
TransformerTTS
(
cfg
)
model
.
set_dict
(
load_checkpoint
(
str
(
cfg
.
transformer_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"transformer"
)))
model
.
set_dict
(
load_checkpoint
(
str
(
cfg
.
transformer_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"transformer"
)))
model
.
eval
()
model_postnet
.
set_dict
(
load_checkpoint
(
str
(
cfg
.
postnet_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"postnet"
)))
with
fluid
.
unique_name
.
guard
():
model_postnet
=
ModelPostNet
(
cfg
)
model_postnet
.
set_dict
(
load_checkpoint
(
str
(
cfg
.
postnet_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"postnet"
)))
model_postnet
.
eval
()
# init input
# init input
text
=
np
.
asarray
(
text_to_sequence
(
text_input
))
text
=
np
.
asarray
(
text_to_sequence
(
text_input
))
text
=
fluid
.
layers
.
unsqueeze
(
dg
.
to_variable
(
text
),[
0
])
text
=
fluid
.
layers
.
unsqueeze
(
dg
.
to_variable
(
text
),[
0
])
...
@@ -42,9 +45,6 @@ def synthesis(text_input, cfg):
...
@@ -42,9 +45,6 @@ def synthesis(text_input, cfg):
pos_text
=
fluid
.
layers
.
unsqueeze
(
dg
.
to_variable
(
pos_text
),[
0
])
pos_text
=
fluid
.
layers
.
unsqueeze
(
dg
.
to_variable
(
pos_text
),[
0
])
model
.
eval
()
model_postnet
.
eval
()
pbar
=
tqdm
(
range
(
cfg
.
max_len
))
pbar
=
tqdm
(
range
(
cfg
.
max_len
))
for
i
in
pbar
:
for
i
in
pbar
:
...
...
parakeet/models/transformerTTS/train_transformer.py
浏览文件 @
f009411b
...
@@ -86,17 +86,17 @@ def main(cfg):
...
@@ -86,17 +86,17 @@ def main(cfg):
mel_pred
,
postnet_pred
,
attn_probs
,
stop_preds
,
attn_enc
,
attn_dec
=
model
(
character
,
mel_input
,
pos_text
,
pos_mel
)
mel_pred
,
postnet_pred
,
attn_probs
,
stop_preds
,
attn_enc
,
attn_dec
=
model
(
character
,
mel_input
,
pos_text
,
pos_mel
)
label
=
np
.
zeros
(
stop_preds
.
shape
).
astype
(
np
.
float32
)
label
=
(
pos_mel
==
0
).
astype
(
np
.
float32
)
text_length
=
text_length
.
numpy
()
#label = np.zeros(stop_preds.shape).astype(np.float32)
for
i
in
range
(
label
.
shape
[
0
]):
#text_length = text_length.numpy()
label
[
i
][
text_length
[
i
]
-
1
]
=
1
#for i in range(label.shape[0]):
# label[i][text_length[i] - 1] = 1
mel_loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
mel_pred
,
mel
)))
mel_loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
mel_pred
,
mel
)))
post_mel_loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
postnet_pred
,
mel
)))
post_mel_loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
postnet_pred
,
mel
)))
stop_loss
=
cross_entropy
(
stop_preds
,
dg
.
to_variable
(
label
)
)
stop_loss
=
cross_entropy
(
stop_preds
,
label
)
loss
=
mel_loss
+
post_mel_loss
+
stop_loss
loss
=
mel_loss
+
post_mel_loss
+
stop_loss
if
local_rank
==
0
:
if
local_rank
==
0
:
writer
.
add_scalars
(
'training_loss'
,
{
writer
.
add_scalars
(
'training_loss'
,
{
'mel_loss'
:
mel_loss
.
numpy
(),
'mel_loss'
:
mel_loss
.
numpy
(),
...
...
parakeet/modules/multihead_attention.py
浏览文件 @
f009411b
...
@@ -105,7 +105,6 @@ class MultiheadAttention(dg.Layer):
...
@@ -105,7 +105,6 @@ class MultiheadAttention(dg.Layer):
# concat all multihead result
# concat all multihead result
result
=
layers
.
reshape
(
result
,
[
self
.
num_head
,
batch_size
,
seq_len_query
,
self
.
d_q
])
result
=
layers
.
reshape
(
result
,
[
self
.
num_head
,
batch_size
,
seq_len_query
,
self
.
d_q
])
result
=
layers
.
reshape
(
layers
.
transpose
(
result
,
[
1
,
2
,
0
,
3
]),[
batch_size
,
seq_len_query
,
-
1
])
result
=
layers
.
reshape
(
layers
.
transpose
(
result
,
[
1
,
2
,
0
,
3
]),[
batch_size
,
seq_len_query
,
-
1
])
result
=
layers
.
concat
([
query_input
,
result
],
axis
=-
1
)
result
=
layers
.
concat
([
query_input
,
result
],
axis
=-
1
)
result
=
layers
.
dropout
(
self
.
fc
(
result
),
self
.
dropout
)
result
=
layers
.
dropout
(
self
.
fc
(
result
),
self
.
dropout
)
result
=
result
+
query_input
result
=
result
+
query_input
...
...
parakeet/modules/utils.py
浏览文件 @
f009411b
...
@@ -65,9 +65,10 @@ def guided_attention(N, T, g=0.2):
...
@@ -65,9 +65,10 @@ def guided_attention(N, T, g=0.2):
return
W
return
W
def
cross_entropy
(
input
,
label
,
position_weight
=
5.0
,
epsilon
=
0.0001
):
def
cross_entropy
(
input
,
label
,
position_weight
=
1.0
,
epsilon
=
1e-30
):
input
=
-
1
*
label
*
layers
.
log
(
input
+
epsilon
)
-
(
1
-
label
)
*
layers
.
log
(
1
-
input
+
epsilon
)
output
=
-
1
*
label
*
layers
.
log
(
input
+
epsilon
)
-
(
1
-
label
)
*
layers
.
log
(
1
-
input
+
epsilon
)
label
=
input
*
(
label
*
(
position_weight
-
1
)
+
1
)
output
=
output
*
(
label
*
(
position_weight
-
1
)
+
1
)
return
layers
.
reduce_sum
(
label
,
dim
=
[
0
,
1
])
return
layers
.
reduce_sum
(
output
,
dim
=
[
0
,
1
])
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录