Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Parakeet
提交
8a9bbc26
P
Parakeet
项目概览
PaddlePaddle
/
Parakeet
通知
11
Star
3
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
19
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Parakeet
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
19
Issue
19
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
8a9bbc26
编写于
12月 16, 2019
作者:
L
lifuchen
提交者:
chenfeiyu
12月 16, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add_TransformerTTS
上级
fd9e198a
变更
14
展开全部
隐藏空白更改
内联
并排
Showing
14 changed file
with
1593 addition
and
2 deletion
+1593
-2
parakeet/data/batch.py
parakeet/data/batch.py
+1
-1
parakeet/models/transformerTTS/config/synthesis.yaml
parakeet/models/transformerTTS/config/synthesis.yaml
+20
-0
parakeet/models/transformerTTS/config/train_postnet.yaml
parakeet/models/transformerTTS/config/train_postnet.yaml
+27
-0
parakeet/models/transformerTTS/config/train_transformer.yaml
parakeet/models/transformerTTS/config/train_transformer.yaml
+32
-0
parakeet/models/transformerTTS/layers.py
parakeet/models/transformerTTS/layers.py
+170
-0
parakeet/models/transformerTTS/module.py
parakeet/models/transformerTTS/module.py
+525
-0
parakeet/models/transformerTTS/network.py
parakeet/models/transformerTTS/network.py
+207
-0
parakeet/models/transformerTTS/parse.py
parakeet/models/transformerTTS/parse.py
+63
-0
parakeet/models/transformerTTS/preprocess.py
parakeet/models/transformerTTS/preprocess.py
+137
-0
parakeet/models/transformerTTS/synthesis.py
parakeet/models/transformerTTS/synthesis.py
+67
-0
parakeet/models/transformerTTS/train_postnet.py
parakeet/models/transformerTTS/train_postnet.py
+135
-0
parakeet/models/transformerTTS/train_transformer.py
parakeet/models/transformerTTS/train_transformer.py
+166
-0
parakeet/models/transformerTTS/utils.py
parakeet/models/transformerTTS/utils.py
+42
-0
tests/test_ljspeech.py
tests/test_ljspeech.py
+1
-1
未找到文件。
parakeet/data/batch.py
浏览文件 @
8a9bbc26
...
...
@@ -88,7 +88,7 @@ def batch_spec(minibatch, pad_value=0., dtype=np.float32):
mono_channel
=
False
lengths
=
[
example
.
shape
[
-
1
]
for
example
in
minibatch
]
# assume (channel, F, n_frame) or (F, n_frame)
max_len
=
np
.
max
(
lengths
)
max_len
=
np
.
max
(
lengths
)
batch
=
[]
for
example
in
minibatch
:
...
...
parakeet/models/transformerTTS/config/synthesis.yaml
0 → 100644
浏览文件 @
8a9bbc26
audio
:
num_mels
:
80
n_fft
:
2048
sr
:
22050
preemphasis
:
0.97
hop_length
:
275
win_length
:
1102
power
:
1.2
min_level_db
:
-100
ref_level_db
:
20
outputs_per_step
:
1
max_len
:
50
transformer_step
:
1
postnet_step
:
1
use_gpu
:
True
checkpoint_path
:
./checkpoint
log_dir
:
./log
sample_path
:
./sample
\ No newline at end of file
parakeet/models/transformerTTS/config/train_postnet.yaml
0 → 100644
浏览文件 @
8a9bbc26
audio
:
num_mels
:
80
n_fft
:
2048
sr
:
22050
preemphasis
:
0.97
hop_length
:
275
win_length
:
1102
power
:
1.2
min_level_db
:
-100
ref_level_db
:
20
outputs_per_step
:
1
network
:
hidden_size
:
256
embedding_size
:
512
batch_size
:
32
epochs
:
10000
lr
:
0.001
save_step
:
500
use_gpu
:
True
use_data_parallel
:
False
data_path
:
../../../dataset/LJSpeech-1.1
save_path
:
./checkpoint
log_dir
:
./log
\ No newline at end of file
parakeet/models/transformerTTS/config/train_transformer.yaml
0 → 100644
浏览文件 @
8a9bbc26
audio
:
num_mels
:
80
n_fft
:
2048
sr
:
22050
preemphasis
:
0.97
hop_length
:
275
win_length
:
1102
power
:
1.2
min_level_db
:
-100
ref_level_db
:
20
outputs_per_step
:
1
network
:
hidden_size
:
256
embedding_size
:
512
batch_size
:
32
epochs
:
10000
lr
:
0.001
save_step
:
500
image_step
:
2000
use_gpu
:
True
use_data_parallel
:
False
data_path
:
../../../dataset/LJSpeech-1.1
save_path
:
./checkpoint
log_dir
:
./log
\ No newline at end of file
parakeet/models/transformerTTS/layers.py
0 → 100644
浏览文件 @
8a9bbc26
import
math
import
numpy
as
np
import
paddle
from
paddle
import
fluid
import
paddle.fluid.dygraph
as
dg
class
Conv1D
(
dg
.
Layer
):
"""
A convolution 1D block implemented with Conv2D. Form simplicity and
ensuring the output has the same length as the input, it does not allow
stride > 1.
"""
def
__init__
(
self
,
name_scope
,
in_channels
,
num_filters
,
filter_size
=
3
,
padding
=
0
,
dilation
=
1
,
stride
=
1
,
groups
=
None
,
param_attr
=
None
,
bias_attr
=
None
,
use_cudnn
=
True
,
act
=
None
,
data_format
=
'NCT'
,
dtype
=
"float32"
):
super
(
Conv1D
,
self
).
__init__
(
name_scope
,
dtype
=
dtype
)
self
.
padding
=
padding
self
.
in_channels
=
in_channels
self
.
num_filters
=
num_filters
self
.
filter_size
=
filter_size
self
.
stride
=
stride
self
.
dilation
=
dilation
self
.
padding
=
padding
self
.
act
=
act
self
.
data_format
=
data_format
self
.
conv
=
dg
.
Conv2D
(
self
.
full_name
(),
num_filters
=
num_filters
,
filter_size
=
(
1
,
filter_size
),
stride
=
(
1
,
stride
),
dilation
=
(
1
,
dilation
),
padding
=
(
0
,
padding
),
groups
=
groups
,
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
use_cudnn
=
use_cudnn
,
act
=
act
,
dtype
=
dtype
)
def
forward
(
self
,
x
):
"""
Args:
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
input channels.
Returns:
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
output channels (num_filters).
"""
if
self
.
data_format
==
'NTC'
:
x
=
fluid
.
layers
.
transpose
(
x
,
[
0
,
2
,
1
])
x
=
fluid
.
layers
.
unsqueeze
(
x
,
[
2
])
x
=
self
.
conv
(
x
)
x
=
fluid
.
layers
.
squeeze
(
x
,
[
2
])
if
self
.
data_format
==
'NTC'
:
x
=
fluid
.
layers
.
transpose
(
x
,
[
0
,
2
,
1
])
return
x
class
Pool1D
(
dg
.
Layer
):
"""
A Pool 1D block implemented with Pool2D.
"""
def
__init__
(
self
,
name_scope
,
pool_size
=-
1
,
pool_type
=
'max'
,
pool_stride
=
1
,
pool_padding
=
0
,
global_pooling
=
False
,
use_cudnn
=
True
,
ceil_mode
=
False
,
exclusive
=
True
,
data_format
=
'NCT'
,
dtype
=
'float32'
):
super
(
Pool1D
,
self
).
__init__
(
name_scope
,
dtype
=
dtype
)
self
.
pool_size
=
pool_size
self
.
pool_type
=
pool_type
self
.
pool_stride
=
pool_stride
self
.
pool_padding
=
pool_padding
self
.
global_pooling
=
global_pooling
self
.
use_cudnn
=
use_cudnn
self
.
ceil_mode
=
ceil_mode
self
.
exclusive
=
exclusive
self
.
data_format
=
data_format
self
.
dtype
=
dtype
self
.
pool2d
=
dg
.
Pool2D
(
self
.
full_name
(),
[
1
,
pool_size
],
pool_type
=
pool_type
,
pool_stride
=
[
1
,
pool_stride
],
pool_padding
=
[
0
,
pool_padding
],
global_pooling
=
global_pooling
,
use_cudnn
=
use_cudnn
,
ceil_mode
=
ceil_mode
,
exclusive
=
exclusive
,
dtype
=
dtype
)
def
forward
(
self
,
x
):
"""
Args:
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
input channels.
Returns:
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
output channels (num_filters).
"""
if
self
.
data_format
==
'NTC'
:
x
=
fluid
.
layers
.
transpose
(
x
,
[
0
,
2
,
1
])
x
=
fluid
.
layers
.
unsqueeze
(
x
,
[
2
])
x
=
self
.
pool2d
(
x
)
x
=
fluid
.
layers
.
squeeze
(
x
,
[
2
])
if
self
.
data_format
==
'NTC'
:
x
=
fluid
.
layers
.
transpose
(
x
,
[
0
,
2
,
1
])
return
x
class
DynamicGRU
(
dg
.
Layer
):
def
__init__
(
self
,
scope_name
,
size
,
param_attr
=
None
,
bias_attr
=
None
,
is_reverse
=
False
,
gate_activation
=
'sigmoid'
,
candidate_activation
=
'tanh'
,
h_0
=
None
,
origin_mode
=
False
,
init_size
=
None
):
super
(
DynamicGRU
,
self
).
__init__
(
scope_name
)
self
.
gru_unit
=
dg
.
GRUUnit
(
self
.
full_name
(),
size
*
3
,
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
activation
=
candidate_activation
,
gate_activation
=
gate_activation
,
origin_mode
=
origin_mode
)
self
.
size
=
size
self
.
h_0
=
h_0
self
.
is_reverse
=
is_reverse
def
forward
(
self
,
inputs
):
hidden
=
self
.
h_0
res
=
[]
for
i
in
range
(
inputs
.
shape
[
1
]):
if
self
.
is_reverse
:
i
=
inputs
.
shape
[
1
]
-
1
-
i
input_
=
inputs
[:,
i
:
i
+
1
,
:]
input_
=
fluid
.
layers
.
reshape
(
input_
,
[
-
1
,
input_
.
shape
[
2
]],
inplace
=
False
)
hidden
,
reset
,
gate
=
self
.
gru_unit
(
input_
,
hidden
)
hidden_
=
fluid
.
layers
.
reshape
(
hidden
,
[
-
1
,
1
,
hidden
.
shape
[
1
]],
inplace
=
False
)
res
.
append
(
hidden_
)
if
self
.
is_reverse
:
res
=
res
[::
-
1
]
res
=
fluid
.
layers
.
concat
(
res
,
axis
=
1
)
return
res
parakeet/models/transformerTTS/module.py
0 → 100644
浏览文件 @
8a9bbc26
此差异已折叠。
点击以展开。
parakeet/models/transformerTTS/network.py
0 → 100644
浏览文件 @
8a9bbc26
from
module
import
*
from
utils
import
get_positional_table
,
get_sinusoid_encoding_table
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid
as
fluid
class
Encoder
(
dg
.
Layer
):
def
__init__
(
self
,
name_scope
,
embedding_size
,
num_hidden
,
config
):
super
(
Encoder
,
self
).
__init__
(
name_scope
)
self
.
num_hidden
=
num_hidden
param
=
fluid
.
ParamAttr
(
name
=
'alpha'
)
self
.
alpha
=
self
.
create_parameter
(
param
,
shape
=
(
1
,
),
dtype
=
'float32'
,
default_initializer
=
fluid
.
initializer
.
ConstantInitializer
(
value
=
1.0
))
self
.
pos_inp
=
get_sinusoid_encoding_table
(
1024
,
self
.
num_hidden
,
padding_idx
=
0
)
self
.
pos_emb
=
dg
.
Embedding
(
name_scope
=
self
.
full_name
(),
size
=
[
1024
,
num_hidden
],
padding_idx
=
0
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
,
initializer
=
fluid
.
initializer
.
NumpyArrayInitializer
(
self
.
pos_inp
),
trainable
=
False
))
self
.
encoder_prenet
=
EncoderPrenet
(
name_scope
=
self
.
full_name
(),
embedding_size
=
embedding_size
,
num_hidden
=
num_hidden
,
use_cudnn
=
config
.
use_gpu
)
self
.
layers
=
[
MultiheadAttention
(
self
.
full_name
(),
num_hidden
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
layers
):
self
.
add_sublayer
(
"self_attn_{}"
.
format
(
i
),
layer
)
self
.
ffns
=
[
FFN
(
self
.
full_name
(),
num_hidden
,
use_cudnn
=
config
.
use_gpu
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
ffns
):
self
.
add_sublayer
(
"ffns_{}"
.
format
(
i
),
layer
)
def
forward
(
self
,
x
,
positional
):
if
fluid
.
framework
.
_dygraph_tracer
().
_train_mode
:
query_mask
=
(
positional
!=
0
).
astype
(
float
)
mask
=
(
positional
!=
0
).
astype
(
float
)
mask
=
fluid
.
layers
.
expand
(
fluid
.
layers
.
unsqueeze
(
mask
,[
1
]),
[
1
,
x
.
shape
[
1
],
1
])
else
:
query_mask
,
mask
=
None
,
None
# Encoder pre_network
x
=
self
.
encoder_prenet
(
x
)
#(N,T,C)
# Get positional encoding
positional
=
self
.
pos_emb
(
fluid
.
layers
.
unsqueeze
(
positional
,
axes
=
[
-
1
]))
x
=
positional
*
self
.
alpha
+
x
#(N, T, C)
# Positional dropout
x
=
layers
.
dropout
(
x
,
0.1
)
# Self attention encoder
attentions
=
list
()
for
layer
,
ffn
in
zip
(
self
.
layers
,
self
.
ffns
):
x
,
attention
=
layer
(
x
,
x
,
x
,
mask
=
mask
,
query_mask
=
query_mask
)
x
=
ffn
(
x
)
attentions
.
append
(
attention
)
return
x
,
query_mask
,
attentions
class
Decoder
(
dg
.
Layer
):
def
__init__
(
self
,
name_scope
,
num_hidden
,
config
):
super
(
Decoder
,
self
).
__init__
(
name_scope
)
self
.
num_hidden
=
num_hidden
param
=
fluid
.
ParamAttr
(
name
=
'alpha'
)
self
.
alpha
=
self
.
create_parameter
(
param
,
shape
=
(
1
,),
dtype
=
'float32'
,
default_initializer
=
fluid
.
initializer
.
ConstantInitializer
(
value
=
1.0
))
self
.
pos_inp
=
get_sinusoid_encoding_table
(
1024
,
self
.
num_hidden
,
padding_idx
=
0
)
self
.
pos_emb
=
dg
.
Embedding
(
name_scope
=
self
.
full_name
(),
size
=
[
1024
,
num_hidden
],
padding_idx
=
0
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
,
initializer
=
fluid
.
initializer
.
NumpyArrayInitializer
(
self
.
pos_inp
),
trainable
=
False
))
self
.
decoder_prenet
=
DecoderPrenet
(
self
.
full_name
(),
input_size
=
config
.
audio
.
num_mels
,
hidden_size
=
num_hidden
*
2
,
output_size
=
num_hidden
,
dropout_rate
=
0.2
)
self
.
linear
=
FC
(
self
.
full_name
(),
num_hidden
,
num_hidden
)
self
.
selfattn_layers
=
[
MultiheadAttention
(
self
.
full_name
(),
num_hidden
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
selfattn_layers
):
self
.
add_sublayer
(
"self_attn_{}"
.
format
(
i
),
layer
)
self
.
attn_layers
=
[
MultiheadAttention
(
self
.
full_name
(),
num_hidden
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
attn_layers
):
self
.
add_sublayer
(
"attn_{}"
.
format
(
i
),
layer
)
self
.
ffns
=
[
FFN
(
self
.
full_name
(),
num_hidden
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
ffns
):
self
.
add_sublayer
(
"ffns_{}"
.
format
(
i
),
layer
)
self
.
mel_linear
=
FC
(
self
.
full_name
(),
num_hidden
,
config
.
audio
.
num_mels
*
config
.
audio
.
outputs_per_step
)
self
.
stop_linear
=
FC
(
self
.
full_name
(),
num_hidden
,
1
,
gain
=
1
)
self
.
postconvnet
=
PostConvNet
(
self
.
full_name
(),
config
)
def
forward
(
self
,
key
,
value
,
query
,
c_mask
,
positional
):
batch_size
=
key
.
shape
[
0
]
decoder_len
=
query
.
shape
[
1
]
# get decoder mask with triangular matrix
if
fluid
.
framework
.
_dygraph_tracer
().
_train_mode
:
#zeros = np.zeros(positional.shape, dtype=np.float32)
m_mask
=
(
positional
!=
0
).
astype
(
float
)
mask
=
np
.
repeat
(
np
.
expand_dims
(
m_mask
.
numpy
()
==
0
,
axis
=
1
),
decoder_len
,
axis
=
1
)
mask
=
mask
+
np
.
repeat
(
np
.
expand_dims
(
np
.
triu
(
np
.
ones
([
decoder_len
,
decoder_len
]),
1
),
axis
=
0
)
,
batch_size
,
axis
=
0
)
mask
=
fluid
.
layers
.
cast
(
dg
.
to_variable
(
mask
==
0
),
np
.
float32
)
# (batch_size, decoder_len, decoder_len)
zero_mask
=
fluid
.
layers
.
expand
(
fluid
.
layers
.
unsqueeze
((
c_mask
!=
0
).
astype
(
float
),
axes
=
2
),
[
1
,
1
,
decoder_len
])
# (batch_size, decoder_len, seq_len)
zero_mask
=
fluid
.
layers
.
transpose
(
zero_mask
,
[
0
,
2
,
1
])
else
:
mask
=
np
.
repeat
(
np
.
expand_dims
(
np
.
triu
(
np
.
ones
([
decoder_len
,
decoder_len
]),
1
),
axis
=
0
)
,
batch_size
,
axis
=
0
)
mask
=
fluid
.
layers
.
cast
(
dg
.
to_variable
(
mask
==
0
),
np
.
float32
)
m_mask
,
zero_mask
=
None
,
None
#import pdb; pdb.set_trace()
# Decoder pre-network
query
=
self
.
decoder_prenet
(
query
)
# Centered position
query
=
self
.
linear
(
query
)
# Get position embedding
positional
=
self
.
pos_emb
(
fluid
.
layers
.
unsqueeze
(
positional
,
axes
=
[
-
1
]))
query
=
positional
*
self
.
alpha
+
query
#positional dropout
query
=
fluid
.
layers
.
dropout
(
query
,
0.1
)
# Attention decoder-decoder, encoder-decoder
selfattn_list
=
list
()
attn_list
=
list
()
for
selfattn
,
attn
,
ffn
in
zip
(
self
.
selfattn_layers
,
self
.
attn_layers
,
self
.
ffns
):
query
,
attn_dec
=
selfattn
(
query
,
query
,
query
,
mask
=
mask
,
query_mask
=
m_mask
)
query
,
attn_dot
=
attn
(
key
,
value
,
query
,
mask
=
zero_mask
,
query_mask
=
m_mask
)
query
=
ffn
(
query
)
selfattn_list
.
append
(
attn_dec
)
attn_list
.
append
(
attn_dot
)
# Mel linear projection
mel_out
=
self
.
mel_linear
(
query
)
# Post Mel Network
postnet_input
=
layers
.
transpose
(
mel_out
,
[
0
,
2
,
1
])
out
=
self
.
postconvnet
(
postnet_input
)
out
=
postnet_input
+
out
out
=
layers
.
transpose
(
out
,
[
0
,
2
,
1
])
# Stop tokens
stop_tokens
=
self
.
stop_linear
(
query
)
return
mel_out
,
out
,
attn_list
,
stop_tokens
,
selfattn_list
class
Model
(
dg
.
Layer
):
def
__init__
(
self
,
name_scope
,
config
):
super
(
Model
,
self
).
__init__
(
name_scope
)
self
.
encoder
=
Encoder
(
self
.
full_name
(),
config
.
network
.
embedding_size
,
config
.
network
.
hidden_size
,
config
)
self
.
decoder
=
Decoder
(
self
.
full_name
(),
config
.
network
.
hidden_size
,
config
)
self
.
config
=
config
def
forward
(
self
,
characters
,
mel_input
,
pos_text
,
pos_mel
):
# key (batch_size, seq_len, channel)
# c_mask (batch_size, seq_len)
# attns_enc (channel / 2, seq_len, seq_len)
key
,
c_mask
,
attns_enc
=
self
.
encoder
(
characters
,
pos_text
)
# mel_output/postnet_output (batch_size, mel_len, n_mel)
# attn_probs (128, mel_len, seq_len)
# stop_preds (batch_size, mel_len, 1)
# attns_dec (128, mel_len, mel_len)
mel_output
,
postnet_output
,
attn_probs
,
stop_preds
,
attns_dec
=
self
.
decoder
(
key
,
key
,
mel_input
,
c_mask
,
pos_mel
)
return
mel_output
,
postnet_output
,
attn_probs
,
stop_preds
,
attns_enc
,
attns_dec
class
ModelPostNet
(
dg
.
Layer
):
"""
CBHG Network (mel -> linear)
"""
def
__init__
(
self
,
name_scope
,
config
):
super
(
ModelPostNet
,
self
).
__init__
(
name_scope
)
self
.
pre_proj
=
Conv
(
self
.
full_name
(),
in_channels
=
config
.
audio
.
num_mels
,
out_channels
=
config
.
network
.
hidden_size
,
data_format
=
"NCT"
)
self
.
cbhg
=
CBHG
(
self
.
full_name
(),
config
)
self
.
post_proj
=
Conv
(
self
.
full_name
(),
in_channels
=
config
.
audio
.
num_mels
,
out_channels
=
(
config
.
audio
.
n_fft
//
2
)
+
1
,
data_format
=
"NCT"
)
def
forward
(
self
,
mel
):
mel
=
layers
.
transpose
(
mel
,
[
0
,
2
,
1
])
mel
=
self
.
pre_proj
(
mel
)
mel
=
self
.
cbhg
(
mel
)
mag_pred
=
self
.
post_proj
(
mel
)
mag_pred
=
layers
.
transpose
(
mag_pred
,
[
0
,
2
,
1
])
return
mag_pred
parakeet/models/transformerTTS/parse.py
0 → 100644
浏览文件 @
8a9bbc26
import
jsonargparse
def
add_config_options_to_parser
(
parser
):
parser
.
add_argument
(
'--audio.num_mels'
,
type
=
int
,
default
=
80
,
help
=
"the number of mel bands when calculating mel spectrograms."
)
parser
.
add_argument
(
'--audio.n_fft'
,
type
=
int
,
default
=
2048
,
help
=
"the number of fft components."
)
parser
.
add_argument
(
'--audio.sr'
,
type
=
int
,
default
=
22050
,
help
=
"the sampling rate of audio data file."
)
parser
.
add_argument
(
'--audio.preemphasis'
,
type
=
float
,
default
=
0.97
,
help
=
"the preemphasis coefficient."
)
parser
.
add_argument
(
'--audio.hop_length'
,
type
=
float
,
default
=
128
,
help
=
"the number of samples to advance between frames."
)
parser
.
add_argument
(
'--audio.win_length'
,
type
=
float
,
default
=
1024
,
help
=
"the length (width) of the window function."
)
parser
.
add_argument
(
'--audio.power'
,
type
=
float
,
default
=
1.4
,
help
=
"the power to raise before griffin-lim."
)
parser
.
add_argument
(
'--audio.min_level_db'
,
type
=
int
,
default
=-
100
,
help
=
"the minimum level db."
)
parser
.
add_argument
(
'--audio.ref_level_db'
,
type
=
int
,
default
=
20
,
help
=
"the reference level db."
)
parser
.
add_argument
(
'--audio.outputs_per_step'
,
type
=
int
,
default
=
1
,
help
=
"the outputs per step."
)
parser
.
add_argument
(
'--network.hidden_size'
,
type
=
int
,
default
=
256
,
help
=
"the hidden size in network."
)
parser
.
add_argument
(
'--network.embedding_size'
,
type
=
int
,
default
=
512
,
help
=
"the embedding vector size."
)
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
default
=
32
,
help
=
"batch size for training."
)
parser
.
add_argument
(
'--epochs'
,
type
=
int
,
default
=
10000
,
help
=
"the number of epoch for training."
)
parser
.
add_argument
(
'--lr'
,
type
=
float
,
default
=
0.001
,
help
=
"the learning rate for training."
)
parser
.
add_argument
(
'--save_step'
,
type
=
int
,
default
=
500
,
help
=
"checkpointing interval during training."
)
parser
.
add_argument
(
'--image_step'
,
type
=
int
,
default
=
2000
,
help
=
"attention image interval during training."
)
parser
.
add_argument
(
'--max_len'
,
type
=
int
,
default
=
400
,
help
=
"The max length of audio when synthsis."
)
parser
.
add_argument
(
'--transformer_step'
,
type
=
int
,
default
=
160000
,
help
=
"Global step to restore checkpoint of transformer in synthesis."
)
parser
.
add_argument
(
'--postnet_step'
,
type
=
int
,
default
=
100000
,
help
=
"Global step to restore checkpoint of postnet in synthesis."
)
parser
.
add_argument
(
'--use_gpu'
,
type
=
bool
,
default
=
True
,
help
=
"use gpu or not during training."
)
parser
.
add_argument
(
'--use_data_parallel'
,
type
=
bool
,
default
=
False
,
help
=
"use data parallel or not during training."
)
parser
.
add_argument
(
'--data_path'
,
type
=
str
,
default
=
'./dataset/LJSpeech-1.1'
,
help
=
"the path of dataset."
)
parser
.
add_argument
(
'--checkpoint_path'
,
type
=
str
,
default
=
None
,
help
=
"the path to load checkpoint or pretrain model."
)
parser
.
add_argument
(
'--save_path'
,
type
=
str
,
default
=
'./checkpoint'
,
help
=
"the path to save checkpoint."
)
parser
.
add_argument
(
'--log_dir'
,
type
=
str
,
default
=
'./log'
,
help
=
"the directory to save tensorboard log."
)
parser
.
add_argument
(
'--sample_path'
,
type
=
str
,
default
=
'./log'
,
help
=
"the directory to save audio sample in synthesis."
)
parser
.
add_argument
(
'-c'
,
'--config'
,
action
=
jsonargparse
.
ActionConfigFile
)
parakeet/models/transformerTTS/preprocess.py
0 → 100644
浏览文件 @
8a9bbc26
from
pathlib
import
Path
import
numpy
as
np
import
pandas
as
pd
import
librosa
from
parakeet
import
g2p
from
parakeet
import
audio
from
parakeet.data.sampler
import
SequentialSampler
,
RandomSampler
,
BatchSampler
from
parakeet.data.dataset
import
Dataset
from
parakeet.data.datacargo
import
DataCargo
from
parakeet.data.batch
import
TextIDBatcher
,
SpecBatcher
_ljspeech_processor
=
audio
.
AudioProcessor
(
sample_rate
=
22050
,
num_mels
=
80
,
min_level_db
=-
100
,
ref_level_db
=
20
,
n_fft
=
2048
,
win_length
=
int
(
22050
*
0.05
),
hop_length
=
int
(
22050
*
0.0125
),
power
=
1.2
,
preemphasis
=
0.97
,
signal_norm
=
True
,
symmetric_norm
=
False
,
max_norm
=
1.
,
mel_fmin
=
0
,
mel_fmax
=
None
,
clip_norm
=
True
,
griffin_lim_iters
=
60
,
do_trim_silence
=
False
,
sound_norm
=
False
)
class
LJSpeech
(
Dataset
):
def
__init__
(
self
,
root
):
super
(
LJSpeech
,
self
).
__init__
()
assert
isinstance
(
root
,
(
str
,
Path
)),
"root should be a string or Path object"
self
.
root
=
root
if
isinstance
(
root
,
Path
)
else
Path
(
root
)
self
.
metadata
=
self
.
_prepare_metadata
()
def
_prepare_metadata
(
self
):
csv_path
=
self
.
root
.
joinpath
(
"metadata.csv"
)
metadata
=
pd
.
read_csv
(
csv_path
,
sep
=
"|"
,
header
=
None
,
quoting
=
3
,
names
=
[
"fname"
,
"raw_text"
,
"normalized_text"
])
return
metadata
def
_get_example
(
self
,
metadatum
):
"""All the code for generating an Example from a metadatum. If you want a
different preprocessing pipeline, you can override this method.
This method may require several processor, each of which has a lot of options.
In this case, you'd better pass a composed transform and pass it to the init
method.
"""
fname
,
raw_text
,
normalized_text
=
metadatum
wav_path
=
self
.
root
.
joinpath
(
"wavs"
,
fname
+
".wav"
)
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
wav
=
_ljspeech_processor
.
load_wav
(
str
(
wav_path
))
mag
=
_ljspeech_processor
.
spectrogram
(
wav
).
astype
(
np
.
float32
)
mel
=
_ljspeech_processor
.
melspectrogram
(
wav
).
astype
(
np
.
float32
)
phonemes
=
np
.
array
(
g2p
.
en
.
text_to_sequence
(
normalized_text
),
dtype
=
np
.
int64
)
return
(
mag
,
mel
,
phonemes
)
# maybe we need to implement it as a map in the future
def
_batch_examples
(
self
,
minibatch
):
mag_batch
=
[]
mel_batch
=
[]
phoneme_batch
=
[]
for
example
in
minibatch
:
mag
,
mel
,
phoneme
=
example
mag_batch
.
append
(
mag
)
mel_batch
.
append
(
mel
)
phoneme_batch
.
append
(
phoneme
)
mag_batch
=
SpecBatcher
(
pad_value
=
0.
)(
mag_batch
)
mel_batch
=
SpecBatcher
(
pad_value
=
0.
)(
mel_batch
)
phoneme_batch
=
TextIDBatcher
(
pad_id
=
0
)(
phoneme_batch
)
return
(
mag_batch
,
mel_batch
,
phoneme_batch
)
def
__getitem__
(
self
,
index
):
metadatum
=
self
.
metadata
.
iloc
[
index
]
example
=
self
.
_get_example
(
metadatum
)
return
example
def
__iter__
(
self
):
for
i
in
range
(
len
(
self
)):
yield
self
[
i
]
def
__len__
(
self
):
return
len
(
self
.
metadata
)
def
batch_examples
(
batch
):
texts
=
[]
mels
=
[]
mel_inputs
=
[]
text_lens
=
[]
pos_texts
=
[]
pos_mels
=
[]
for
data
in
batch
:
_
,
mel
,
text
=
data
mel_inputs
.
append
(
np
.
concatenate
([
np
.
zeros
([
mel
.
shape
[
0
],
1
],
np
.
float32
),
mel
[:,:
-
1
]],
axis
=-
1
))
text_lens
.
append
(
len
(
text
))
pos_texts
.
append
(
np
.
arange
(
1
,
len
(
text
)
+
1
))
pos_mels
.
append
(
np
.
arange
(
1
,
mel
.
shape
[
1
]
+
1
))
mels
.
append
(
mel
)
texts
.
append
(
text
)
# Sort by text_len in descending order
texts
=
[
i
for
i
,
_
in
sorted
(
zip
(
texts
,
text_lens
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)]
mels
=
[
i
for
i
,
_
in
sorted
(
zip
(
mels
,
text_lens
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)]
mel_inputs
=
[
i
for
i
,
_
in
sorted
(
zip
(
mel_inputs
,
text_lens
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)]
pos_texts
=
[
i
for
i
,
_
in
sorted
(
zip
(
pos_texts
,
text_lens
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)]
pos_mels
=
[
i
for
i
,
_
in
sorted
(
zip
(
pos_mels
,
text_lens
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)]
text_lens
=
sorted
(
text_lens
,
reverse
=
True
)
# Pad sequence with largest len of the batch
texts
=
TextIDBatcher
(
pad_id
=
0
)(
texts
)
pos_texts
=
TextIDBatcher
(
pad_id
=
0
)(
pos_texts
)
pos_mels
=
TextIDBatcher
(
pad_id
=
0
)(
pos_mels
)
mels
=
np
.
transpose
(
SpecBatcher
(
pad_value
=
0.
)(
mels
),
axes
=
(
0
,
2
,
1
))
mel_inputs
=
np
.
transpose
(
SpecBatcher
(
pad_value
=
0.
)(
mel_inputs
),
axes
=
(
0
,
2
,
1
))
return
(
texts
,
mels
,
mel_inputs
,
pos_texts
,
pos_mels
,
np
.
array
(
text_lens
))
def
batch_examples_postnet
(
batch
):
mels
=
[]
mags
=
[]
for
data
in
batch
:
mag
,
mel
,
_
=
data
mels
.
append
(
mel
)
mags
.
append
(
mag
)
mels
=
np
.
transpose
(
SpecBatcher
(
pad_value
=
0.
)(
mels
),
axes
=
(
0
,
2
,
1
))
mags
=
np
.
transpose
(
SpecBatcher
(
pad_value
=
0.
)(
mags
),
axes
=
(
0
,
2
,
1
))
return
(
mels
,
mags
)
parakeet/models/transformerTTS/synthesis.py
0 → 100644
浏览文件 @
8a9bbc26
import
os
from
scipy.io.wavfile
import
write
from
parakeet.g2p.en
import
text_to_sequence
import
numpy
as
np
from
network
import
Model
,
ModelPostNet
from
tqdm
import
tqdm
from
tensorboardX
import
SummaryWriter
import
paddle.fluid
as
fluid
import
paddle.fluid.dygraph
as
dg
from
preprocess
import
_ljspeech_processor
from
pathlib
import
Path
import
jsonargparse
from
parse
import
add_config_options_to_parser
from
pprint
import
pprint
def
load_checkpoint
(
step
,
model_path
):
model_dict
,
opti_dict
=
fluid
.
dygraph
.
load_dygraph
(
os
.
path
.
join
(
model_path
,
step
))
return
model_dict
def
synthesis
(
text_input
,
cfg
):
place
=
(
fluid
.
CUDAPlace
(
0
)
if
cfg
.
use_gpu
else
fluid
.
CPUPlace
())
# tensorboard
if
not
os
.
path
.
exists
(
cfg
.
log_dir
):
os
.
mkdir
(
cfg
.
log_dir
)
path
=
os
.
path
.
join
(
cfg
.
log_dir
,
'synthesis'
)
writer
=
SummaryWriter
(
path
)
with
dg
.
guard
(
place
):
model
=
Model
(
'transtts'
,
cfg
)
model_postnet
=
ModelPostNet
(
'postnet'
,
cfg
)
model
.
set_dict
(
load_checkpoint
(
str
(
cfg
.
transformer_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"transformer"
)))
model_postnet
.
set_dict
(
load_checkpoint
(
str
(
cfg
.
postnet_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"postnet"
)))
# init input
text
=
np
.
asarray
(
text_to_sequence
(
text_input
))
text
=
fluid
.
layers
.
unsqueeze
(
dg
.
to_variable
(
text
),[
0
])
mel_input
=
dg
.
to_variable
(
np
.
zeros
([
1
,
1
,
80
])).
astype
(
np
.
float32
)
pos_text
=
np
.
arange
(
1
,
text
.
shape
[
1
]
+
1
)
pos_text
=
fluid
.
layers
.
unsqueeze
(
dg
.
to_variable
(
pos_text
),[
0
])
model
.
eval
()
model_postnet
.
eval
()
pbar
=
tqdm
(
range
(
cfg
.
max_len
))
for
i
in
pbar
:
pos_mel
=
np
.
arange
(
1
,
mel_input
.
shape
[
1
]
+
1
)
pos_mel
=
fluid
.
layers
.
unsqueeze
(
dg
.
to_variable
(
pos_mel
),[
0
])
mel_pred
,
postnet_pred
,
attn_probs
,
stop_preds
,
attn_enc
,
attn_dec
=
model
(
text
,
mel_input
,
pos_text
,
pos_mel
)
mel_input
=
fluid
.
layers
.
concat
([
mel_input
,
postnet_pred
[:,
-
1
:,:]],
axis
=
1
)
mag_pred
=
model_postnet
(
postnet_pred
)
wav
=
_ljspeech_processor
.
inv_spectrogram
(
fluid
.
layers
.
transpose
(
fluid
.
layers
.
squeeze
(
mag_pred
,[
0
]),
[
1
,
0
]).
numpy
())
writer
.
add_audio
(
text_input
,
wav
,
0
,
cfg
.
audio
.
sr
)
if
not
os
.
path
.
exists
(
cfg
.
sample_path
):
os
.
mkdir
(
cfg
.
sample_path
)
write
(
os
.
path
.
join
(
cfg
.
sample_path
,
'test.wav'
),
cfg
.
audio
.
sr
,
wav
)
if
__name__
==
'__main__'
:
parser
=
jsonargparse
.
ArgumentParser
(
description
=
"Synthesis model"
,
formatter_class
=
'default_argparse'
)
add_config_options_to_parser
(
parser
)
cfg
=
parser
.
parse_args
(
'-c ./config/synthesis.yaml'
.
split
())
synthesis
(
"Transformer model is so fast!"
,
cfg
)
\ No newline at end of file
parakeet/models/transformerTTS/train_postnet.py
0 → 100644
浏览文件 @
8a9bbc26
from
network
import
*
from
preprocess
import
batch_examples_postnet
,
LJSpeech
from
tensorboardX
import
SummaryWriter
import
os
from
tqdm
import
tqdm
from
parakeet.data.datacargo
import
DataCargo
from
pathlib
import
Path
import
jsonargparse
from
parse
import
add_config_options_to_parser
from
pprint
import
pprint
class
MyDataParallel
(
dg
.
parallel
.
DataParallel
):
"""
A data parallel proxy for model.
"""
def
__init__
(
self
,
layers
,
strategy
):
super
(
MyDataParallel
,
self
).
__init__
(
layers
,
strategy
)
def
__getattr__
(
self
,
key
):
if
key
in
self
.
__dict__
:
return
object
.
__getattribute__
(
self
,
key
)
elif
key
is
"_layers"
:
return
object
.
__getattribute__
(
self
,
"_sub_layers"
)[
"_layers"
]
else
:
return
getattr
(
object
.
__getattribute__
(
self
,
"_sub_layers"
)[
"_layers"
],
key
)
def
main
():
parser
=
jsonargparse
.
ArgumentParser
(
description
=
"Train postnet model"
,
formatter_class
=
'default_argparse'
)
add_config_options_to_parser
(
parser
)
cfg
=
parser
.
parse_args
(
'-c ./config/train_postnet.yaml'
.
split
())
local_rank
=
dg
.
parallel
.
Env
().
local_rank
if
local_rank
==
0
:
# Print the whole config setting.
pprint
(
jsonargparse
.
namespace_to_dict
(
cfg
))
LJSPEECH_ROOT
=
Path
(
cfg
.
data_path
)
dataset
=
LJSpeech
(
LJSPEECH_ROOT
)
dataloader
=
DataCargo
(
dataset
,
batch_size
=
cfg
.
batch_size
,
shuffle
=
True
,
collate_fn
=
batch_examples_postnet
,
drop_last
=
True
)
global_step
=
0
place
=
(
fluid
.
CUDAPlace
(
dg
.
parallel
.
Env
().
dev_id
)
if
cfg
.
use_data_parallel
else
fluid
.
CUDAPlace
(
0
)
if
cfg
.
use_gpu
else
fluid
.
CPUPlace
())
if
not
os
.
path
.
exists
(
cfg
.
log_dir
):
os
.
mkdir
(
cfg
.
log_dir
)
path
=
os
.
path
.
join
(
cfg
.
log_dir
,
'postnet'
)
writer
=
SummaryWriter
(
path
)
with
dg
.
guard
(
place
):
# dataloader
input_fields
=
{
'names'
:
[
'mel'
,
'mag'
],
'shapes'
:
[[
cfg
.
batch_size
,
None
,
80
],
[
cfg
.
batch_size
,
None
,
257
]],
'dtypes'
:
[
'float32'
,
'float32'
],
'lod_levels'
:
[
0
,
0
]
}
inputs
=
[
fluid
.
data
(
name
=
input_fields
[
'names'
][
i
],
shape
=
input_fields
[
'shapes'
][
i
],
dtype
=
input_fields
[
'dtypes'
][
i
],
lod_level
=
input_fields
[
'lod_levels'
][
i
])
for
i
in
range
(
len
(
input_fields
[
'names'
]))
]
reader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
inputs
,
capacity
=
32
,
iterable
=
True
,
use_double_buffer
=
True
,
return_list
=
True
)
model
=
ModelPostNet
(
'postnet'
,
cfg
)
model
.
train
()
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
dg
.
NoamDecay
(
1
/
(
4000
*
(
cfg
.
lr
**
2
)),
4000
))
if
cfg
.
checkpoint_path
is
not
None
:
model_dict
,
opti_dict
=
fluid
.
dygraph
.
load_dygraph
(
cfg
.
checkpoint_path
)
model
.
set_dict
(
model_dict
)
optimizer
.
set_dict
(
opti_dict
)
print
(
"load checkpoint!!!"
)
if
cfg
.
use_data_parallel
:
strategy
=
dg
.
parallel
.
prepare_context
()
model
=
MyDataParallel
(
model
,
strategy
)
for
epoch
in
range
(
cfg
.
epochs
):
reader
.
set_batch_generator
(
dataloader
,
place
)
pbar
=
tqdm
(
reader
())
for
i
,
data
in
enumerate
(
pbar
):
pbar
.
set_description
(
'Processing at epoch %d'
%
epoch
)
mel
,
mag
=
data
mag
=
dg
.
to_variable
(
mag
.
numpy
())
mel
=
dg
.
to_variable
(
mel
.
numpy
())
global_step
+=
1
mag_pred
=
model
(
mel
)
loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
mag_pred
,
mag
)))
if
cfg
.
use_data_parallel
:
loss
=
model
.
scale_loss
(
loss
)
writer
.
add_scalars
(
'training_loss'
,{
'loss'
:
loss
.
numpy
(),
},
global_step
)
loss
.
backward
()
if
cfg
.
use_data_parallel
:
model
.
apply_collective_grads
()
optimizer
.
minimize
(
loss
,
grad_clip
=
fluid
.
dygraph_grad_clip
.
GradClipByGlobalNorm
(
1
))
model
.
clear_gradients
()
if
global_step
%
cfg
.
save_step
==
0
:
if
not
os
.
path
.
exists
(
cfg
.
save_path
):
os
.
mkdir
(
cfg
.
save_path
)
save_path
=
os
.
path
.
join
(
cfg
.
save_path
,
'postnet/%d'
%
global_step
)
dg
.
save_dygraph
(
model
.
state_dict
(),
save_path
)
dg
.
save_dygraph
(
optimizer
.
state_dict
(),
save_path
)
if
__name__
==
'__main__'
:
main
()
\ No newline at end of file
parakeet/models/transformerTTS/train_transformer.py
0 → 100644
浏览文件 @
8a9bbc26
from
preprocess
import
batch_examples
,
LJSpeech
import
os
from
tqdm
import
tqdm
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.layers
as
layers
from
network
import
*
from
tensorboardX
import
SummaryWriter
from
parakeet.data.datacargo
import
DataCargo
from
pathlib
import
Path
import
jsonargparse
from
parse
import
add_config_options_to_parser
from
pprint
import
pprint
from
matplotlib
import
cm
class
MyDataParallel
(
dg
.
parallel
.
DataParallel
):
"""
A data parallel proxy for model.
"""
def
__init__
(
self
,
layers
,
strategy
):
super
(
MyDataParallel
,
self
).
__init__
(
layers
,
strategy
)
def
__getattr__
(
self
,
key
):
if
key
in
self
.
__dict__
:
return
object
.
__getattribute__
(
self
,
key
)
elif
key
is
"_layers"
:
return
object
.
__getattribute__
(
self
,
"_sub_layers"
)[
"_layers"
]
else
:
return
getattr
(
object
.
__getattribute__
(
self
,
"_sub_layers"
)[
"_layers"
],
key
)
def
main
():
parser
=
jsonargparse
.
ArgumentParser
(
description
=
"Train TransformerTTS model"
,
formatter_class
=
'default_argparse'
)
add_config_options_to_parser
(
parser
)
cfg
=
parser
.
parse_args
(
'-c ./config/train_transformer.yaml'
.
split
())
local_rank
=
dg
.
parallel
.
Env
().
local_rank
if
local_rank
==
0
:
# Print the whole config setting.
pprint
(
jsonargparse
.
namespace_to_dict
(
cfg
))
LJSPEECH_ROOT
=
Path
(
cfg
.
data_path
)
dataset
=
LJSpeech
(
LJSPEECH_ROOT
)
dataloader
=
DataCargo
(
dataset
,
batch_size
=
cfg
.
batch_size
,
shuffle
=
True
,
collate_fn
=
batch_examples
,
drop_last
=
True
)
global_step
=
0
place
=
(
fluid
.
CUDAPlace
(
dg
.
parallel
.
Env
().
dev_id
)
if
cfg
.
use_data_parallel
else
fluid
.
CUDAPlace
(
0
)
if
cfg
.
use_gpu
else
fluid
.
CPUPlace
())
if
not
os
.
path
.
exists
(
cfg
.
log_dir
):
os
.
mkdir
(
cfg
.
log_dir
)
path
=
os
.
path
.
join
(
cfg
.
log_dir
,
'transformer'
)
writer
=
SummaryWriter
(
path
)
if
local_rank
==
0
else
None
with
dg
.
guard
(
place
):
if
cfg
.
use_data_parallel
:
strategy
=
dg
.
parallel
.
prepare_context
()
# dataloader
input_fields
=
{
'names'
:
[
'character'
,
'mel'
,
'mel_input'
,
'pos_text'
,
'pos_mel'
,
'text_len'
],
'shapes'
:
[[
cfg
.
batch_size
,
None
],
[
cfg
.
batch_size
,
None
,
80
],
[
cfg
.
batch_size
,
None
,
80
],
[
cfg
.
batch_size
,
1
],
[
cfg
.
batch_size
,
1
],
[
cfg
.
batch_size
,
1
]],
'dtypes'
:
[
'float32'
,
'float32'
,
'float32'
,
'int64'
,
'int64'
,
'int64'
],
'lod_levels'
:
[
0
,
0
,
0
,
0
,
0
,
0
]
}
inputs
=
[
fluid
.
data
(
name
=
input_fields
[
'names'
][
i
],
shape
=
input_fields
[
'shapes'
][
i
],
dtype
=
input_fields
[
'dtypes'
][
i
],
lod_level
=
input_fields
[
'lod_levels'
][
i
])
for
i
in
range
(
len
(
input_fields
[
'names'
]))
]
reader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
inputs
,
capacity
=
32
,
iterable
=
True
,
use_double_buffer
=
True
,
return_list
=
True
)
model
=
Model
(
'transtts'
,
cfg
)
model
.
train
()
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
dg
.
NoamDecay
(
1
/
(
4000
*
(
cfg
.
lr
**
2
)),
4000
))
if
cfg
.
checkpoint_path
is
not
None
:
model_dict
,
opti_dict
=
fluid
.
dygraph
.
load_dygraph
(
cfg
.
checkpoint_path
)
model
.
set_dict
(
model_dict
)
optimizer
.
set_dict
(
opti_dict
)
print
(
"load checkpoint!!!"
)
if
cfg
.
use_data_parallel
:
model
=
MyDataParallel
(
model
,
strategy
)
for
epoch
in
range
(
cfg
.
epochs
):
reader
.
set_batch_generator
(
dataloader
,
place
)
pbar
=
tqdm
(
reader
())
for
i
,
data
in
enumerate
(
pbar
):
pbar
.
set_description
(
'Processing at epoch %d'
%
epoch
)
character
,
mel
,
mel_input
,
pos_text
,
pos_mel
,
text_length
=
data
global_step
+=
1
mel_pred
,
postnet_pred
,
attn_probs
,
stop_preds
,
attn_enc
,
attn_dec
=
model
(
character
,
mel_input
,
pos_text
,
pos_mel
)
mel_loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
mel_pred
,
mel
)))
post_mel_loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
postnet_pred
,
mel
)))
loss
=
mel_loss
+
post_mel_loss
if
cfg
.
use_data_parallel
:
loss
=
model
.
scale_loss
(
loss
)
writer
.
add_scalars
(
'training_loss'
,
{
'mel_loss'
:
mel_loss
.
numpy
(),
'post_mel_loss'
:
post_mel_loss
.
numpy
(),
},
global_step
)
writer
.
add_scalars
(
'alphas'
,
{
'encoder_alpha'
:
model
.
encoder
.
alpha
.
numpy
(),
'decoder_alpha'
:
model
.
decoder
.
alpha
.
numpy
(),
},
global_step
)
writer
.
add_scalar
(
'learning_rate'
,
optimizer
.
_learning_rate
.
step
().
numpy
(),
global_step
)
if
global_step
%
cfg
.
image_step
==
1
:
for
i
,
prob
in
enumerate
(
attn_probs
):
for
j
in
range
(
4
):
x
=
np
.
uint8
(
cm
.
viridis
(
prob
.
numpy
()[
j
*
16
])
*
255
)
writer
.
add_image
(
'Attention_enc_%d_0'
%
global_step
,
x
,
i
*
4
+
j
,
dataformats
=
"HWC"
)
for
i
,
prob
in
enumerate
(
attn_enc
):
for
j
in
range
(
4
):
x
=
np
.
uint8
(
cm
.
viridis
(
prob
.
numpy
()[
j
*
16
])
*
255
)
writer
.
add_image
(
'Attention_enc_%d_0'
%
global_step
,
x
,
i
*
4
+
j
,
dataformats
=
"HWC"
)
for
i
,
prob
in
enumerate
(
attn_dec
):
for
j
in
range
(
4
):
x
=
np
.
uint8
(
cm
.
viridis
(
prob
.
numpy
()[
j
*
16
])
*
255
)
writer
.
add_image
(
'Attention_dec_%d_0'
%
global_step
,
x
,
i
*
4
+
j
,
dataformats
=
"HWC"
)
loss
.
backward
()
if
cfg
.
use_data_parallel
:
model
.
apply_collective_grads
()
optimizer
.
minimize
(
loss
,
grad_clip
=
fluid
.
dygraph_grad_clip
.
GradClipByGlobalNorm
(
1
))
model
.
clear_gradients
()
# save checkpoint
if
local_rank
==
0
and
global_step
%
cfg
.
save_step
==
0
:
if
not
os
.
path
.
exists
(
cfg
.
save_path
):
os
.
mkdir
(
cfg
.
save_path
)
save_path
=
os
.
path
.
join
(
cfg
.
save_path
,
'transformer/%d'
%
global_step
)
dg
.
save_dygraph
(
model
.
state_dict
(),
save_path
)
dg
.
save_dygraph
(
optimizer
.
state_dict
(),
save_path
)
if
local_rank
==
0
:
writer
.
close
()
if
__name__
==
'__main__'
:
main
()
\ No newline at end of file
parakeet/models/transformerTTS/utils.py
0 → 100644
浏览文件 @
8a9bbc26
import
numpy
as
np
import
librosa
import
os
,
copy
from
scipy
import
signal
def
get_positional_table
(
d_pos_vec
,
n_position
=
1024
):
position_enc
=
np
.
array
([
[
pos
/
np
.
power
(
10000
,
2
*
i
/
d_pos_vec
)
for
i
in
range
(
d_pos_vec
)]
if
pos
!=
0
else
np
.
zeros
(
d_pos_vec
)
for
pos
in
range
(
n_position
)])
position_enc
[
1
:,
0
::
2
]
=
np
.
sin
(
position_enc
[
1
:,
0
::
2
])
# dim 2i
position_enc
[
1
:,
1
::
2
]
=
np
.
cos
(
position_enc
[
1
:,
1
::
2
])
# dim 2i+1
return
position_enc
def
get_sinusoid_encoding_table
(
n_position
,
d_hid
,
padding_idx
=
None
):
''' Sinusoid position encoding table '''
def
cal_angle
(
position
,
hid_idx
):
return
position
/
np
.
power
(
10000
,
2
*
(
hid_idx
//
2
)
/
d_hid
)
def
get_posi_angle_vec
(
position
):
return
[
cal_angle
(
position
,
hid_j
)
for
hid_j
in
range
(
d_hid
)]
sinusoid_table
=
np
.
array
([
get_posi_angle_vec
(
pos_i
)
for
pos_i
in
range
(
n_position
)])
sinusoid_table
[:,
0
::
2
]
=
np
.
sin
(
sinusoid_table
[:,
0
::
2
])
# dim 2i
sinusoid_table
[:,
1
::
2
]
=
np
.
cos
(
sinusoid_table
[:,
1
::
2
])
# dim 2i+1
if
padding_idx
is
not
None
:
# zero vector for padding dimension
sinusoid_table
[
padding_idx
]
=
0.
return
sinusoid_table
def
guided_attention
(
N
,
T
,
g
=
0.2
):
'''Guided attention. Refer to page 3 on the paper.'''
W
=
np
.
zeros
((
N
,
T
),
dtype
=
np
.
float32
)
for
n_pos
in
range
(
W
.
shape
[
0
]):
for
t_pos
in
range
(
W
.
shape
[
1
]):
W
[
n_pos
,
t_pos
]
=
1
-
np
.
exp
(
-
(
t_pos
/
float
(
T
)
-
n_pos
/
float
(
N
))
**
2
/
(
2
*
g
*
g
))
return
W
tests/test_ljspeech.py
浏览文件 @
8a9bbc26
...
...
@@ -7,4 +7,4 @@ LJSPEECH_ROOT = Path("/workspace/datasets/LJSpeech-1.1")
ljspeech
=
LJSpeech
(
LJSPEECH_ROOT
)
ljspeech_cargo
=
DataCargo
(
ljspeech
,
batch_size
=
16
,
shuffle
=
True
)
for
i
,
batch
in
enumerate
(
ljspeech_cargo
):
print
(
i
)
\ No newline at end of file
print
(
i
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录