Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Parakeet
提交
ab0fe8f3
P
Parakeet
项目概览
PaddlePaddle
/
Parakeet
通知
14
Star
3
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
19
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Parakeet
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
19
Issue
19
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
ab0fe8f3
编写于
1月 13, 2020
作者:
L
lifuchen
提交者:
chenfeiyu
1月 13, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
TransformerTTS precision alignment
上级
ae88be34
变更
14
隐藏空白更改
内联
并排
Showing
14 changed file
with
93 addition
and
104 deletion
+93
-104
parakeet/g2p/text/cleaners.py
parakeet/g2p/text/cleaners.py
+1
-1
parakeet/models/fastspeech/config/fastspeech.yaml
parakeet/models/fastspeech/config/fastspeech.yaml
+6
-5
parakeet/models/fastspeech/modules.py
parakeet/models/fastspeech/modules.py
+0
-2
parakeet/models/fastspeech/network.py
parakeet/models/fastspeech/network.py
+9
-9
parakeet/models/fastspeech/parse.py
parakeet/models/fastspeech/parse.py
+9
-8
parakeet/models/transformerTTS/module.py
parakeet/models/transformerTTS/module.py
+10
-36
parakeet/models/transformerTTS/network.py
parakeet/models/transformerTTS/network.py
+11
-14
parakeet/models/transformerTTS/synthesis.py
parakeet/models/transformerTTS/synthesis.py
+2
-2
parakeet/models/transformerTTS/train_postnet.py
parakeet/models/transformerTTS/train_postnet.py
+0
-2
parakeet/models/transformerTTS/train_transformer.py
parakeet/models/transformerTTS/train_transformer.py
+5
-6
parakeet/modules/layers.py
parakeet/modules/layers.py
+19
-0
parakeet/modules/multihead_attention.py
parakeet/modules/multihead_attention.py
+8
-6
parakeet/modules/post_convnet.py
parakeet/modules/post_convnet.py
+10
-11
parakeet/modules/prenet.py
parakeet/modules/prenet.py
+3
-2
未找到文件。
parakeet/g2p/text/cleaners.py
浏览文件 @
ab0fe8f3
...
@@ -89,7 +89,7 @@ def transliteration_cleaners(text):
...
@@ -89,7 +89,7 @@ def transliteration_cleaners(text):
def
english_cleaners
(
text
):
def
english_cleaners
(
text
):
'''Pipeline for English text, including number and abbreviation expansion.'''
'''Pipeline for English text, including number and abbreviation expansion.'''
text
=
convert_to_ascii
(
text
)
text
=
convert_to_ascii
(
text
)
text
=
add_punctuation
(
text
)
#
text = add_punctuation(text)
text
=
lowercase
(
text
)
text
=
lowercase
(
text
)
text
=
expand_numbers
(
text
)
text
=
expand_numbers
(
text
)
text
=
expand_abbreviations
(
text
)
text
=
expand_abbreviations
(
text
)
...
...
parakeet/models/fastspeech/config/fastspeech.yaml
浏览文件 @
ab0fe8f3
...
@@ -14,13 +14,11 @@ encoder_n_layer: 6
...
@@ -14,13 +14,11 @@ encoder_n_layer: 6
encoder_head
:
2
encoder_head
:
2
encoder_conv1d_filter_size
:
1536
encoder_conv1d_filter_size
:
1536
max_sep_len
:
2048
max_sep_len
:
2048
encoder_output_size
:
384
fs_embedding_size
:
384
embedding_size
:
384
decoder_n_layer
:
6
decoder_n_layer
:
6
decoder_head
:
2
decoder_head
:
2
decoder_conv1d_filter_size
:
1536
decoder_conv1d_filter_size
:
1536
decoder_output_size
:
384
fs_hidden_size
:
384
hidden_size
:
384
duration_predictor_output_size
:
256
duration_predictor_output_size
:
256
duration_predictor_filter_size
:
3
duration_predictor_filter_size
:
3
fft_conv1d_filter
:
3
fft_conv1d_filter
:
3
...
@@ -28,6 +26,9 @@ fft_conv1d_padding: 1
...
@@ -28,6 +26,9 @@ fft_conv1d_padding: 1
dropout
:
0.1
dropout
:
0.1
transformer_head
:
4
transformer_head
:
4
embedding_size
:
512
hidden_size
:
256
warm_up_step
:
4000
warm_up_step
:
4000
grad_clip_thresh
:
0.1
grad_clip_thresh
:
0.1
batch_size
:
32
batch_size
:
32
...
@@ -39,5 +40,5 @@ use_data_parallel: False
...
@@ -39,5 +40,5 @@ use_data_parallel: False
data_path
:
../../../dataset/LJSpeech-1.1
data_path
:
../../../dataset/LJSpeech-1.1
transtts_path
:
../transformerTTS/checkpoint
transtts_path
:
../transformerTTS/checkpoint
transformer_step
:
20
transformer_step
:
1
log_dir
:
./log
log_dir
:
./log
\ No newline at end of file
parakeet/models/fastspeech/modules.py
浏览文件 @
ab0fe8f3
...
@@ -8,8 +8,6 @@ from parakeet.modules.layers import Conv1D
...
@@ -8,8 +8,6 @@ from parakeet.modules.layers import Conv1D
from
parakeet.modules.multihead_attention
import
MultiheadAttention
from
parakeet.modules.multihead_attention
import
MultiheadAttention
from
parakeet.modules.feed_forward
import
PositionwiseFeedForward
from
parakeet.modules.feed_forward
import
PositionwiseFeedForward
class
FFTBlock
(
dg
.
Layer
):
class
FFTBlock
(
dg
.
Layer
):
def
__init__
(
self
,
d_model
,
d_inner
,
n_head
,
d_k
,
d_v
,
filter_size
,
padding
,
dropout
=
0.2
):
def
__init__
(
self
,
d_model
,
d_inner
,
n_head
,
d_k
,
d_v
,
filter_size
,
padding
,
dropout
=
0.2
):
super
(
FFTBlock
,
self
).
__init__
()
super
(
FFTBlock
,
self
).
__init__
()
...
...
parakeet/models/fastspeech/network.py
浏览文件 @
ab0fe8f3
from
utils
import
*
from
utils
import
*
from
modules
import
*
from
modules
import
FFTBlock
,
LengthRegulator
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
parakeet.g2p.text.symbols
import
symbols
from
parakeet.g2p.text.symbols
import
symbols
...
@@ -131,38 +131,38 @@ class FastSpeech(dg.Layer):
...
@@ -131,38 +131,38 @@ class FastSpeech(dg.Layer):
self
.
encoder
=
Encoder
(
n_src_vocab
=
len
(
symbols
)
+
1
,
self
.
encoder
=
Encoder
(
n_src_vocab
=
len
(
symbols
)
+
1
,
len_max_seq
=
cfg
.
max_sep_len
,
len_max_seq
=
cfg
.
max_sep_len
,
d_word_vec
=
cfg
.
embedding_size
,
d_word_vec
=
cfg
.
fs_
embedding_size
,
n_layers
=
cfg
.
encoder_n_layer
,
n_layers
=
cfg
.
encoder_n_layer
,
n_head
=
cfg
.
encoder_head
,
n_head
=
cfg
.
encoder_head
,
d_k
=
64
,
d_k
=
64
,
d_v
=
64
,
d_v
=
64
,
d_model
=
cfg
.
hidden_size
,
d_model
=
cfg
.
fs_
hidden_size
,
d_inner
=
cfg
.
encoder_conv1d_filter_size
,
d_inner
=
cfg
.
encoder_conv1d_filter_size
,
fft_conv1d_kernel
=
cfg
.
fft_conv1d_filter
,
fft_conv1d_kernel
=
cfg
.
fft_conv1d_filter
,
fft_conv1d_padding
=
cfg
.
fft_conv1d_padding
,
fft_conv1d_padding
=
cfg
.
fft_conv1d_padding
,
dropout
=
0.1
)
dropout
=
0.1
)
self
.
length_regulator
=
LengthRegulator
(
input_size
=
cfg
.
hidden_size
,
self
.
length_regulator
=
LengthRegulator
(
input_size
=
cfg
.
fs_
hidden_size
,
out_channels
=
cfg
.
duration_predictor_output_size
,
out_channels
=
cfg
.
duration_predictor_output_size
,
filter_size
=
cfg
.
duration_predictor_filter_size
,
filter_size
=
cfg
.
duration_predictor_filter_size
,
dropout
=
cfg
.
dropout
)
dropout
=
cfg
.
dropout
)
self
.
decoder
=
Decoder
(
len_max_seq
=
cfg
.
max_sep_len
,
self
.
decoder
=
Decoder
(
len_max_seq
=
cfg
.
max_sep_len
,
d_word_vec
=
cfg
.
embedding_size
,
d_word_vec
=
cfg
.
fs_
embedding_size
,
n_layers
=
cfg
.
decoder_n_layer
,
n_layers
=
cfg
.
decoder_n_layer
,
n_head
=
cfg
.
decoder_head
,
n_head
=
cfg
.
decoder_head
,
d_k
=
64
,
d_k
=
64
,
d_v
=
64
,
d_v
=
64
,
d_model
=
cfg
.
hidden_size
,
d_model
=
cfg
.
fs_
hidden_size
,
d_inner
=
cfg
.
decoder_conv1d_filter_size
,
d_inner
=
cfg
.
decoder_conv1d_filter_size
,
fft_conv1d_kernel
=
cfg
.
fft_conv1d_filter
,
fft_conv1d_kernel
=
cfg
.
fft_conv1d_filter
,
fft_conv1d_padding
=
cfg
.
fft_conv1d_padding
,
fft_conv1d_padding
=
cfg
.
fft_conv1d_padding
,
dropout
=
0.1
)
dropout
=
0.1
)
self
.
mel_linear
=
dg
.
Linear
(
cfg
.
decoder_output_size
,
cfg
.
audio
.
num_mels
)
self
.
mel_linear
=
dg
.
Linear
(
cfg
.
fs_hidden_size
,
cfg
.
audio
.
num_mels
*
cfg
.
audio
.
outputs_per_step
)
self
.
postnet
=
PostConvNet
(
n_mels
=
80
,
self
.
postnet
=
PostConvNet
(
n_mels
=
cfg
.
audio
.
num_mels
,
num_hidden
=
512
,
num_hidden
=
512
,
filter_size
=
5
,
filter_size
=
5
,
padding
=
int
(
5
/
2
),
padding
=
int
(
5
/
2
),
num_conv
=
5
,
num_conv
=
5
,
outputs_per_step
=
1
,
outputs_per_step
=
cfg
.
audio
.
outputs_per_step
,
use_cudnn
=
True
,
use_cudnn
=
True
,
dropout
=
0.1
)
dropout
=
0.1
)
...
...
parakeet/models/fastspeech/parse.py
浏览文件 @
ab0fe8f3
...
@@ -22,8 +22,8 @@ def add_config_options_to_parser(parser):
...
@@ -22,8 +22,8 @@ def add_config_options_to_parser(parser):
parser
.
add_argument
(
'--audio.outputs_per_step'
,
type
=
int
,
default
=
1
,
parser
.
add_argument
(
'--audio.outputs_per_step'
,
type
=
int
,
default
=
1
,
help
=
"the outputs per step."
)
help
=
"the outputs per step."
)
parser
.
add_argument
(
'--embedding_size'
,
type
=
int
,
default
=
256
,
parser
.
add_argument
(
'--
fs_
embedding_size'
,
type
=
int
,
default
=
256
,
help
=
"the dim size of embedding."
)
help
=
"the dim size of embedding
of fastspeech
."
)
parser
.
add_argument
(
'--encoder_n_layer'
,
type
=
int
,
default
=
6
,
parser
.
add_argument
(
'--encoder_n_layer'
,
type
=
int
,
default
=
6
,
help
=
"the number of FFT Block in encoder."
)
help
=
"the number of FFT Block in encoder."
)
parser
.
add_argument
(
'--encoder_head'
,
type
=
int
,
default
=
2
,
parser
.
add_argument
(
'--encoder_head'
,
type
=
int
,
default
=
2
,
...
@@ -32,18 +32,14 @@ def add_config_options_to_parser(parser):
...
@@ -32,18 +32,14 @@ def add_config_options_to_parser(parser):
help
=
"the filter size of conv1d in encoder."
)
help
=
"the filter size of conv1d in encoder."
)
parser
.
add_argument
(
'--max_sep_len'
,
type
=
int
,
default
=
2048
,
parser
.
add_argument
(
'--max_sep_len'
,
type
=
int
,
default
=
2048
,
help
=
"the max length of sequence."
)
help
=
"the max length of sequence."
)
parser
.
add_argument
(
'--encoder_output_size'
,
type
=
int
,
default
=
256
,
help
=
"the output channel size of encoder."
)
parser
.
add_argument
(
'--decoder_n_layer'
,
type
=
int
,
default
=
6
,
parser
.
add_argument
(
'--decoder_n_layer'
,
type
=
int
,
default
=
6
,
help
=
"the number of FFT Block in decoder."
)
help
=
"the number of FFT Block in decoder."
)
parser
.
add_argument
(
'--decoder_head'
,
type
=
int
,
default
=
2
,
parser
.
add_argument
(
'--decoder_head'
,
type
=
int
,
default
=
2
,
help
=
"the attention head number in decoder."
)
help
=
"the attention head number in decoder."
)
parser
.
add_argument
(
'--decoder_conv1d_filter_size'
,
type
=
int
,
default
=
1024
,
parser
.
add_argument
(
'--decoder_conv1d_filter_size'
,
type
=
int
,
default
=
1024
,
help
=
"the filter size of conv1d in decoder."
)
help
=
"the filter size of conv1d in decoder."
)
parser
.
add_argument
(
'--decoder_output_size'
,
type
=
int
,
default
=
256
,
parser
.
add_argument
(
'--fs_hidden_size'
,
type
=
int
,
default
=
256
,
help
=
"the output channel size of decoder."
)
help
=
"the hidden size in model of fastspeech."
)
parser
.
add_argument
(
'--hidden_size'
,
type
=
int
,
default
=
256
,
help
=
"the hidden size in model."
)
parser
.
add_argument
(
'--duration_predictor_output_size'
,
type
=
int
,
default
=
256
,
parser
.
add_argument
(
'--duration_predictor_output_size'
,
type
=
int
,
default
=
256
,
help
=
"the output size of duration predictior."
)
help
=
"the output size of duration predictior."
)
parser
.
add_argument
(
'--duration_predictor_filter_size'
,
type
=
int
,
default
=
3
,
parser
.
add_argument
(
'--duration_predictor_filter_size'
,
type
=
int
,
default
=
3
,
...
@@ -57,6 +53,11 @@ def add_config_options_to_parser(parser):
...
@@ -57,6 +53,11 @@ def add_config_options_to_parser(parser):
parser
.
add_argument
(
'--transformer_head'
,
type
=
int
,
default
=
4
,
parser
.
add_argument
(
'--transformer_head'
,
type
=
int
,
default
=
4
,
help
=
"the attention head num of transformerTTS."
)
help
=
"the attention head num of transformerTTS."
)
parser
.
add_argument
(
'--hidden_size'
,
type
=
int
,
default
=
256
,
help
=
"the hidden size in model of transformerTTS."
)
parser
.
add_argument
(
'--embedding_size'
,
type
=
int
,
default
=
256
,
help
=
"the dim size of embedding of transformerTTS."
)
parser
.
add_argument
(
'--warm_up_step'
,
type
=
int
,
default
=
4000
,
parser
.
add_argument
(
'--warm_up_step'
,
type
=
int
,
default
=
4000
,
help
=
"the warm up step of learning rate."
)
help
=
"the warm up step of learning rate."
)
parser
.
add_argument
(
'--grad_clip_thresh'
,
type
=
float
,
default
=
1.0
,
parser
.
add_argument
(
'--grad_clip_thresh'
,
type
=
float
,
default
=
1.0
,
...
...
parakeet/models/transformerTTS/module.py
浏览文件 @
ab0fe8f3
...
@@ -3,20 +3,18 @@ from parakeet.g2p.text.symbols import symbols
...
@@ -3,20 +3,18 @@ from parakeet.g2p.text.symbols import symbols
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
paddle.fluid.layers
as
layers
import
paddle.fluid.layers
as
layers
from
parakeet.modules.layers
import
Conv
,
Pool1D
from
parakeet.modules.layers
import
Conv
,
Pool1D
,
Linear
from
parakeet.modules.dynamicGRU
import
DynamicGRU
from
parakeet.modules.dynamicGRU
import
DynamicGRU
import
numpy
as
np
import
numpy
as
np
class
EncoderPrenet
(
dg
.
Layer
):
class
EncoderPrenet
(
dg
.
Layer
):
def
__init__
(
self
,
embedding_size
,
num_hidden
,
use_cudnn
=
True
):
def
__init__
(
self
,
embedding_size
,
num_hidden
,
use_cudnn
=
True
):
super
(
EncoderPrenet
,
self
).
__init__
()
super
(
EncoderPrenet
,
self
).
__init__
()
self
.
embedding_size
=
embedding_size
self
.
embedding_size
=
embedding_size
self
.
num_hidden
=
num_hidden
self
.
num_hidden
=
num_hidden
self
.
use_cudnn
=
use_cudnn
self
.
use_cudnn
=
use_cudnn
self
.
embedding
=
dg
.
Embedding
(
size
=
[
len
(
symbols
),
embedding_size
],
self
.
embedding
=
dg
.
Embedding
(
size
=
[
len
(
symbols
),
embedding_size
],
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
padding_idx
=
None
)
padding_idx
=
None
)
self
.
conv_list
=
[]
self
.
conv_list
=
[]
self
.
conv_list
.
append
(
Conv
(
in_channels
=
embedding_size
,
self
.
conv_list
.
append
(
Conv
(
in_channels
=
embedding_size
,
...
@@ -37,16 +35,12 @@ class EncoderPrenet(dg.Layer):
...
@@ -37,16 +35,12 @@ class EncoderPrenet(dg.Layer):
self
.
add_sublayer
(
"conv_list_{}"
.
format
(
i
),
layer
)
self
.
add_sublayer
(
"conv_list_{}"
.
format
(
i
),
layer
)
self
.
batch_norm_list
=
[
dg
.
BatchNorm
(
num_hidden
,
self
.
batch_norm_list
=
[
dg
.
BatchNorm
(
num_hidden
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
data_layout
=
'NCHW'
,
epsilon
=
1e-30
)
for
_
in
range
(
3
)]
bias_attr
=
fluid
.
ParamAttr
(
name
=
'bias'
),
moving_mean_name
=
'moving_mean'
,
moving_variance_name
=
'moving_var'
,
data_layout
=
'NCHW'
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
batch_norm_list
):
for
i
,
layer
in
enumerate
(
self
.
batch_norm_list
):
self
.
add_sublayer
(
"batch_norm_list_{}"
.
format
(
i
),
layer
)
self
.
add_sublayer
(
"batch_norm_list_{}"
.
format
(
i
),
layer
)
self
.
projection
=
dg
.
Linear
(
num_hidden
,
num_hidden
)
self
.
projection
=
Linear
(
num_hidden
,
num_hidden
)
def
forward
(
self
,
x
):
def
forward
(
self
,
x
):
x
=
self
.
embedding
(
x
)
#(batch_size, seq_len, embending_size)
x
=
self
.
embedding
(
x
)
#(batch_size, seq_len, embending_size)
...
@@ -90,10 +84,6 @@ class CBHG(dg.Layer):
...
@@ -90,10 +84,6 @@ class CBHG(dg.Layer):
self
.
batchnorm_list
=
[]
self
.
batchnorm_list
=
[]
for
i
in
range
(
K
):
for
i
in
range
(
K
):
self
.
batchnorm_list
.
append
(
dg
.
BatchNorm
(
hidden_size
,
self
.
batchnorm_list
.
append
(
dg
.
BatchNorm
(
hidden_size
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'bias'
),
moving_mean_name
=
'moving_mean'
,
moving_variance_name
=
'moving_var'
,
data_layout
=
'NCHW'
))
data_layout
=
'NCHW'
))
for
i
,
layer
in
enumerate
(
self
.
batchnorm_list
):
for
i
,
layer
in
enumerate
(
self
.
batchnorm_list
):
...
@@ -114,16 +104,8 @@ class CBHG(dg.Layer):
...
@@ -114,16 +104,8 @@ class CBHG(dg.Layer):
data_format
=
"NCT"
)
data_format
=
"NCT"
)
self
.
batchnorm_proj_1
=
dg
.
BatchNorm
(
hidden_size
,
self
.
batchnorm_proj_1
=
dg
.
BatchNorm
(
hidden_size
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'bias'
),
moving_mean_name
=
'moving_mean'
,
moving_variance_name
=
'moving_var'
,
data_layout
=
'NCHW'
)
data_layout
=
'NCHW'
)
self
.
batchnorm_proj_2
=
dg
.
BatchNorm
(
projection_size
,
self
.
batchnorm_proj_2
=
dg
.
BatchNorm
(
projection_size
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'bias'
),
moving_mean_name
=
'moving_mean'
,
moving_variance_name
=
'moving_var'
,
data_layout
=
'NCHW'
)
data_layout
=
'NCHW'
)
self
.
max_pool
=
Pool1D
(
pool_size
=
max_pool_kernel_size
,
self
.
max_pool
=
Pool1D
(
pool_size
=
max_pool_kernel_size
,
pool_type
=
'max'
,
pool_type
=
'max'
,
...
@@ -134,32 +116,24 @@ class CBHG(dg.Layer):
...
@@ -134,32 +116,24 @@ class CBHG(dg.Layer):
h_0
=
np
.
zeros
((
batch_size
,
hidden_size
//
2
),
dtype
=
"float32"
)
h_0
=
np
.
zeros
((
batch_size
,
hidden_size
//
2
),
dtype
=
"float32"
)
h_0
=
dg
.
to_variable
(
h_0
)
h_0
=
dg
.
to_variable
(
h_0
)
self
.
fc_forward1
=
dg
.
Linear
(
hidden_size
,
hidden_size
//
2
*
3
)
self
.
fc_forward1
=
Linear
(
hidden_size
,
hidden_size
//
2
*
3
)
self
.
fc_reverse1
=
dg
.
Linear
(
hidden_size
,
hidden_size
//
2
*
3
)
self
.
fc_reverse1
=
Linear
(
hidden_size
,
hidden_size
//
2
*
3
)
self
.
gru_forward1
=
DynamicGRU
(
size
=
self
.
hidden_size
//
2
,
self
.
gru_forward1
=
DynamicGRU
(
size
=
self
.
hidden_size
//
2
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'bias'
),
is_reverse
=
False
,
is_reverse
=
False
,
origin_mode
=
True
,
origin_mode
=
True
,
h_0
=
h_0
)
h_0
=
h_0
)
self
.
gru_reverse1
=
DynamicGRU
(
size
=
self
.
hidden_size
//
2
,
self
.
gru_reverse1
=
DynamicGRU
(
size
=
self
.
hidden_size
//
2
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'bias'
),
is_reverse
=
True
,
is_reverse
=
True
,
origin_mode
=
True
,
origin_mode
=
True
,
h_0
=
h_0
)
h_0
=
h_0
)
self
.
fc_forward2
=
dg
.
Linear
(
hidden_size
,
hidden_size
//
2
*
3
)
self
.
fc_forward2
=
Linear
(
hidden_size
,
hidden_size
//
2
*
3
)
self
.
fc_reverse2
=
dg
.
Linear
(
hidden_size
,
hidden_size
//
2
*
3
)
self
.
fc_reverse2
=
Linear
(
hidden_size
,
hidden_size
//
2
*
3
)
self
.
gru_forward2
=
DynamicGRU
(
size
=
self
.
hidden_size
//
2
,
self
.
gru_forward2
=
DynamicGRU
(
size
=
self
.
hidden_size
//
2
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'bias'
),
is_reverse
=
False
,
is_reverse
=
False
,
origin_mode
=
True
,
origin_mode
=
True
,
h_0
=
h_0
)
h_0
=
h_0
)
self
.
gru_reverse2
=
DynamicGRU
(
size
=
self
.
hidden_size
//
2
,
self
.
gru_reverse2
=
DynamicGRU
(
size
=
self
.
hidden_size
//
2
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'bias'
),
is_reverse
=
True
,
is_reverse
=
True
,
origin_mode
=
True
,
origin_mode
=
True
,
h_0
=
h_0
)
h_0
=
h_0
)
...
@@ -216,8 +190,8 @@ class Highwaynet(dg.Layer):
...
@@ -216,8 +190,8 @@ class Highwaynet(dg.Layer):
self
.
linears
=
[]
self
.
linears
=
[]
for
i
in
range
(
num_layers
):
for
i
in
range
(
num_layers
):
self
.
linears
.
append
(
dg
.
Linear
(
num_units
,
num_units
))
self
.
linears
.
append
(
Linear
(
num_units
,
num_units
))
self
.
gates
.
append
(
dg
.
Linear
(
num_units
,
num_units
))
self
.
gates
.
append
(
Linear
(
num_units
,
num_units
))
for
i
,
(
linear
,
gate
)
in
enumerate
(
zip
(
self
.
linears
,
self
.
gates
)):
for
i
,
(
linear
,
gate
)
in
enumerate
(
zip
(
self
.
linears
,
self
.
gates
)):
self
.
add_sublayer
(
"linears_{}"
.
format
(
i
),
linear
)
self
.
add_sublayer
(
"linears_{}"
.
format
(
i
),
linear
)
...
...
parakeet/models/transformerTTS/network.py
浏览文件 @
ab0fe8f3
from
parakeet.models.transformerTTS.module
import
*
from
parakeet.models.transformerTTS.module
import
*
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
parakeet.modules.layers
import
Conv1D
from
parakeet.modules.layers
import
Conv1D
,
Linear
from
parakeet.modules.utils
import
*
from
parakeet.modules.utils
import
*
from
parakeet.modules.multihead_attention
import
MultiheadAttention
from
parakeet.modules.multihead_attention
import
MultiheadAttention
from
parakeet.modules.feed_forward
import
PositionwiseFeedForward
from
parakeet.modules.feed_forward
import
PositionwiseFeedForward
...
@@ -13,8 +13,7 @@ class Encoder(dg.Layer):
...
@@ -13,8 +13,7 @@ class Encoder(dg.Layer):
def
__init__
(
self
,
embedding_size
,
num_hidden
,
config
):
def
__init__
(
self
,
embedding_size
,
num_hidden
,
config
):
super
(
Encoder
,
self
).
__init__
()
super
(
Encoder
,
self
).
__init__
()
self
.
num_hidden
=
num_hidden
self
.
num_hidden
=
num_hidden
param
=
fluid
.
ParamAttr
(
name
=
'alpha'
,
param
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
1.0
))
initializer
=
fluid
.
initializer
.
Constant
(
value
=
1.0
))
self
.
alpha
=
self
.
create_parameter
(
shape
=
(
1
,
),
attr
=
param
,
dtype
=
'float32'
)
self
.
alpha
=
self
.
create_parameter
(
shape
=
(
1
,
),
attr
=
param
,
dtype
=
'float32'
)
self
.
pos_inp
=
get_sinusoid_encoding_table
(
1024
,
self
.
num_hidden
,
padding_idx
=
0
)
self
.
pos_inp
=
get_sinusoid_encoding_table
(
1024
,
self
.
num_hidden
,
padding_idx
=
0
)
self
.
pos_emb
=
dg
.
Embedding
(
size
=
[
1024
,
num_hidden
],
self
.
pos_emb
=
dg
.
Embedding
(
size
=
[
1024
,
num_hidden
],
...
@@ -39,13 +38,13 @@ class Encoder(dg.Layer):
...
@@ -39,13 +38,13 @@ class Encoder(dg.Layer):
else
:
else
:
query_mask
,
mask
=
None
,
None
query_mask
,
mask
=
None
,
None
# Encoder pre_network
# Encoder pre_network
x
=
self
.
encoder_prenet
(
x
)
#(N,T,C)
x
=
self
.
encoder_prenet
(
x
)
#(N,T,C)
# Get positional encoding
# Get positional encoding
positional
=
self
.
pos_emb
(
positional
)
positional
=
self
.
pos_emb
(
positional
)
x
=
positional
*
self
.
alpha
+
x
#(N, T, C)
x
=
positional
*
self
.
alpha
+
x
#(N, T, C)
...
@@ -65,21 +64,20 @@ class Decoder(dg.Layer):
...
@@ -65,21 +64,20 @@ class Decoder(dg.Layer):
def
__init__
(
self
,
num_hidden
,
config
):
def
__init__
(
self
,
num_hidden
,
config
):
super
(
Decoder
,
self
).
__init__
()
super
(
Decoder
,
self
).
__init__
()
self
.
num_hidden
=
num_hidden
self
.
num_hidden
=
num_hidden
param
=
fluid
.
ParamAttr
(
name
=
'alpha'
)
param
=
fluid
.
ParamAttr
()
self
.
alpha
=
self
.
create_parameter
(
shape
=
(
1
,),
attr
=
param
,
dtype
=
'float32'
,
self
.
alpha
=
self
.
create_parameter
(
shape
=
(
1
,),
attr
=
param
,
dtype
=
'float32'
,
default_initializer
=
fluid
.
initializer
.
ConstantInitializer
(
value
=
1.0
))
default_initializer
=
fluid
.
initializer
.
ConstantInitializer
(
value
=
1.0
))
self
.
pos_inp
=
get_sinusoid_encoding_table
(
1024
,
self
.
num_hidden
,
padding_idx
=
0
)
self
.
pos_inp
=
get_sinusoid_encoding_table
(
1024
,
self
.
num_hidden
,
padding_idx
=
0
)
self
.
pos_emb
=
dg
.
Embedding
(
size
=
[
1024
,
num_hidden
],
self
.
pos_emb
=
dg
.
Embedding
(
size
=
[
1024
,
num_hidden
],
padding_idx
=
0
,
padding_idx
=
0
,
param_attr
=
fluid
.
ParamAttr
(
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
,
initializer
=
fluid
.
initializer
.
NumpyArrayInitializer
(
self
.
pos_inp
),
initializer
=
fluid
.
initializer
.
NumpyArrayInitializer
(
self
.
pos_inp
),
trainable
=
False
))
trainable
=
False
))
self
.
decoder_prenet
=
PreNet
(
input_size
=
config
.
audio
.
num_mels
,
self
.
decoder_prenet
=
PreNet
(
input_size
=
config
.
audio
.
num_mels
,
hidden_size
=
num_hidden
*
2
,
hidden_size
=
num_hidden
*
2
,
output_size
=
num_hidden
,
output_size
=
num_hidden
,
dropout_rate
=
0.2
)
dropout_rate
=
0.2
)
self
.
linear
=
dg
.
Linear
(
num_hidden
,
num_hidden
)
self
.
linear
=
Linear
(
num_hidden
,
num_hidden
)
self
.
selfattn_layers
=
[
MultiheadAttention
(
num_hidden
,
num_hidden
//
4
,
num_hidden
//
4
)
for
_
in
range
(
3
)]
self
.
selfattn_layers
=
[
MultiheadAttention
(
num_hidden
,
num_hidden
//
4
,
num_hidden
//
4
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
selfattn_layers
):
for
i
,
layer
in
enumerate
(
self
.
selfattn_layers
):
...
@@ -90,8 +88,8 @@ class Decoder(dg.Layer):
...
@@ -90,8 +88,8 @@ class Decoder(dg.Layer):
self
.
ffns
=
[
PositionwiseFeedForward
(
num_hidden
,
num_hidden
*
4
,
filter_size
=
1
)
for
_
in
range
(
3
)]
self
.
ffns
=
[
PositionwiseFeedForward
(
num_hidden
,
num_hidden
*
4
,
filter_size
=
1
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
ffns
):
for
i
,
layer
in
enumerate
(
self
.
ffns
):
self
.
add_sublayer
(
"ffns_{}"
.
format
(
i
),
layer
)
self
.
add_sublayer
(
"ffns_{}"
.
format
(
i
),
layer
)
self
.
mel_linear
=
dg
.
Linear
(
num_hidden
,
config
.
audio
.
num_mels
*
config
.
audio
.
outputs_per_step
)
self
.
mel_linear
=
Linear
(
num_hidden
,
config
.
audio
.
num_mels
*
config
.
audio
.
outputs_per_step
)
self
.
stop_linear
=
dg
.
Linear
(
num_hidden
,
1
)
self
.
stop_linear
=
Linear
(
num_hidden
,
1
)
self
.
postconvnet
=
PostConvNet
(
config
.
audio
.
num_mels
,
config
.
hidden_size
,
self
.
postconvnet
=
PostConvNet
(
config
.
audio
.
num_mels
,
config
.
hidden_size
,
filter_size
=
5
,
padding
=
4
,
num_conv
=
5
,
filter_size
=
5
,
padding
=
4
,
num_conv
=
5
,
...
@@ -115,10 +113,10 @@ class Decoder(dg.Layer):
...
@@ -115,10 +113,10 @@ class Decoder(dg.Layer):
mask
=
get_triu_tensor
(
query
.
numpy
(),
query
.
numpy
()).
astype
(
np
.
float32
)
mask
=
get_triu_tensor
(
query
.
numpy
(),
query
.
numpy
()).
astype
(
np
.
float32
)
mask
=
fluid
.
layers
.
cast
(
dg
.
to_variable
(
mask
==
0
),
np
.
float32
)
mask
=
fluid
.
layers
.
cast
(
dg
.
to_variable
(
mask
==
0
),
np
.
float32
)
m_mask
,
zero_mask
=
None
,
None
m_mask
,
zero_mask
=
None
,
None
# Decoder pre-network
# Decoder pre-network
query
=
self
.
decoder_prenet
(
query
)
query
=
self
.
decoder_prenet
(
query
)
# Centered position
# Centered position
query
=
self
.
linear
(
query
)
query
=
self
.
linear
(
query
)
...
@@ -132,14 +130,13 @@ class Decoder(dg.Layer):
...
@@ -132,14 +130,13 @@ class Decoder(dg.Layer):
# Attention decoder-decoder, encoder-decoder
# Attention decoder-decoder, encoder-decoder
selfattn_list
=
list
()
selfattn_list
=
list
()
attn_list
=
list
()
attn_list
=
list
()
for
selfattn
,
attn
,
ffn
in
zip
(
self
.
selfattn_layers
,
self
.
attn_layers
,
self
.
ffns
):
for
selfattn
,
attn
,
ffn
in
zip
(
self
.
selfattn_layers
,
self
.
attn_layers
,
self
.
ffns
):
query
,
attn_dec
=
selfattn
(
query
,
query
,
query
,
mask
=
mask
,
query_mask
=
m_mask
)
query
,
attn_dec
=
selfattn
(
query
,
query
,
query
,
mask
=
mask
,
query_mask
=
m_mask
)
query
,
attn_dot
=
attn
(
key
,
value
,
query
,
mask
=
zero_mask
,
query_mask
=
m_mask
)
query
,
attn_dot
=
attn
(
key
,
value
,
query
,
mask
=
zero_mask
,
query_mask
=
m_mask
)
query
=
ffn
(
query
)
query
=
ffn
(
query
)
selfattn_list
.
append
(
attn_dec
)
selfattn_list
.
append
(
attn_dec
)
attn_list
.
append
(
attn_dot
)
attn_list
.
append
(
attn_dot
)
# Mel linear projection
# Mel linear projection
mel_out
=
self
.
mel_linear
(
query
)
mel_out
=
self
.
mel_linear
(
query
)
# Post Mel Network
# Post Mel Network
...
@@ -164,7 +161,7 @@ class TransformerTTS(dg.Layer):
...
@@ -164,7 +161,7 @@ class TransformerTTS(dg.Layer):
# key (batch_size, seq_len, channel)
# key (batch_size, seq_len, channel)
# c_mask (batch_size, seq_len)
# c_mask (batch_size, seq_len)
# attns_enc (channel / 2, seq_len, seq_len)
# attns_enc (channel / 2, seq_len, seq_len)
key
,
c_mask
,
attns_enc
=
self
.
encoder
(
characters
,
pos_text
)
key
,
c_mask
,
attns_enc
=
self
.
encoder
(
characters
,
pos_text
)
# mel_output/postnet_output (batch_size, mel_len, n_mel)
# mel_output/postnet_output (batch_size, mel_len, n_mel)
...
...
parakeet/models/transformerTTS/synthesis.py
浏览文件 @
ab0fe8f3
...
@@ -2,7 +2,7 @@ import os
...
@@ -2,7 +2,7 @@ import os
from
scipy.io.wavfile
import
write
from
scipy.io.wavfile
import
write
from
parakeet.g2p.en
import
text_to_sequence
from
parakeet.g2p.en
import
text_to_sequence
import
numpy
as
np
import
numpy
as
np
from
network
import
Model
,
ModelPostNet
from
network
import
TransformerTTS
,
ModelPostNet
from
tqdm
import
tqdm
from
tqdm
import
tqdm
from
tensorboardX
import
SummaryWriter
from
tensorboardX
import
SummaryWriter
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
...
@@ -28,7 +28,7 @@ def synthesis(text_input, cfg):
...
@@ -28,7 +28,7 @@ def synthesis(text_input, cfg):
writer
=
SummaryWriter
(
path
)
writer
=
SummaryWriter
(
path
)
with
dg
.
guard
(
place
):
with
dg
.
guard
(
place
):
model
=
Model
(
cfg
)
model
=
TransformerTTS
(
cfg
)
model_postnet
=
ModelPostNet
(
cfg
)
model_postnet
=
ModelPostNet
(
cfg
)
model
.
set_dict
(
load_checkpoint
(
str
(
cfg
.
transformer_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"transformer"
)))
model
.
set_dict
(
load_checkpoint
(
str
(
cfg
.
transformer_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"transformer"
)))
...
...
parakeet/models/transformerTTS/train_postnet.py
浏览文件 @
ab0fe8f3
...
@@ -89,8 +89,6 @@ def main(cfg):
...
@@ -89,8 +89,6 @@ def main(cfg):
else
:
else
:
loss
.
backward
()
loss
.
backward
()
optimizer
.
minimize
(
loss
,
grad_clip
=
fluid
.
dygraph_grad_clip
.
GradClipByGlobalNorm
(
cfg
.
grad_clip_thresh
))
optimizer
.
minimize
(
loss
,
grad_clip
=
fluid
.
dygraph_grad_clip
.
GradClipByGlobalNorm
(
cfg
.
grad_clip_thresh
))
print
(
"==============="
,
model
.
pre_proj
.
conv
.
weight
.
numpy
())
print
(
"==============="
,
model
.
pre_proj
.
conv
.
weight
.
gradient
())
model
.
clear_gradients
()
model
.
clear_gradients
()
if
local_rank
==
0
:
if
local_rank
==
0
:
...
...
parakeet/models/transformerTTS/train_transformer.py
浏览文件 @
ab0fe8f3
...
@@ -63,7 +63,7 @@ def main(cfg):
...
@@ -63,7 +63,7 @@ def main(cfg):
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
dg
.
NoamDecay
(
1
/
(
cfg
.
warm_up_step
*
(
cfg
.
lr
**
2
)),
cfg
.
warm_up_step
),
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
dg
.
NoamDecay
(
1
/
(
cfg
.
warm_up_step
*
(
cfg
.
lr
**
2
)),
cfg
.
warm_up_step
),
parameter_list
=
model
.
parameters
())
parameter_list
=
model
.
parameters
())
reader
=
LJSpeechLoader
(
cfg
,
nranks
,
local_rank
).
reader
()
reader
=
LJSpeechLoader
(
cfg
,
nranks
,
local_rank
,
shuffle
=
True
).
reader
()
if
cfg
.
checkpoint_path
is
not
None
:
if
cfg
.
checkpoint_path
is
not
None
:
model_dict
,
opti_dict
=
load_checkpoint
(
str
(
cfg
.
transformer_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"transformer"
))
model_dict
,
opti_dict
=
load_checkpoint
(
str
(
cfg
.
transformer_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"transformer"
))
...
@@ -78,26 +78,25 @@ def main(cfg):
...
@@ -78,26 +78,25 @@ def main(cfg):
for
epoch
in
range
(
cfg
.
epochs
):
for
epoch
in
range
(
cfg
.
epochs
):
pbar
=
tqdm
(
reader
)
pbar
=
tqdm
(
reader
)
for
i
,
data
in
enumerate
(
pbar
):
for
i
,
data
in
enumerate
(
pbar
):
pbar
.
set_description
(
'Processing at epoch %d'
%
epoch
)
pbar
.
set_description
(
'Processing at epoch %d'
%
epoch
)
character
,
mel
,
mel_input
,
pos_text
,
pos_mel
,
text_length
=
data
character
,
mel
,
mel_input
,
pos_text
,
pos_mel
,
text_length
=
data
global_step
+=
1
global_step
+=
1
mel_pred
,
postnet_pred
,
attn_probs
,
stop_preds
,
attn_enc
,
attn_dec
=
model
(
character
,
mel_input
,
pos_text
,
pos_mel
)
mel_pred
,
postnet_pred
,
attn_probs
,
stop_preds
,
attn_enc
,
attn_dec
=
model
(
character
,
mel_input
,
pos_text
,
pos_mel
)
label
=
np
.
zeros
(
stop_preds
.
shape
).
astype
(
np
.
float32
)
label
=
np
.
zeros
(
stop_preds
.
shape
).
astype
(
np
.
float32
)
text_length
=
text_length
.
numpy
()
text_length
=
text_length
.
numpy
()
for
i
in
range
(
label
.
shape
[
0
]):
for
i
in
range
(
label
.
shape
[
0
]):
label
[
i
][
text_length
[
i
]
-
1
]
=
1
label
[
i
][
text_length
[
i
]
-
1
]
=
1
mel_loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
mel_pred
,
mel
)))
mel_loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
mel_pred
,
mel
)))
post_mel_loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
postnet_pred
,
mel
)))
post_mel_loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
postnet_pred
,
mel
)))
stop_loss
=
cross_entropy
(
stop_preds
,
dg
.
to_variable
(
label
))
stop_loss
=
cross_entropy
(
stop_preds
,
dg
.
to_variable
(
label
))
loss
=
mel_loss
+
post_mel_loss
+
stop_loss
loss
=
mel_loss
+
post_mel_loss
+
stop_loss
if
local_rank
==
0
:
if
local_rank
==
0
:
writer
.
add_scalars
(
'training_loss'
,
{
writer
.
add_scalars
(
'training_loss'
,
{
'mel_loss'
:
mel_loss
.
numpy
(),
'mel_loss'
:
mel_loss
.
numpy
(),
...
...
parakeet/modules/layers.py
浏览文件 @
ab0fe8f3
...
@@ -5,6 +5,25 @@ import paddle
...
@@ -5,6 +5,25 @@ import paddle
from
paddle
import
fluid
from
paddle
import
fluid
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.dygraph
as
dg
class
Linear
(
dg
.
Layer
):
def
__init__
(
self
,
in_features
,
out_features
,
is_bias
=
True
,
dtype
=
"float32"
):
super
(
Linear
,
self
).
__init__
()
self
.
in_features
=
in_features
self
.
out_features
=
out_features
self
.
dtype
=
dtype
self
.
weight
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
XavierInitializer
())
self
.
bias
=
is_bias
if
is_bias
is
not
False
:
k
=
math
.
sqrt
(
1
/
in_features
)
self
.
bias
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
k
,
high
=
k
))
self
.
linear
=
dg
.
Linear
(
in_features
,
out_features
,
param_attr
=
self
.
weight
,
bias_attr
=
self
.
bias
,)
def
forward
(
self
,
x
):
x
=
self
.
linear
(
x
)
return
x
class
Conv
(
dg
.
Layer
):
class
Conv
(
dg
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels
,
filter_size
=
1
,
def
__init__
(
self
,
in_channels
,
out_channels
,
filter_size
=
1
,
...
...
parakeet/modules/multihead_attention.py
浏览文件 @
ab0fe8f3
...
@@ -2,6 +2,7 @@ import math
...
@@ -2,6 +2,7 @@ import math
import
numpy
as
np
import
numpy
as
np
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.layers
as
layers
import
paddle.fluid.layers
as
layers
from
parakeet.modules.layers
import
Linear
class
ScaledDotProductAttention
(
dg
.
Layer
):
class
ScaledDotProductAttention
(
dg
.
Layer
):
def
__init__
(
self
,
d_key
):
def
__init__
(
self
,
d_key
):
...
@@ -34,10 +35,10 @@ class ScaledDotProductAttention(dg.Layer):
...
@@ -34,10 +35,10 @@ class ScaledDotProductAttention(dg.Layer):
attention
=
attention
*
mask
attention
=
attention
*
mask
mask
=
(
mask
==
0
).
astype
(
np
.
float32
)
*
(
-
2
**
32
+
1
)
mask
=
(
mask
==
0
).
astype
(
np
.
float32
)
*
(
-
2
**
32
+
1
)
attention
=
attention
+
mask
attention
=
attention
+
mask
attention
=
layers
.
softmax
(
attention
)
attention
=
layers
.
softmax
(
attention
)
attention
=
layers
.
dropout
(
attention
,
dropout
)
attention
=
layers
.
dropout
(
attention
,
dropout
)
# Mask query to ignore padding
# Mask query to ignore padding
if
query_mask
is
not
None
:
if
query_mask
is
not
None
:
attention
=
attention
*
query_mask
attention
=
attention
*
query_mask
...
@@ -54,13 +55,13 @@ class MultiheadAttention(dg.Layer):
...
@@ -54,13 +55,13 @@ class MultiheadAttention(dg.Layer):
self
.
d_q
=
d_q
self
.
d_q
=
d_q
self
.
dropout
=
dropout
self
.
dropout
=
dropout
self
.
key
=
dg
.
Linear
(
num_hidden
,
num_head
*
d_k
)
self
.
key
=
Linear
(
num_hidden
,
num_head
*
d_k
,
is_bias
=
False
)
self
.
value
=
dg
.
Linear
(
num_hidden
,
num_head
*
d_k
)
self
.
value
=
Linear
(
num_hidden
,
num_head
*
d_k
,
is_bias
=
False
)
self
.
query
=
dg
.
Linear
(
num_hidden
,
num_head
*
d_q
)
self
.
query
=
Linear
(
num_hidden
,
num_head
*
d_q
,
is_bias
=
False
)
self
.
scal_attn
=
ScaledDotProductAttention
(
d_k
)
self
.
scal_attn
=
ScaledDotProductAttention
(
d_k
)
self
.
fc
=
dg
.
Linear
(
num_head
*
d_q
,
num_hidden
)
self
.
fc
=
Linear
(
num_head
*
d_q
*
2
,
num_hidden
)
self
.
layer_norm
=
dg
.
LayerNorm
(
num_hidden
)
self
.
layer_norm
=
dg
.
LayerNorm
(
num_hidden
)
...
@@ -105,6 +106,7 @@ class MultiheadAttention(dg.Layer):
...
@@ -105,6 +106,7 @@ class MultiheadAttention(dg.Layer):
result
=
layers
.
reshape
(
result
,
[
self
.
num_head
,
batch_size
,
seq_len_query
,
self
.
d_q
])
result
=
layers
.
reshape
(
result
,
[
self
.
num_head
,
batch_size
,
seq_len_query
,
self
.
d_q
])
result
=
layers
.
reshape
(
layers
.
transpose
(
result
,
[
1
,
2
,
0
,
3
]),[
batch_size
,
seq_len_query
,
-
1
])
result
=
layers
.
reshape
(
layers
.
transpose
(
result
,
[
1
,
2
,
0
,
3
]),[
batch_size
,
seq_len_query
,
-
1
])
result
=
layers
.
concat
([
query_input
,
result
],
axis
=-
1
)
result
=
layers
.
dropout
(
self
.
fc
(
result
),
self
.
dropout
)
result
=
layers
.
dropout
(
self
.
fc
(
result
),
self
.
dropout
)
result
=
result
+
query_input
result
=
result
+
query_input
...
...
parakeet/modules/post_convnet.py
浏览文件 @
ab0fe8f3
...
@@ -16,6 +16,7 @@ class PostConvNet(dg.Layer):
...
@@ -16,6 +16,7 @@ class PostConvNet(dg.Layer):
super
(
PostConvNet
,
self
).
__init__
()
super
(
PostConvNet
,
self
).
__init__
()
self
.
dropout
=
dropout
self
.
dropout
=
dropout
self
.
num_conv
=
num_conv
self
.
conv_list
=
[]
self
.
conv_list
=
[]
self
.
conv_list
.
append
(
Conv
(
in_channels
=
n_mels
*
outputs_per_step
,
self
.
conv_list
.
append
(
Conv
(
in_channels
=
n_mels
*
outputs_per_step
,
out_channels
=
num_hidden
,
out_channels
=
num_hidden
,
...
@@ -43,17 +44,9 @@ class PostConvNet(dg.Layer):
...
@@ -43,17 +44,9 @@ class PostConvNet(dg.Layer):
self
.
add_sublayer
(
"conv_list_{}"
.
format
(
i
),
layer
)
self
.
add_sublayer
(
"conv_list_{}"
.
format
(
i
),
layer
)
self
.
batch_norm_list
=
[
dg
.
BatchNorm
(
num_hidden
,
self
.
batch_norm_list
=
[
dg
.
BatchNorm
(
num_hidden
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'bias'
),
moving_mean_name
=
'moving_mean'
,
moving_variance_name
=
'moving_var'
,
data_layout
=
'NCHW'
)
for
_
in
range
(
num_conv
-
1
)]
data_layout
=
'NCHW'
)
for
_
in
range
(
num_conv
-
1
)]
self
.
batch_norm_list
.
append
(
dg
.
BatchNorm
(
n_mels
*
outputs_per_step
,
#self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step,
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
# data_layout='NCHW'))
bias_attr
=
fluid
.
ParamAttr
(
name
=
'bias'
),
moving_mean_name
=
'moving_mean'
,
moving_variance_name
=
'moving_var'
,
data_layout
=
'NCHW'
))
for
i
,
layer
in
enumerate
(
self
.
batch_norm_list
):
for
i
,
layer
in
enumerate
(
self
.
batch_norm_list
):
self
.
add_sublayer
(
"batch_norm_list_{}"
.
format
(
i
),
layer
)
self
.
add_sublayer
(
"batch_norm_list_{}"
.
format
(
i
),
layer
)
...
@@ -67,9 +60,15 @@ class PostConvNet(dg.Layer):
...
@@ -67,9 +60,15 @@ class PostConvNet(dg.Layer):
Returns:
Returns:
output (Variable), Shape(B, T, C), the result after postconvnet.
output (Variable), Shape(B, T, C), the result after postconvnet.
"""
"""
input
=
layers
.
transpose
(
input
,
[
0
,
2
,
1
])
input
=
layers
.
transpose
(
input
,
[
0
,
2
,
1
])
len
=
input
.
shape
[
-
1
]
len
=
input
.
shape
[
-
1
]
for
batch_norm
,
conv
in
zip
(
self
.
batch_norm_list
,
self
.
conv_list
):
for
i
in
range
(
self
.
num_conv
-
1
):
batch_norm
=
self
.
batch_norm_list
[
i
]
conv
=
self
.
conv_list
[
i
]
input
=
layers
.
dropout
(
layers
.
tanh
(
batch_norm
(
conv
(
input
)[:,:,:
len
])),
self
.
dropout
)
input
=
layers
.
dropout
(
layers
.
tanh
(
batch_norm
(
conv
(
input
)[:,:,:
len
])),
self
.
dropout
)
conv
=
self
.
conv_list
[
self
.
num_conv
-
1
]
input
=
conv
(
input
)[:,:,:
len
]
output
=
layers
.
transpose
(
input
,
[
0
,
2
,
1
])
output
=
layers
.
transpose
(
input
,
[
0
,
2
,
1
])
return
output
return
output
\ No newline at end of file
parakeet/modules/prenet.py
浏览文件 @
ab0fe8f3
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.layers
as
layers
import
paddle.fluid.layers
as
layers
from
parakeet.modules.layers
import
Linear
class
PreNet
(
dg
.
Layer
):
class
PreNet
(
dg
.
Layer
):
def
__init__
(
self
,
input_size
,
hidden_size
,
output_size
,
dropout_rate
=
0.2
):
def
__init__
(
self
,
input_size
,
hidden_size
,
output_size
,
dropout_rate
=
0.2
):
...
@@ -14,8 +15,8 @@ class PreNet(dg.Layer):
...
@@ -14,8 +15,8 @@ class PreNet(dg.Layer):
self
.
output_size
=
output_size
self
.
output_size
=
output_size
self
.
dropout_rate
=
dropout_rate
self
.
dropout_rate
=
dropout_rate
self
.
linear1
=
dg
.
Linear
(
input_size
,
hidden_size
)
self
.
linear1
=
Linear
(
input_size
,
hidden_size
)
self
.
linear2
=
dg
.
Linear
(
hidden_size
,
output_size
)
self
.
linear2
=
Linear
(
hidden_size
,
output_size
)
def
forward
(
self
,
x
):
def
forward
(
self
,
x
):
"""
"""
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录