Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Parakeet
提交
e0aef2e0
P
Parakeet
项目概览
PaddlePaddle
/
Parakeet
通知
14
Star
3
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
19
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Parakeet
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
19
Issue
19
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e0aef2e0
编写于
1月 08, 2020
作者:
L
lifuchen
提交者:
chenfeiyu
1月 08, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
rebuild code and TransformerTTS is right. FastSpeech will later.
上级
5b632d18
变更
21
隐藏空白更改
内联
并排
Showing
21 changed file
with
297 addition
and
250 deletion
+297
-250
parakeet/audio/audio.py
parakeet/audio/audio.py
+2
-2
parakeet/models/dataloader/jlspeech.py
parakeet/models/dataloader/jlspeech.py
+31
-31
parakeet/models/fastspeech/dataset.py
parakeet/models/fastspeech/dataset.py
+0
-124
parakeet/models/fastspeech/modules.py
parakeet/models/fastspeech/modules.py
+40
-7
parakeet/models/fastspeech/network.py
parakeet/models/fastspeech/network.py
+51
-0
parakeet/models/fastspeech/parse.py
parakeet/models/fastspeech/parse.py
+2
-2
parakeet/models/fastspeech/train.py
parakeet/models/fastspeech/train.py
+2
-2
parakeet/models/transformerTTS/config/train_postnet.yaml
parakeet/models/transformerTTS/config/train_postnet.yaml
+2
-1
parakeet/models/transformerTTS/config/train_transformer.yaml
parakeet/models/transformerTTS/config/train_transformer.yaml
+7
-6
parakeet/models/transformerTTS/data.py
parakeet/models/transformerTTS/data.py
+0
-29
parakeet/models/transformerTTS/module.py
parakeet/models/transformerTTS/module.py
+9
-8
parakeet/models/transformerTTS/network.py
parakeet/models/transformerTTS/network.py
+9
-8
parakeet/models/transformerTTS/parse.py
parakeet/models/transformerTTS/parse.py
+6
-2
parakeet/models/transformerTTS/train_postnet.py
parakeet/models/transformerTTS/train_postnet.py
+8
-3
parakeet/models/transformerTTS/train_transformer.py
parakeet/models/transformerTTS/train_transformer.py
+11
-5
parakeet/modules/dynamicGRU.py
parakeet/modules/dynamicGRU.py
+8
-0
parakeet/modules/feed_forward.py
parakeet/modules/feed_forward.py
+18
-6
parakeet/modules/layers.py
parakeet/modules/layers.py
+36
-0
parakeet/modules/multihead_attention.py
parakeet/modules/multihead_attention.py
+33
-5
parakeet/modules/post_convnet.py
parakeet/modules/post_convnet.py
+14
-6
parakeet/modules/prenet.py
parakeet/modules/prenet.py
+8
-3
未找到文件。
parakeet/audio/audio.py
浏览文件 @
e0aef2e0
...
@@ -9,7 +9,7 @@ class AudioProcessor(object):
...
@@ -9,7 +9,7 @@ class AudioProcessor(object):
sample_rate
=
None
,
# int, sampling rate
sample_rate
=
None
,
# int, sampling rate
num_mels
=
None
,
# int, bands of mel spectrogram
num_mels
=
None
,
# int, bands of mel spectrogram
min_level_db
=
None
,
# float, minimum level db
min_level_db
=
None
,
# float, minimum level db
ref_level_db
=
None
,
# float, reference level db
n
ref_level_db
=
None
,
# float, reference level db
n_fft
=
None
,
# int: number of samples in a frame for stft
n_fft
=
None
,
# int: number of samples in a frame for stft
win_length
=
None
,
# int: the same meaning with n_fft
win_length
=
None
,
# int: the same meaning with n_fft
hop_length
=
None
,
# int: number of samples between neighboring frame
hop_length
=
None
,
# int: number of samples between neighboring frame
...
@@ -22,7 +22,7 @@ class AudioProcessor(object):
...
@@ -22,7 +22,7 @@ class AudioProcessor(object):
mel_fmax
=
None
,
# int: mel spectrogram's maximum frequency
mel_fmax
=
None
,
# int: mel spectrogram's maximum frequency
clip_norm
=
True
,
# bool: clip spectrogram's norm
clip_norm
=
True
,
# bool: clip spectrogram's norm
griffin_lim_iters
=
None
,
# int:
griffin_lim_iters
=
None
,
# int:
do_trim_silence
=
False
,
# bool: trim sil
i
ence
do_trim_silence
=
False
,
# bool: trim silence
sound_norm
=
False
,
sound_norm
=
False
,
**
kwargs
):
**
kwargs
):
self
.
sample_rate
=
sample_rate
self
.
sample_rate
=
sample_rate
...
...
parakeet/models/dataloader/jlspeech.py
浏览文件 @
e0aef2e0
...
@@ -12,19 +12,19 @@ from parakeet.data.dataset import Dataset
...
@@ -12,19 +12,19 @@ from parakeet.data.dataset import Dataset
from
parakeet.data.batch
import
TextIDBatcher
,
SpecBatcher
from
parakeet.data.batch
import
TextIDBatcher
,
SpecBatcher
class
LJSpeechLoader
:
class
LJSpeechLoader
:
def
__init__
(
self
,
config
,
nranks
,
rank
,
is_vocoder
=
False
):
def
__init__
(
self
,
config
,
nranks
,
rank
,
is_vocoder
=
False
,
shuffle
=
True
):
place
=
fluid
.
CUDAPlace
(
rank
)
if
config
.
use_gpu
else
fluid
.
CPUPlace
()
place
=
fluid
.
CUDAPlace
(
rank
)
if
config
.
use_gpu
else
fluid
.
CPUPlace
()
LJSPEECH_ROOT
=
Path
(
config
.
data_path
)
LJSPEECH_ROOT
=
Path
(
config
.
data_path
)
dataset
=
LJSpeech
(
LJSPEECH_ROOT
,
config
)
dataset
=
LJSpeech
(
LJSPEECH_ROOT
,
config
)
sampler
=
DistributedSampler
(
len
(
dataset
),
nranks
,
rank
)
sampler
=
DistributedSampler
(
len
(
dataset
),
nranks
,
rank
,
shuffle
=
shuffle
)
assert
config
.
batch_size
%
nranks
==
0
assert
config
.
batch_size
%
nranks
==
0
each_bs
=
config
.
batch_size
//
nranks
each_bs
=
config
.
batch_size
//
nranks
if
is_vocoder
:
if
is_vocoder
:
dataloader
=
DataCargo
(
dataset
,
sampler
=
sampler
,
batch_size
=
each_bs
,
shuffle
=
Tru
e
,
collate_fn
=
batch_examples_vocoder
,
drop_last
=
True
)
dataloader
=
DataCargo
(
dataset
,
sampler
=
sampler
,
batch_size
=
each_bs
,
shuffle
=
shuffl
e
,
collate_fn
=
batch_examples_vocoder
,
drop_last
=
True
)
else
:
else
:
dataloader
=
DataCargo
(
dataset
,
sampler
=
sampler
,
batch_size
=
each_bs
,
shuffle
=
Tru
e
,
collate_fn
=
batch_examples
,
drop_last
=
True
)
dataloader
=
DataCargo
(
dataset
,
sampler
=
sampler
,
batch_size
=
each_bs
,
shuffle
=
shuffl
e
,
collate_fn
=
batch_examples
,
drop_last
=
True
)
self
.
reader
=
fluid
.
io
.
DataLoader
.
from_generator
(
self
.
reader
=
fluid
.
io
.
DataLoader
.
from_generator
(
capacity
=
32
,
capacity
=
32
,
...
@@ -41,6 +41,25 @@ class LJSpeech(Dataset):
...
@@ -41,6 +41,25 @@ class LJSpeech(Dataset):
self
.
root
=
root
if
isinstance
(
root
,
Path
)
else
Path
(
root
)
self
.
root
=
root
if
isinstance
(
root
,
Path
)
else
Path
(
root
)
self
.
metadata
=
self
.
_prepare_metadata
()
self
.
metadata
=
self
.
_prepare_metadata
()
self
.
config
=
config
self
.
config
=
config
self
.
_ljspeech_processor
=
audio
.
AudioProcessor
(
sample_rate
=
config
.
audio
.
sr
,
num_mels
=
config
.
audio
.
num_mels
,
min_level_db
=
config
.
audio
.
min_level_db
,
ref_level_db
=
config
.
audio
.
ref_level_db
,
n_fft
=
config
.
audio
.
n_fft
,
win_length
=
config
.
audio
.
win_length
,
hop_length
=
config
.
audio
.
hop_length
,
power
=
config
.
audio
.
power
,
preemphasis
=
config
.
audio
.
preemphasis
,
signal_norm
=
True
,
symmetric_norm
=
False
,
max_norm
=
1.
,
mel_fmin
=
0
,
mel_fmax
=
None
,
clip_norm
=
True
,
griffin_lim_iters
=
60
,
do_trim_silence
=
False
,
sound_norm
=
False
)
def
_prepare_metadata
(
self
):
def
_prepare_metadata
(
self
):
csv_path
=
self
.
root
.
joinpath
(
"metadata.csv"
)
csv_path
=
self
.
root
.
joinpath
(
"metadata.csv"
)
...
@@ -59,29 +78,10 @@ class LJSpeech(Dataset):
...
@@ -59,29 +78,10 @@ class LJSpeech(Dataset):
fname
,
raw_text
,
normalized_text
=
metadatum
fname
,
raw_text
,
normalized_text
=
metadatum
wav_path
=
self
.
root
.
joinpath
(
"wavs"
,
fname
+
".wav"
)
wav_path
=
self
.
root
.
joinpath
(
"wavs"
,
fname
+
".wav"
)
_ljspeech_processor
=
audio
.
AudioProcessor
(
sample_rate
=
22050
,
num_mels
=
80
,
min_level_db
=-
100
,
ref_level_db
=
20
,
n_fft
=
2048
,
win_length
=
int
(
22050
*
0.05
),
hop_length
=
int
(
22050
*
0.0125
),
power
=
1.2
,
preemphasis
=
0.97
,
signal_norm
=
True
,
symmetric_norm
=
False
,
max_norm
=
1.
,
mel_fmin
=
0
,
mel_fmax
=
None
,
clip_norm
=
True
,
griffin_lim_iters
=
60
,
do_trim_silence
=
False
,
sound_norm
=
False
)
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
wav
=
_ljspeech_processor
.
load_wav
(
str
(
wav_path
))
wav
=
self
.
_ljspeech_processor
.
load_wav
(
str
(
wav_path
))
mag
=
_ljspeech_processor
.
spectrogram
(
wav
).
astype
(
np
.
float32
)
mag
=
self
.
_ljspeech_processor
.
spectrogram
(
wav
).
astype
(
np
.
float32
)
mel
=
_ljspeech_processor
.
melspectrogram
(
wav
).
astype
(
np
.
float32
)
mel
=
self
.
_ljspeech_processor
.
melspectrogram
(
wav
).
astype
(
np
.
float32
)
phonemes
=
np
.
array
(
g2p
.
en
.
text_to_sequence
(
normalized_text
),
dtype
=
np
.
int64
)
phonemes
=
np
.
array
(
g2p
.
en
.
text_to_sequence
(
normalized_text
),
dtype
=
np
.
int64
)
return
(
mag
,
mel
,
phonemes
)
# maybe we need to implement it as a map in the future
return
(
mag
,
mel
,
phonemes
)
# maybe we need to implement it as a map in the future
...
@@ -123,11 +123,11 @@ def batch_examples(batch):
...
@@ -123,11 +123,11 @@ def batch_examples(batch):
text_lens
=
sorted
(
text_lens
,
reverse
=
True
)
text_lens
=
sorted
(
text_lens
,
reverse
=
True
)
# Pad sequence with largest len of the batch
# Pad sequence with largest len of the batch
texts
=
TextIDBatcher
(
pad_id
=
0
)(
texts
)
texts
=
TextIDBatcher
(
pad_id
=
0
)(
texts
)
#(B, T)
pos_texts
=
TextIDBatcher
(
pad_id
=
0
)(
pos_texts
)
pos_texts
=
TextIDBatcher
(
pad_id
=
0
)(
pos_texts
)
#(B,T)
pos_mels
=
TextIDBatcher
(
pad_id
=
0
)(
pos_mels
)
pos_mels
=
TextIDBatcher
(
pad_id
=
0
)(
pos_mels
)
#(B,T)
mels
=
np
.
transpose
(
SpecBatcher
(
pad_value
=
0.
)(
mels
),
axes
=
(
0
,
2
,
1
))
mels
=
np
.
transpose
(
SpecBatcher
(
pad_value
=
0.
)(
mels
),
axes
=
(
0
,
2
,
1
))
#(B,T,num_mels)
mel_inputs
=
np
.
transpose
(
SpecBatcher
(
pad_value
=
0.
)(
mel_inputs
),
axes
=
(
0
,
2
,
1
))
mel_inputs
=
np
.
transpose
(
SpecBatcher
(
pad_value
=
0.
)(
mel_inputs
),
axes
=
(
0
,
2
,
1
))
#(B,T,num_mels)
return
(
texts
,
mels
,
mel_inputs
,
pos_texts
,
pos_mels
,
np
.
array
(
text_lens
))
return
(
texts
,
mels
,
mel_inputs
,
pos_texts
,
pos_mels
,
np
.
array
(
text_lens
))
def
batch_examples_vocoder
(
batch
):
def
batch_examples_vocoder
(
batch
):
...
...
parakeet/models/fastspeech/dataset.py
已删除
100644 → 0
浏览文件 @
5b632d18
import
torch
from
torch.nn
import
functional
as
F
from
torch.utils.data
import
Dataset
,
DataLoader
import
numpy
as
np
import
math
import
os
import
hparams
import
Audio
from
text
import
text_to_sequence
from
utils
import
process_text
,
pad_1D
,
pad_2D
device
=
torch
.
device
(
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
)
class
FastSpeechDataset
(
Dataset
):
""" LJSpeech """
def
__init__
(
self
):
self
.
text
=
process_text
(
os
.
path
.
join
(
"data"
,
"train.txt"
))
def
__len__
(
self
):
return
len
(
self
.
text
)
def
__getitem__
(
self
,
idx
):
mel_gt_name
=
os
.
path
.
join
(
hparams
.
mel_ground_truth
,
"ljspeech-mel-%05d.npy"
%
(
idx
+
1
))
mel_gt_target
=
np
.
load
(
mel_gt_name
)
D
=
np
.
load
(
os
.
path
.
join
(
hparams
.
alignment_path
,
str
(
idx
)
+
".npy"
))
character
=
self
.
text
[
idx
][
0
:
len
(
self
.
text
[
idx
])
-
1
]
character
=
np
.
array
(
text_to_sequence
(
character
,
hparams
.
text_cleaners
))
sample
=
{
"text"
:
character
,
"mel_target"
:
mel_gt_target
,
"D"
:
D
}
return
sample
def
reprocess
(
batch
,
cut_list
):
texts
=
[
batch
[
ind
][
"text"
]
for
ind
in
cut_list
]
mel_targets
=
[
batch
[
ind
][
"mel_target"
]
for
ind
in
cut_list
]
Ds
=
[
batch
[
ind
][
"D"
]
for
ind
in
cut_list
]
length_text
=
np
.
array
([])
for
text
in
texts
:
length_text
=
np
.
append
(
length_text
,
text
.
shape
[
0
])
src_pos
=
list
()
max_len
=
int
(
max
(
length_text
))
for
length_src_row
in
length_text
:
src_pos
.
append
(
np
.
pad
([
i
+
1
for
i
in
range
(
int
(
length_src_row
))],
(
0
,
max_len
-
int
(
length_src_row
)),
'constant'
))
src_pos
=
np
.
array
(
src_pos
)
length_mel
=
np
.
array
(
list
())
for
mel
in
mel_targets
:
length_mel
=
np
.
append
(
length_mel
,
mel
.
shape
[
0
])
mel_pos
=
list
()
max_mel_len
=
int
(
max
(
length_mel
))
for
length_mel_row
in
length_mel
:
mel_pos
.
append
(
np
.
pad
([
i
+
1
for
i
in
range
(
int
(
length_mel_row
))],
(
0
,
max_mel_len
-
int
(
length_mel_row
)),
'constant'
))
mel_pos
=
np
.
array
(
mel_pos
)
texts
=
pad_1D
(
texts
)
Ds
=
pad_1D
(
Ds
)
mel_targets
=
pad_2D
(
mel_targets
)
out
=
{
"text"
:
texts
,
"mel_target"
:
mel_targets
,
"D"
:
Ds
,
"mel_pos"
:
mel_pos
,
"src_pos"
:
src_pos
,
"mel_max_len"
:
max_mel_len
}
return
out
def
collate_fn
(
batch
):
len_arr
=
np
.
array
([
d
[
"text"
].
shape
[
0
]
for
d
in
batch
])
index_arr
=
np
.
argsort
(
-
len_arr
)
batchsize
=
len
(
batch
)
real_batchsize
=
int
(
math
.
sqrt
(
batchsize
))
cut_list
=
list
()
for
i
in
range
(
real_batchsize
):
cut_list
.
append
(
index_arr
[
i
*
real_batchsize
:(
i
+
1
)
*
real_batchsize
])
output
=
list
()
for
i
in
range
(
real_batchsize
):
output
.
append
(
reprocess
(
batch
,
cut_list
[
i
]))
return
output
if
__name__
==
"__main__"
:
# Test
dataset
=
FastSpeechDataset
()
training_loader
=
DataLoader
(
dataset
,
batch_size
=
1
,
shuffle
=
False
,
collate_fn
=
collate_fn
,
drop_last
=
True
,
num_workers
=
0
)
total_step
=
hparams
.
epochs
*
len
(
training_loader
)
*
hparams
.
batch_size
cnt
=
0
for
i
,
batchs
in
enumerate
(
training_loader
):
for
j
,
data_of_batch
in
enumerate
(
batchs
):
mel_target
=
torch
.
from_numpy
(
data_of_batch
[
"mel_target"
]).
float
().
to
(
device
)
D
=
torch
.
from_numpy
(
data_of_batch
[
"D"
]).
int
().
to
(
device
)
# print(mel_target.size())
# print(D.sum())
print
(
cnt
)
if
mel_target
.
size
(
1
)
==
D
.
sum
().
item
():
cnt
+=
1
print
(
cnt
)
parakeet/models/fastspeech/modules.py
浏览文件 @
e0aef2e0
...
@@ -11,20 +11,33 @@ from parakeet.modules.feed_forward import PositionwiseFeedForward
...
@@ -11,20 +11,33 @@ from parakeet.modules.feed_forward import PositionwiseFeedForward
class
FFTBlock
(
dg
.
Layer
):
class
FFTBlock
(
dg
.
Layer
):
"""FFT Block"""
def
__init__
(
self
,
d_model
,
d_inner
,
n_head
,
d_k
,
d_v
,
filter_size
,
padding
,
dropout
=
0.2
):
def
__init__
(
self
,
d_model
,
d_inner
,
n_head
,
d_k
,
d_v
,
filter_size
,
padding
,
dropout
=
0.2
):
super
(
FFTBlock
,
self
).
__init__
()
super
(
FFTBlock
,
self
).
__init__
()
self
.
slf_attn
=
MultiheadAttention
(
d_model
,
d_k
,
d_v
,
num_head
=
n_head
,
dropout
=
dropout
)
self
.
slf_attn
=
MultiheadAttention
(
d_model
,
d_k
,
d_v
,
num_head
=
n_head
,
dropout
=
dropout
)
self
.
pos_ffn
=
PositionwiseFeedForward
(
d_model
,
d_inner
,
filter_size
=
filter_size
,
padding
=
padding
,
dropout
=
dropout
)
self
.
pos_ffn
=
PositionwiseFeedForward
(
d_model
,
d_inner
,
filter_size
=
filter_size
,
padding
=
padding
,
dropout
=
dropout
)
def
forward
(
self
,
enc_input
,
non_pad_mask
=
None
,
slf_attn_mask
=
None
):
def
forward
(
self
,
enc_input
,
non_pad_mask
=
None
,
slf_attn_mask
=
None
):
enc_output
,
enc_slf_attn
=
self
.
slf_attn
(
enc_input
,
enc_input
,
enc_input
,
mask
=
slf_attn_mask
)
"""
enc_output
*=
non_pad_mask
Feed Forward Transformer block in FastSpeech.
Args:
enc_input (Variable): Shape(B, T, C), dtype: float32. The embedding characters input.
T means the timesteps of input.
non_pad_mask (Variable): Shape(B, T, 1), dtype: int64. The mask of sequence.
slf_attn_mask (Variable): Shape(B, len_q, len_k), dtype: int64. The mask of self attention.
len_q means the sequence length of query, len_k means the sequence length of key.
Returns:
output (Variable), Shape(B, T, C), the output after self-attention & ffn.
slf_attn (Variable), Shape(B * n_head, T, T), the self attention.
"""
output
,
slf_attn
=
self
.
slf_attn
(
enc_input
,
enc_input
,
enc_input
,
mask
=
slf_attn_mask
)
output
*=
non_pad_mask
enc_output
=
self
.
pos_ffn
(
enc_
output
)
output
=
self
.
pos_ffn
(
output
)
enc_
output
*=
non_pad_mask
output
*=
non_pad_mask
return
enc_output
,
enc_
slf_attn
return
output
,
slf_attn
class
LengthRegulator
(
dg
.
Layer
):
class
LengthRegulator
(
dg
.
Layer
):
...
@@ -70,6 +83,20 @@ class LengthRegulator(dg.Layer):
...
@@ -70,6 +83,20 @@ class LengthRegulator(dg.Layer):
def
forward
(
self
,
x
,
alpha
=
1.0
,
target
=
None
):
def
forward
(
self
,
x
,
alpha
=
1.0
,
target
=
None
):
"""
Length Regulator block in FastSpeech.
Args:
x (Variable): Shape(B, T, C), dtype: float32. The encoder output.
alpha (Constant): dtype: float32. The hyperparameter to determine the length of
the expanded sequence mel, thereby controlling the voice speed.
target (Variable): (Variable, optional): Shape(B, T_text),
dtype: int64. The duration of phoneme compute from pretrained transformerTTS.
Returns:
output (Variable), Shape(B, T, C), the output after exppand.
duration_predictor_output (Variable), Shape(B, T, C), the output of duration predictor.
"""
duration_predictor_output
=
self
.
duration_predictor
(
x
)
duration_predictor_output
=
self
.
duration_predictor
(
x
)
if
fluid
.
framework
.
_dygraph_tracer
().
_train_mode
:
if
fluid
.
framework
.
_dygraph_tracer
().
_train_mode
:
output
=
self
.
LR
(
x
,
target
)
output
=
self
.
LR
(
x
,
target
)
...
@@ -81,7 +108,6 @@ class LengthRegulator(dg.Layer):
...
@@ -81,7 +108,6 @@ class LengthRegulator(dg.Layer):
return
output
,
mel_pos
return
output
,
mel_pos
class
DurationPredictor
(
dg
.
Layer
):
class
DurationPredictor
(
dg
.
Layer
):
""" Duration Predictor """
def
__init__
(
self
,
input_size
,
out_channels
,
filter_size
,
dropout
=
0.1
):
def
__init__
(
self
,
input_size
,
out_channels
,
filter_size
,
dropout
=
0.1
):
super
(
DurationPredictor
,
self
).
__init__
()
super
(
DurationPredictor
,
self
).
__init__
()
self
.
input_size
=
input_size
self
.
input_size
=
input_size
...
@@ -105,7 +131,14 @@ class DurationPredictor(dg.Layer):
...
@@ -105,7 +131,14 @@ class DurationPredictor(dg.Layer):
self
.
linear
=
dg
.
Linear
(
self
.
out_channels
,
1
)
self
.
linear
=
dg
.
Linear
(
self
.
out_channels
,
1
)
def
forward
(
self
,
encoder_output
):
def
forward
(
self
,
encoder_output
):
"""
Duration Predictor block in FastSpeech.
Args:
encoder_output (Variable): Shape(B, T, C), dtype: float32. The encoder output.
Returns:
out (Variable), Shape(B, T, C), the output of duration predictor.
"""
# encoder_output.shape(N, T, C)
# encoder_output.shape(N, T, C)
out
=
layers
.
dropout
(
layers
.
relu
(
self
.
layer_norm1
(
self
.
conv1
(
encoder_output
))),
self
.
dropout
)
out
=
layers
.
dropout
(
layers
.
relu
(
self
.
layer_norm1
(
self
.
conv1
(
encoder_output
))),
self
.
dropout
)
out
=
layers
.
dropout
(
layers
.
relu
(
self
.
layer_norm2
(
self
.
conv2
(
out
))),
self
.
dropout
)
out
=
layers
.
dropout
(
layers
.
relu
(
self
.
layer_norm2
(
self
.
conv2
(
out
))),
self
.
dropout
)
...
...
parakeet/models/fastspeech/network.py
浏览文件 @
e0aef2e0
...
@@ -35,6 +35,20 @@ class Encoder(dg.Layer):
...
@@ -35,6 +35,20 @@ class Encoder(dg.Layer):
self
.
add_sublayer
(
'fft_{}'
.
format
(
i
),
layer
)
self
.
add_sublayer
(
'fft_{}'
.
format
(
i
),
layer
)
def
forward
(
self
,
character
,
text_pos
):
def
forward
(
self
,
character
,
text_pos
):
"""
Encoder layer of FastSpeech.
Args:
character (Variable): Shape(B, T_text), dtype: float32. The input text
characters. T_text means the timesteps of input characters.
text_pos (Variable): Shape(B, T_text), dtype: int64. The input text
position. T_text means the timesteps of input characters.
Returns:
enc_output (Variable), Shape(B, text_T, C), the encoder output.
non_pad_mask (Variable), Shape(B, T_text, 1), the mask with non pad.
enc_slf_attn_list (list<Variable>), Len(n_layers), Shape(B * n_head, text_T, text_T), the encoder self attention list.
"""
enc_slf_attn_list
=
[]
enc_slf_attn_list
=
[]
# -- prepare masks
# -- prepare masks
# shape character (N, T)
# shape character (N, T)
...
@@ -80,6 +94,18 @@ class Decoder(dg.Layer):
...
@@ -80,6 +94,18 @@ class Decoder(dg.Layer):
self
.
add_sublayer
(
'fft_{}'
.
format
(
i
),
layer
)
self
.
add_sublayer
(
'fft_{}'
.
format
(
i
),
layer
)
def
forward
(
self
,
enc_seq
,
enc_pos
):
def
forward
(
self
,
enc_seq
,
enc_pos
):
"""
Decoder layer of FastSpeech.
Args:
enc_seq (Variable), Shape(B, text_T, C), dtype: float32.
The output of length regulator.
enc_pos (Variable, optional): Shape(B, T_mel),
dtype: int64. The spectrum position. T_mel means the timesteps of input spectrum.
Returns:
dec_output (Variable), Shape(B, mel_T, C), the decoder output.
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
"""
dec_slf_attn_list
=
[]
dec_slf_attn_list
=
[]
# -- Prepare masks
# -- Prepare masks
...
@@ -141,6 +167,31 @@ class FastSpeech(dg.Layer):
...
@@ -141,6 +167,31 @@ class FastSpeech(dg.Layer):
dropout
=
0.1
)
dropout
=
0.1
)
def
forward
(
self
,
character
,
text_pos
,
mel_pos
=
None
,
length_target
=
None
,
alpha
=
1.0
):
def
forward
(
self
,
character
,
text_pos
,
mel_pos
=
None
,
length_target
=
None
,
alpha
=
1.0
):
"""
FastSpeech model.
Args:
character (Variable): Shape(B, T_text), dtype: float32. The input text
characters. T_text means the timesteps of input characters.
text_pos (Variable): Shape(B, T_text), dtype: int64. The input text
position. T_text means the timesteps of input characters.
mel_pos (Variable, optional): Shape(B, T_mel),
dtype: int64. The spectrum position. T_mel means the timesteps of input spectrum.
length_target (Variable, optional): Shape(B, T_text),
dtype: int64. The duration of phoneme compute from pretrained transformerTTS.
alpha (Constant):
dtype: float32. The hyperparameter to determine the length of the expanded sequence
mel, thereby controlling the voice speed.
Returns:
mel_output (Variable), Shape(B, mel_T, C), the mel output before postnet.
mel_output_postnet (Variable), Shape(B, mel_T, C), the mel output after postnet.
duration_predictor_output (Variable), Shape(B, text_T), the duration of phoneme compute
with duration predictor.
enc_slf_attn_list (Variable), Shape(B, text_T, text_T), the encoder self attention list.
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
"""
encoder_output
,
non_pad_mask
,
enc_slf_attn_list
=
self
.
encoder
(
character
,
text_pos
)
encoder_output
,
non_pad_mask
,
enc_slf_attn_list
=
self
.
encoder
(
character
,
text_pos
)
if
fluid
.
framework
.
_dygraph_tracer
().
_train_mode
:
if
fluid
.
framework
.
_dygraph_tracer
().
_train_mode
:
...
...
parakeet/models/fastspeech/parse.py
浏览文件 @
e0aef2e0
...
@@ -9,9 +9,9 @@ def add_config_options_to_parser(parser):
...
@@ -9,9 +9,9 @@ def add_config_options_to_parser(parser):
help
=
"the sampling rate of audio data file."
)
help
=
"the sampling rate of audio data file."
)
parser
.
add_argument
(
'--audio.preemphasis'
,
type
=
float
,
default
=
0.97
,
parser
.
add_argument
(
'--audio.preemphasis'
,
type
=
float
,
default
=
0.97
,
help
=
"the preemphasis coefficient."
)
help
=
"the preemphasis coefficient."
)
parser
.
add_argument
(
'--audio.hop_length'
,
type
=
floa
t
,
default
=
128
,
parser
.
add_argument
(
'--audio.hop_length'
,
type
=
in
t
,
default
=
128
,
help
=
"the number of samples to advance between frames."
)
help
=
"the number of samples to advance between frames."
)
parser
.
add_argument
(
'--audio.win_length'
,
type
=
floa
t
,
default
=
1024
,
parser
.
add_argument
(
'--audio.win_length'
,
type
=
in
t
,
default
=
1024
,
help
=
"the length (width) of the window function."
)
help
=
"the length (width) of the window function."
)
parser
.
add_argument
(
'--audio.power'
,
type
=
float
,
default
=
1.4
,
parser
.
add_argument
(
'--audio.power'
,
type
=
float
,
default
=
1.4
,
help
=
"the power to raise before griffin-lim."
)
help
=
"the power to raise before griffin-lim."
)
...
...
parakeet/models/fastspeech/train.py
浏览文件 @
e0aef2e0
...
@@ -66,8 +66,8 @@ def main(cfg):
...
@@ -66,8 +66,8 @@ def main(cfg):
model
=
FastSpeech
(
cfg
)
model
=
FastSpeech
(
cfg
)
model
.
train
()
model
.
train
()
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
dg
.
NoamDecay
(
1
/
(
cfg
.
warm_up_step
*
(
cfg
.
lr
**
2
)),
cfg
.
warm_up_step
)
)
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
dg
.
NoamDecay
(
1
/
(
cfg
.
warm_up_step
*
(
cfg
.
lr
**
2
)),
cfg
.
warm_up_step
)
,
parameter_list
=
model
.
parameters
())
reader
=
LJSpeechLoader
(
cfg
,
nranks
,
local_rank
).
reader
()
reader
=
LJSpeechLoader
(
cfg
,
nranks
,
local_rank
).
reader
()
if
cfg
.
checkpoint_path
is
not
None
:
if
cfg
.
checkpoint_path
is
not
None
:
...
...
parakeet/models/transformerTTS/config/train_postnet.yaml
浏览文件 @
e0aef2e0
...
@@ -13,7 +13,8 @@ audio:
...
@@ -13,7 +13,8 @@ audio:
hidden_size
:
256
hidden_size
:
256
embedding_size
:
512
embedding_size
:
512
warm_up_step
:
4000
grad_clip_thresh
:
1.0
batch_size
:
32
batch_size
:
32
epochs
:
10000
epochs
:
10000
lr
:
0.001
lr
:
0.001
...
...
parakeet/models/transformerTTS/config/train_transformer.yaml
浏览文件 @
e0aef2e0
...
@@ -11,22 +11,23 @@ audio:
...
@@ -11,22 +11,23 @@ audio:
outputs_per_step
:
1
outputs_per_step
:
1
hidden_size
:
384
#
256
hidden_size
:
256
embedding_size
:
384
#
512
embedding_size
:
512
warm_up_step
:
4000
grad_clip_thresh
:
1.0
batch_size
:
32
batch_size
:
32
epochs
:
10000
epochs
:
10000
lr
:
0.001
lr
:
0.001
save_step
:
10
save_step
:
10
00
image_step
:
2000
image_step
:
2000
use_gpu
:
True
use_gpu
:
True
use_data_parallel
:
Tru
e
use_data_parallel
:
Fals
e
data_path
:
../../../dataset/LJSpeech-1.1
data_path
:
../../../dataset/LJSpeech-1.1
save_path
:
./checkpoint
save_path
:
./checkpoint
log_dir
:
./log
log_dir
:
./log
#checkpoint_path: ./checkpoint/transformer/1
\ No newline at end of file
parakeet/models/transformerTTS/data.py
已删除
100644 → 0
浏览文件 @
5b632d18
from
pathlib
import
Path
import
numpy
as
np
from
paddle
import
fluid
from
parakeet.data.sampler
import
DistributedSampler
from
parakeet.data.datacargo
import
DataCargo
from
preprocess
import
batch_examples
,
LJSpeech
,
batch_examples_vocoder
class
LJSpeechLoader
:
def
__init__
(
self
,
config
,
nranks
,
rank
,
is_vocoder
=
False
):
place
=
fluid
.
CUDAPlace
(
rank
)
if
config
.
use_gpu
else
fluid
.
CPUPlace
()
LJSPEECH_ROOT
=
Path
(
config
.
data_path
)
dataset
=
LJSpeech
(
LJSPEECH_ROOT
)
sampler
=
DistributedSampler
(
len
(
dataset
),
nranks
,
rank
)
assert
config
.
batch_size
%
nranks
==
0
each_bs
=
config
.
batch_size
//
nranks
if
is_vocoder
:
dataloader
=
DataCargo
(
dataset
,
sampler
=
sampler
,
batch_size
=
each_bs
,
shuffle
=
True
,
collate_fn
=
batch_examples_vocoder
,
drop_last
=
True
)
else
:
dataloader
=
DataCargo
(
dataset
,
sampler
=
sampler
,
batch_size
=
each_bs
,
shuffle
=
True
,
collate_fn
=
batch_examples
,
drop_last
=
True
)
self
.
reader
=
fluid
.
io
.
DataLoader
.
from_generator
(
capacity
=
32
,
iterable
=
True
,
use_double_buffer
=
True
,
return_list
=
True
)
self
.
reader
.
set_batch_generator
(
dataloader
,
place
)
parakeet/models/transformerTTS/module.py
浏览文件 @
e0aef2e0
...
@@ -3,11 +3,12 @@ from parakeet.g2p.text.symbols import symbols
...
@@ -3,11 +3,12 @@ from parakeet.g2p.text.symbols import symbols
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
paddle.fluid.layers
as
layers
import
paddle.fluid.layers
as
layers
from
parakeet.modules.layers
import
Conv
1D
,
Pool1D
from
parakeet.modules.layers
import
Conv
,
Pool1D
from
parakeet.modules.dynamicGRU
import
DynamicGRU
from
parakeet.modules.dynamicGRU
import
DynamicGRU
import
numpy
as
np
import
numpy
as
np
class
EncoderPrenet
(
dg
.
Layer
):
class
EncoderPrenet
(
dg
.
Layer
):
def
__init__
(
self
,
embedding_size
,
num_hidden
,
use_cudnn
=
True
):
def
__init__
(
self
,
embedding_size
,
num_hidden
,
use_cudnn
=
True
):
super
(
EncoderPrenet
,
self
).
__init__
()
super
(
EncoderPrenet
,
self
).
__init__
()
...
@@ -18,19 +19,19 @@ class EncoderPrenet(dg.Layer):
...
@@ -18,19 +19,19 @@ class EncoderPrenet(dg.Layer):
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
),
padding_idx
=
None
)
padding_idx
=
None
)
self
.
conv_list
=
[]
self
.
conv_list
=
[]
self
.
conv_list
.
append
(
Conv
1D
(
in_channels
=
embedding_size
,
self
.
conv_list
.
append
(
Conv
(
in_channels
=
embedding_size
,
out_channels
=
num_hidden
,
out_channels
=
num_hidden
,
filter_size
=
5
,
filter_size
=
5
,
padding
=
int
(
np
.
floor
(
5
/
2
)),
padding
=
int
(
np
.
floor
(
5
/
2
)),
use_cudnn
=
use_cudnn
,
use_cudnn
=
use_cudnn
,
data_format
=
"NCT"
))
data_format
=
"NCT"
))
for
_
in
range
(
2
):
for
_
in
range
(
2
):
self
.
conv_list
=
Conv1D
(
in_channels
=
num_hidden
,
self
.
conv_list
.
append
(
Conv
(
in_channels
=
num_hidden
,
out_channels
=
num_hidden
,
out_channels
=
num_hidden
,
filter_size
=
5
,
filter_size
=
5
,
padding
=
int
(
np
.
floor
(
5
/
2
)),
padding
=
int
(
np
.
floor
(
5
/
2
)),
use_cudnn
=
use_cudnn
,
use_cudnn
=
use_cudnn
,
data_format
=
"NCT"
)
data_format
=
"NCT"
)
)
for
i
,
layer
in
enumerate
(
self
.
conv_list
):
for
i
,
layer
in
enumerate
(
self
.
conv_list
):
self
.
add_sublayer
(
"conv_list_{}"
.
format
(
i
),
layer
)
self
.
add_sublayer
(
"conv_list_{}"
.
format
(
i
),
layer
)
...
@@ -71,13 +72,13 @@ class CBHG(dg.Layer):
...
@@ -71,13 +72,13 @@ class CBHG(dg.Layer):
self
.
hidden_size
=
hidden_size
self
.
hidden_size
=
hidden_size
self
.
projection_size
=
projection_size
self
.
projection_size
=
projection_size
self
.
conv_list
=
[]
self
.
conv_list
=
[]
self
.
conv_list
.
append
(
Conv
1D
(
in_channels
=
projection_size
,
self
.
conv_list
.
append
(
Conv
(
in_channels
=
projection_size
,
out_channels
=
hidden_size
,
out_channels
=
hidden_size
,
filter_size
=
1
,
filter_size
=
1
,
padding
=
int
(
np
.
floor
(
1
/
2
)),
padding
=
int
(
np
.
floor
(
1
/
2
)),
data_format
=
"NCT"
))
data_format
=
"NCT"
))
for
i
in
range
(
2
,
K
+
1
):
for
i
in
range
(
2
,
K
+
1
):
self
.
conv_list
.
append
(
Conv
1D
(
in_channels
=
hidden_size
,
self
.
conv_list
.
append
(
Conv
(
in_channels
=
hidden_size
,
out_channels
=
hidden_size
,
out_channels
=
hidden_size
,
filter_size
=
i
,
filter_size
=
i
,
padding
=
int
(
np
.
floor
(
i
/
2
)),
padding
=
int
(
np
.
floor
(
i
/
2
)),
...
@@ -100,13 +101,13 @@ class CBHG(dg.Layer):
...
@@ -100,13 +101,13 @@ class CBHG(dg.Layer):
conv_outdim
=
hidden_size
*
K
conv_outdim
=
hidden_size
*
K
self
.
conv_projection_1
=
Conv
1D
(
in_channels
=
conv_outdim
,
self
.
conv_projection_1
=
Conv
(
in_channels
=
conv_outdim
,
out_channels
=
hidden_size
,
out_channels
=
hidden_size
,
filter_size
=
3
,
filter_size
=
3
,
padding
=
int
(
np
.
floor
(
3
/
2
)),
padding
=
int
(
np
.
floor
(
3
/
2
)),
data_format
=
"NCT"
)
data_format
=
"NCT"
)
self
.
conv_projection_2
=
Conv
1D
(
in_channels
=
hidden_size
,
self
.
conv_projection_2
=
Conv
(
in_channels
=
hidden_size
,
out_channels
=
projection_size
,
out_channels
=
projection_size
,
filter_size
=
3
,
filter_size
=
3
,
padding
=
int
(
np
.
floor
(
3
/
2
)),
padding
=
int
(
np
.
floor
(
3
/
2
)),
...
...
parakeet/models/transformerTTS/network.py
浏览文件 @
e0aef2e0
...
@@ -20,13 +20,12 @@ class Encoder(dg.Layer):
...
@@ -20,13 +20,12 @@ class Encoder(dg.Layer):
self
.
pos_emb
=
dg
.
Embedding
(
size
=
[
1024
,
num_hidden
],
self
.
pos_emb
=
dg
.
Embedding
(
size
=
[
1024
,
num_hidden
],
padding_idx
=
0
,
padding_idx
=
0
,
param_attr
=
fluid
.
ParamAttr
(
param_attr
=
fluid
.
ParamAttr
(
name
=
'weight'
,
initializer
=
fluid
.
initializer
.
NumpyArrayInitializer
(
self
.
pos_inp
),
initializer
=
fluid
.
initializer
.
NumpyArrayInitializer
(
self
.
pos_inp
),
trainable
=
False
))
trainable
=
False
))
self
.
encoder_prenet
=
EncoderPrenet
(
embedding_size
=
embedding_size
,
self
.
encoder_prenet
=
EncoderPrenet
(
embedding_size
=
embedding_size
,
num_hidden
=
num_hidden
,
num_hidden
=
num_hidden
,
use_cudnn
=
config
.
use_gpu
)
use_cudnn
=
config
.
use_gpu
)
self
.
layers
=
[
MultiheadAttention
(
num_hidden
,
num_hidden
,
num_hidden
)
for
_
in
range
(
3
)]
self
.
layers
=
[
MultiheadAttention
(
num_hidden
,
num_hidden
//
4
,
num_hidden
//
4
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
layers
):
for
i
,
layer
in
enumerate
(
self
.
layers
):
self
.
add_sublayer
(
"self_attn_{}"
.
format
(
i
),
layer
)
self
.
add_sublayer
(
"self_attn_{}"
.
format
(
i
),
layer
)
self
.
ffns
=
[
PositionwiseFeedForward
(
num_hidden
,
num_hidden
*
4
,
filter_size
=
1
,
use_cudnn
=
config
.
use_gpu
)
for
_
in
range
(
3
)]
self
.
ffns
=
[
PositionwiseFeedForward
(
num_hidden
,
num_hidden
*
4
,
filter_size
=
1
,
use_cudnn
=
config
.
use_gpu
)
for
_
in
range
(
3
)]
...
@@ -40,6 +39,7 @@ class Encoder(dg.Layer):
...
@@ -40,6 +39,7 @@ class Encoder(dg.Layer):
else
:
else
:
query_mask
,
mask
=
None
,
None
query_mask
,
mask
=
None
,
None
# Encoder pre_network
# Encoder pre_network
x
=
self
.
encoder_prenet
(
x
)
#(N,T,C)
x
=
self
.
encoder_prenet
(
x
)
#(N,T,C)
...
@@ -81,10 +81,10 @@ class Decoder(dg.Layer):
...
@@ -81,10 +81,10 @@ class Decoder(dg.Layer):
dropout_rate
=
0.2
)
dropout_rate
=
0.2
)
self
.
linear
=
dg
.
Linear
(
num_hidden
,
num_hidden
)
self
.
linear
=
dg
.
Linear
(
num_hidden
,
num_hidden
)
self
.
selfattn_layers
=
[
MultiheadAttention
(
num_hidden
,
num_hidden
,
num_hidden
)
for
_
in
range
(
3
)]
self
.
selfattn_layers
=
[
MultiheadAttention
(
num_hidden
,
num_hidden
//
4
,
num_hidden
//
4
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
selfattn_layers
):
for
i
,
layer
in
enumerate
(
self
.
selfattn_layers
):
self
.
add_sublayer
(
"self_attn_{}"
.
format
(
i
),
layer
)
self
.
add_sublayer
(
"self_attn_{}"
.
format
(
i
),
layer
)
self
.
attn_layers
=
[
MultiheadAttention
(
num_hidden
,
num_hidden
,
num_hidden
)
for
_
in
range
(
3
)]
self
.
attn_layers
=
[
MultiheadAttention
(
num_hidden
,
num_hidden
//
4
,
num_hidden
//
4
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
attn_layers
):
for
i
,
layer
in
enumerate
(
self
.
attn_layers
):
self
.
add_sublayer
(
"attn_{}"
.
format
(
i
),
layer
)
self
.
add_sublayer
(
"attn_{}"
.
format
(
i
),
layer
)
self
.
ffns
=
[
PositionwiseFeedForward
(
num_hidden
,
num_hidden
*
4
,
filter_size
=
1
)
for
_
in
range
(
3
)]
self
.
ffns
=
[
PositionwiseFeedForward
(
num_hidden
,
num_hidden
*
4
,
filter_size
=
1
)
for
_
in
range
(
3
)]
...
@@ -104,18 +104,18 @@ class Decoder(dg.Layer):
...
@@ -104,18 +104,18 @@ class Decoder(dg.Layer):
if
fluid
.
framework
.
_dygraph_tracer
().
_train_mode
:
if
fluid
.
framework
.
_dygraph_tracer
().
_train_mode
:
m_mask
=
get_non_pad_mask
(
positional
)
m_mask
=
get_non_pad_mask
(
positional
)
mask
=
get_attn_key_pad_mask
(
positional
,
query
)
mask
=
get_attn_key_pad_mask
(
(
positional
==
0
).
astype
(
np
.
float32
)
,
query
)
triu_tensor
=
dg
.
to_variable
(
get_triu_tensor
(
query
.
numpy
(),
query
.
numpy
())).
astype
(
np
.
float32
)
triu_tensor
=
dg
.
to_variable
(
get_triu_tensor
(
query
.
numpy
(),
query
.
numpy
())).
astype
(
np
.
float32
)
mask
=
mask
+
triu_tensor
mask
=
mask
+
triu_tensor
mask
=
fluid
.
layers
.
cast
(
mask
!
=
0
,
np
.
float32
)
mask
=
fluid
.
layers
.
cast
(
mask
=
=
0
,
np
.
float32
)
# (batch_size, decoder_len, encoder_len)
# (batch_size, decoder_len, encoder_len)
zero_mask
=
get_attn_key_pad_mask
(
layers
.
squeeze
(
c_mask
,[
-
1
]),
query
)
zero_mask
=
get_attn_key_pad_mask
(
layers
.
squeeze
(
c_mask
,[
-
1
]),
query
)
else
:
else
:
mask
=
get_triu_tensor
(
query
.
numpy
(),
query
.
numpy
()).
astype
(
np
.
float32
)
mask
=
get_triu_tensor
(
query
.
numpy
(),
query
.
numpy
()).
astype
(
np
.
float32
)
mask
=
fluid
.
layers
.
cast
(
dg
.
to_variable
(
mask
!
=
0
),
np
.
float32
)
mask
=
fluid
.
layers
.
cast
(
dg
.
to_variable
(
mask
=
=
0
),
np
.
float32
)
m_mask
,
zero_mask
=
None
,
None
m_mask
,
zero_mask
=
None
,
None
# Decoder pre-network
# Decoder pre-network
query
=
self
.
decoder_prenet
(
query
)
query
=
self
.
decoder_prenet
(
query
)
...
@@ -164,6 +164,7 @@ class TransformerTTS(dg.Layer):
...
@@ -164,6 +164,7 @@ class TransformerTTS(dg.Layer):
# key (batch_size, seq_len, channel)
# key (batch_size, seq_len, channel)
# c_mask (batch_size, seq_len)
# c_mask (batch_size, seq_len)
# attns_enc (channel / 2, seq_len, seq_len)
# attns_enc (channel / 2, seq_len, seq_len)
key
,
c_mask
,
attns_enc
=
self
.
encoder
(
characters
,
pos_text
)
key
,
c_mask
,
attns_enc
=
self
.
encoder
(
characters
,
pos_text
)
# mel_output/postnet_output (batch_size, mel_len, n_mel)
# mel_output/postnet_output (batch_size, mel_len, n_mel)
...
...
parakeet/models/transformerTTS/parse.py
浏览文件 @
e0aef2e0
...
@@ -9,9 +9,9 @@ def add_config_options_to_parser(parser):
...
@@ -9,9 +9,9 @@ def add_config_options_to_parser(parser):
help
=
"the sampling rate of audio data file."
)
help
=
"the sampling rate of audio data file."
)
parser
.
add_argument
(
'--audio.preemphasis'
,
type
=
float
,
default
=
0.97
,
parser
.
add_argument
(
'--audio.preemphasis'
,
type
=
float
,
default
=
0.97
,
help
=
"the preemphasis coefficient."
)
help
=
"the preemphasis coefficient."
)
parser
.
add_argument
(
'--audio.hop_length'
,
type
=
floa
t
,
default
=
128
,
parser
.
add_argument
(
'--audio.hop_length'
,
type
=
in
t
,
default
=
128
,
help
=
"the number of samples to advance between frames."
)
help
=
"the number of samples to advance between frames."
)
parser
.
add_argument
(
'--audio.win_length'
,
type
=
floa
t
,
default
=
1024
,
parser
.
add_argument
(
'--audio.win_length'
,
type
=
in
t
,
default
=
1024
,
help
=
"the length (width) of the window function."
)
help
=
"the length (width) of the window function."
)
parser
.
add_argument
(
'--audio.power'
,
type
=
float
,
default
=
1.4
,
parser
.
add_argument
(
'--audio.power'
,
type
=
float
,
default
=
1.4
,
help
=
"the power to raise before griffin-lim."
)
help
=
"the power to raise before griffin-lim."
)
...
@@ -27,6 +27,10 @@ def add_config_options_to_parser(parser):
...
@@ -27,6 +27,10 @@ def add_config_options_to_parser(parser):
parser
.
add_argument
(
'--embedding_size'
,
type
=
int
,
default
=
512
,
parser
.
add_argument
(
'--embedding_size'
,
type
=
int
,
default
=
512
,
help
=
"the embedding vector size."
)
help
=
"the embedding vector size."
)
parser
.
add_argument
(
'--warm_up_step'
,
type
=
int
,
default
=
4000
,
help
=
"the warm up step of learning rate."
)
parser
.
add_argument
(
'--grad_clip_thresh'
,
type
=
float
,
default
=
1.0
,
help
=
"the threshold of grad clip."
)
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
default
=
32
,
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
default
=
32
,
help
=
"batch size for training."
)
help
=
"batch size for training."
)
parser
.
add_argument
(
'--epochs'
,
type
=
int
,
default
=
10000
,
parser
.
add_argument
(
'--epochs'
,
type
=
int
,
default
=
10000
,
...
...
parakeet/models/transformerTTS/train_postnet.py
浏览文件 @
e0aef2e0
...
@@ -6,7 +6,7 @@ from pathlib import Path
...
@@ -6,7 +6,7 @@ from pathlib import Path
import
jsonargparse
import
jsonargparse
from
parse
import
add_config_options_to_parser
from
parse
import
add_config_options_to_parser
from
pprint
import
pprint
from
pprint
import
pprint
from
data
import
LJSpeechLoader
from
parakeet.models.dataloader.jlspeech
import
LJSpeechLoader
class
MyDataParallel
(
dg
.
parallel
.
DataParallel
):
class
MyDataParallel
(
dg
.
parallel
.
DataParallel
):
"""
"""
...
@@ -50,7 +50,9 @@ def main(cfg):
...
@@ -50,7 +50,9 @@ def main(cfg):
model
=
ModelPostNet
(
cfg
)
model
=
ModelPostNet
(
cfg
)
model
.
train
()
model
.
train
()
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
dg
.
NoamDecay
(
1
/
(
4000
*
(
cfg
.
lr
**
2
)),
4000
))
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
dg
.
NoamDecay
(
1
/
(
cfg
.
warm_up_step
*
(
cfg
.
lr
**
2
)),
cfg
.
warm_up_step
),
parameter_list
=
model
.
parameters
())
if
cfg
.
checkpoint_path
is
not
None
:
if
cfg
.
checkpoint_path
is
not
None
:
model_dict
,
opti_dict
=
fluid
.
dygraph
.
load_dygraph
(
cfg
.
checkpoint_path
)
model_dict
,
opti_dict
=
fluid
.
dygraph
.
load_dygraph
(
cfg
.
checkpoint_path
)
...
@@ -75,13 +77,16 @@ def main(cfg):
...
@@ -75,13 +77,16 @@ def main(cfg):
mag_pred
=
model
(
mel
)
mag_pred
=
model
(
mel
)
loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
mag_pred
,
mag
)))
loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
mag_pred
,
mag
)))
if
cfg
.
use_data_parallel
:
if
cfg
.
use_data_parallel
:
loss
=
model
.
scale_loss
(
loss
)
loss
=
model
.
scale_loss
(
loss
)
loss
.
backward
()
loss
.
backward
()
model
.
apply_collective_grads
()
model
.
apply_collective_grads
()
else
:
else
:
loss
.
backward
()
loss
.
backward
()
optimizer
.
minimize
(
loss
,
grad_clip
=
fluid
.
dygraph_grad_clip
.
GradClipByGlobalNorm
(
1
))
optimizer
.
minimize
(
loss
,
grad_clip
=
fluid
.
dygraph_grad_clip
.
GradClipByGlobalNorm
(
cfg
.
grad_clip_thresh
))
print
(
"==============="
,
model
.
pre_proj
.
conv
.
weight
.
numpy
())
print
(
"==============="
,
model
.
pre_proj
.
conv
.
weight
.
gradient
())
model
.
clear_gradients
()
model
.
clear_gradients
()
if
local_rank
==
0
:
if
local_rank
==
0
:
...
...
parakeet/models/transformerTTS/train_transformer.py
浏览文件 @
e0aef2e0
...
@@ -34,6 +34,9 @@ def main(cfg):
...
@@ -34,6 +34,9 @@ def main(cfg):
local_rank
=
dg
.
parallel
.
Env
().
local_rank
if
cfg
.
use_data_parallel
else
0
local_rank
=
dg
.
parallel
.
Env
().
local_rank
if
cfg
.
use_data_parallel
else
0
nranks
=
dg
.
parallel
.
Env
().
nranks
if
cfg
.
use_data_parallel
else
1
nranks
=
dg
.
parallel
.
Env
().
nranks
if
cfg
.
use_data_parallel
else
1
fluid
.
default_startup_program
().
random_seed
=
1
fluid
.
default_main_program
().
random_seed
=
1
if
local_rank
==
0
:
if
local_rank
==
0
:
# Print the whole config setting.
# Print the whole config setting.
pprint
(
jsonargparse
.
namespace_to_dict
(
cfg
))
pprint
(
jsonargparse
.
namespace_to_dict
(
cfg
))
...
@@ -53,7 +56,8 @@ def main(cfg):
...
@@ -53,7 +56,8 @@ def main(cfg):
model
=
TransformerTTS
(
cfg
)
model
=
TransformerTTS
(
cfg
)
model
.
train
()
model
.
train
()
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
dg
.
NoamDecay
(
1
/
(
4000
*
(
cfg
.
lr
**
2
)),
4000
))
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
dg
.
NoamDecay
(
1
/
(
cfg
.
warm_up_step
*
(
cfg
.
lr
**
2
)),
cfg
.
warm_up_step
),
parameter_list
=
model
.
parameters
())
reader
=
LJSpeechLoader
(
cfg
,
nranks
,
local_rank
).
reader
()
reader
=
LJSpeechLoader
(
cfg
,
nranks
,
local_rank
).
reader
()
...
@@ -69,6 +73,8 @@ def main(cfg):
...
@@ -69,6 +73,8 @@ def main(cfg):
for
epoch
in
range
(
cfg
.
epochs
):
for
epoch
in
range
(
cfg
.
epochs
):
pbar
=
tqdm
(
reader
)
pbar
=
tqdm
(
reader
)
for
i
,
data
in
enumerate
(
pbar
):
for
i
,
data
in
enumerate
(
pbar
):
pbar
.
set_description
(
'Processing at epoch %d'
%
epoch
)
pbar
.
set_description
(
'Processing at epoch %d'
%
epoch
)
character
,
mel
,
mel_input
,
pos_text
,
pos_mel
,
text_length
=
data
character
,
mel
,
mel_input
,
pos_text
,
pos_mel
,
text_length
=
data
...
@@ -86,7 +92,7 @@ def main(cfg):
...
@@ -86,7 +92,7 @@ def main(cfg):
post_mel_loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
postnet_pred
,
mel
)))
post_mel_loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
postnet_pred
,
mel
)))
stop_loss
=
cross_entropy
(
stop_preds
,
dg
.
to_variable
(
label
))
stop_loss
=
cross_entropy
(
stop_preds
,
dg
.
to_variable
(
label
))
loss
=
mel_loss
+
post_mel_loss
+
stop_loss
loss
=
mel_loss
+
post_mel_loss
+
stop_loss
if
local_rank
==
0
:
if
local_rank
==
0
:
writer
.
add_scalars
(
'training_loss'
,
{
writer
.
add_scalars
(
'training_loss'
,
{
'mel_loss'
:
mel_loss
.
numpy
(),
'mel_loss'
:
mel_loss
.
numpy
(),
...
@@ -116,16 +122,16 @@ def main(cfg):
...
@@ -116,16 +122,16 @@ def main(cfg):
for
j
in
range
(
4
):
for
j
in
range
(
4
):
x
=
np
.
uint8
(
cm
.
viridis
(
prob
.
numpy
()[
j
*
16
])
*
255
)
x
=
np
.
uint8
(
cm
.
viridis
(
prob
.
numpy
()[
j
*
16
])
*
255
)
writer
.
add_image
(
'Attention_dec_%d_0'
%
global_step
,
x
,
i
*
4
+
j
,
dataformats
=
"HWC"
)
writer
.
add_image
(
'Attention_dec_%d_0'
%
global_step
,
x
,
i
*
4
+
j
,
dataformats
=
"HWC"
)
if
cfg
.
use_data_parallel
:
if
cfg
.
use_data_parallel
:
loss
=
model
.
scale_loss
(
loss
)
loss
=
model
.
scale_loss
(
loss
)
loss
.
backward
()
loss
.
backward
()
model
.
apply_collective_grads
()
model
.
apply_collective_grads
()
else
:
else
:
loss
.
backward
()
loss
.
backward
()
optimizer
.
minimize
(
loss
,
grad_clip
=
fluid
.
dygraph_grad_clip
.
GradClipByGlobalNorm
(
1
))
optimizer
.
minimize
(
loss
,
grad_clip
=
fluid
.
dygraph_grad_clip
.
GradClipByGlobalNorm
(
cfg
.
grad_clip_thresh
))
model
.
clear_gradients
()
model
.
clear_gradients
()
# save checkpoint
# save checkpoint
if
local_rank
==
0
and
global_step
%
cfg
.
save_step
==
0
:
if
local_rank
==
0
and
global_step
%
cfg
.
save_step
==
0
:
if
not
os
.
path
.
exists
(
cfg
.
save_path
):
if
not
os
.
path
.
exists
(
cfg
.
save_path
):
...
...
parakeet/modules/dynamicGRU.py
浏览文件 @
e0aef2e0
...
@@ -25,6 +25,14 @@ class DynamicGRU(dg.Layer):
...
@@ -25,6 +25,14 @@ class DynamicGRU(dg.Layer):
self
.
is_reverse
=
is_reverse
self
.
is_reverse
=
is_reverse
def
forward
(
self
,
inputs
):
def
forward
(
self
,
inputs
):
"""
Dynamic GRU block.
Args:
input (Variable): Shape(B, T, C), dtype: float32. The input value.
Returns:
output (Variable), Shape(B, T, C), the result compute by GRU.
"""
hidden
=
self
.
h_0
hidden
=
self
.
h_0
res
=
[]
res
=
[]
for
i
in
range
(
inputs
.
shape
[
1
]):
for
i
in
range
(
inputs
.
shape
[
1
]):
...
...
parakeet/modules/feed_forward.py
浏览文件 @
e0aef2e0
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.layers
as
layers
import
paddle.fluid.layers
as
layers
from
parakeet.modules.layers
import
Conv1D
import
paddle.fluid
as
fluid
import
math
from
parakeet.modules.layers
import
Conv
class
PositionwiseFeedForward
(
dg
.
Layer
):
class
PositionwiseFeedForward
(
dg
.
Layer
):
''' A two-feed-forward-layer module '''
''' A two-feed-forward-layer module '''
...
@@ -9,14 +12,15 @@ class PositionwiseFeedForward(dg.Layer):
...
@@ -9,14 +12,15 @@ class PositionwiseFeedForward(dg.Layer):
self
.
num_hidden
=
num_hidden
self
.
num_hidden
=
num_hidden
self
.
use_cudnn
=
use_cudnn
self
.
use_cudnn
=
use_cudnn
self
.
dropout
=
dropout
self
.
dropout
=
dropout
self
.
w_1
=
Conv
1D
(
in_channels
=
d_in
,
self
.
w_1
=
Conv
(
in_channels
=
d_in
,
out_channels
=
num_hidden
,
out_channels
=
num_hidden
,
filter_size
=
filter_size
,
filter_size
=
filter_size
,
padding
=
padding
,
padding
=
padding
,
use_cudnn
=
use_cudnn
,
use_cudnn
=
use_cudnn
,
data_format
=
"NTC"
)
data_format
=
"NTC"
)
self
.
w_2
=
Conv1D
(
in_channels
=
num_hidden
,
self
.
w_2
=
Conv
(
in_channels
=
num_hidden
,
out_channels
=
d_in
,
out_channels
=
d_in
,
filter_size
=
filter_size
,
filter_size
=
filter_size
,
padding
=
padding
,
padding
=
padding
,
...
@@ -25,6 +29,14 @@ class PositionwiseFeedForward(dg.Layer):
...
@@ -25,6 +29,14 @@ class PositionwiseFeedForward(dg.Layer):
self
.
layer_norm
=
dg
.
LayerNorm
(
d_in
)
self
.
layer_norm
=
dg
.
LayerNorm
(
d_in
)
def
forward
(
self
,
input
):
def
forward
(
self
,
input
):
"""
Feed Forward Network.
Args:
input (Variable): Shape(B, T, C), dtype: float32. The input value.
Returns:
output (Variable), Shape(B, T, C), the result after FFN.
"""
#FFN Networt
#FFN Networt
x
=
self
.
w_2
(
layers
.
relu
(
self
.
w_1
(
input
)))
x
=
self
.
w_2
(
layers
.
relu
(
self
.
w_1
(
input
)))
...
@@ -35,6 +47,6 @@ class PositionwiseFeedForward(dg.Layer):
...
@@ -35,6 +47,6 @@ class PositionwiseFeedForward(dg.Layer):
x
=
x
+
input
x
=
x
+
input
#layer normalization
#layer normalization
x
=
self
.
layer_norm
(
x
)
output
=
self
.
layer_norm
(
x
)
return
x
return
output
\ No newline at end of file
\ No newline at end of file
parakeet/modules/layers.py
浏览文件 @
e0aef2e0
...
@@ -6,6 +6,42 @@ from paddle import fluid
...
@@ -6,6 +6,42 @@ from paddle import fluid
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.dygraph
as
dg
class
Conv
(
dg
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels
,
filter_size
=
1
,
padding
=
0
,
dilation
=
1
,
stride
=
1
,
use_cudnn
=
True
,
data_format
=
"NCT"
,
is_bias
=
True
):
super
(
Conv
,
self
).
__init__
()
self
.
in_channels
=
in_channels
self
.
out_channels
=
out_channels
self
.
filter_size
=
filter_size
self
.
padding
=
padding
self
.
dilation
=
dilation
self
.
stride
=
stride
self
.
use_cudnn
=
use_cudnn
self
.
data_format
=
data_format
self
.
is_bias
=
is_bias
self
.
weight_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
XavierInitializer
())
self
.
bias_attr
=
None
if
is_bias
is
not
False
:
k
=
math
.
sqrt
(
1
/
in_channels
)
self
.
bias_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
k
,
high
=
k
))
self
.
conv
=
Conv1D
(
in_channels
=
in_channels
,
out_channels
=
out_channels
,
filter_size
=
filter_size
,
padding
=
padding
,
dilation
=
dilation
,
stride
=
stride
,
param_attr
=
self
.
weight_attr
,
bias_attr
=
self
.
bias_attr
,
use_cudnn
=
use_cudnn
,
data_format
=
data_format
)
def
forward
(
self
,
x
):
x
=
self
.
conv
(
x
)
return
x
class
Conv1D
(
dg
.
Layer
):
class
Conv1D
(
dg
.
Layer
):
"""
"""
A convolution 1D block implemented with Conv2D. Form simplicity and
A convolution 1D block implemented with Conv2D. Form simplicity and
...
...
parakeet/modules/multihead_attention.py
浏览文件 @
e0aef2e0
...
@@ -10,22 +10,35 @@ class ScaledDotProductAttention(dg.Layer):
...
@@ -10,22 +10,35 @@ class ScaledDotProductAttention(dg.Layer):
self
.
d_key
=
d_key
self
.
d_key
=
d_key
# please attention this mask is diff from pytorch
# please attention this mask is diff from pytorch
def
forward
(
self
,
key
,
value
,
query
,
mask
=
None
,
query_mask
=
None
):
def
forward
(
self
,
key
,
value
,
query
,
mask
=
None
,
query_mask
=
None
,
dropout
=
0.1
):
"""
Scaled Dot Product Attention.
Args:
key (Variable): Shape(B, T, C), dtype: float32. The input key of attention.
value (Variable): Shape(B, T, C), dtype: float32. The input value of attention.
query (Variable): Shape(B, T, C), dtype: float32. The input query of attention.
mask (Variable): Shape(B, len_q, len_k), dtype: float32. The mask of key.
query_mask (Variable): Shape(B, len_q, 1), dtype: float32. The mask of query.
dropout (Constant): dtype: float32. The probability of dropout.
Returns:
result (Variable), Shape(B, T, C), the result of mutihead attention.
attention (Variable), Shape(n_head * B, T, C), the attention of key.
"""
# Compute attention score
# Compute attention score
attention
=
layers
.
matmul
(
query
,
key
,
transpose_y
=
True
)
#transpose the last dim in y
attention
=
layers
.
matmul
(
query
,
key
,
transpose_y
=
True
)
#transpose the last dim in y
attention
=
attention
/
math
.
sqrt
(
self
.
d_key
)
attention
=
attention
/
math
.
sqrt
(
self
.
d_key
)
# Mask key to ignore padding
# Mask key to ignore padding
if
mask
is
not
None
:
if
mask
is
not
None
:
attention
=
attention
*
(
mask
==
0
).
astype
(
np
.
float32
)
attention
=
attention
*
mask
mask
=
mask
*
(
-
2
**
32
+
1
)
mask
=
(
mask
==
0
).
astype
(
np
.
float32
)
*
(
-
2
**
32
+
1
)
attention
=
attention
+
mask
attention
=
attention
+
mask
attention
=
layers
.
softmax
(
attention
)
attention
=
layers
.
softmax
(
attention
)
attention
=
layers
.
dropout
(
attention
,
0.0
)
attention
=
layers
.
dropout
(
attention
,
dropout
)
# Mask query to ignore padding
# Mask query to ignore padding
# Not sure how to work
if
query_mask
is
not
None
:
if
query_mask
is
not
None
:
attention
=
attention
*
query_mask
attention
=
attention
*
query_mask
...
@@ -52,6 +65,19 @@ class MultiheadAttention(dg.Layer):
...
@@ -52,6 +65,19 @@ class MultiheadAttention(dg.Layer):
self
.
layer_norm
=
dg
.
LayerNorm
(
num_hidden
)
self
.
layer_norm
=
dg
.
LayerNorm
(
num_hidden
)
def
forward
(
self
,
key
,
value
,
query_input
,
mask
=
None
,
query_mask
=
None
):
def
forward
(
self
,
key
,
value
,
query_input
,
mask
=
None
,
query_mask
=
None
):
"""
Multihead Attention.
Args:
key (Variable): Shape(B, T, C), dtype: float32. The input key of attention.
value (Variable): Shape(B, T, C), dtype: float32. The input value of attention.
query_input (Variable): Shape(B, T, C), dtype: float32. The input query of attention.
mask (Variable): Shape(B, len_q, len_k), dtype: float32. The mask of key.
query_mask (Variable): Shape(B, len_q, 1), dtype: float32. The mask of query.
Returns:
result (Variable), Shape(B, T, C), the result of mutihead attention.
attention (Variable), Shape(n_head * B, T, C), the attention of key.
"""
batch_size
=
key
.
shape
[
0
]
batch_size
=
key
.
shape
[
0
]
seq_len_key
=
key
.
shape
[
1
]
seq_len_key
=
key
.
shape
[
1
]
seq_len_query
=
query_input
.
shape
[
1
]
seq_len_query
=
query_input
.
shape
[
1
]
...
@@ -62,6 +88,7 @@ class MultiheadAttention(dg.Layer):
...
@@ -62,6 +88,7 @@ class MultiheadAttention(dg.Layer):
if
mask
is
not
None
:
if
mask
is
not
None
:
mask
=
layers
.
expand
(
mask
,
(
self
.
num_head
,
1
,
1
))
mask
=
layers
.
expand
(
mask
,
(
self
.
num_head
,
1
,
1
))
# Make multihead attention
# Make multihead attention
# key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn)
# key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn)
key
=
layers
.
reshape
(
self
.
key
(
key
),
[
batch_size
,
seq_len_key
,
self
.
num_head
,
self
.
d_k
])
key
=
layers
.
reshape
(
self
.
key
(
key
),
[
batch_size
,
seq_len_key
,
self
.
num_head
,
self
.
d_k
])
...
@@ -71,6 +98,7 @@ class MultiheadAttention(dg.Layer):
...
@@ -71,6 +98,7 @@ class MultiheadAttention(dg.Layer):
key
=
layers
.
reshape
(
layers
.
transpose
(
key
,
[
2
,
0
,
1
,
3
]),
[
-
1
,
seq_len_key
,
self
.
d_k
])
key
=
layers
.
reshape
(
layers
.
transpose
(
key
,
[
2
,
0
,
1
,
3
]),
[
-
1
,
seq_len_key
,
self
.
d_k
])
value
=
layers
.
reshape
(
layers
.
transpose
(
value
,
[
2
,
0
,
1
,
3
]),
[
-
1
,
seq_len_key
,
self
.
d_k
])
value
=
layers
.
reshape
(
layers
.
transpose
(
value
,
[
2
,
0
,
1
,
3
]),
[
-
1
,
seq_len_key
,
self
.
d_k
])
query
=
layers
.
reshape
(
layers
.
transpose
(
query
,
[
2
,
0
,
1
,
3
]),
[
-
1
,
seq_len_query
,
self
.
d_q
])
query
=
layers
.
reshape
(
layers
.
transpose
(
query
,
[
2
,
0
,
1
,
3
]),
[
-
1
,
seq_len_query
,
self
.
d_q
])
result
,
attention
=
self
.
scal_attn
(
key
,
value
,
query
,
mask
=
mask
,
query_mask
=
query_mask
)
result
,
attention
=
self
.
scal_attn
(
key
,
value
,
query
,
mask
=
mask
,
query_mask
=
query_mask
)
# concat all multihead result
# concat all multihead result
...
...
parakeet/modules/post_convnet.py
浏览文件 @
e0aef2e0
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
paddle.fluid.layers
as
layers
import
paddle.fluid.layers
as
layers
from
parakeet.modules.layers
import
Conv
1D
from
parakeet.modules.layers
import
Conv
class
PostConvNet
(
dg
.
Layer
):
class
PostConvNet
(
dg
.
Layer
):
def
__init__
(
self
,
def
__init__
(
self
,
...
@@ -17,7 +17,7 @@ class PostConvNet(dg.Layer):
...
@@ -17,7 +17,7 @@ class PostConvNet(dg.Layer):
self
.
dropout
=
dropout
self
.
dropout
=
dropout
self
.
conv_list
=
[]
self
.
conv_list
=
[]
self
.
conv_list
.
append
(
Conv
1D
(
in_channels
=
n_mels
*
outputs_per_step
,
self
.
conv_list
.
append
(
Conv
(
in_channels
=
n_mels
*
outputs_per_step
,
out_channels
=
num_hidden
,
out_channels
=
num_hidden
,
filter_size
=
filter_size
,
filter_size
=
filter_size
,
padding
=
padding
,
padding
=
padding
,
...
@@ -25,14 +25,14 @@ class PostConvNet(dg.Layer):
...
@@ -25,14 +25,14 @@ class PostConvNet(dg.Layer):
data_format
=
"NCT"
))
data_format
=
"NCT"
))
for
_
in
range
(
1
,
num_conv
-
1
):
for
_
in
range
(
1
,
num_conv
-
1
):
self
.
conv_list
.
append
(
Conv
1D
(
in_channels
=
num_hidden
,
self
.
conv_list
.
append
(
Conv
(
in_channels
=
num_hidden
,
out_channels
=
num_hidden
,
out_channels
=
num_hidden
,
filter_size
=
filter_size
,
filter_size
=
filter_size
,
padding
=
padding
,
padding
=
padding
,
use_cudnn
=
use_cudnn
,
use_cudnn
=
use_cudnn
,
data_format
=
"NCT"
)
)
data_format
=
"NCT"
)
)
self
.
conv_list
.
append
(
Conv
1D
(
in_channels
=
num_hidden
,
self
.
conv_list
.
append
(
Conv
(
in_channels
=
num_hidden
,
out_channels
=
n_mels
*
outputs_per_step
,
out_channels
=
n_mels
*
outputs_per_step
,
filter_size
=
filter_size
,
filter_size
=
filter_size
,
padding
=
padding
,
padding
=
padding
,
...
@@ -59,9 +59,17 @@ class PostConvNet(dg.Layer):
...
@@ -59,9 +59,17 @@ class PostConvNet(dg.Layer):
def
forward
(
self
,
input
):
def
forward
(
self
,
input
):
"""
Post Conv Net.
Args:
input (Variable): Shape(B, T, C), dtype: float32. The input value.
Returns:
output (Variable), Shape(B, T, C), the result after postconvnet.
"""
input
=
layers
.
transpose
(
input
,
[
0
,
2
,
1
])
input
=
layers
.
transpose
(
input
,
[
0
,
2
,
1
])
len
=
input
.
shape
[
-
1
]
len
=
input
.
shape
[
-
1
]
for
batch_norm
,
conv
in
zip
(
self
.
batch_norm_list
,
self
.
conv_list
):
for
batch_norm
,
conv
in
zip
(
self
.
batch_norm_list
,
self
.
conv_list
):
input
=
layers
.
dropout
(
layers
.
tanh
(
batch_norm
(
conv
(
input
)[:,:,:
len
])),
self
.
dropout
)
input
=
layers
.
dropout
(
layers
.
tanh
(
batch_norm
(
conv
(
input
)[:,:,:
len
])),
self
.
dropout
)
input
=
layers
.
transpose
(
input
,
[
0
,
2
,
1
])
output
=
layers
.
transpose
(
input
,
[
0
,
2
,
1
])
return
input
return
output
\ No newline at end of file
\ No newline at end of file
parakeet/modules/prenet.py
浏览文件 @
e0aef2e0
...
@@ -2,9 +2,6 @@ import paddle.fluid.dygraph as dg
...
@@ -2,9 +2,6 @@ import paddle.fluid.dygraph as dg
import
paddle.fluid.layers
as
layers
import
paddle.fluid.layers
as
layers
class
PreNet
(
dg
.
Layer
):
class
PreNet
(
dg
.
Layer
):
"""
Pre Net before passing through the network
"""
def
__init__
(
self
,
input_size
,
hidden_size
,
output_size
,
dropout_rate
=
0.2
):
def
__init__
(
self
,
input_size
,
hidden_size
,
output_size
,
dropout_rate
=
0.2
):
"""
"""
:param input_size: dimension of input
:param input_size: dimension of input
...
@@ -21,6 +18,14 @@ class PreNet(dg.Layer):
...
@@ -21,6 +18,14 @@ class PreNet(dg.Layer):
self
.
linear2
=
dg
.
Linear
(
hidden_size
,
output_size
)
self
.
linear2
=
dg
.
Linear
(
hidden_size
,
output_size
)
def
forward
(
self
,
x
):
def
forward
(
self
,
x
):
"""
Pre Net before passing through the network.
Args:
x (Variable): Shape(B, T, C), dtype: float32. The input value.
Returns:
x (Variable), Shape(B, T, C), the result after pernet.
"""
x
=
layers
.
dropout
(
layers
.
relu
(
self
.
linear1
(
x
)),
self
.
dropout_rate
)
x
=
layers
.
dropout
(
layers
.
relu
(
self
.
linear1
(
x
)),
self
.
dropout_rate
)
x
=
layers
.
dropout
(
layers
.
relu
(
self
.
linear2
(
x
)),
self
.
dropout_rate
)
x
=
layers
.
dropout
(
layers
.
relu
(
self
.
linear2
(
x
)),
self
.
dropout_rate
)
return
x
return
x
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录