Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Parakeet
提交
abc2b537
P
Parakeet
项目概览
PaddlePaddle
/
Parakeet
通知
10
Star
3
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
19
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Parakeet
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
19
Issue
19
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
abc2b537
编写于
1月 22, 2020
作者:
L
lifuchen
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'add_TranTTS' into 'master'
right fastspeech version. See merge request !5
上级
6e7566fa
47a618ce
变更
17
隐藏空白更改
内联
并排
Showing
17 changed file
with
224 addition
and
374 deletion
+224
-374
parakeet/audio/audio.py
parakeet/audio/audio.py
+1
-1
parakeet/models/dataloader/jlspeech.py
parakeet/models/dataloader/jlspeech.py
+0
-148
parakeet/models/fastspeech/config/fastspeech.yaml
parakeet/models/fastspeech/config/fastspeech.yaml
+5
-2
parakeet/models/fastspeech/config/synthesis.yaml
parakeet/models/fastspeech/config/synthesis.yaml
+33
-0
parakeet/models/fastspeech/modules.py
parakeet/models/fastspeech/modules.py
+2
-1
parakeet/models/fastspeech/network.py
parakeet/models/fastspeech/network.py
+3
-4
parakeet/models/fastspeech/parse.py
parakeet/models/fastspeech/parse.py
+5
-0
parakeet/models/fastspeech/synthesis.py
parakeet/models/fastspeech/synthesis.py
+76
-0
parakeet/models/fastspeech/train.py
parakeet/models/fastspeech/train.py
+21
-27
parakeet/models/fastspeech/utils.py
parakeet/models/fastspeech/utils.py
+7
-5
parakeet/models/transformerTTS/config/train_transformer.yaml
parakeet/models/transformerTTS/config/train_transformer.yaml
+2
-1
parakeet/models/transformerTTS/module.py
parakeet/models/transformerTTS/module.py
+1
-0
parakeet/models/transformerTTS/parse.py
parakeet/models/transformerTTS/parse.py
+5
-3
parakeet/models/transformerTTS/preprocess.py
parakeet/models/transformerTTS/preprocess.py
+0
-123
parakeet/models/transformerTTS/synthesis.py
parakeet/models/transformerTTS/synthesis.py
+33
-5
parakeet/models/transformerTTS/train_postnet.py
parakeet/models/transformerTTS/train_postnet.py
+11
-21
parakeet/models/transformerTTS/train_transformer.py
parakeet/models/transformerTTS/train_transformer.py
+19
-33
未找到文件。
parakeet/audio/audio.py
浏览文件 @
abc2b537
...
...
@@ -209,7 +209,7 @@ class AudioProcessor(object):
def
inv_melspectrogram
(
self
,
mel_spectrogram
):
S
=
self
.
_denormalize
(
mel_spectrogram
)
S
=
self
.
_db_to_amplitude
(
S
+
self
.
ref_level_db
)
S
=
self
.
_
linear_to_mel
(
np
.
abs
(
S
))
S
=
self
.
_
mel_to_linear
(
np
.
abs
(
S
))
if
self
.
preemphasis
:
return
self
.
apply_inv_preemphasis
(
self
.
_griffin_lim
(
S
**
self
.
power
))
return
self
.
_griffin_lim
(
S
**
self
.
power
)
...
...
parakeet/models/dataloader/jlspeech.py
已删除
100644 → 0
浏览文件 @
6e7566fa
from
pathlib
import
Path
import
numpy
as
np
import
pandas
as
pd
import
librosa
from
paddle
import
fluid
from
parakeet
import
g2p
from
parakeet
import
audio
from
parakeet.data.sampler
import
*
from
parakeet.data.datacargo
import
DataCargo
from
parakeet.data.dataset
import
Dataset
from
parakeet.data.batch
import
TextIDBatcher
,
SpecBatcher
class
LJSpeechLoader
:
def
__init__
(
self
,
config
,
nranks
,
rank
,
is_vocoder
=
False
,
shuffle
=
True
):
place
=
fluid
.
CUDAPlace
(
rank
)
if
config
.
use_gpu
else
fluid
.
CPUPlace
()
LJSPEECH_ROOT
=
Path
(
config
.
data_path
)
dataset
=
LJSpeech
(
LJSPEECH_ROOT
,
config
)
sampler
=
DistributedSampler
(
len
(
dataset
),
nranks
,
rank
,
shuffle
=
shuffle
)
assert
config
.
batch_size
%
nranks
==
0
each_bs
=
config
.
batch_size
//
nranks
if
is_vocoder
:
dataloader
=
DataCargo
(
dataset
,
sampler
=
sampler
,
batch_size
=
each_bs
,
shuffle
=
shuffle
,
collate_fn
=
batch_examples_vocoder
,
drop_last
=
True
)
else
:
dataloader
=
DataCargo
(
dataset
,
sampler
=
sampler
,
batch_size
=
each_bs
,
shuffle
=
shuffle
,
collate_fn
=
batch_examples
,
drop_last
=
True
)
self
.
reader
=
fluid
.
io
.
DataLoader
.
from_generator
(
capacity
=
32
,
iterable
=
True
,
use_double_buffer
=
True
,
return_list
=
True
)
self
.
reader
.
set_batch_generator
(
dataloader
,
place
)
class
LJSpeech
(
Dataset
):
def
__init__
(
self
,
root
,
config
):
super
(
LJSpeech
,
self
).
__init__
()
assert
isinstance
(
root
,
(
str
,
Path
)),
"root should be a string or Path object"
self
.
root
=
root
if
isinstance
(
root
,
Path
)
else
Path
(
root
)
self
.
metadata
=
self
.
_prepare_metadata
()
self
.
config
=
config
self
.
_ljspeech_processor
=
audio
.
AudioProcessor
(
sample_rate
=
config
.
audio
.
sr
,
num_mels
=
config
.
audio
.
num_mels
,
min_level_db
=
config
.
audio
.
min_level_db
,
ref_level_db
=
config
.
audio
.
ref_level_db
,
n_fft
=
config
.
audio
.
n_fft
,
win_length
=
config
.
audio
.
win_length
,
hop_length
=
config
.
audio
.
hop_length
,
power
=
config
.
audio
.
power
,
preemphasis
=
config
.
audio
.
preemphasis
,
signal_norm
=
True
,
symmetric_norm
=
False
,
max_norm
=
1.
,
mel_fmin
=
0
,
mel_fmax
=
None
,
clip_norm
=
True
,
griffin_lim_iters
=
60
,
do_trim_silence
=
False
,
sound_norm
=
False
)
def
_prepare_metadata
(
self
):
csv_path
=
self
.
root
.
joinpath
(
"metadata.csv"
)
metadata
=
pd
.
read_csv
(
csv_path
,
sep
=
"|"
,
header
=
None
,
quoting
=
3
,
names
=
[
"fname"
,
"raw_text"
,
"normalized_text"
])
return
metadata
def
_get_example
(
self
,
metadatum
):
"""All the code for generating an Example from a metadatum. If you want a
different preprocessing pipeline, you can override this method.
This method may require several processor, each of which has a lot of options.
In this case, you'd better pass a composed transform and pass it to the init
method.
"""
fname
,
raw_text
,
normalized_text
=
metadatum
wav_path
=
self
.
root
.
joinpath
(
"wavs"
,
fname
+
".wav"
)
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
wav
=
self
.
_ljspeech_processor
.
load_wav
(
str
(
wav_path
))
mag
=
self
.
_ljspeech_processor
.
spectrogram
(
wav
).
astype
(
np
.
float32
)
mel
=
self
.
_ljspeech_processor
.
melspectrogram
(
wav
).
astype
(
np
.
float32
)
phonemes
=
np
.
array
(
g2p
.
en
.
text_to_sequence
(
normalized_text
),
dtype
=
np
.
int64
)
return
(
mag
,
mel
,
phonemes
)
# maybe we need to implement it as a map in the future
def
__getitem__
(
self
,
index
):
metadatum
=
self
.
metadata
.
iloc
[
index
]
example
=
self
.
_get_example
(
metadatum
)
return
example
def
__iter__
(
self
):
for
i
in
range
(
len
(
self
)):
yield
self
[
i
]
def
__len__
(
self
):
return
len
(
self
.
metadata
)
def
batch_examples
(
batch
):
texts
=
[]
mels
=
[]
mel_inputs
=
[]
text_lens
=
[]
pos_texts
=
[]
pos_mels
=
[]
for
data
in
batch
:
_
,
mel
,
text
=
data
mel_inputs
.
append
(
np
.
concatenate
([
np
.
zeros
([
mel
.
shape
[
0
],
1
],
np
.
float32
),
mel
[:,:
-
1
]],
axis
=-
1
))
text_lens
.
append
(
len
(
text
))
pos_texts
.
append
(
np
.
arange
(
1
,
len
(
text
)
+
1
))
pos_mels
.
append
(
np
.
arange
(
1
,
mel
.
shape
[
1
]
+
1
))
mels
.
append
(
mel
)
texts
.
append
(
text
)
# Sort by text_len in descending order
texts
=
[
i
for
i
,
_
in
sorted
(
zip
(
texts
,
text_lens
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)]
mels
=
[
i
for
i
,
_
in
sorted
(
zip
(
mels
,
text_lens
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)]
mel_inputs
=
[
i
for
i
,
_
in
sorted
(
zip
(
mel_inputs
,
text_lens
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)]
pos_texts
=
[
i
for
i
,
_
in
sorted
(
zip
(
pos_texts
,
text_lens
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)]
pos_mels
=
[
i
for
i
,
_
in
sorted
(
zip
(
pos_mels
,
text_lens
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)]
text_lens
=
sorted
(
text_lens
,
reverse
=
True
)
# Pad sequence with largest len of the batch
texts
=
TextIDBatcher
(
pad_id
=
0
)(
texts
)
#(B, T)
pos_texts
=
TextIDBatcher
(
pad_id
=
0
)(
pos_texts
)
#(B,T)
pos_mels
=
TextIDBatcher
(
pad_id
=
0
)(
pos_mels
)
#(B,T)
mels
=
np
.
transpose
(
SpecBatcher
(
pad_value
=
0.
)(
mels
),
axes
=
(
0
,
2
,
1
))
#(B,T,num_mels)
mel_inputs
=
np
.
transpose
(
SpecBatcher
(
pad_value
=
0.
)(
mel_inputs
),
axes
=
(
0
,
2
,
1
))
#(B,T,num_mels)
return
(
texts
,
mels
,
mel_inputs
,
pos_texts
,
pos_mels
,
np
.
array
(
text_lens
))
def
batch_examples_vocoder
(
batch
):
mels
=
[]
mags
=
[]
for
data
in
batch
:
mag
,
mel
,
_
=
data
mels
.
append
(
mel
)
mags
.
append
(
mag
)
mels
=
np
.
transpose
(
SpecBatcher
(
pad_value
=
0.
)(
mels
),
axes
=
(
0
,
2
,
1
))
mags
=
np
.
transpose
(
SpecBatcher
(
pad_value
=
0.
)(
mags
),
axes
=
(
0
,
2
,
1
))
return
(
mels
,
mags
)
parakeet/models/fastspeech/config/fastspeech.yaml
浏览文件 @
abc2b537
...
...
@@ -39,5 +39,8 @@ use_data_parallel: False
data_path
:
../../../dataset/LJSpeech-1.1
transtts_path
:
../transformerTTS/checkpoint/
transformer_step
:
10
log_dir
:
./log
\ No newline at end of file
transformer_step
:
200000
save_path
:
./checkpoint
log_dir
:
./log
#checkpoint_path: ./checkpoint
#ransformer_step: 97000
\ No newline at end of file
parakeet/models/fastspeech/config/synthesis.yaml
0 → 100644
浏览文件 @
abc2b537
audio
:
num_mels
:
80
n_fft
:
2048
sr
:
22050
preemphasis
:
0.97
hop_length
:
275
win_length
:
1102
power
:
1.2
min_level_db
:
-100
ref_level_db
:
20
outputs_per_step
:
1
encoder_n_layer
:
6
encoder_head
:
2
encoder_conv1d_filter_size
:
1536
max_sep_len
:
2048
decoder_n_layer
:
6
decoder_head
:
2
decoder_conv1d_filter_size
:
1536
fs_hidden_size
:
384
duration_predictor_output_size
:
256
duration_predictor_filter_size
:
3
fft_conv1d_filter
:
3
fft_conv1d_padding
:
1
dropout
:
0.1
transformer_head
:
4
use_gpu
:
True
alpha
:
1.0
checkpoint_path
:
checkpoint/
fastspeech_step
:
71000
log_dir
:
./log
\ No newline at end of file
parakeet/models/fastspeech/modules.py
浏览文件 @
abc2b537
...
...
@@ -102,7 +102,8 @@ class LengthRegulator(dg.Layer):
else
:
duration_predictor_output
=
layers
.
round
(
duration_predictor_output
)
output
=
self
.
LR
(
x
,
duration_predictor_output
,
alpha
)
mel_pos
=
dg
.
to_variable
([
i
+
1
for
i
in
range
(
output
.
shape
[
1
])])
mel_pos
=
dg
.
to_variable
(
np
.
arange
(
1
,
output
.
shape
[
1
]
+
1
))
mel_pos
=
layers
.
unsqueeze
(
mel_pos
,
[
0
])
return
output
,
mel_pos
class
DurationPredictor
(
dg
.
Layer
):
...
...
parakeet/models/fastspeech/network.py
浏览文件 @
abc2b537
from
utils
import
*
from
modules
import
FFTBlock
,
LengthRegulator
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid
as
fluid
from
parakeet.g2p.text.symbols
import
symbols
from
parakeet.modules.utils
import
*
from
parakeet.modules.post_convnet
import
PostConvNet
from
parakeet.modules.layers
import
Linear
from
utils
import
*
from
modules
import
FFTBlock
,
LengthRegulator
class
Encoder
(
dg
.
Layer
):
def
__init__
(
self
,
...
...
@@ -203,8 +203,7 @@ class FastSpeech(dg.Layer):
return
mel_output
,
mel_output_postnet
,
duration_predictor_output
,
enc_slf_attn_list
,
dec_slf_attn_list
else
:
length_regulator_output
,
decoder_pos
=
self
.
length_regulator
(
encoder_output
,
alpha
=
alpha
)
decoder_output
=
self
.
decoder
(
length_regulator_output
,
decoder_pos
)
decoder_output
,
_
=
self
.
decoder
(
length_regulator_output
,
decoder_pos
)
mel_output
=
self
.
mel_linear
(
decoder_output
)
mel_output_postnet
=
self
.
postnet
(
mel_output
)
+
mel_output
...
...
parakeet/models/fastspeech/parse.py
浏览文件 @
abc2b537
...
...
@@ -50,6 +50,9 @@ def add_config_options_to_parser(parser):
help
=
"the dropout in network."
)
parser
.
add_argument
(
'--transformer_head'
,
type
=
int
,
default
=
4
,
help
=
"the attention head num of transformerTTS."
)
parser
.
add_argument
(
'--alpha'
,
type
=
float
,
default
=
1.0
,
help
=
"the hyperparameter to determine the length of the expanded sequence
\
mel, thereby controlling the voice speed."
)
parser
.
add_argument
(
'--hidden_size'
,
type
=
int
,
default
=
256
,
help
=
"the hidden size in model of transformerTTS."
)
...
...
@@ -68,6 +71,8 @@ def add_config_options_to_parser(parser):
help
=
"the learning rate for training."
)
parser
.
add_argument
(
'--save_step'
,
type
=
int
,
default
=
500
,
help
=
"checkpointing interval during training."
)
parser
.
add_argument
(
'--fastspeech_step'
,
type
=
int
,
default
=
160000
,
help
=
"Global step to restore checkpoint of fastspeech."
)
parser
.
add_argument
(
'--use_gpu'
,
type
=
bool
,
default
=
True
,
help
=
"use gpu or not during training."
)
parser
.
add_argument
(
'--use_data_parallel'
,
type
=
bool
,
default
=
False
,
...
...
parakeet/models/fastspeech/synthesis.py
0 → 100644
浏览文件 @
abc2b537
import
os
from
tensorboardX
import
SummaryWriter
from
collections
import
OrderedDict
import
jsonargparse
from
parse
import
add_config_options_to_parser
from
pprint
import
pprint
import
numpy
as
np
import
paddle.fluid
as
fluid
import
paddle.fluid.dygraph
as
dg
from
parakeet.g2p.en
import
text_to_sequence
from
parakeet
import
audio
from
network
import
FastSpeech
def
load_checkpoint
(
step
,
model_path
):
model_dict
,
_
=
fluid
.
dygraph
.
load_dygraph
(
os
.
path
.
join
(
model_path
,
step
))
new_state_dict
=
OrderedDict
()
for
param
in
model_dict
:
if
param
.
startswith
(
'_layers.'
):
new_state_dict
[
param
[
8
:]]
=
model_dict
[
param
]
else
:
new_state_dict
[
param
]
=
model_dict
[
param
]
return
new_state_dict
def
synthesis
(
text_input
,
cfg
):
place
=
(
fluid
.
CUDAPlace
(
0
)
if
cfg
.
use_gpu
else
fluid
.
CPUPlace
())
# tensorboard
if
not
os
.
path
.
exists
(
cfg
.
log_dir
):
os
.
mkdir
(
cfg
.
log_dir
)
path
=
os
.
path
.
join
(
cfg
.
log_dir
,
'synthesis'
)
writer
=
SummaryWriter
(
path
)
with
dg
.
guard
(
place
):
model
=
FastSpeech
(
cfg
)
model
.
set_dict
(
load_checkpoint
(
str
(
cfg
.
fastspeech_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"fastspeech"
)))
model
.
eval
()
text
=
np
.
asarray
(
text_to_sequence
(
text_input
))
text
=
fluid
.
layers
.
unsqueeze
(
dg
.
to_variable
(
text
),[
0
])
pos_text
=
np
.
arange
(
1
,
text
.
shape
[
1
]
+
1
)
pos_text
=
fluid
.
layers
.
unsqueeze
(
dg
.
to_variable
(
pos_text
),[
0
])
mel_output
,
mel_output_postnet
=
model
(
text
,
pos_text
,
alpha
=
cfg
.
alpha
)
_ljspeech_processor
=
audio
.
AudioProcessor
(
sample_rate
=
cfg
.
audio
.
sr
,
num_mels
=
cfg
.
audio
.
num_mels
,
min_level_db
=
cfg
.
audio
.
min_level_db
,
ref_level_db
=
cfg
.
audio
.
ref_level_db
,
n_fft
=
cfg
.
audio
.
n_fft
,
win_length
=
cfg
.
audio
.
win_length
,
hop_length
=
cfg
.
audio
.
hop_length
,
power
=
cfg
.
audio
.
power
,
preemphasis
=
cfg
.
audio
.
preemphasis
,
signal_norm
=
True
,
symmetric_norm
=
False
,
max_norm
=
1.
,
mel_fmin
=
0
,
mel_fmax
=
None
,
clip_norm
=
True
,
griffin_lim_iters
=
60
,
do_trim_silence
=
False
,
sound_norm
=
False
)
mel_output_postnet
=
fluid
.
layers
.
transpose
(
fluid
.
layers
.
squeeze
(
mel_output_postnet
,[
0
]),
[
1
,
0
])
wav
=
_ljspeech_processor
.
inv_melspectrogram
(
mel_output_postnet
.
numpy
())
writer
.
add_audio
(
text_input
,
wav
,
0
,
cfg
.
audio
.
sr
)
print
(
"Synthesis completed !!!"
)
writer
.
close
()
if
__name__
==
'__main__'
:
parser
=
jsonargparse
.
ArgumentParser
(
description
=
"Synthesis model"
,
formatter_class
=
'default_argparse'
)
add_config_options_to_parser
(
parser
)
cfg
=
parser
.
parse_args
(
'-c ./config/synthesis.yaml'
.
split
())
synthesis
(
"Transformer model is so fast!"
,
cfg
)
\ No newline at end of file
parakeet/models/fastspeech/train.py
浏览文件 @
abc2b537
...
...
@@ -5,34 +5,28 @@ import time
import
math
import
jsonargparse
from
pathlib
import
Path
from
parse
import
add_config_options_to_parser
from
pprint
import
pprint
from
tqdm
import
tqdm
from
collections
import
OrderedDict
from
tensorboardX
import
SummaryWriter
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.layers
as
layers
import
paddle.fluid
as
fluid
from
par
se
import
add_config_options_to_pars
er
from
p
print
import
pprint
from
par
akeet.models.dataloader.ljspeech
import
LJSpeechLoad
er
from
p
arakeet.models.transformerTTS.network
import
TransformerTTS
from
network
import
FastSpeech
from
utils
import
get_alignment
from
parakeet.models.dataloader.jlspeech
import
LJSpeechLoader
from
parakeet.models.transformerTTS.network
import
TransformerTTS
class
MyDataParallel
(
dg
.
parallel
.
DataParallel
):
"""
A data parallel proxy for model.
"""
def
__init__
(
self
,
layers
,
strategy
):
super
(
MyDataParallel
,
self
).
__init__
(
layers
,
strategy
)
def
__getattr__
(
self
,
key
):
if
key
in
self
.
__dict__
:
return
object
.
__getattribute__
(
self
,
key
)
elif
key
is
"_layers"
:
return
object
.
__getattribute__
(
self
,
"_sub_layers"
)[
"_layers"
]
def
load_checkpoint
(
step
,
model_path
):
model_dict
,
opti_dict
=
fluid
.
dygraph
.
load_dygraph
(
os
.
path
.
join
(
model_path
,
step
))
new_state_dict
=
OrderedDict
()
for
param
in
model_dict
:
if
param
.
startswith
(
'_layers.'
):
new_state_dict
[
param
[
8
:]]
=
model_dict
[
param
]
else
:
return
getattr
(
object
.
__getattribute__
(
self
,
"_sub_layers"
)[
"_layers"
],
key
)
new_state_dict
[
param
]
=
model_dict
[
param
]
return
new_state_dict
,
opti_dict
def
main
(
cfg
):
...
...
@@ -57,8 +51,7 @@ def main(cfg):
with
dg
.
guard
(
place
):
with
fluid
.
unique_name
.
guard
():
transformerTTS
=
TransformerTTS
(
cfg
)
model_path
=
os
.
path
.
join
(
cfg
.
transtts_path
,
"transformer"
)
model_dict
,
_
=
fluid
.
dygraph
.
load_dygraph
(
os
.
path
.
join
(
model_path
,
str
(
cfg
.
transformer_step
)))
model_dict
,
_
=
load_checkpoint
(
str
(
cfg
.
transformer_step
),
os
.
path
.
join
(
cfg
.
transtts_path
,
"transformer"
))
transformerTTS
.
set_dict
(
model_dict
)
transformerTTS
.
eval
()
...
...
@@ -67,27 +60,29 @@ def main(cfg):
model
.
train
()
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
dg
.
NoamDecay
(
1
/
(
cfg
.
warm_up_step
*
(
cfg
.
lr
**
2
)),
cfg
.
warm_up_step
),
parameter_list
=
model
.
parameters
())
reader
=
LJSpeechLoader
(
cfg
,
nranks
,
local_rank
).
reader
()
reader
=
LJSpeechLoader
(
cfg
,
nranks
,
local_rank
,
shuffle
=
True
).
reader
()
if
cfg
.
checkpoint_path
is
not
None
:
model_dict
,
opti_dict
=
fluid
.
dygraph
.
load_dygraph
(
cfg
.
checkpoint_path
)
model_dict
,
opti_dict
=
load_checkpoint
(
str
(
cfg
.
fastspeech_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"fastspeech"
)
)
model
.
set_dict
(
model_dict
)
optimizer
.
set_dict
(
opti_dict
)
global_step
=
cfg
.
fastspeech_step
print
(
"load checkpoint!!!"
)
if
cfg
.
use_data_parallel
:
strategy
=
dg
.
parallel
.
prepare_context
()
model
=
My
DataParallel
(
model
,
strategy
)
model
=
fluid
.
dygraph
.
parallel
.
DataParallel
(
model
,
strategy
)
for
epoch
in
range
(
cfg
.
epochs
):
pbar
=
tqdm
(
reader
)
for
i
,
data
in
enumerate
(
pbar
):
pbar
.
set_description
(
'Processing at epoch %d'
%
epoch
)
character
,
mel
,
mel_input
,
pos_text
,
pos_mel
,
text_length
=
data
character
,
mel
,
mel_input
,
pos_text
,
pos_mel
,
text_length
,
mel_lens
=
data
_
,
_
,
attn_probs
,
_
,
_
,
_
=
transformerTTS
(
character
,
mel_input
,
pos_text
,
pos_mel
)
alignment
=
dg
.
to_variable
(
get_alignment
(
attn_probs
,
cfg
.
transformer_head
)).
astype
(
np
.
float32
)
alignment
=
dg
.
to_variable
(
get_alignment
(
attn_probs
,
mel_lens
,
cfg
.
transformer_head
)).
astype
(
np
.
float32
)
global_step
+=
1
#Forward
...
...
@@ -102,7 +97,6 @@ def main(cfg):
total_loss
=
mel_loss
+
mel_postnet_loss
+
duration_loss
if
local_rank
==
0
:
#print('epoch:{}, step:{}, mel_loss:{}, mel_postnet_loss:{}, duration_loss:{}'.format(epoch, global_step, mel_loss.numpy(), mel_postnet_loss.numpy(), duration_loss.numpy()))
writer
.
add_scalar
(
'mel_loss'
,
mel_loss
.
numpy
(),
global_step
)
writer
.
add_scalar
(
'post_mel_loss'
,
mel_postnet_loss
.
numpy
(),
global_step
)
writer
.
add_scalar
(
'duration_loss'
,
duration_loss
.
numpy
(),
global_step
)
...
...
parakeet/models/fastspeech/utils.py
浏览文件 @
abc2b537
import
numpy
as
np
def
get_alignment
(
attn_probs
,
n_head
):
def
get_alignment
(
attn_probs
,
mel_lens
,
n_head
):
max_F
=
0
assert
attn_probs
[
0
].
shape
[
0
]
%
n_head
==
0
batch_size
=
int
(
attn_probs
[
0
].
shape
[
0
]
//
n_head
)
#max_attn = attn_probs[0].numpy()[0,batch_size]
for
i
in
range
(
len
(
attn_probs
)):
multi_attn
=
attn_probs
[
i
].
numpy
()
for
j
in
range
(
n_head
):
...
...
@@ -12,7 +13,7 @@ def get_alignment(attn_probs, n_head):
if
max_F
<
F
:
max_F
=
F
max_attn
=
attn
alignment
=
compute_duration
(
max_attn
)
alignment
=
compute_duration
(
max_attn
,
mel_lens
)
return
alignment
def
score_F
(
attn
):
...
...
@@ -20,11 +21,12 @@ def score_F(attn):
mean
=
np
.
mean
(
max
)
return
mean
def
compute_duration
(
attn
):
def
compute_duration
(
attn
,
mel_lens
):
alignment
=
np
.
zeros
([
attn
.
shape
[
0
],
attn
.
shape
[
2
]])
mel_lens
=
mel_lens
.
numpy
()
for
i
in
range
(
attn
.
shape
[
0
]):
for
j
in
range
(
attn
.
shape
[
1
]):
max_index
=
attn
[
i
,
j
].
tolist
().
index
(
attn
[
i
,
j
].
max
()
)
for
j
in
range
(
mel_lens
[
i
]):
max_index
=
np
.
argmax
(
attn
[
i
,
j
]
)
alignment
[
i
,
max_index
]
+=
1
return
alignment
...
...
parakeet/models/transformerTTS/config/train_transformer.yaml
浏览文件 @
abc2b537
...
...
@@ -24,11 +24,12 @@ save_step: 1000
image_step
:
2000
use_gpu
:
True
use_data_parallel
:
False
stop_token
:
False
data_path
:
../../../dataset/LJSpeech-1.1
save_path
:
./checkpoint
log_dir
:
./log
#checkpoint_path: ./checkpoint
#
transformer_step: 70
000
#
ransformer_step: 97
000
\ No newline at end of file
parakeet/models/transformerTTS/module.py
浏览文件 @
abc2b537
...
...
@@ -49,6 +49,7 @@ class EncoderPrenet(dg.Layer):
x
=
layers
.
dropout
(
layers
.
relu
(
batch_norm
(
conv
(
x
))),
0.2
)
x
=
layers
.
transpose
(
x
,[
0
,
2
,
1
])
#(N,T,C)
x
=
self
.
projection
(
x
)
return
x
class
CBHG
(
dg
.
Layer
):
...
...
parakeet/models/transformerTTS/parse.py
浏览文件 @
abc2b537
...
...
@@ -44,13 +44,15 @@ def add_config_options_to_parser(parser):
parser
.
add_argument
(
'--max_len'
,
type
=
int
,
default
=
400
,
help
=
"The max length of audio when synthsis."
)
parser
.
add_argument
(
'--transformer_step'
,
type
=
int
,
default
=
160000
,
help
=
"Global step to restore checkpoint of transformer
in synthesis
."
)
parser
.
add_argument
(
'--postnet_step'
,
type
=
int
,
default
=
10
0000
,
help
=
"Global step to restore checkpoint of postnet
in synthesis
."
)
help
=
"Global step to restore checkpoint of transformer."
)
parser
.
add_argument
(
'--postnet_step'
,
type
=
int
,
default
=
9
0000
,
help
=
"Global step to restore checkpoint of postnet."
)
parser
.
add_argument
(
'--use_gpu'
,
type
=
bool
,
default
=
True
,
help
=
"use gpu or not during training."
)
parser
.
add_argument
(
'--use_data_parallel'
,
type
=
bool
,
default
=
False
,
help
=
"use data parallel or not during training."
)
parser
.
add_argument
(
'--stop_token'
,
type
=
bool
,
default
=
False
,
help
=
"use stop token loss in network or not."
)
parser
.
add_argument
(
'--data_path'
,
type
=
str
,
default
=
'./dataset/LJSpeech-1.1'
,
help
=
"the path of dataset."
)
...
...
parakeet/models/transformerTTS/preprocess.py
已删除
100644 → 0
浏览文件 @
6e7566fa
from
pathlib
import
Path
import
numpy
as
np
import
pandas
as
pd
import
librosa
from
parakeet
import
g2p
from
parakeet
import
audio
from
parakeet.data.sampler
import
SequentialSampler
,
RandomSampler
,
BatchSampler
from
parakeet.data.dataset
import
Dataset
from
parakeet.data.datacargo
import
DataCargo
from
parakeet.data.batch
import
TextIDBatcher
,
SpecBatcher
_ljspeech_processor
=
audio
.
AudioProcessor
(
sample_rate
=
22050
,
num_mels
=
80
,
min_level_db
=-
100
,
ref_level_db
=
20
,
n_fft
=
2048
,
win_length
=
int
(
22050
*
0.05
),
hop_length
=
int
(
22050
*
0.0125
),
power
=
1.2
,
preemphasis
=
0.97
,
signal_norm
=
True
,
symmetric_norm
=
False
,
max_norm
=
1.
,
mel_fmin
=
0
,
mel_fmax
=
None
,
clip_norm
=
True
,
griffin_lim_iters
=
60
,
do_trim_silence
=
False
,
sound_norm
=
False
)
class
LJSpeech
(
Dataset
):
def
__init__
(
self
,
root
):
super
(
LJSpeech
,
self
).
__init__
()
assert
isinstance
(
root
,
(
str
,
Path
)),
"root should be a string or Path object"
self
.
root
=
root
if
isinstance
(
root
,
Path
)
else
Path
(
root
)
self
.
metadata
=
self
.
_prepare_metadata
()
def
_prepare_metadata
(
self
):
csv_path
=
self
.
root
.
joinpath
(
"metadata.csv"
)
metadata
=
pd
.
read_csv
(
csv_path
,
sep
=
"|"
,
header
=
None
,
quoting
=
3
,
names
=
[
"fname"
,
"raw_text"
,
"normalized_text"
])
return
metadata
def
_get_example
(
self
,
metadatum
):
"""All the code for generating an Example from a metadatum. If you want a
different preprocessing pipeline, you can override this method.
This method may require several processor, each of which has a lot of options.
In this case, you'd better pass a composed transform and pass it to the init
method.
"""
fname
,
raw_text
,
normalized_text
=
metadatum
wav_path
=
self
.
root
.
joinpath
(
"wavs"
,
fname
+
".wav"
)
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
wav
=
_ljspeech_processor
.
load_wav
(
str
(
wav_path
))
mag
=
_ljspeech_processor
.
spectrogram
(
wav
).
astype
(
np
.
float32
)
mel
=
_ljspeech_processor
.
melspectrogram
(
wav
).
astype
(
np
.
float32
)
phonemes
=
np
.
array
(
g2p
.
en
.
text_to_sequence
(
normalized_text
),
dtype
=
np
.
int64
)
return
(
mag
,
mel
,
phonemes
)
# maybe we need to implement it as a map in the future
def
__getitem__
(
self
,
index
):
metadatum
=
self
.
metadata
.
iloc
[
index
]
example
=
self
.
_get_example
(
metadatum
)
return
example
def
__iter__
(
self
):
for
i
in
range
(
len
(
self
)):
yield
self
[
i
]
def
__len__
(
self
):
return
len
(
self
.
metadata
)
def
batch_examples
(
batch
):
texts
=
[]
mels
=
[]
mel_inputs
=
[]
text_lens
=
[]
pos_texts
=
[]
pos_mels
=
[]
for
data
in
batch
:
_
,
mel
,
text
=
data
mel_inputs
.
append
(
np
.
concatenate
([
np
.
zeros
([
mel
.
shape
[
0
],
1
],
np
.
float32
),
mel
[:,:
-
1
]],
axis
=-
1
))
text_lens
.
append
(
len
(
text
))
pos_texts
.
append
(
np
.
arange
(
1
,
len
(
text
)
+
1
))
pos_mels
.
append
(
np
.
arange
(
1
,
mel
.
shape
[
1
]
+
1
))
mels
.
append
(
mel
)
texts
.
append
(
text
)
# Sort by text_len in descending order
texts
=
[
i
for
i
,
_
in
sorted
(
zip
(
texts
,
text_lens
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)]
mels
=
[
i
for
i
,
_
in
sorted
(
zip
(
mels
,
text_lens
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)]
mel_inputs
=
[
i
for
i
,
_
in
sorted
(
zip
(
mel_inputs
,
text_lens
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)]
pos_texts
=
[
i
for
i
,
_
in
sorted
(
zip
(
pos_texts
,
text_lens
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)]
pos_mels
=
[
i
for
i
,
_
in
sorted
(
zip
(
pos_mels
,
text_lens
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)]
text_lens
=
sorted
(
text_lens
,
reverse
=
True
)
# Pad sequence with largest len of the batch
texts
=
TextIDBatcher
(
pad_id
=
0
)(
texts
)
pos_texts
=
TextIDBatcher
(
pad_id
=
0
)(
pos_texts
)
pos_mels
=
TextIDBatcher
(
pad_id
=
0
)(
pos_mels
)
mels
=
np
.
transpose
(
SpecBatcher
(
pad_value
=
0.
)(
mels
),
axes
=
(
0
,
2
,
1
))
mel_inputs
=
np
.
transpose
(
SpecBatcher
(
pad_value
=
0.
)(
mel_inputs
),
axes
=
(
0
,
2
,
1
))
return
(
texts
,
mels
,
mel_inputs
,
pos_texts
,
pos_mels
,
np
.
array
(
text_lens
))
def
batch_examples_vocoder
(
batch
):
mels
=
[]
mags
=
[]
for
data
in
batch
:
mag
,
mel
,
_
=
data
mels
.
append
(
mel
)
mags
.
append
(
mag
)
mels
=
np
.
transpose
(
SpecBatcher
(
pad_value
=
0.
)(
mels
),
axes
=
(
0
,
2
,
1
))
mags
=
np
.
transpose
(
SpecBatcher
(
pad_value
=
0.
)(
mags
),
axes
=
(
0
,
2
,
1
))
return
(
mels
,
mags
)
parakeet/models/transformerTTS/synthesis.py
浏览文件 @
abc2b537
...
...
@@ -7,15 +7,22 @@ from tqdm import tqdm
from
tensorboardX
import
SummaryWriter
import
paddle.fluid
as
fluid
import
paddle.fluid.dygraph
as
dg
from
preprocess
import
_ljspeech_processor
from
pathlib
import
Path
import
jsonargparse
from
parse
import
add_config_options_to_parser
from
pprint
import
pprint
from
collections
import
OrderedDict
from
parakeet
import
audio
def
load_checkpoint
(
step
,
model_path
):
model_dict
,
opti_dict
=
fluid
.
dygraph
.
load_dygraph
(
os
.
path
.
join
(
model_path
,
step
))
return
model_dict
model_dict
,
_
=
fluid
.
dygraph
.
load_dygraph
(
os
.
path
.
join
(
model_path
,
step
))
new_state_dict
=
OrderedDict
()
for
param
in
model_dict
:
if
param
.
startswith
(
'_layers.'
):
new_state_dict
[
param
[
8
:]]
=
model_dict
[
param
]
else
:
new_state_dict
[
param
]
=
model_dict
[
param
]
return
new_state_dict
def
synthesis
(
text_input
,
cfg
):
place
=
(
fluid
.
CUDAPlace
(
0
)
if
cfg
.
use_gpu
else
fluid
.
CPUPlace
())
...
...
@@ -30,7 +37,7 @@ def synthesis(text_input, cfg):
with
dg
.
guard
(
place
):
with
fluid
.
unique_name
.
guard
():
model
=
TransformerTTS
(
cfg
)
model
.
set_dict
(
load_checkpoint
(
str
(
cfg
.
transformer_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"transformer"
)))
model
.
set_dict
(
load_checkpoint
(
str
(
cfg
.
transformer_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"
nostop_token/
transformer"
)))
model
.
eval
()
with
fluid
.
unique_name
.
guard
():
...
...
@@ -54,14 +61,35 @@ def synthesis(text_input, cfg):
mel_input
=
fluid
.
layers
.
concat
([
mel_input
,
postnet_pred
[:,
-
1
:,:]],
axis
=
1
)
mag_pred
=
model_postnet
(
postnet_pred
)
_ljspeech_processor
=
audio
.
AudioProcessor
(
sample_rate
=
cfg
.
audio
.
sr
,
num_mels
=
cfg
.
audio
.
num_mels
,
min_level_db
=
cfg
.
audio
.
min_level_db
,
ref_level_db
=
cfg
.
audio
.
ref_level_db
,
n_fft
=
cfg
.
audio
.
n_fft
,
win_length
=
cfg
.
audio
.
win_length
,
hop_length
=
cfg
.
audio
.
hop_length
,
power
=
cfg
.
audio
.
power
,
preemphasis
=
cfg
.
audio
.
preemphasis
,
signal_norm
=
True
,
symmetric_norm
=
False
,
max_norm
=
1.
,
mel_fmin
=
0
,
mel_fmax
=
None
,
clip_norm
=
True
,
griffin_lim_iters
=
60
,
do_trim_silence
=
False
,
sound_norm
=
False
)
wav
=
_ljspeech_processor
.
inv_spectrogram
(
fluid
.
layers
.
transpose
(
fluid
.
layers
.
squeeze
(
mag_pred
,[
0
]),
[
1
,
0
]).
numpy
())
writer
.
add_audio
(
text_input
,
wav
,
0
,
cfg
.
audio
.
sr
)
if
not
os
.
path
.
exists
(
cfg
.
sample_path
):
os
.
mkdir
(
cfg
.
sample_path
)
write
(
os
.
path
.
join
(
cfg
.
sample_path
,
'test.wav'
),
cfg
.
audio
.
sr
,
wav
)
writer
.
close
()
if
__name__
==
'__main__'
:
parser
=
jsonargparse
.
ArgumentParser
(
description
=
"Synthesis model"
,
formatter_class
=
'default_argparse'
)
add_config_options_to_parser
(
parser
)
cfg
=
parser
.
parse_args
(
'-c ./config/synthesis.yaml'
.
split
())
synthesis
(
"Transformer model is so fast!"
,
cfg
)
\ No newline at end of file
synthesis
(
"Transformer model is so fast!"
,
cfg
)
parakeet/models/transformerTTS/train_postnet.py
浏览文件 @
abc2b537
from
network
import
*
from
tensorboardX
import
SummaryWriter
import
os
from
tqdm
import
tqdm
from
pathlib
import
Path
from
collections
import
OrderedDict
import
jsonargparse
from
parse
import
add_config_options_to_parser
from
pprint
import
pprint
from
parakeet.models.dataloader.jlspeech
import
LJSpeechLoader
class
MyDataParallel
(
dg
.
parallel
.
DataParallel
):
"""
A data parallel proxy for model.
"""
def
__init__
(
self
,
layers
,
strategy
):
super
(
MyDataParallel
,
self
).
__init__
(
layers
,
strategy
)
def
__getattr__
(
self
,
key
):
if
key
in
self
.
__dict__
:
return
object
.
__getattribute__
(
self
,
key
)
elif
key
is
"_layers"
:
return
object
.
__getattribute__
(
self
,
"_sub_layers"
)[
"_layers"
]
else
:
return
getattr
(
object
.
__getattribute__
(
self
,
"_sub_layers"
)[
"_layers"
],
key
)
from
parakeet.models.dataloader.ljspeech
import
LJSpeechLoader
from
network
import
*
def
load_checkpoint
(
step
,
model_path
):
model_dict
,
opti_dict
=
fluid
.
dygraph
.
load_dygraph
(
os
.
path
.
join
(
model_path
,
step
))
return
model_dict
,
opti_dict
new_state_dict
=
OrderedDict
()
for
param
in
model_dict
:
if
param
.
startswith
(
'_layers.'
):
new_state_dict
[
param
[
8
:]]
=
model_dict
[
param
]
else
:
new_state_dict
[
param
]
=
model_dict
[
param
]
return
new_state_dict
,
opti_dict
def
main
(
cfg
):
...
...
@@ -66,7 +56,7 @@ def main(cfg):
if
cfg
.
use_data_parallel
:
strategy
=
dg
.
parallel
.
prepare_context
()
model
=
My
DataParallel
(
model
,
strategy
)
model
=
fluid
.
dygraph
.
parallel
.
DataParallel
(
model
,
strategy
)
reader
=
LJSpeechLoader
(
cfg
,
nranks
,
local_rank
,
is_vocoder
=
True
).
reader
()
...
...
parakeet/models/transformerTTS/train_transformer.py
浏览文件 @
abc2b537
import
os
from
tqdm
import
tqdm
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.layers
as
layers
from
network
import
*
from
tensorboardX
import
SummaryWriter
from
pathlib
import
Path
from
collections
import
OrderedDict
import
jsonargparse
from
parse
import
add_config_options_to_parser
from
pprint
import
pprint
from
matplotlib
import
cm
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.layers
as
layers
from
parakeet.modules.utils
import
cross_entropy
from
parakeet.models.dataloader.jlspeech
import
LJSpeechLoader
class
MyDataParallel
(
dg
.
parallel
.
DataParallel
):
"""
A data parallel proxy for model.
"""
def
__init__
(
self
,
layers
,
strategy
):
super
(
MyDataParallel
,
self
).
__init__
(
layers
,
strategy
)
def
__getattr__
(
self
,
key
):
if
key
in
self
.
__dict__
:
return
object
.
__getattribute__
(
self
,
key
)
elif
key
is
"_layers"
:
return
object
.
__getattribute__
(
self
,
"_sub_layers"
)[
"_layers"
]
else
:
return
getattr
(
object
.
__getattribute__
(
self
,
"_sub_layers"
)[
"_layers"
],
key
)
from
parakeet.models.dataloader.ljspeech
import
LJSpeechLoader
from
network
import
*
def
load_checkpoint
(
step
,
model_path
):
model_dict
,
opti_dict
=
fluid
.
dygraph
.
load_dygraph
(
os
.
path
.
join
(
model_path
,
step
))
return
model_dict
,
opti_dict
new_state_dict
=
OrderedDict
()
for
param
in
model_dict
:
if
param
.
startswith
(
'_layers.'
):
new_state_dict
[
param
[
8
:]]
=
model_dict
[
param
]
else
:
new_state_dict
[
param
]
=
model_dict
[
param
]
return
new_state_dict
,
opti_dict
def
main
(
cfg
):
local_rank
=
dg
.
parallel
.
Env
().
local_rank
if
cfg
.
use_data_parallel
else
0
nranks
=
dg
.
parallel
.
Env
().
nranks
if
cfg
.
use_data_parallel
else
1
fluid
.
default_startup_program
().
random_seed
=
1
fluid
.
default_main_program
().
random_seed
=
1
if
local_rank
==
0
:
# Print the whole config setting.
pprint
(
jsonargparse
.
namespace_to_dict
(
cfg
))
...
...
@@ -74,28 +61,27 @@ def main(cfg):
if
cfg
.
use_data_parallel
:
strategy
=
dg
.
parallel
.
prepare_context
()
model
=
My
DataParallel
(
model
,
strategy
)
model
=
fluid
.
dygraph
.
parallel
.
DataParallel
(
model
,
strategy
)
for
epoch
in
range
(
cfg
.
epochs
):
pbar
=
tqdm
(
reader
)
for
i
,
data
in
enumerate
(
pbar
):
pbar
.
set_description
(
'Processing at epoch %d'
%
epoch
)
character
,
mel
,
mel_input
,
pos_text
,
pos_mel
,
text_length
=
data
character
,
mel
,
mel_input
,
pos_text
,
pos_mel
,
text_length
,
_
=
data
global_step
+=
1
mel_pred
,
postnet_pred
,
attn_probs
,
stop_preds
,
attn_enc
,
attn_dec
=
model
(
character
,
mel_input
,
pos_text
,
pos_mel
)
label
=
(
pos_mel
==
0
).
astype
(
np
.
float32
)
#label = np.zeros(stop_preds.shape).astype(np.float32)
#text_length = text_length.numpy()
#for i in range(label.shape[0]):
# label[i][text_length[i] - 1] = 1
mel_loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
mel_pred
,
mel
)))
post_mel_loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
postnet_pred
,
mel
)))
stop_loss
=
cross_entropy
(
stop_preds
,
label
)
loss
=
mel_loss
+
post_mel_loss
+
stop_loss
loss
=
mel_loss
+
post_mel_loss
# Note: When used stop token loss the learning did not work.
if
cfg
.
stop_token
:
stop_loss
=
cross_entropy
(
stop_preds
,
label
)
loss
=
loss
+
stop_loss
if
local_rank
==
0
:
writer
.
add_scalars
(
'training_loss'
,
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录