Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Parakeet
提交
04d7f8b5
P
Parakeet
项目概览
PaddlePaddle
/
Parakeet
通知
14
Star
3
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
19
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Parakeet
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
19
Issue
19
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
04d7f8b5
编写于
2月 13, 2020
作者:
L
lifuchen
提交者:
chenfeiyu
2月 13, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
transform parse to argparse
上级
f5ac04b1
变更
27
隐藏空白更改
内联
并排
Showing
27 changed file
with
365 addition
and
356 deletion
+365
-356
examples/FastSpeech/parse.py
examples/FastSpeech/parse.py
+0
-97
examples/TransformerTTS/config/train_transformer.yaml
examples/TransformerTTS/config/train_transformer.yaml
+0
-35
examples/fastspeech/README.md
examples/fastspeech/README.md
+0
-0
examples/fastspeech/config/fastspeech.yaml
examples/fastspeech/config/fastspeech.yaml
+32
-0
examples/fastspeech/config/synthesis.yaml
examples/fastspeech/config/synthesis.yaml
+2
-9
examples/fastspeech/parse.py
examples/fastspeech/parse.py
+36
-0
examples/fastspeech/synthesis.py
examples/fastspeech/synthesis.py
+26
-22
examples/fastspeech/train.py
examples/fastspeech/train.py
+32
-31
examples/transformer_tts/README.md
examples/transformer_tts/README.md
+0
-0
examples/transformer_tts/config/synthesis.yaml
examples/transformer_tts/config/synthesis.yaml
+11
-0
examples/transformer_tts/config/train_transformer.yaml
examples/transformer_tts/config/train_transformer.yaml
+7
-7
examples/transformer_tts/config/train_vocoder.yaml
examples/transformer_tts/config/train_vocoder.yaml
+16
-0
examples/transformer_tts/parse.py
examples/transformer_tts/parse.py
+38
-0
examples/transformer_tts/synthesis.py
examples/transformer_tts/synthesis.py
+32
-27
examples/transformer_tts/train_transformer.py
examples/transformer_tts/train_transformer.py
+36
-33
examples/transformer_tts/train_vocoder.py
examples/transformer_tts/train_vocoder.py
+33
-31
parakeet/models/dataloader/ljspeech.py
parakeet/models/dataloader/ljspeech.py
+14
-14
parakeet/models/fastspeech/fastspeech.py
parakeet/models/fastspeech/fastspeech.py
+28
-28
parakeet/models/transformer_tts/CBHG.py
parakeet/models/transformer_tts/CBHG.py
+0
-0
parakeet/models/transformer_tts/__init__.py
parakeet/models/transformer_tts/__init__.py
+0
-0
parakeet/models/transformer_tts/decoder.py
parakeet/models/transformer_tts/decoder.py
+7
-7
parakeet/models/transformer_tts/encoder.py
parakeet/models/transformer_tts/encoder.py
+4
-4
parakeet/models/transformer_tts/encoderprenet.py
parakeet/models/transformer_tts/encoderprenet.py
+0
-0
parakeet/models/transformer_tts/post_convnet.py
parakeet/models/transformer_tts/post_convnet.py
+0
-0
parakeet/models/transformer_tts/prenet.py
parakeet/models/transformer_tts/prenet.py
+0
-0
parakeet/models/transformer_tts/transformerTTS.py
parakeet/models/transformer_tts/transformerTTS.py
+4
-4
parakeet/models/transformer_tts/vocoder.py
parakeet/models/transformer_tts/vocoder.py
+7
-7
未找到文件。
examples/FastSpeech/parse.py
已删除
100644 → 0
浏览文件 @
f5ac04b1
import
jsonargparse
def
add_config_options_to_parser
(
parser
):
parser
.
add_argument
(
'--audio.num_mels'
,
type
=
int
,
default
=
80
,
help
=
"the number of mel bands when calculating mel spectrograms."
)
parser
.
add_argument
(
'--audio.n_fft'
,
type
=
int
,
default
=
2048
,
help
=
"the number of fft components."
)
parser
.
add_argument
(
'--audio.sr'
,
type
=
int
,
default
=
22050
,
help
=
"the sampling rate of audio data file."
)
parser
.
add_argument
(
'--audio.preemphasis'
,
type
=
float
,
default
=
0.97
,
help
=
"the preemphasis coefficient."
)
parser
.
add_argument
(
'--audio.hop_length'
,
type
=
int
,
default
=
128
,
help
=
"the number of samples to advance between frames."
)
parser
.
add_argument
(
'--audio.win_length'
,
type
=
int
,
default
=
1024
,
help
=
"the length (width) of the window function."
)
parser
.
add_argument
(
'--audio.power'
,
type
=
float
,
default
=
1.4
,
help
=
"the power to raise before griffin-lim."
)
parser
.
add_argument
(
'--audio.min_level_db'
,
type
=
int
,
default
=-
100
,
help
=
"the minimum level db."
)
parser
.
add_argument
(
'--audio.ref_level_db'
,
type
=
int
,
default
=
20
,
help
=
"the reference level db."
)
parser
.
add_argument
(
'--audio.outputs_per_step'
,
type
=
int
,
default
=
1
,
help
=
"the outputs per step."
)
parser
.
add_argument
(
'--encoder_n_layer'
,
type
=
int
,
default
=
6
,
help
=
"the number of FFT Block in encoder."
)
parser
.
add_argument
(
'--encoder_head'
,
type
=
int
,
default
=
2
,
help
=
"the attention head number in encoder."
)
parser
.
add_argument
(
'--encoder_conv1d_filter_size'
,
type
=
int
,
default
=
1024
,
help
=
"the filter size of conv1d in encoder."
)
parser
.
add_argument
(
'--max_sep_len'
,
type
=
int
,
default
=
2048
,
help
=
"the max length of sequence."
)
parser
.
add_argument
(
'--decoder_n_layer'
,
type
=
int
,
default
=
6
,
help
=
"the number of FFT Block in decoder."
)
parser
.
add_argument
(
'--decoder_head'
,
type
=
int
,
default
=
2
,
help
=
"the attention head number in decoder."
)
parser
.
add_argument
(
'--decoder_conv1d_filter_size'
,
type
=
int
,
default
=
1024
,
help
=
"the filter size of conv1d in decoder."
)
parser
.
add_argument
(
'--fs_hidden_size'
,
type
=
int
,
default
=
256
,
help
=
"the hidden size in model of fastspeech."
)
parser
.
add_argument
(
'--duration_predictor_output_size'
,
type
=
int
,
default
=
256
,
help
=
"the output size of duration predictior."
)
parser
.
add_argument
(
'--duration_predictor_filter_size'
,
type
=
int
,
default
=
3
,
help
=
"the filter size of conv1d in duration prediction."
)
parser
.
add_argument
(
'--fft_conv1d_filter'
,
type
=
int
,
default
=
3
,
help
=
"the filter size of conv1d in fft."
)
parser
.
add_argument
(
'--fft_conv1d_padding'
,
type
=
int
,
default
=
1
,
help
=
"the padding size of conv1d in fft."
)
parser
.
add_argument
(
'--dropout'
,
type
=
float
,
default
=
0.1
,
help
=
"the dropout in network."
)
parser
.
add_argument
(
'--transformer_head'
,
type
=
int
,
default
=
4
,
help
=
"the attention head num of transformerTTS."
)
parser
.
add_argument
(
'--alpha'
,
type
=
float
,
default
=
1.0
,
help
=
"the hyperparameter to determine the length of the expanded sequence
\
mel, thereby controlling the voice speed."
)
parser
.
add_argument
(
'--hidden_size'
,
type
=
int
,
default
=
256
,
help
=
"the hidden size in model of transformerTTS."
)
parser
.
add_argument
(
'--embedding_size'
,
type
=
int
,
default
=
256
,
help
=
"the dim size of embedding of transformerTTS."
)
parser
.
add_argument
(
'--warm_up_step'
,
type
=
int
,
default
=
4000
,
help
=
"the warm up step of learning rate."
)
parser
.
add_argument
(
'--grad_clip_thresh'
,
type
=
float
,
default
=
1.0
,
help
=
"the threshold of grad clip."
)
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
default
=
32
,
help
=
"batch size for training."
)
parser
.
add_argument
(
'--epochs'
,
type
=
int
,
default
=
10000
,
help
=
"the number of epoch for training."
)
parser
.
add_argument
(
'--lr'
,
type
=
float
,
default
=
0.001
,
help
=
"the learning rate for training."
)
parser
.
add_argument
(
'--save_step'
,
type
=
int
,
default
=
500
,
help
=
"checkpointing interval during training."
)
parser
.
add_argument
(
'--fastspeech_step'
,
type
=
int
,
default
=
160000
,
help
=
"Global step to restore checkpoint of fastspeech."
)
parser
.
add_argument
(
'--use_gpu'
,
type
=
bool
,
default
=
True
,
help
=
"use gpu or not during training."
)
parser
.
add_argument
(
'--use_data_parallel'
,
type
=
bool
,
default
=
False
,
help
=
"use data parallel or not during training."
)
parser
.
add_argument
(
'--data_path'
,
type
=
str
,
default
=
'./dataset/LJSpeech-1.1'
,
help
=
"the path of dataset."
)
parser
.
add_argument
(
'--checkpoint_path'
,
type
=
str
,
default
=
None
,
help
=
"the path to load checkpoint or pretrain model."
)
parser
.
add_argument
(
'--save_path'
,
type
=
str
,
default
=
'./checkpoint'
,
help
=
"the path to save checkpoint."
)
parser
.
add_argument
(
'--log_dir'
,
type
=
str
,
default
=
'./log'
,
help
=
"the directory to save tensorboard log."
)
parser
.
add_argument
(
'--sample_path'
,
type
=
str
,
default
=
'./sample'
,
help
=
"the directory to save audio sample in synthesis."
)
parser
.
add_argument
(
'--transtts_path'
,
type
=
str
,
default
=
'./log'
,
help
=
"the directory to load pretrain transformerTTS model."
)
parser
.
add_argument
(
'--transformer_step'
,
type
=
int
,
default
=
70000
,
help
=
"the step to load transformerTTS model."
)
parser
.
add_argument
(
'-c'
,
'--config'
,
action
=
jsonargparse
.
ActionConfigFile
)
examples/TransformerTTS/config/train_transformer.yaml
已删除
100644 → 0
浏览文件 @
f5ac04b1
audio
:
num_mels
:
80
n_fft
:
2048
sr
:
22050
preemphasis
:
0.97
hop_length
:
275
win_length
:
1102
power
:
1.2
min_level_db
:
-100
ref_level_db
:
20
outputs_per_step
:
1
hidden_size
:
256
embedding_size
:
512
warm_up_step
:
4000
grad_clip_thresh
:
1.0
batch_size
:
32
epochs
:
10000
lr
:
0.001
save_step
:
1000
image_step
:
2000
use_gpu
:
True
use_data_parallel
:
False
stop_token
:
False
data_path
:
../../dataset/LJSpeech-1.1
save_path
:
./checkpoint
log_dir
:
./log
#checkpoint_path: ./checkpoint
#ransformer_step: 97000
\ No newline at end of file
examples/
FastS
peech/README.md
→
examples/
fasts
peech/README.md
浏览文件 @
04d7f8b5
文件已移动
examples/
FastS
peech/config/fastspeech.yaml
→
examples/
fasts
peech/config/fastspeech.yaml
浏览文件 @
04d7f8b5
...
@@ -10,37 +10,23 @@ audio:
...
@@ -10,37 +10,23 @@ audio:
ref_level_db
:
20
#the reference level db.
ref_level_db
:
20
#the reference level db.
outputs_per_step
:
1
#the outputs per step.
outputs_per_step
:
1
#the outputs per step.
encoder_n_layer
:
6
encoder_n_layer
:
6
#the number of FFT Block in encoder.
encoder_head
:
2
encoder_head
:
2
#the attention head number in encoder.
encoder_conv1d_filter_size
:
1536
encoder_conv1d_filter_size
:
1536
#the filter size of conv1d in encoder.
max_se
p_len
:
2048
max_se
q_len
:
2048
#the max length of sequence.
decoder_n_layer
:
6
decoder_n_layer
:
6
#the number of FFT Block in decoder.
decoder_head
:
2
decoder_head
:
2
#the attention head number in decoder.
decoder_conv1d_filter_size
:
1536
decoder_conv1d_filter_size
:
1536
#the filter size of conv1d in decoder.
fs_hidden_size
:
384
fs_hidden_size
:
384
#the hidden size in model of fastspeech.
duration_predictor_output_size
:
256
duration_predictor_output_size
:
256
#the output size of duration predictior.
duration_predictor_filter_size
:
3
duration_predictor_filter_size
:
3
#the filter size of conv1d in duration prediction.
fft_conv1d_filter
:
3
fft_conv1d_filter
:
3
#the filter size of conv1d in fft.
fft_conv1d_padding
:
1
fft_conv1d_padding
:
1
#the padding size of conv1d in fft.
dropout
:
0.1
dropout
:
0.1
#the dropout in network.
transformer_head
:
4
transformer_head
:
4
#the attention head num of transformerTTS.
embedding_size
:
512
embedding_size
:
512
#the dim size of embedding of transformerTTS.
hidden_size
:
256
hidden_size
:
256
#the hidden size in model of transformerTTS.
warm_up_step
:
4000
#the warm up step of learning rate.
grad_clip_thresh
:
0.1
#the threshold of grad clip.
warm_up_step
:
4000
grad_clip_thresh
:
0.1
batch_size
:
32
epochs
:
10000
lr
:
0.001
save_step
:
500
use_gpu
:
True
use_data_parallel
:
True
data_path
:
../../dataset/LJSpeech-1.1
transtts_path
:
../TransformerTTS/checkpoint/
transformer_step
:
160000
save_path
:
./checkpoint
log_dir
:
./log
#checkpoint_path: ./checkpoint
#transformer_step: 97000
examples/
FastS
peech/config/synthesis.yaml
→
examples/
fasts
peech/config/synthesis.yaml
浏览文件 @
04d7f8b5
...
@@ -13,7 +13,7 @@ audio:
...
@@ -13,7 +13,7 @@ audio:
encoder_n_layer
:
6
encoder_n_layer
:
6
encoder_head
:
2
encoder_head
:
2
encoder_conv1d_filter_size
:
1536
encoder_conv1d_filter_size
:
1536
max_se
p
_len
:
2048
max_se
q
_len
:
2048
decoder_n_layer
:
6
decoder_n_layer
:
6
decoder_head
:
2
decoder_head
:
2
decoder_conv1d_filter_size
:
1536
decoder_conv1d_filter_size
:
1536
...
@@ -23,11 +23,4 @@ duration_predictor_filter_size: 3
...
@@ -23,11 +23,4 @@ duration_predictor_filter_size: 3
fft_conv1d_filter
:
3
fft_conv1d_filter
:
3
fft_conv1d_padding
:
1
fft_conv1d_padding
:
1
dropout
:
0.1
dropout
:
0.1
transformer_head
:
4
transformer_head
:
4
\ No newline at end of file
use_gpu
:
True
alpha
:
1.0
checkpoint_path
:
checkpoint/
fastspeech_step
:
71000
log_dir
:
./log
\ No newline at end of file
examples/fastspeech/parse.py
0 → 100644
浏览文件 @
04d7f8b5
import
argparse
def
add_config_options_to_parser
(
parser
):
parser
.
add_argument
(
'--config_path'
,
type
=
str
,
default
=
'config/fastspeech.yaml'
,
help
=
"the yaml config file path."
)
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
default
=
32
,
help
=
"batch size for training."
)
parser
.
add_argument
(
'--epochs'
,
type
=
int
,
default
=
10000
,
help
=
"the number of epoch for training."
)
parser
.
add_argument
(
'--lr'
,
type
=
float
,
default
=
0.001
,
help
=
"the learning rate for training."
)
parser
.
add_argument
(
'--save_step'
,
type
=
int
,
default
=
500
,
help
=
"checkpointing interval during training."
)
parser
.
add_argument
(
'--fastspeech_step'
,
type
=
int
,
default
=
70000
,
help
=
"Global step to restore checkpoint of fastspeech."
)
parser
.
add_argument
(
'--use_gpu'
,
type
=
int
,
default
=
1
,
help
=
"use gpu or not during training."
)
parser
.
add_argument
(
'--use_data_parallel'
,
type
=
int
,
default
=
0
,
help
=
"use data parallel or not during training."
)
parser
.
add_argument
(
'--data_path'
,
type
=
str
,
default
=
'./dataset/LJSpeech-1.1'
,
help
=
"the path of dataset."
)
parser
.
add_argument
(
'--checkpoint_path'
,
type
=
str
,
default
=
None
,
help
=
"the path to load checkpoint or pretrain model."
)
parser
.
add_argument
(
'--save_path'
,
type
=
str
,
default
=
'./checkpoint'
,
help
=
"the path to save checkpoint."
)
parser
.
add_argument
(
'--log_dir'
,
type
=
str
,
default
=
'./log'
,
help
=
"the directory to save tensorboard log."
)
parser
.
add_argument
(
'--sample_path'
,
type
=
str
,
default
=
'./sample'
,
help
=
"the directory to save audio sample in synthesis."
)
parser
.
add_argument
(
'--transtts_path'
,
type
=
str
,
default
=
'./log'
,
help
=
"the directory to load pretrain transformerTTS model."
)
parser
.
add_argument
(
'--transformer_step'
,
type
=
int
,
default
=
160000
,
help
=
"the step to load transformerTTS model."
)
examples/
FastS
peech/synthesis.py
→
examples/
fasts
peech/synthesis.py
浏览文件 @
04d7f8b5
import
os
import
os
from
tensorboardX
import
SummaryWriter
from
tensorboardX
import
SummaryWriter
from
collections
import
OrderedDict
from
collections
import
OrderedDict
import
json
argparse
import
argparse
from
parse
import
add_config_options_to_parser
from
parse
import
add_config_options_to_parser
from
pprint
import
pprint
from
pprint
import
pprint
from
ruamel
import
yaml
import
numpy
as
np
import
numpy
as
np
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.dygraph
as
dg
from
parakeet.g2p.en
import
text_to_sequence
from
parakeet.g2p.en
import
text_to_sequence
from
parakeet
import
audio
from
parakeet
import
audio
from
network
import
FastSpeech
from
parakeet.models.fastspeech.fastspeech
import
FastSpeech
def
load_checkpoint
(
step
,
model_path
):
def
load_checkpoint
(
step
,
model_path
):
model_dict
,
_
=
fluid
.
dygraph
.
load_dygraph
(
os
.
path
.
join
(
model_path
,
step
))
model_dict
,
_
=
fluid
.
dygraph
.
load_dygraph
(
os
.
path
.
join
(
model_path
,
step
))
...
@@ -21,19 +22,22 @@ def load_checkpoint(step, model_path):
...
@@ -21,19 +22,22 @@ def load_checkpoint(step, model_path):
new_state_dict
[
param
]
=
model_dict
[
param
]
new_state_dict
[
param
]
=
model_dict
[
param
]
return
new_state_dict
return
new_state_dict
def
synthesis
(
text_input
,
cfg
):
def
synthesis
(
text_input
,
args
):
place
=
(
fluid
.
CUDAPlace
(
0
)
if
cfg
.
use_gpu
else
fluid
.
CPUPlace
())
place
=
(
fluid
.
CUDAPlace
(
0
)
if
args
.
use_gpu
else
fluid
.
CPUPlace
())
# tensorboard
# tensorboard
if
not
os
.
path
.
exists
(
cfg
.
log_dir
):
if
not
os
.
path
.
exists
(
args
.
log_dir
):
os
.
mkdir
(
cfg
.
log_dir
)
os
.
mkdir
(
args
.
log_dir
)
path
=
os
.
path
.
join
(
cfg
.
log_dir
,
'synthesis'
)
path
=
os
.
path
.
join
(
args
.
log_dir
,
'synthesis'
)
with
open
(
args
.
config_path
)
as
f
:
cfg
=
yaml
.
load
(
f
,
Loader
=
yaml
.
Loader
)
writer
=
SummaryWriter
(
path
)
writer
=
SummaryWriter
(
path
)
with
dg
.
guard
(
place
):
with
dg
.
guard
(
place
):
model
=
FastSpeech
(
cfg
)
model
=
FastSpeech
(
cfg
)
model
.
set_dict
(
load_checkpoint
(
str
(
cfg
.
fastspeech_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"fastspeech"
)))
model
.
set_dict
(
load_checkpoint
(
str
(
args
.
fastspeech_step
),
os
.
path
.
join
(
args
.
checkpoint_path
,
"fastspeech"
)))
model
.
eval
()
model
.
eval
()
text
=
np
.
asarray
(
text_to_sequence
(
text_input
))
text
=
np
.
asarray
(
text_to_sequence
(
text_input
))
...
@@ -41,18 +45,18 @@ def synthesis(text_input, cfg):
...
@@ -41,18 +45,18 @@ def synthesis(text_input, cfg):
pos_text
=
np
.
arange
(
1
,
text
.
shape
[
1
]
+
1
)
pos_text
=
np
.
arange
(
1
,
text
.
shape
[
1
]
+
1
)
pos_text
=
fluid
.
layers
.
unsqueeze
(
dg
.
to_variable
(
pos_text
),[
0
])
pos_text
=
fluid
.
layers
.
unsqueeze
(
dg
.
to_variable
(
pos_text
),[
0
])
mel_output
,
mel_output_postnet
=
model
(
text
,
pos_text
,
alpha
=
cfg
.
alpha
)
mel_output
,
mel_output_postnet
=
model
(
text
,
pos_text
,
alpha
=
args
.
alpha
)
_ljspeech_processor
=
audio
.
AudioProcessor
(
_ljspeech_processor
=
audio
.
AudioProcessor
(
sample_rate
=
cfg
.
audio
.
sr
,
sample_rate
=
cfg
[
'audio'
][
'sr'
]
,
num_mels
=
cfg
.
audio
.
num_mels
,
num_mels
=
cfg
[
'audio'
][
'num_mels'
]
,
min_level_db
=
cfg
.
audio
.
min_level_db
,
min_level_db
=
cfg
[
'audio'
][
'min_level_db'
]
,
ref_level_db
=
cfg
.
audio
.
ref_level_db
,
ref_level_db
=
cfg
[
'audio'
][
'ref_level_db'
]
,
n_fft
=
cfg
.
audio
.
n_fft
,
n_fft
=
cfg
[
'audio'
][
'n_fft'
]
,
win_length
=
cfg
.
audio
.
win_length
,
win_length
=
cfg
[
'audio'
][
'win_length'
]
,
hop_length
=
cfg
.
audio
.
hop_length
,
hop_length
=
cfg
[
'audio'
][
'hop_length'
]
,
power
=
cfg
.
audio
.
power
,
power
=
cfg
[
'audio'
][
'power'
]
,
preemphasis
=
cfg
.
audio
.
preemphasis
,
preemphasis
=
cfg
[
'audio'
][
'preemphasis'
]
,
signal_norm
=
True
,
signal_norm
=
True
,
symmetric_norm
=
False
,
symmetric_norm
=
False
,
max_norm
=
1.
,
max_norm
=
1.
,
...
@@ -65,12 +69,12 @@ def synthesis(text_input, cfg):
...
@@ -65,12 +69,12 @@ def synthesis(text_input, cfg):
mel_output_postnet
=
fluid
.
layers
.
transpose
(
fluid
.
layers
.
squeeze
(
mel_output_postnet
,[
0
]),
[
1
,
0
])
mel_output_postnet
=
fluid
.
layers
.
transpose
(
fluid
.
layers
.
squeeze
(
mel_output_postnet
,[
0
]),
[
1
,
0
])
wav
=
_ljspeech_processor
.
inv_melspectrogram
(
mel_output_postnet
.
numpy
())
wav
=
_ljspeech_processor
.
inv_melspectrogram
(
mel_output_postnet
.
numpy
())
writer
.
add_audio
(
text_input
,
wav
,
0
,
cfg
.
audio
.
sr
)
writer
.
add_audio
(
text_input
,
wav
,
0
,
cfg
[
'audio'
][
'sr'
]
)
print
(
"Synthesis completed !!!"
)
print
(
"Synthesis completed !!!"
)
writer
.
close
()
writer
.
close
()
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
parser
=
jsonargparse
.
ArgumentParser
(
description
=
"Synthesis model"
,
formatter_class
=
'default_argparse'
)
parser
=
argparse
.
ArgumentParser
(
description
=
"Train Fastspeech model"
)
add_config_options_to_parser
(
parser
)
add_config_options_to_parser
(
parser
)
cfg
=
parser
.
parse_args
(
'-c ./config/synthesis.yaml'
.
split
())
args
=
parser
.
parse_args
()
synthesis
(
"Transformer model is so fast!"
,
cfg
)
synthesis
(
"Transformer model is so fast!"
,
args
)
\ No newline at end of file
\ No newline at end of file
examples/
FastS
peech/train.py
→
examples/
fasts
peech/train.py
浏览文件 @
04d7f8b5
...
@@ -3,10 +3,10 @@ import argparse
...
@@ -3,10 +3,10 @@ import argparse
import
os
import
os
import
time
import
time
import
math
import
math
import
jsonargparse
from
pathlib
import
Path
from
pathlib
import
Path
from
parse
import
add_config_options_to_parser
from
parse
import
add_config_options_to_parser
from
pprint
import
pprint
from
pprint
import
pprint
from
ruamel
import
yaml
from
tqdm
import
tqdm
from
tqdm
import
tqdm
from
collections
import
OrderedDict
from
collections
import
OrderedDict
from
tensorboardX
import
SummaryWriter
from
tensorboardX
import
SummaryWriter
...
@@ -14,7 +14,7 @@ import paddle.fluid.dygraph as dg
...
@@ -14,7 +14,7 @@ import paddle.fluid.dygraph as dg
import
paddle.fluid.layers
as
layers
import
paddle.fluid.layers
as
layers
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
parakeet.models.dataloader.ljspeech
import
LJSpeechLoader
from
parakeet.models.dataloader.ljspeech
import
LJSpeechLoader
from
parakeet.models.transformer
TTS
.transformerTTS
import
TransformerTTS
from
parakeet.models.transformer
_tts
.transformerTTS
import
TransformerTTS
from
parakeet.models.fastspeech.fastspeech
import
FastSpeech
from
parakeet.models.fastspeech.fastspeech
import
FastSpeech
from
parakeet.models.fastspeech.utils
import
get_alignment
from
parakeet.models.fastspeech.utils
import
get_alignment
...
@@ -28,50 +28,49 @@ def load_checkpoint(step, model_path):
...
@@ -28,50 +28,49 @@ def load_checkpoint(step, model_path):
new_state_dict
[
param
]
=
model_dict
[
param
]
new_state_dict
[
param
]
=
model_dict
[
param
]
return
new_state_dict
,
opti_dict
return
new_state_dict
,
opti_dict
def
main
(
cfg
):
def
main
(
args
):
local_rank
=
dg
.
parallel
.
Env
().
local_rank
if
cfg
.
use_data_parallel
else
0
local_rank
=
dg
.
parallel
.
Env
().
local_rank
if
args
.
use_data_parallel
else
0
nranks
=
dg
.
parallel
.
Env
().
nranks
if
cfg
.
use_data_parallel
else
1
nranks
=
dg
.
parallel
.
Env
().
nranks
if
args
.
use_data_parallel
else
1
if
local_rank
==
0
:
with
open
(
args
.
config_path
)
as
f
:
# Print the whole config setting.
cfg
=
yaml
.
load
(
f
,
Loader
=
yaml
.
Loader
)
pprint
(
jsonargparse
.
namespace_to_dict
(
cfg
))
global_step
=
0
global_step
=
0
place
=
(
fluid
.
CUDAPlace
(
dg
.
parallel
.
Env
().
dev_id
)
place
=
(
fluid
.
CUDAPlace
(
dg
.
parallel
.
Env
().
dev_id
)
if
cfg
.
use_data_parallel
else
fluid
.
CUDAPlace
(
0
)
if
args
.
use_data_parallel
else
fluid
.
CUDAPlace
(
0
)
if
cfg
.
use_gpu
else
fluid
.
CPUPlace
())
if
args
.
use_gpu
else
fluid
.
CPUPlace
())
if
not
os
.
path
.
exists
(
cfg
.
log_dir
):
if
not
os
.
path
.
exists
(
args
.
log_dir
):
os
.
mkdir
(
cfg
.
log_dir
)
os
.
mkdir
(
args
.
log_dir
)
path
=
os
.
path
.
join
(
cfg
.
log_dir
,
'fastspeech'
)
path
=
os
.
path
.
join
(
args
.
log_dir
,
'fastspeech'
)
writer
=
SummaryWriter
(
path
)
if
local_rank
==
0
else
None
writer
=
SummaryWriter
(
path
)
if
local_rank
==
0
else
None
with
dg
.
guard
(
place
):
with
dg
.
guard
(
place
):
with
fluid
.
unique_name
.
guard
():
with
fluid
.
unique_name
.
guard
():
transformerTTS
=
TransformerTTS
(
cfg
)
transformerTTS
=
TransformerTTS
(
cfg
)
model_dict
,
_
=
load_checkpoint
(
str
(
cfg
.
transformer_step
),
os
.
path
.
join
(
cfg
.
transtts_path
,
"transformer"
))
model_dict
,
_
=
load_checkpoint
(
str
(
args
.
transformer_step
),
os
.
path
.
join
(
args
.
transtts_path
,
"transformer"
))
transformerTTS
.
set_dict
(
model_dict
)
transformerTTS
.
set_dict
(
model_dict
)
transformerTTS
.
eval
()
transformerTTS
.
eval
()
model
=
FastSpeech
(
cfg
)
model
=
FastSpeech
(
cfg
)
model
.
train
()
model
.
train
()
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
dg
.
NoamDecay
(
1
/
(
cfg
.
warm_up_step
*
(
cfg
.
lr
**
2
)),
cfg
.
warm_up_step
),
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
dg
.
NoamDecay
(
1
/
(
cfg
[
'warm_up_step'
]
*
(
args
.
lr
**
2
)),
cfg
[
'warm_up_step'
]
),
parameter_list
=
model
.
parameters
())
parameter_list
=
model
.
parameters
())
reader
=
LJSpeechLoader
(
cfg
,
nranks
,
local_rank
,
shuffle
=
True
).
reader
()
reader
=
LJSpeechLoader
(
cfg
,
args
,
nranks
,
local_rank
,
shuffle
=
True
).
reader
()
if
cfg
.
checkpoint_path
is
not
None
:
if
args
.
checkpoint_path
is
not
None
:
model_dict
,
opti_dict
=
load_checkpoint
(
str
(
cfg
.
fastspeech_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"fastspeech"
))
model_dict
,
opti_dict
=
load_checkpoint
(
str
(
args
.
fastspeech_step
),
os
.
path
.
join
(
args
.
checkpoint_path
,
"fastspeech"
))
model
.
set_dict
(
model_dict
)
model
.
set_dict
(
model_dict
)
optimizer
.
set_dict
(
opti_dict
)
optimizer
.
set_dict
(
opti_dict
)
global_step
=
cfg
.
fastspeech_step
global_step
=
args
.
fastspeech_step
print
(
"load checkpoint!!!"
)
print
(
"load checkpoint!!!"
)
if
cfg
.
use_data_parallel
:
if
args
.
use_data_parallel
:
strategy
=
dg
.
parallel
.
prepare_context
()
strategy
=
dg
.
parallel
.
prepare_context
()
model
=
fluid
.
dygraph
.
parallel
.
DataParallel
(
model
,
strategy
)
model
=
fluid
.
dygraph
.
parallel
.
DataParallel
(
model
,
strategy
)
for
epoch
in
range
(
cfg
.
epochs
):
for
epoch
in
range
(
args
.
epochs
):
pbar
=
tqdm
(
reader
)
pbar
=
tqdm
(
reader
)
for
i
,
data
in
enumerate
(
pbar
):
for
i
,
data
in
enumerate
(
pbar
):
...
@@ -79,7 +78,7 @@ def main(cfg):
...
@@ -79,7 +78,7 @@ def main(cfg):
character
,
mel
,
mel_input
,
pos_text
,
pos_mel
,
text_length
,
mel_lens
=
data
character
,
mel
,
mel_input
,
pos_text
,
pos_mel
,
text_length
,
mel_lens
=
data
_
,
_
,
attn_probs
,
_
,
_
,
_
=
transformerTTS
(
character
,
mel_input
,
pos_text
,
pos_mel
)
_
,
_
,
attn_probs
,
_
,
_
,
_
=
transformerTTS
(
character
,
mel_input
,
pos_text
,
pos_mel
)
alignment
=
dg
.
to_variable
(
get_alignment
(
attn_probs
,
mel_lens
,
cfg
.
transformer_head
)).
astype
(
np
.
float32
)
alignment
=
dg
.
to_variable
(
get_alignment
(
attn_probs
,
mel_lens
,
cfg
[
'transformer_head'
]
)).
astype
(
np
.
float32
)
global_step
+=
1
global_step
+=
1
...
@@ -101,20 +100,20 @@ def main(cfg):
...
@@ -101,20 +100,20 @@ def main(cfg):
writer
.
add_scalar
(
'learning_rate'
,
optimizer
.
_learning_rate
.
step
().
numpy
(),
global_step
)
writer
.
add_scalar
(
'learning_rate'
,
optimizer
.
_learning_rate
.
step
().
numpy
(),
global_step
)
if
cfg
.
use_data_parallel
:
if
args
.
use_data_parallel
:
total_loss
=
model
.
scale_loss
(
total_loss
)
total_loss
=
model
.
scale_loss
(
total_loss
)
total_loss
.
backward
()
total_loss
.
backward
()
model
.
apply_collective_grads
()
model
.
apply_collective_grads
()
else
:
else
:
total_loss
.
backward
()
total_loss
.
backward
()
optimizer
.
minimize
(
total_loss
,
grad_clip
=
fluid
.
dygraph_grad_clip
.
GradClipByGlobalNorm
(
cfg
.
grad_clip_thresh
))
optimizer
.
minimize
(
total_loss
,
grad_clip
=
fluid
.
dygraph_grad_clip
.
GradClipByGlobalNorm
(
cfg
[
'grad_clip_thresh'
]
))
model
.
clear_gradients
()
model
.
clear_gradients
()
# save checkpoint
# save checkpoint
if
local_rank
==
0
and
global_step
%
cfg
.
save_step
==
0
:
if
local_rank
==
0
and
global_step
%
args
.
save_step
==
0
:
if
not
os
.
path
.
exists
(
cfg
.
save_path
):
if
not
os
.
path
.
exists
(
args
.
save_path
):
os
.
mkdir
(
cfg
.
save_path
)
os
.
mkdir
(
args
.
save_path
)
save_path
=
os
.
path
.
join
(
cfg
.
save_path
,
'fastspeech/%d'
%
global_step
)
save_path
=
os
.
path
.
join
(
args
.
save_path
,
'fastspeech/%d'
%
global_step
)
dg
.
save_dygraph
(
model
.
state_dict
(),
save_path
)
dg
.
save_dygraph
(
model
.
state_dict
(),
save_path
)
dg
.
save_dygraph
(
optimizer
.
state_dict
(),
save_path
)
dg
.
save_dygraph
(
optimizer
.
state_dict
(),
save_path
)
if
local_rank
==
0
:
if
local_rank
==
0
:
...
@@ -122,7 +121,9 @@ def main(cfg):
...
@@ -122,7 +121,9 @@ def main(cfg):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
parser
=
jsonargparse
.
ArgumentParser
(
description
=
"Train Fastspeech model"
,
formatter_class
=
'default_argparse'
)
parser
=
argparse
.
ArgumentParser
(
description
=
"Train Fastspeech model"
)
add_config_options_to_parser
(
parser
)
add_config_options_to_parser
(
parser
)
cfg
=
parser
.
parse_args
(
'-c config/fastspeech.yaml'
.
split
())
args
=
parser
.
parse_args
()
main
(
cfg
)
# Print the whole config setting.
pprint
(
args
)
main
(
args
)
examples/
TransformerTTS
/README.md
→
examples/
transformer_tts
/README.md
浏览文件 @
04d7f8b5
文件已移动
examples/transformer_tts/config/synthesis.yaml
0 → 100644
浏览文件 @
04d7f8b5
audio
:
num_mels
:
80
n_fft
:
2048
sr
:
22050
preemphasis
:
0.97
hop_length
:
275
win_length
:
1102
power
:
1.2
min_level_db
:
-100
ref_level_db
:
20
outputs_per_step
:
1
\ No newline at end of file
examples/
TransformerTTS/config/synthesis
.yaml
→
examples/
transformer_tts/config/train_transformer
.yaml
浏览文件 @
04d7f8b5
...
@@ -10,11 +10,11 @@ audio:
...
@@ -10,11 +10,11 @@ audio:
ref_level_db
:
20
ref_level_db
:
20
outputs_per_step
:
1
outputs_per_step
:
1
max_len
:
50
transformer_step
:
10
postnet_step
:
10
use_gpu
:
True
checkpoint_path
:
./checkpoint
hidden_size
:
256
log_dir
:
./log
embedding_size
:
512
sample_path
:
./sample
warm_up_step
:
4000
\ No newline at end of file
grad_clip_thresh
:
1.0
\ No newline at end of file
examples/
TransformerTTS
/config/train_vocoder.yaml
→
examples/
transformer_tts
/config/train_vocoder.yaml
浏览文件 @
04d7f8b5
...
@@ -12,18 +12,5 @@ audio:
...
@@ -12,18 +12,5 @@ audio:
hidden_size
:
256
hidden_size
:
256
embedding_size
:
512
embedding_size
:
512
warm_up_step
:
4000
warm_up_step
:
4000
grad_clip_thresh
:
1.0
grad_clip_thresh
:
1.0
batch_size
:
32
\ No newline at end of file
epochs
:
10000
lr
:
0.001
save_step
:
10
use_gpu
:
True
use_data_parallel
:
True
data_path
:
../../dataset/LJSpeech-1.1
save_path
:
./checkpoint
log_dir
:
./log
#checkpoint_path: ./checkpoint
#transformer_step: 27000
\ No newline at end of file
examples/
TransformerTTS
/parse.py
→
examples/
transformer_tts
/parse.py
浏览文件 @
04d7f8b5
import
json
argparse
import
argparse
def
add_config_options_to_parser
(
parser
):
def
add_config_options_to_parser
(
parser
):
parser
.
add_argument
(
'--audio.num_mels'
,
type
=
int
,
default
=
80
,
parser
.
add_argument
(
'--config_path'
,
type
=
str
,
default
=
'config/train_transformer.yaml'
,
help
=
"the number of mel bands when calculating mel spectrograms."
)
help
=
"the yaml config file path."
)
parser
.
add_argument
(
'--audio.n_fft'
,
type
=
int
,
default
=
2048
,
help
=
"the number of fft components."
)
parser
.
add_argument
(
'--audio.sr'
,
type
=
int
,
default
=
22050
,
help
=
"the sampling rate of audio data file."
)
parser
.
add_argument
(
'--audio.preemphasis'
,
type
=
float
,
default
=
0.97
,
help
=
"the preemphasis coefficient."
)
parser
.
add_argument
(
'--audio.hop_length'
,
type
=
int
,
default
=
128
,
help
=
"the number of samples to advance between frames."
)
parser
.
add_argument
(
'--audio.win_length'
,
type
=
int
,
default
=
1024
,
help
=
"the length (width) of the window function."
)
parser
.
add_argument
(
'--audio.power'
,
type
=
float
,
default
=
1.4
,
help
=
"the power to raise before griffin-lim."
)
parser
.
add_argument
(
'--audio.min_level_db'
,
type
=
int
,
default
=-
100
,
help
=
"the minimum level db."
)
parser
.
add_argument
(
'--audio.ref_level_db'
,
type
=
int
,
default
=
20
,
help
=
"the reference level db."
)
parser
.
add_argument
(
'--audio.outputs_per_step'
,
type
=
int
,
default
=
1
,
help
=
"the outputs per step."
)
parser
.
add_argument
(
'--hidden_size'
,
type
=
int
,
default
=
256
,
help
=
"the hidden size in network."
)
parser
.
add_argument
(
'--embedding_size'
,
type
=
int
,
default
=
512
,
help
=
"the embedding vector size."
)
parser
.
add_argument
(
'--warm_up_step'
,
type
=
int
,
default
=
4000
,
help
=
"the warm up step of learning rate."
)
parser
.
add_argument
(
'--grad_clip_thresh'
,
type
=
float
,
default
=
1.0
,
help
=
"the threshold of grad clip."
)
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
default
=
32
,
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
default
=
32
,
help
=
"batch size for training."
)
help
=
"batch size for training."
)
parser
.
add_argument
(
'--epochs'
,
type
=
int
,
default
=
10000
,
parser
.
add_argument
(
'--epochs'
,
type
=
int
,
default
=
10000
,
...
@@ -45,13 +17,13 @@ def add_config_options_to_parser(parser):
...
@@ -45,13 +17,13 @@ def add_config_options_to_parser(parser):
help
=
"The max length of audio when synthsis."
)
help
=
"The max length of audio when synthsis."
)
parser
.
add_argument
(
'--transformer_step'
,
type
=
int
,
default
=
160000
,
parser
.
add_argument
(
'--transformer_step'
,
type
=
int
,
default
=
160000
,
help
=
"Global step to restore checkpoint of transformer."
)
help
=
"Global step to restore checkpoint of transformer."
)
parser
.
add_argument
(
'--
postnet
_step'
,
type
=
int
,
default
=
90000
,
parser
.
add_argument
(
'--
vocoder
_step'
,
type
=
int
,
default
=
90000
,
help
=
"Global step to restore checkpoint of postnet."
)
help
=
"Global step to restore checkpoint of postnet."
)
parser
.
add_argument
(
'--use_gpu'
,
type
=
bool
,
default
=
True
,
parser
.
add_argument
(
'--use_gpu'
,
type
=
int
,
default
=
1
,
help
=
"use gpu or not during training."
)
help
=
"use gpu or not during training."
)
parser
.
add_argument
(
'--use_data_parallel'
,
type
=
bool
,
default
=
False
,
parser
.
add_argument
(
'--use_data_parallel'
,
type
=
int
,
default
=
0
,
help
=
"use data parallel or not during training."
)
help
=
"use data parallel or not during training."
)
parser
.
add_argument
(
'--stop_token'
,
type
=
bool
,
default
=
False
,
parser
.
add_argument
(
'--stop_token'
,
type
=
int
,
default
=
0
,
help
=
"use stop token loss in network or not."
)
help
=
"use stop token loss in network or not."
)
parser
.
add_argument
(
'--data_path'
,
type
=
str
,
default
=
'./dataset/LJSpeech-1.1'
,
parser
.
add_argument
(
'--data_path'
,
type
=
str
,
default
=
'./dataset/LJSpeech-1.1'
,
...
@@ -62,8 +34,5 @@ def add_config_options_to_parser(parser):
...
@@ -62,8 +34,5 @@ def add_config_options_to_parser(parser):
help
=
"the path to save checkpoint."
)
help
=
"the path to save checkpoint."
)
parser
.
add_argument
(
'--log_dir'
,
type
=
str
,
default
=
'./log'
,
parser
.
add_argument
(
'--log_dir'
,
type
=
str
,
default
=
'./log'
,
help
=
"the directory to save tensorboard log."
)
help
=
"the directory to save tensorboard log."
)
parser
.
add_argument
(
'--sample_path'
,
type
=
str
,
default
=
'./
log
'
,
parser
.
add_argument
(
'--sample_path'
,
type
=
str
,
default
=
'./
sample
'
,
help
=
"the directory to save audio sample in synthesis."
)
help
=
"the directory to save audio sample in synthesis."
)
parser
.
add_argument
(
'-c'
,
'--config'
,
action
=
jsonargparse
.
ActionConfigFile
)
examples/
TransformerTTS
/synthesis.py
→
examples/
transformer_tts
/synthesis.py
浏览文件 @
04d7f8b5
...
@@ -2,17 +2,19 @@ import os
...
@@ -2,17 +2,19 @@ import os
from
scipy.io.wavfile
import
write
from
scipy.io.wavfile
import
write
from
parakeet.g2p.en
import
text_to_sequence
from
parakeet.g2p.en
import
text_to_sequence
import
numpy
as
np
import
numpy
as
np
from
network
import
TransformerTTS
,
ModelPostNet
from
tqdm
import
tqdm
from
tqdm
import
tqdm
from
tensorboardX
import
SummaryWriter
from
tensorboardX
import
SummaryWriter
from
ruamel
import
yaml
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.dygraph
as
dg
from
pathlib
import
Path
from
pathlib
import
Path
import
json
argparse
import
argparse
from
parse
import
add_config_options_to_parser
from
parse
import
add_config_options_to_parser
from
pprint
import
pprint
from
pprint
import
pprint
from
collections
import
OrderedDict
from
collections
import
OrderedDict
from
parakeet
import
audio
from
parakeet
import
audio
from
parakeet.models.transformer_tts.vocoder
import
Vocoder
from
parakeet.models.transformer_tts.transformerTTS
import
TransformerTTS
def
load_checkpoint
(
step
,
model_path
):
def
load_checkpoint
(
step
,
model_path
):
model_dict
,
_
=
fluid
.
dygraph
.
load_dygraph
(
os
.
path
.
join
(
model_path
,
step
))
model_dict
,
_
=
fluid
.
dygraph
.
load_dygraph
(
os
.
path
.
join
(
model_path
,
step
))
...
@@ -24,25 +26,28 @@ def load_checkpoint(step, model_path):
...
@@ -24,25 +26,28 @@ def load_checkpoint(step, model_path):
new_state_dict
[
param
]
=
model_dict
[
param
]
new_state_dict
[
param
]
=
model_dict
[
param
]
return
new_state_dict
return
new_state_dict
def
synthesis
(
text_input
,
cfg
):
def
synthesis
(
text_input
,
args
):
place
=
(
fluid
.
CUDAPlace
(
0
)
if
cfg
.
use_gpu
else
fluid
.
CPUPlace
())
place
=
(
fluid
.
CUDAPlace
(
0
)
if
args
.
use_gpu
else
fluid
.
CPUPlace
())
with
open
(
args
.
config_path
)
as
f
:
cfg
=
yaml
.
load
(
f
,
Loader
=
yaml
.
Loader
)
# tensorboard
# tensorboard
if
not
os
.
path
.
exists
(
cfg
.
log_dir
):
if
not
os
.
path
.
exists
(
args
.
log_dir
):
os
.
mkdir
(
cfg
.
log_dir
)
os
.
mkdir
(
args
.
log_dir
)
path
=
os
.
path
.
join
(
cfg
.
log_dir
,
'synthesis'
)
path
=
os
.
path
.
join
(
args
.
log_dir
,
'synthesis'
)
writer
=
SummaryWriter
(
path
)
writer
=
SummaryWriter
(
path
)
with
dg
.
guard
(
place
):
with
dg
.
guard
(
place
):
with
fluid
.
unique_name
.
guard
():
with
fluid
.
unique_name
.
guard
():
model
=
TransformerTTS
(
cfg
)
model
=
TransformerTTS
(
cfg
)
model
.
set_dict
(
load_checkpoint
(
str
(
cfg
.
transformer_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"nostop_token/transformer"
)))
model
.
set_dict
(
load_checkpoint
(
str
(
args
.
transformer_step
),
os
.
path
.
join
(
args
.
checkpoint_path
,
"nostop_token/transformer"
)))
model
.
eval
()
model
.
eval
()
with
fluid
.
unique_name
.
guard
():
with
fluid
.
unique_name
.
guard
():
model_postnet
=
ModelPostNet
(
cfg
)
model_postnet
=
Vocoder
(
cfg
,
args
.
batch_size
)
model_postnet
.
set_dict
(
load_checkpoint
(
str
(
cfg
.
postnet_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"postnet"
)))
model_postnet
.
set_dict
(
load_checkpoint
(
str
(
args
.
postnet_step
),
os
.
path
.
join
(
args
.
checkpoint_path
,
"postnet"
)))
model_postnet
.
eval
()
model_postnet
.
eval
()
# init input
# init input
text
=
np
.
asarray
(
text_to_sequence
(
text_input
))
text
=
np
.
asarray
(
text_to_sequence
(
text_input
))
...
@@ -52,7 +57,7 @@ def synthesis(text_input, cfg):
...
@@ -52,7 +57,7 @@ def synthesis(text_input, cfg):
pos_text
=
fluid
.
layers
.
unsqueeze
(
dg
.
to_variable
(
pos_text
),[
0
])
pos_text
=
fluid
.
layers
.
unsqueeze
(
dg
.
to_variable
(
pos_text
),[
0
])
pbar
=
tqdm
(
range
(
cfg
.
max_len
))
pbar
=
tqdm
(
range
(
args
.
max_len
))
for
i
in
pbar
:
for
i
in
pbar
:
pos_mel
=
np
.
arange
(
1
,
mel_input
.
shape
[
1
]
+
1
)
pos_mel
=
np
.
arange
(
1
,
mel_input
.
shape
[
1
]
+
1
)
...
@@ -62,15 +67,15 @@ def synthesis(text_input, cfg):
...
@@ -62,15 +67,15 @@ def synthesis(text_input, cfg):
mag_pred
=
model_postnet
(
postnet_pred
)
mag_pred
=
model_postnet
(
postnet_pred
)
_ljspeech_processor
=
audio
.
AudioProcessor
(
_ljspeech_processor
=
audio
.
AudioProcessor
(
sample_rate
=
cfg
.
audio
.
sr
,
sample_rate
=
cfg
[
'audio'
][
'sr'
]
,
num_mels
=
cfg
.
audio
.
num_mels
,
num_mels
=
cfg
[
'audio'
][
'num_mels'
]
,
min_level_db
=
cfg
.
audio
.
min_level_db
,
min_level_db
=
cfg
[
'audio'
][
'min_level_db'
]
,
ref_level_db
=
cfg
.
audio
.
ref_level_db
,
ref_level_db
=
cfg
[
'audio'
][
'ref_level_db'
]
,
n_fft
=
cfg
.
audio
.
n_fft
,
n_fft
=
cfg
[
'audio'
][
'n_fft'
]
,
win_length
=
cfg
.
audio
.
win_length
,
win_length
=
cfg
[
'audio'
][
'win_length'
]
,
hop_length
=
cfg
.
audio
.
hop_length
,
hop_length
=
cfg
[
'audio'
][
'hop_length'
]
,
power
=
cfg
.
audio
.
power
,
power
=
cfg
[
'audio'
][
'power'
]
,
preemphasis
=
cfg
.
audio
.
preemphasis
,
preemphasis
=
cfg
[
'audio'
][
'preemphasis'
]
,
signal_norm
=
True
,
signal_norm
=
True
,
symmetric_norm
=
False
,
symmetric_norm
=
False
,
max_norm
=
1.
,
max_norm
=
1.
,
...
@@ -82,14 +87,14 @@ def synthesis(text_input, cfg):
...
@@ -82,14 +87,14 @@ def synthesis(text_input, cfg):
sound_norm
=
False
)
sound_norm
=
False
)
wav
=
_ljspeech_processor
.
inv_spectrogram
(
fluid
.
layers
.
transpose
(
fluid
.
layers
.
squeeze
(
mag_pred
,[
0
]),
[
1
,
0
]).
numpy
())
wav
=
_ljspeech_processor
.
inv_spectrogram
(
fluid
.
layers
.
transpose
(
fluid
.
layers
.
squeeze
(
mag_pred
,[
0
]),
[
1
,
0
]).
numpy
())
writer
.
add_audio
(
text_input
,
wav
,
0
,
cfg
.
audio
.
sr
)
writer
.
add_audio
(
text_input
,
wav
,
0
,
cfg
[
'audio'
][
'sr'
]
)
if
not
os
.
path
.
exists
(
cfg
.
sample_path
):
if
not
os
.
path
.
exists
(
args
.
sample_path
):
os
.
mkdir
(
cfg
.
sample_path
)
os
.
mkdir
(
args
.
sample_path
)
write
(
os
.
path
.
join
(
cfg
.
sample_path
,
'test.wav'
),
cfg
.
audio
.
sr
,
wav
)
write
(
os
.
path
.
join
(
args
.
sample_path
,
'test.wav'
),
cfg
[
'audio'
][
'sr'
]
,
wav
)
writer
.
close
()
writer
.
close
()
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
parser
=
jsonargparse
.
ArgumentParser
(
description
=
"Synthesis model"
,
formatter_class
=
'default_argparse'
)
parser
=
argparse
.
ArgumentParser
(
description
=
"Synthesis model"
)
add_config_options_to_parser
(
parser
)
add_config_options_to_parser
(
parser
)
cfg
=
parser
.
parse_args
(
'-c ./config/synthesis.yaml'
.
split
()
)
args
=
parser
.
parse_args
(
)
synthesis
(
"Transformer model is so fast!"
,
cfg
)
synthesis
(
"Transformer model is so fast!"
,
args
)
examples/
TransformerTTS
/train_transformer.py
→
examples/
transformer_tts
/train_transformer.py
浏览文件 @
04d7f8b5
...
@@ -3,9 +3,10 @@ from tqdm import tqdm
...
@@ -3,9 +3,10 @@ from tqdm import tqdm
from
tensorboardX
import
SummaryWriter
from
tensorboardX
import
SummaryWriter
from
pathlib
import
Path
from
pathlib
import
Path
from
collections
import
OrderedDict
from
collections
import
OrderedDict
import
json
argparse
import
argparse
from
parse
import
add_config_options_to_parser
from
parse
import
add_config_options_to_parser
from
pprint
import
pprint
from
pprint
import
pprint
from
ruamel
import
yaml
from
matplotlib
import
cm
from
matplotlib
import
cm
import
numpy
as
np
import
numpy
as
np
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
...
@@ -13,7 +14,7 @@ import paddle.fluid.dygraph as dg
...
@@ -13,7 +14,7 @@ import paddle.fluid.dygraph as dg
import
paddle.fluid.layers
as
layers
import
paddle.fluid.layers
as
layers
from
parakeet.modules.utils
import
cross_entropy
from
parakeet.modules.utils
import
cross_entropy
from
parakeet.models.dataloader.ljspeech
import
LJSpeechLoader
from
parakeet.models.dataloader.ljspeech
import
LJSpeechLoader
from
parakeet.models.transformer
TTS
.transformerTTS
import
TransformerTTS
from
parakeet.models.transformer
_tts
.transformerTTS
import
TransformerTTS
def
load_checkpoint
(
step
,
model_path
):
def
load_checkpoint
(
step
,
model_path
):
model_dict
,
opti_dict
=
fluid
.
dygraph
.
load_dygraph
(
os
.
path
.
join
(
model_path
,
step
))
model_dict
,
opti_dict
=
fluid
.
dygraph
.
load_dygraph
(
os
.
path
.
join
(
model_path
,
step
))
...
@@ -26,22 +27,21 @@ def load_checkpoint(step, model_path):
...
@@ -26,22 +27,21 @@ def load_checkpoint(step, model_path):
return
new_state_dict
,
opti_dict
return
new_state_dict
,
opti_dict
def
main
(
cfg
):
def
main
(
args
):
local_rank
=
dg
.
parallel
.
Env
().
local_rank
if
cfg
.
use_data_parallel
else
0
local_rank
=
dg
.
parallel
.
Env
().
local_rank
if
args
.
use_data_parallel
else
0
nranks
=
dg
.
parallel
.
Env
().
nranks
if
cfg
.
use_data_parallel
else
1
nranks
=
dg
.
parallel
.
Env
().
nranks
if
args
.
use_data_parallel
else
1
if
local_rank
==
0
:
with
open
(
args
.
config_path
)
as
f
:
# Print the whole config setting.
cfg
=
yaml
.
load
(
f
,
Loader
=
yaml
.
Loader
)
pprint
(
jsonargparse
.
namespace_to_dict
(
cfg
))
global_step
=
0
global_step
=
0
place
=
(
fluid
.
CUDAPlace
(
dg
.
parallel
.
Env
().
dev_id
)
place
=
(
fluid
.
CUDAPlace
(
dg
.
parallel
.
Env
().
dev_id
)
if
cfg
.
use_data_parallel
else
fluid
.
CUDAPlace
(
0
)
if
args
.
use_data_parallel
else
fluid
.
CUDAPlace
(
0
)
if
cfg
.
use_gpu
else
fluid
.
CPUPlace
())
if
args
.
use_gpu
else
fluid
.
CPUPlace
())
if
not
os
.
path
.
exists
(
cfg
.
log_dir
):
if
not
os
.
path
.
exists
(
args
.
log_dir
):
os
.
mkdir
(
cfg
.
log_dir
)
os
.
mkdir
(
args
.
log_dir
)
path
=
os
.
path
.
join
(
cfg
.
log_dir
,
'transformer'
)
path
=
os
.
path
.
join
(
args
.
log_dir
,
'transformer'
)
writer
=
SummaryWriter
(
path
)
if
local_rank
==
0
else
None
writer
=
SummaryWriter
(
path
)
if
local_rank
==
0
else
None
...
@@ -49,23 +49,23 @@ def main(cfg):
...
@@ -49,23 +49,23 @@ def main(cfg):
model
=
TransformerTTS
(
cfg
)
model
=
TransformerTTS
(
cfg
)
model
.
train
()
model
.
train
()
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
dg
.
NoamDecay
(
1
/
(
cfg
.
warm_up_step
*
(
cfg
.
lr
**
2
)),
cfg
.
warm_up_step
),
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
dg
.
NoamDecay
(
1
/
(
cfg
[
'warm_up_step'
]
*
(
args
.
lr
**
2
)),
cfg
[
'warm_up_step'
]
),
parameter_list
=
model
.
parameters
())
parameter_list
=
model
.
parameters
())
reader
=
LJSpeechLoader
(
cfg
,
nranks
,
local_rank
,
shuffle
=
True
).
reader
()
reader
=
LJSpeechLoader
(
cfg
,
args
,
nranks
,
local_rank
,
shuffle
=
True
).
reader
()
if
cfg
.
checkpoint_path
is
not
None
:
if
args
.
checkpoint_path
is
not
None
:
model_dict
,
opti_dict
=
load_checkpoint
(
str
(
cfg
.
transformer_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"transformer"
))
model_dict
,
opti_dict
=
load_checkpoint
(
str
(
args
.
transformer_step
),
os
.
path
.
join
(
args
.
checkpoint_path
,
"transformer"
))
model
.
set_dict
(
model_dict
)
model
.
set_dict
(
model_dict
)
optimizer
.
set_dict
(
opti_dict
)
optimizer
.
set_dict
(
opti_dict
)
global_step
=
cfg
.
transformer_step
global_step
=
args
.
transformer_step
print
(
"load checkpoint!!!"
)
print
(
"load checkpoint!!!"
)
if
cfg
.
use_data_parallel
:
if
args
.
use_data_parallel
:
strategy
=
dg
.
parallel
.
prepare_context
()
strategy
=
dg
.
parallel
.
prepare_context
()
model
=
fluid
.
dygraph
.
parallel
.
DataParallel
(
model
,
strategy
)
model
=
fluid
.
dygraph
.
parallel
.
DataParallel
(
model
,
strategy
)
for
epoch
in
range
(
cfg
.
epochs
):
for
epoch
in
range
(
args
.
epochs
):
pbar
=
tqdm
(
reader
)
pbar
=
tqdm
(
reader
)
for
i
,
data
in
enumerate
(
pbar
):
for
i
,
data
in
enumerate
(
pbar
):
pbar
.
set_description
(
'Processing at epoch %d'
%
epoch
)
pbar
.
set_description
(
'Processing at epoch %d'
%
epoch
)
...
@@ -81,7 +81,7 @@ def main(cfg):
...
@@ -81,7 +81,7 @@ def main(cfg):
post_mel_loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
postnet_pred
,
mel
)))
post_mel_loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
postnet_pred
,
mel
)))
loss
=
mel_loss
+
post_mel_loss
loss
=
mel_loss
+
post_mel_loss
# Note: When used stop token loss the learning did not work.
# Note: When used stop token loss the learning did not work.
if
cfg
.
stop_token
:
if
args
.
stop_token
:
stop_loss
=
cross_entropy
(
stop_preds
,
label
)
stop_loss
=
cross_entropy
(
stop_preds
,
label
)
loss
=
loss
+
stop_loss
loss
=
loss
+
stop_loss
...
@@ -91,7 +91,7 @@ def main(cfg):
...
@@ -91,7 +91,7 @@ def main(cfg):
'post_mel_loss'
:
post_mel_loss
.
numpy
()
'post_mel_loss'
:
post_mel_loss
.
numpy
()
},
global_step
)
},
global_step
)
if
cfg
.
stop_token
:
if
args
.
stop_token
:
writer
.
add_scalar
(
'stop_loss'
,
stop_loss
.
numpy
(),
global_step
)
writer
.
add_scalar
(
'stop_loss'
,
stop_loss
.
numpy
(),
global_step
)
writer
.
add_scalars
(
'alphas'
,
{
writer
.
add_scalars
(
'alphas'
,
{
...
@@ -101,7 +101,7 @@ def main(cfg):
...
@@ -101,7 +101,7 @@ def main(cfg):
writer
.
add_scalar
(
'learning_rate'
,
optimizer
.
_learning_rate
.
step
().
numpy
(),
global_step
)
writer
.
add_scalar
(
'learning_rate'
,
optimizer
.
_learning_rate
.
step
().
numpy
(),
global_step
)
if
global_step
%
cfg
.
image_step
==
1
:
if
global_step
%
args
.
image_step
==
1
:
for
i
,
prob
in
enumerate
(
attn_probs
):
for
i
,
prob
in
enumerate
(
attn_probs
):
for
j
in
range
(
4
):
for
j
in
range
(
4
):
x
=
np
.
uint8
(
cm
.
viridis
(
prob
.
numpy
()[
j
*
16
])
*
255
)
x
=
np
.
uint8
(
cm
.
viridis
(
prob
.
numpy
()[
j
*
16
])
*
255
)
...
@@ -117,20 +117,20 @@ def main(cfg):
...
@@ -117,20 +117,20 @@ def main(cfg):
x
=
np
.
uint8
(
cm
.
viridis
(
prob
.
numpy
()[
j
*
16
])
*
255
)
x
=
np
.
uint8
(
cm
.
viridis
(
prob
.
numpy
()[
j
*
16
])
*
255
)
writer
.
add_image
(
'Attention_dec_%d_0'
%
global_step
,
x
,
i
*
4
+
j
,
dataformats
=
"HWC"
)
writer
.
add_image
(
'Attention_dec_%d_0'
%
global_step
,
x
,
i
*
4
+
j
,
dataformats
=
"HWC"
)
if
cfg
.
use_data_parallel
:
if
args
.
use_data_parallel
:
loss
=
model
.
scale_loss
(
loss
)
loss
=
model
.
scale_loss
(
loss
)
loss
.
backward
()
loss
.
backward
()
model
.
apply_collective_grads
()
model
.
apply_collective_grads
()
else
:
else
:
loss
.
backward
()
loss
.
backward
()
optimizer
.
minimize
(
loss
,
grad_clip
=
fluid
.
dygraph_grad_clip
.
GradClipByGlobalNorm
(
cfg
.
grad_clip_thresh
))
optimizer
.
minimize
(
loss
,
grad_clip
=
fluid
.
dygraph_grad_clip
.
GradClipByGlobalNorm
(
cfg
[
'grad_clip_thresh'
]
))
model
.
clear_gradients
()
model
.
clear_gradients
()
# save checkpoint
# save checkpoint
if
local_rank
==
0
and
global_step
%
cfg
.
save_step
==
0
:
if
local_rank
==
0
and
global_step
%
args
.
save_step
==
0
:
if
not
os
.
path
.
exists
(
cfg
.
save_path
):
if
not
os
.
path
.
exists
(
args
.
save_path
):
os
.
mkdir
(
cfg
.
save_path
)
os
.
mkdir
(
args
.
save_path
)
save_path
=
os
.
path
.
join
(
cfg
.
save_path
,
'transformer/%d'
%
global_step
)
save_path
=
os
.
path
.
join
(
args
.
save_path
,
'transformer/%d'
%
global_step
)
dg
.
save_dygraph
(
model
.
state_dict
(),
save_path
)
dg
.
save_dygraph
(
model
.
state_dict
(),
save_path
)
dg
.
save_dygraph
(
optimizer
.
state_dict
(),
save_path
)
dg
.
save_dygraph
(
optimizer
.
state_dict
(),
save_path
)
if
local_rank
==
0
:
if
local_rank
==
0
:
...
@@ -138,7 +138,10 @@ def main(cfg):
...
@@ -138,7 +138,10 @@ def main(cfg):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
parser
=
jsonargparse
.
ArgumentParser
(
description
=
"Train TransformerTTS model"
,
formatter_class
=
'default_argparse'
)
parser
=
argparse
.
ArgumentParser
(
description
=
"Train TransformerTTS model"
)
add_config_options_to_parser
(
parser
)
add_config_options_to_parser
(
parser
)
cfg
=
parser
.
parse_args
(
'-c ./config/train_transformer.yaml'
.
split
())
main
(
cfg
)
args
=
parser
.
parse_args
()
\ No newline at end of file
# Print the whole config setting.
pprint
(
args
)
main
(
args
)
\ No newline at end of file
examples/
TransformerTTS
/train_vocoder.py
→
examples/
transformer_tts
/train_vocoder.py
浏览文件 @
04d7f8b5
...
@@ -3,14 +3,15 @@ import os
...
@@ -3,14 +3,15 @@ import os
from
tqdm
import
tqdm
from
tqdm
import
tqdm
from
pathlib
import
Path
from
pathlib
import
Path
from
collections
import
OrderedDict
from
collections
import
OrderedDict
import
jsonargparse
import
argparse
from
ruamel
import
yaml
from
parse
import
add_config_options_to_parser
from
parse
import
add_config_options_to_parser
from
pprint
import
pprint
from
pprint
import
pprint
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.layers
as
layers
import
paddle.fluid.layers
as
layers
from
parakeet.models.dataloader.ljspeech
import
LJSpeechLoader
from
parakeet.models.dataloader.ljspeech
import
LJSpeechLoader
from
parakeet.models.transformer
TTS
.vocoder
import
Vocoder
from
parakeet.models.transformer
_tts
.vocoder
import
Vocoder
def
load_checkpoint
(
step
,
model_path
):
def
load_checkpoint
(
step
,
model_path
):
model_dict
,
opti_dict
=
dg
.
load_dygraph
(
os
.
path
.
join
(
model_path
,
step
))
model_dict
,
opti_dict
=
dg
.
load_dygraph
(
os
.
path
.
join
(
model_path
,
step
))
...
@@ -22,48 +23,47 @@ def load_checkpoint(step, model_path):
...
@@ -22,48 +23,47 @@ def load_checkpoint(step, model_path):
new_state_dict
[
param
]
=
model_dict
[
param
]
new_state_dict
[
param
]
=
model_dict
[
param
]
return
new_state_dict
,
opti_dict
return
new_state_dict
,
opti_dict
def
main
(
cfg
):
def
main
(
args
):
local_rank
=
dg
.
parallel
.
Env
().
local_rank
if
cfg
.
use_data_parallel
else
0
local_rank
=
dg
.
parallel
.
Env
().
local_rank
if
args
.
use_data_parallel
else
0
nranks
=
dg
.
parallel
.
Env
().
nranks
if
cfg
.
use_data_parallel
else
1
nranks
=
dg
.
parallel
.
Env
().
nranks
if
args
.
use_data_parallel
else
1
if
local_rank
==
0
:
with
open
(
args
.
config_path
)
as
f
:
# Print the whole config setting.
cfg
=
yaml
.
load
(
f
,
Loader
=
yaml
.
Loader
)
pprint
(
jsonargparse
.
namespace_to_dict
(
cfg
))
global_step
=
0
global_step
=
0
place
=
(
fluid
.
CUDAPlace
(
dg
.
parallel
.
Env
().
dev_id
)
place
=
(
fluid
.
CUDAPlace
(
dg
.
parallel
.
Env
().
dev_id
)
if
cfg
.
use_data_parallel
else
fluid
.
CUDAPlace
(
0
)
if
args
.
use_data_parallel
else
fluid
.
CUDAPlace
(
0
)
if
cfg
.
use_gpu
else
fluid
.
CPUPlace
())
if
args
.
use_gpu
else
fluid
.
CPUPlace
())
if
not
os
.
path
.
exists
(
cfg
.
log_dir
):
if
not
os
.
path
.
exists
(
args
.
log_dir
):
os
.
mkdir
(
cfg
.
log_dir
)
os
.
mkdir
(
args
.
log_dir
)
path
=
os
.
path
.
join
(
cfg
.
log_dir
,
'postnet'
)
path
=
os
.
path
.
join
(
args
.
log_dir
,
'postnet'
)
writer
=
SummaryWriter
(
path
)
if
local_rank
==
0
else
None
writer
=
SummaryWriter
(
path
)
if
local_rank
==
0
else
None
with
dg
.
guard
(
place
):
with
dg
.
guard
(
place
):
model
=
Vocoder
(
cfg
)
model
=
Vocoder
(
cfg
,
args
.
batch_size
)
model
.
train
()
model
.
train
()
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
dg
.
NoamDecay
(
1
/
(
cfg
.
warm_up_step
*
(
cfg
.
lr
**
2
)),
cfg
.
warm_up_step
),
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
dg
.
NoamDecay
(
1
/
(
cfg
[
'warm_up_step'
]
*
(
args
.
lr
**
2
)),
cfg
[
'warm_up_step'
]
),
parameter_list
=
model
.
parameters
())
parameter_list
=
model
.
parameters
())
if
cfg
.
checkpoint_path
is
not
None
:
if
args
.
checkpoint_path
is
not
None
:
model_dict
,
opti_dict
=
load_checkpoint
(
str
(
cfg
.
postnet_step
),
os
.
path
.
join
(
cfg
.
checkpoint_path
,
"postnet"
))
model_dict
,
opti_dict
=
load_checkpoint
(
str
(
args
.
vocoder_step
),
os
.
path
.
join
(
args
.
checkpoint_path
,
"postnet"
))
model
.
set_dict
(
model_dict
)
model
.
set_dict
(
model_dict
)
optimizer
.
set_dict
(
opti_dict
)
optimizer
.
set_dict
(
opti_dict
)
global_step
=
cfg
.
postnet
_step
global_step
=
args
.
vocoder
_step
print
(
"load checkpoint!!!"
)
print
(
"load checkpoint!!!"
)
if
cfg
.
use_data_parallel
:
if
args
.
use_data_parallel
:
strategy
=
dg
.
parallel
.
prepare_context
()
strategy
=
dg
.
parallel
.
prepare_context
()
model
=
fluid
.
dygraph
.
parallel
.
DataParallel
(
model
,
strategy
)
model
=
fluid
.
dygraph
.
parallel
.
DataParallel
(
model
,
strategy
)
reader
=
LJSpeechLoader
(
cfg
,
nranks
,
local_rank
,
is_vocoder
=
True
).
reader
()
reader
=
LJSpeechLoader
(
cfg
,
args
,
nranks
,
local_rank
,
is_vocoder
=
True
).
reader
()
for
epoch
in
range
(
cfg
.
epochs
):
for
epoch
in
range
(
args
.
epochs
):
pbar
=
tqdm
(
reader
)
pbar
=
tqdm
(
reader
)
for
i
,
data
in
enumerate
(
pbar
):
for
i
,
data
in
enumerate
(
pbar
):
pbar
.
set_description
(
'Processing at epoch %d'
%
epoch
)
pbar
.
set_description
(
'Processing at epoch %d'
%
epoch
)
...
@@ -75,13 +75,13 @@ def main(cfg):
...
@@ -75,13 +75,13 @@ def main(cfg):
mag_pred
=
model
(
mel
)
mag_pred
=
model
(
mel
)
loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
mag_pred
,
mag
)))
loss
=
layers
.
mean
(
layers
.
abs
(
layers
.
elementwise_sub
(
mag_pred
,
mag
)))
if
cfg
.
use_data_parallel
:
if
args
.
use_data_parallel
:
loss
=
model
.
scale_loss
(
loss
)
loss
=
model
.
scale_loss
(
loss
)
loss
.
backward
()
loss
.
backward
()
model
.
apply_collective_grads
()
model
.
apply_collective_grads
()
else
:
else
:
loss
.
backward
()
loss
.
backward
()
optimizer
.
minimize
(
loss
,
grad_clip
=
fluid
.
dygraph_grad_clip
.
GradClipByGlobalNorm
(
cfg
.
grad_clip_thresh
))
optimizer
.
minimize
(
loss
,
grad_clip
=
fluid
.
dygraph_grad_clip
.
GradClipByGlobalNorm
(
cfg
[
'grad_clip_thresh'
]
))
model
.
clear_gradients
()
model
.
clear_gradients
()
if
local_rank
==
0
:
if
local_rank
==
0
:
...
@@ -89,10 +89,10 @@ def main(cfg):
...
@@ -89,10 +89,10 @@ def main(cfg):
'loss'
:
loss
.
numpy
(),
'loss'
:
loss
.
numpy
(),
},
global_step
)
},
global_step
)
if
global_step
%
cfg
.
save_step
==
0
:
if
global_step
%
args
.
save_step
==
0
:
if
not
os
.
path
.
exists
(
cfg
.
save_path
):
if
not
os
.
path
.
exists
(
args
.
save_path
):
os
.
mkdir
(
cfg
.
save_path
)
os
.
mkdir
(
args
.
save_path
)
save_path
=
os
.
path
.
join
(
cfg
.
save_path
,
'postnet/%d'
%
global_step
)
save_path
=
os
.
path
.
join
(
args
.
save_path
,
'postnet/%d'
%
global_step
)
dg
.
save_dygraph
(
model
.
state_dict
(),
save_path
)
dg
.
save_dygraph
(
model
.
state_dict
(),
save_path
)
dg
.
save_dygraph
(
optimizer
.
state_dict
(),
save_path
)
dg
.
save_dygraph
(
optimizer
.
state_dict
(),
save_path
)
...
@@ -100,7 +100,9 @@ def main(cfg):
...
@@ -100,7 +100,9 @@ def main(cfg):
writer
.
close
()
writer
.
close
()
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
parser
=
jsonargparse
.
ArgumentParser
(
description
=
"Train postnet model"
,
formatter_class
=
'default_argparse'
)
parser
=
argparse
.
ArgumentParser
(
description
=
"Train postnet model"
)
add_config_options_to_parser
(
parser
)
add_config_options_to_parser
(
parser
)
cfg
=
parser
.
parse_args
(
'-c ./config/train_vocoder.yaml'
.
split
())
args
=
parser
.
parse_args
()
main
(
cfg
)
# Print the whole config setting.
\ No newline at end of file
pprint
(
args
)
main
(
args
)
\ No newline at end of file
parakeet/models/dataloader/ljspeech.py
浏览文件 @
04d7f8b5
...
@@ -13,17 +13,17 @@ from parakeet.data.batch import TextIDBatcher, SpecBatcher
...
@@ -13,17 +13,17 @@ from parakeet.data.batch import TextIDBatcher, SpecBatcher
from
parakeet.data.dataset
import
DatasetMixin
,
TransformDataset
from
parakeet.data.dataset
import
DatasetMixin
,
TransformDataset
class
LJSpeechLoader
:
class
LJSpeechLoader
:
def
__init__
(
self
,
config
,
nranks
,
rank
,
is_vocoder
=
False
,
shuffle
=
True
):
def
__init__
(
self
,
config
,
args
,
nranks
,
rank
,
is_vocoder
=
False
,
shuffle
=
True
):
place
=
fluid
.
CUDAPlace
(
rank
)
if
config
.
use_gpu
else
fluid
.
CPUPlace
()
place
=
fluid
.
CUDAPlace
(
rank
)
if
args
.
use_gpu
else
fluid
.
CPUPlace
()
LJSPEECH_ROOT
=
Path
(
config
.
data_path
)
LJSPEECH_ROOT
=
Path
(
args
.
data_path
)
metadata
=
LJSpeechMetaData
(
LJSPEECH_ROOT
)
metadata
=
LJSpeechMetaData
(
LJSPEECH_ROOT
)
transformer
=
LJSpeech
(
config
)
transformer
=
LJSpeech
(
config
)
dataset
=
TransformDataset
(
metadata
,
transformer
)
dataset
=
TransformDataset
(
metadata
,
transformer
)
sampler
=
DistributedSampler
(
len
(
metadata
),
nranks
,
rank
,
shuffle
=
shuffle
)
sampler
=
DistributedSampler
(
len
(
metadata
),
nranks
,
rank
,
shuffle
=
shuffle
)
assert
config
.
batch_size
%
nranks
==
0
assert
args
.
batch_size
%
nranks
==
0
each_bs
=
config
.
batch_size
//
nranks
each_bs
=
args
.
batch_size
//
nranks
if
is_vocoder
:
if
is_vocoder
:
dataloader
=
DataCargo
(
dataset
,
sampler
=
sampler
,
batch_size
=
each_bs
,
shuffle
=
shuffle
,
batch_fn
=
batch_examples_vocoder
,
drop_last
=
True
)
dataloader
=
DataCargo
(
dataset
,
sampler
=
sampler
,
batch_size
=
each_bs
,
shuffle
=
shuffle
,
batch_fn
=
batch_examples_vocoder
,
drop_last
=
True
)
else
:
else
:
...
@@ -63,15 +63,15 @@ class LJSpeech(object):
...
@@ -63,15 +63,15 @@ class LJSpeech(object):
super
(
LJSpeech
,
self
).
__init__
()
super
(
LJSpeech
,
self
).
__init__
()
self
.
config
=
config
self
.
config
=
config
self
.
_ljspeech_processor
=
audio
.
AudioProcessor
(
self
.
_ljspeech_processor
=
audio
.
AudioProcessor
(
sample_rate
=
config
.
audio
.
sr
,
sample_rate
=
config
[
'audio'
][
'sr'
]
,
num_mels
=
config
.
audio
.
num_mels
,
num_mels
=
config
[
'audio'
][
'num_mels'
]
,
min_level_db
=
config
.
audio
.
min_level_db
,
min_level_db
=
config
[
'audio'
][
'min_level_db'
]
,
ref_level_db
=
config
.
audio
.
ref_level_db
,
ref_level_db
=
config
[
'audio'
][
'ref_level_db'
]
,
n_fft
=
config
.
audio
.
n_fft
,
n_fft
=
config
[
'audio'
][
'n_fft'
]
,
win_length
=
config
.
audio
.
win_length
,
win_length
=
config
[
'audio'
][
'win_length'
]
,
hop_length
=
config
.
audio
.
hop_length
,
hop_length
=
config
[
'audio'
][
'hop_length'
]
,
power
=
config
.
audio
.
power
,
power
=
config
[
'audio'
][
'power'
]
,
preemphasis
=
config
.
audio
.
preemphasis
,
preemphasis
=
config
[
'audio'
][
'preemphasis'
]
,
signal_norm
=
True
,
signal_norm
=
True
,
symmetric_norm
=
False
,
symmetric_norm
=
False
,
max_norm
=
1.
,
max_norm
=
1.
,
...
...
parakeet/models/fastspeech/fastspeech.py
浏览文件 @
04d7f8b5
...
@@ -2,7 +2,7 @@ import math
...
@@ -2,7 +2,7 @@ import math
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
parakeet.g2p.text.symbols
import
symbols
from
parakeet.g2p.text.symbols
import
symbols
from
parakeet.models.transformer
TTS
.post_convnet
import
PostConvNet
from
parakeet.models.transformer
_tts
.post_convnet
import
PostConvNet
from
parakeet.models.fastspeech.LengthRegulator
import
LengthRegulator
from
parakeet.models.fastspeech.LengthRegulator
import
LengthRegulator
from
parakeet.models.fastspeech.encoder
import
Encoder
from
parakeet.models.fastspeech.encoder
import
Encoder
from
parakeet.models.fastspeech.decoder
import
Decoder
from
parakeet.models.fastspeech.decoder
import
Decoder
...
@@ -13,43 +13,43 @@ class FastSpeech(dg.Layer):
...
@@ -13,43 +13,43 @@ class FastSpeech(dg.Layer):
super
(
FastSpeech
,
self
).
__init__
()
super
(
FastSpeech
,
self
).
__init__
()
self
.
encoder
=
Encoder
(
n_src_vocab
=
len
(
symbols
)
+
1
,
self
.
encoder
=
Encoder
(
n_src_vocab
=
len
(
symbols
)
+
1
,
len_max_seq
=
cfg
.
max_sep_len
,
len_max_seq
=
cfg
[
'max_seq_len'
]
,
n_layers
=
cfg
.
encoder_n_layer
,
n_layers
=
cfg
[
'encoder_n_layer'
]
,
n_head
=
cfg
.
encoder_head
,
n_head
=
cfg
[
'encoder_head'
]
,
d_k
=
cfg
.
fs_hidden_size
//
cfg
.
encoder_head
,
d_k
=
cfg
[
'fs_hidden_size'
]
//
cfg
[
'encoder_head'
]
,
d_v
=
cfg
.
fs_hidden_size
//
cfg
.
encoder_head
,
d_v
=
cfg
[
'fs_hidden_size'
]
//
cfg
[
'encoder_head'
]
,
d_model
=
cfg
.
fs_hidden_size
,
d_model
=
cfg
[
'fs_hidden_size'
]
,
d_inner
=
cfg
.
encoder_conv1d_filter_size
,
d_inner
=
cfg
[
'encoder_conv1d_filter_size'
]
,
fft_conv1d_kernel
=
cfg
.
fft_conv1d_filter
,
fft_conv1d_kernel
=
cfg
[
'fft_conv1d_filter'
]
,
fft_conv1d_padding
=
cfg
.
fft_conv1d_padding
,
fft_conv1d_padding
=
cfg
[
'fft_conv1d_padding'
]
,
dropout
=
0.1
)
dropout
=
0.1
)
self
.
length_regulator
=
LengthRegulator
(
input_size
=
cfg
.
fs_hidden_size
,
self
.
length_regulator
=
LengthRegulator
(
input_size
=
cfg
[
'fs_hidden_size'
]
,
out_channels
=
cfg
.
duration_predictor_output_size
,
out_channels
=
cfg
[
'duration_predictor_output_size'
]
,
filter_size
=
cfg
.
duration_predictor_filter_size
,
filter_size
=
cfg
[
'duration_predictor_filter_size'
]
,
dropout
=
cfg
.
dropout
)
dropout
=
cfg
[
'dropout'
]
)
self
.
decoder
=
Decoder
(
len_max_seq
=
cfg
.
max_sep_len
,
self
.
decoder
=
Decoder
(
len_max_seq
=
cfg
[
'max_seq_len'
]
,
n_layers
=
cfg
.
decoder_n_layer
,
n_layers
=
cfg
[
'decoder_n_layer'
]
,
n_head
=
cfg
.
decoder_head
,
n_head
=
cfg
[
'decoder_head'
]
,
d_k
=
cfg
.
fs_hidden_size
//
cfg
.
decoder_head
,
d_k
=
cfg
[
'fs_hidden_size'
]
//
cfg
[
'decoder_head'
]
,
d_v
=
cfg
.
fs_hidden_size
//
cfg
.
decoder_head
,
d_v
=
cfg
[
'fs_hidden_size'
]
//
cfg
[
'decoder_head'
]
,
d_model
=
cfg
.
fs_hidden_size
,
d_model
=
cfg
[
'fs_hidden_size'
]
,
d_inner
=
cfg
.
decoder_conv1d_filter_size
,
d_inner
=
cfg
[
'decoder_conv1d_filter_size'
]
,
fft_conv1d_kernel
=
cfg
.
fft_conv1d_filter
,
fft_conv1d_kernel
=
cfg
[
'fft_conv1d_filter'
]
,
fft_conv1d_padding
=
cfg
.
fft_conv1d_padding
,
fft_conv1d_padding
=
cfg
[
'fft_conv1d_padding'
]
,
dropout
=
0.1
)
dropout
=
0.1
)
self
.
weight
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
XavierInitializer
())
self
.
weight
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
XavierInitializer
())
k
=
math
.
sqrt
(
1
/
cfg
.
fs_hidden_size
)
k
=
math
.
sqrt
(
1
/
cfg
[
'fs_hidden_size'
]
)
self
.
bias
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
k
,
high
=
k
))
self
.
bias
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
k
,
high
=
k
))
self
.
mel_linear
=
dg
.
Linear
(
cfg
.
fs_hidden_size
,
self
.
mel_linear
=
dg
.
Linear
(
cfg
[
'fs_hidden_size'
]
,
cfg
.
audio
.
num_mels
*
cfg
.
audio
.
outputs_per_step
,
cfg
[
'audio'
][
'num_mels'
]
*
cfg
[
'audio'
][
'outputs_per_step'
]
,
param_attr
=
self
.
weight
,
param_attr
=
self
.
weight
,
bias_attr
=
self
.
bias
,)
bias_attr
=
self
.
bias
,)
self
.
postnet
=
PostConvNet
(
n_mels
=
cfg
.
audio
.
num_mels
,
self
.
postnet
=
PostConvNet
(
n_mels
=
cfg
[
'audio'
][
'num_mels'
]
,
num_hidden
=
512
,
num_hidden
=
512
,
filter_size
=
5
,
filter_size
=
5
,
padding
=
int
(
5
/
2
),
padding
=
int
(
5
/
2
),
num_conv
=
5
,
num_conv
=
5
,
outputs_per_step
=
cfg
.
audio
.
outputs_per_step
,
outputs_per_step
=
cfg
[
'audio'
][
'outputs_per_step'
]
,
use_cudnn
=
True
,
use_cudnn
=
True
,
dropout
=
0.1
,
dropout
=
0.1
,
batchnorm_last
=
True
)
batchnorm_last
=
True
)
...
...
parakeet/models/transformer
TTS
/CBHG.py
→
parakeet/models/transformer
_tts
/CBHG.py
浏览文件 @
04d7f8b5
文件已移动
parakeet/models/transformer
TTS
/__init__.py
→
parakeet/models/transformer
_tts
/__init__.py
浏览文件 @
04d7f8b5
文件已移动
parakeet/models/transformer
TTS
/decoder.py
→
parakeet/models/transformer
_tts
/decoder.py
浏览文件 @
04d7f8b5
...
@@ -4,8 +4,8 @@ import paddle.fluid as fluid
...
@@ -4,8 +4,8 @@ import paddle.fluid as fluid
from
parakeet.modules.utils
import
*
from
parakeet.modules.utils
import
*
from
parakeet.modules.multihead_attention
import
MultiheadAttention
from
parakeet.modules.multihead_attention
import
MultiheadAttention
from
parakeet.modules.ffn
import
PositionwiseFeedForward
from
parakeet.modules.ffn
import
PositionwiseFeedForward
from
parakeet.models.transformer
TTS
.prenet
import
PreNet
from
parakeet.models.transformer
_tts
.prenet
import
PreNet
from
parakeet.models.transformer
TTS
.post_convnet
import
PostConvNet
from
parakeet.models.transformer
_tts
.post_convnet
import
PostConvNet
class
Decoder
(
dg
.
Layer
):
class
Decoder
(
dg
.
Layer
):
def
__init__
(
self
,
num_hidden
,
config
,
num_head
=
4
):
def
__init__
(
self
,
num_hidden
,
config
,
num_head
=
4
):
...
@@ -20,7 +20,7 @@ class Decoder(dg.Layer):
...
@@ -20,7 +20,7 @@ class Decoder(dg.Layer):
param_attr
=
fluid
.
ParamAttr
(
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
NumpyArrayInitializer
(
self
.
pos_inp
),
initializer
=
fluid
.
initializer
.
NumpyArrayInitializer
(
self
.
pos_inp
),
trainable
=
False
))
trainable
=
False
))
self
.
decoder_prenet
=
PreNet
(
input_size
=
config
.
audio
.
num_mels
,
self
.
decoder_prenet
=
PreNet
(
input_size
=
config
[
'audio'
][
'num_mels'
]
,
hidden_size
=
num_hidden
*
2
,
hidden_size
=
num_hidden
*
2
,
output_size
=
num_hidden
,
output_size
=
num_hidden
,
dropout_rate
=
0.2
)
dropout_rate
=
0.2
)
...
@@ -38,17 +38,17 @@ class Decoder(dg.Layer):
...
@@ -38,17 +38,17 @@ class Decoder(dg.Layer):
self
.
ffns
=
[
PositionwiseFeedForward
(
num_hidden
,
num_hidden
*
num_head
,
filter_size
=
1
)
for
_
in
range
(
3
)]
self
.
ffns
=
[
PositionwiseFeedForward
(
num_hidden
,
num_hidden
*
num_head
,
filter_size
=
1
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
ffns
):
for
i
,
layer
in
enumerate
(
self
.
ffns
):
self
.
add_sublayer
(
"ffns_{}"
.
format
(
i
),
layer
)
self
.
add_sublayer
(
"ffns_{}"
.
format
(
i
),
layer
)
self
.
mel_linear
=
dg
.
Linear
(
num_hidden
,
config
.
audio
.
num_mels
*
config
.
audio
.
outputs_per_step
,
self
.
mel_linear
=
dg
.
Linear
(
num_hidden
,
config
[
'audio'
][
'num_mels'
]
*
config
[
'audio'
][
'outputs_per_step'
]
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
XavierInitializer
()),
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
XavierInitializer
()),
bias_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
k
,
high
=
k
)))
bias_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
k
,
high
=
k
)))
self
.
stop_linear
=
dg
.
Linear
(
num_hidden
,
1
,
self
.
stop_linear
=
dg
.
Linear
(
num_hidden
,
1
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
XavierInitializer
()),
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
XavierInitializer
()),
bias_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
k
,
high
=
k
)))
bias_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
low
=-
k
,
high
=
k
)))
self
.
postconvnet
=
PostConvNet
(
config
.
audio
.
num_mels
,
config
.
hidden_size
,
self
.
postconvnet
=
PostConvNet
(
config
[
'audio'
][
'num_mels'
],
config
[
'hidden_size'
]
,
filter_size
=
5
,
padding
=
4
,
num_conv
=
5
,
filter_size
=
5
,
padding
=
4
,
num_conv
=
5
,
outputs_per_step
=
config
.
audio
.
outputs_per_step
,
outputs_per_step
=
config
[
'audio'
][
'outputs_per_step'
]
,
use_cudnn
=
config
.
use_gpu
)
use_cudnn
=
True
)
def
forward
(
self
,
key
,
value
,
query
,
c_mask
,
positional
):
def
forward
(
self
,
key
,
value
,
query
,
c_mask
,
positional
):
...
...
parakeet/models/transformer
TTS
/encoder.py
→
parakeet/models/transformer
_tts
/encoder.py
浏览文件 @
04d7f8b5
...
@@ -3,10 +3,10 @@ import paddle.fluid as fluid
...
@@ -3,10 +3,10 @@ import paddle.fluid as fluid
from
parakeet.modules.utils
import
*
from
parakeet.modules.utils
import
*
from
parakeet.modules.multihead_attention
import
MultiheadAttention
from
parakeet.modules.multihead_attention
import
MultiheadAttention
from
parakeet.modules.ffn
import
PositionwiseFeedForward
from
parakeet.modules.ffn
import
PositionwiseFeedForward
from
parakeet.models.transformer
TTS
.encoderprenet
import
EncoderPrenet
from
parakeet.models.transformer
_tts
.encoderprenet
import
EncoderPrenet
class
Encoder
(
dg
.
Layer
):
class
Encoder
(
dg
.
Layer
):
def
__init__
(
self
,
embedding_size
,
num_hidden
,
config
,
num_head
=
4
):
def
__init__
(
self
,
embedding_size
,
num_hidden
,
num_head
=
4
):
super
(
Encoder
,
self
).
__init__
()
super
(
Encoder
,
self
).
__init__
()
self
.
num_hidden
=
num_hidden
self
.
num_hidden
=
num_hidden
param
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
1.0
))
param
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
1.0
))
...
@@ -19,11 +19,11 @@ class Encoder(dg.Layer):
...
@@ -19,11 +19,11 @@ class Encoder(dg.Layer):
trainable
=
False
))
trainable
=
False
))
self
.
encoder_prenet
=
EncoderPrenet
(
embedding_size
=
embedding_size
,
self
.
encoder_prenet
=
EncoderPrenet
(
embedding_size
=
embedding_size
,
num_hidden
=
num_hidden
,
num_hidden
=
num_hidden
,
use_cudnn
=
config
.
use_gpu
)
use_cudnn
=
True
)
self
.
layers
=
[
MultiheadAttention
(
num_hidden
,
num_hidden
//
num_head
,
num_hidden
//
num_head
)
for
_
in
range
(
3
)]
self
.
layers
=
[
MultiheadAttention
(
num_hidden
,
num_hidden
//
num_head
,
num_hidden
//
num_head
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
layers
):
for
i
,
layer
in
enumerate
(
self
.
layers
):
self
.
add_sublayer
(
"self_attn_{}"
.
format
(
i
),
layer
)
self
.
add_sublayer
(
"self_attn_{}"
.
format
(
i
),
layer
)
self
.
ffns
=
[
PositionwiseFeedForward
(
num_hidden
,
num_hidden
*
num_head
,
filter_size
=
1
,
use_cudnn
=
config
.
use_gpu
)
for
_
in
range
(
3
)]
self
.
ffns
=
[
PositionwiseFeedForward
(
num_hidden
,
num_hidden
*
num_head
,
filter_size
=
1
,
use_cudnn
=
True
)
for
_
in
range
(
3
)]
for
i
,
layer
in
enumerate
(
self
.
ffns
):
for
i
,
layer
in
enumerate
(
self
.
ffns
):
self
.
add_sublayer
(
"ffns_{}"
.
format
(
i
),
layer
)
self
.
add_sublayer
(
"ffns_{}"
.
format
(
i
),
layer
)
...
...
parakeet/models/transformer
TTS
/encoderprenet.py
→
parakeet/models/transformer
_tts
/encoderprenet.py
浏览文件 @
04d7f8b5
文件已移动
parakeet/models/transformer
TTS
/post_convnet.py
→
parakeet/models/transformer
_tts
/post_convnet.py
浏览文件 @
04d7f8b5
文件已移动
parakeet/models/transformer
TTS
/prenet.py
→
parakeet/models/transformer
_tts
/prenet.py
浏览文件 @
04d7f8b5
文件已移动
parakeet/models/transformer
TTS
/transformerTTS.py
→
parakeet/models/transformer
_tts
/transformerTTS.py
浏览文件 @
04d7f8b5
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
parakeet.models.transformer
TTS
.encoder
import
Encoder
from
parakeet.models.transformer
_tts
.encoder
import
Encoder
from
parakeet.models.transformer
TTS
.decoder
import
Decoder
from
parakeet.models.transformer
_tts
.decoder
import
Decoder
class
TransformerTTS
(
dg
.
Layer
):
class
TransformerTTS
(
dg
.
Layer
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
TransformerTTS
,
self
).
__init__
()
super
(
TransformerTTS
,
self
).
__init__
()
self
.
encoder
=
Encoder
(
config
.
embedding_size
,
config
.
hidden_size
,
config
)
self
.
encoder
=
Encoder
(
config
[
'embedding_size'
],
config
[
'hidden_size'
]
)
self
.
decoder
=
Decoder
(
config
.
hidden_size
,
config
)
self
.
decoder
=
Decoder
(
config
[
'hidden_size'
]
,
config
)
self
.
config
=
config
self
.
config
=
config
def
forward
(
self
,
characters
,
mel_input
,
pos_text
,
pos_mel
):
def
forward
(
self
,
characters
,
mel_input
,
pos_text
,
pos_mel
):
...
...
parakeet/models/transformer
TTS
/vocoder.py
→
parakeet/models/transformer
_tts
/vocoder.py
浏览文件 @
04d7f8b5
...
@@ -2,20 +2,20 @@ import paddle.fluid.dygraph as dg
...
@@ -2,20 +2,20 @@ import paddle.fluid.dygraph as dg
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
parakeet.modules.customized
import
Conv1D
from
parakeet.modules.customized
import
Conv1D
from
parakeet.modules.utils
import
*
from
parakeet.modules.utils
import
*
from
parakeet.models.transformer
TTS
.CBHG
import
CBHG
from
parakeet.models.transformer
_tts
.CBHG
import
CBHG
class
Vocoder
(
dg
.
Layer
):
class
Vocoder
(
dg
.
Layer
):
"""
"""
CBHG Network (mel -> linear)
CBHG Network (mel -> linear)
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
,
batch_size
):
super
(
Vocoder
,
self
).
__init__
()
super
(
Vocoder
,
self
).
__init__
()
self
.
pre_proj
=
Conv1D
(
num_channels
=
config
.
audio
.
num_mels
,
self
.
pre_proj
=
Conv1D
(
num_channels
=
config
[
'audio'
][
'num_mels'
]
,
num_filters
=
config
.
hidden_size
,
num_filters
=
config
[
'hidden_size'
]
,
filter_size
=
1
)
filter_size
=
1
)
self
.
cbhg
=
CBHG
(
config
.
hidden_size
,
config
.
batch_size
)
self
.
cbhg
=
CBHG
(
config
[
'hidden_size'
],
batch_size
)
self
.
post_proj
=
Conv1D
(
num_channels
=
config
.
hidden_size
,
self
.
post_proj
=
Conv1D
(
num_channels
=
config
[
'hidden_size'
]
,
num_filters
=
(
config
.
audio
.
n_fft
//
2
)
+
1
,
num_filters
=
(
config
[
'audio'
][
'n_fft'
]
//
2
)
+
1
,
filter_size
=
1
)
filter_size
=
1
)
def
forward
(
self
,
mel
):
def
forward
(
self
,
mel
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录