Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Parakeet
提交
25883dcd
P
Parakeet
项目概览
PaddlePaddle
/
Parakeet
通知
14
Star
3
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
19
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Parakeet
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
19
Issue
19
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
25883dcd
编写于
2月 24, 2020
作者:
L
liuyibing01
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'update_waveflow' into 'master'
Update waveflow See merge request !21
上级
43814acb
6ad45772
变更
14
显示空白变更内容
内联
并排
Showing
14 changed file
with
460 addition
and
304 deletion
+460
-304
.pre-commit-config.yaml
.pre-commit-config.yaml
+27
-0
examples/deepvoice3/train.py
examples/deepvoice3/train.py
+74
-74
examples/waveflow/README.md
examples/waveflow/README.md
+5
-5
examples/waveflow/benchmark.py
examples/waveflow/benchmark.py
+27
-15
examples/waveflow/configs/waveflow_ljspeech.yaml
examples/waveflow/configs/waveflow_ljspeech.yaml
+0
-0
examples/waveflow/synthesis.py
examples/waveflow/synthesis.py
+36
-18
examples/waveflow/train.py
examples/waveflow/train.py
+34
-18
examples/waveflow/utils.py
examples/waveflow/utils.py
+77
-36
parakeet/datasets/ljspeech.py
parakeet/datasets/ljspeech.py
+26
-19
parakeet/models/waveflow/__init__.py
parakeet/models/waveflow/__init__.py
+1
-0
parakeet/models/waveflow/data.py
parakeet/models/waveflow/data.py
+21
-21
parakeet/models/waveflow/waveflow.py
parakeet/models/waveflow/waveflow.py
+41
-27
parakeet/models/waveflow/waveflow_modules.py
parakeet/models/waveflow/waveflow_modules.py
+77
-58
parakeet/modules/weight_norm.py
parakeet/modules/weight_norm.py
+14
-13
未找到文件。
.pre-commit-config.yaml
0 → 100644
浏览文件 @
25883dcd
-
repo
:
https://github.com/PaddlePaddle/mirrors-yapf.git
sha
:
0d79c0c469bab64f7229c9aca2b1186ef47f0e37
hooks
:
-
id
:
yapf
files
:
\.py$
-
repo
:
https://github.com/pre-commit/pre-commit-hooks
sha
:
a11d9314b22d8f8c7556443875b731ef05965464
hooks
:
-
id
:
check-merge-conflict
-
id
:
check-symlinks
-
id
:
detect-private-key
files
:
(?!.*paddle)^.*$
-
id
:
end-of-file-fixer
files
:
\.md$
-
id
:
trailing-whitespace
files
:
\.md$
-
repo
:
https://github.com/Lucas-C/pre-commit-hooks
sha
:
v1.0.1
hooks
:
-
id
:
forbid-crlf
files
:
\.md$
-
id
:
remove-crlf
files
:
\.md$
-
id
:
forbid-tabs
files
:
\.md$
-
id
:
remove-tabs
files
:
\.md$
examples/deepvoice3/train.py
浏览文件 @
25883dcd
...
@@ -28,22 +28,21 @@ if __name__ == "__main__":
...
@@ -28,22 +28,21 @@ if __name__ == "__main__":
parser
=
argparse
.
ArgumentParser
(
parser
=
argparse
.
ArgumentParser
(
description
=
"Train a deepvoice 3 model with LJSpeech dataset."
)
description
=
"Train a deepvoice 3 model with LJSpeech dataset."
)
parser
.
add_argument
(
"-c"
,
"--config"
,
type
=
str
,
help
=
"experimrnt config"
)
parser
.
add_argument
(
"-c"
,
"--config"
,
type
=
str
,
help
=
"experimrnt config"
)
parser
.
add_argument
(
"-s"
,
parser
.
add_argument
(
"-s"
,
"--data"
,
"--data"
,
type
=
str
,
type
=
str
,
default
=
"/workspace/datasets/LJSpeech-1.1/"
,
default
=
"/workspace/datasets/LJSpeech-1.1/"
,
help
=
"The path of the LJSpeech dataset."
)
help
=
"The path of the LJSpeech dataset."
)
parser
.
add_argument
(
"-r"
,
"--resume"
,
type
=
str
,
help
=
"checkpoint to load"
)
parser
.
add_argument
(
"-r"
,
"--resume"
,
type
=
str
,
help
=
"checkpoint to load"
)
parser
.
add_argument
(
"-o"
,
parser
.
add_argument
(
"-o"
,
"--output"
,
"--output"
,
type
=
str
,
type
=
str
,
default
=
"result"
,
default
=
"result"
,
help
=
"The directory to save result."
)
help
=
"The directory to save result."
)
parser
.
add_argument
(
"-g"
,
parser
.
add_argument
(
"--device"
,
"-g"
,
"--device"
,
type
=
int
,
default
=-
1
,
help
=
"device to use"
)
type
=
int
,
default
=-
1
,
help
=
"device to use"
)
args
,
_
=
parser
.
parse_known_args
()
args
,
_
=
parser
.
parse_known_args
()
with
open
(
args
.
config
,
'rt'
)
as
f
:
with
open
(
args
.
config
,
'rt'
)
as
f
:
config
=
ruamel
.
yaml
.
safe_load
(
f
)
config
=
ruamel
.
yaml
.
safe_load
(
f
)
...
@@ -84,18 +83,16 @@ if __name__ == "__main__":
...
@@ -84,18 +83,16 @@ if __name__ == "__main__":
train_config
=
config
[
"train"
]
train_config
=
config
[
"train"
]
batch_size
=
train_config
[
"batch_size"
]
batch_size
=
train_config
[
"batch_size"
]
text_lengths
=
[
len
(
example
[
2
])
for
example
in
meta
]
text_lengths
=
[
len
(
example
[
2
])
for
example
in
meta
]
sampler
=
PartialyRandomizedSimilarTimeLengthSampler
(
sampler
=
PartialyRandomizedSimilarTimeLengthSampler
(
text_lengths
,
text_lengths
,
batch_size
)
batch_size
)
# some hyperparameters affect how we process data, so create a data collector!
# some hyperparameters affect how we process data, so create a data collector!
model_config
=
config
[
"model"
]
model_config
=
config
[
"model"
]
downsample_factor
=
model_config
[
"downsample_factor"
]
downsample_factor
=
model_config
[
"downsample_factor"
]
r
=
model_config
[
"outputs_per_step"
]
r
=
model_config
[
"outputs_per_step"
]
collector
=
DataCollector
(
downsample_factor
=
downsample_factor
,
r
=
r
)
collector
=
DataCollector
(
downsample_factor
=
downsample_factor
,
r
=
r
)
ljspeech_loader
=
DataCargo
(
ljspeech
,
ljspeech_loader
=
DataCargo
(
batch_fn
=
collector
,
ljspeech
,
batch_fn
=
collector
,
batch_size
=
batch_size
,
sampler
=
sampler
)
batch_size
=
batch_size
,
sampler
=
sampler
)
# =========================model=========================
# =========================model=========================
if
args
.
device
==
-
1
:
if
args
.
device
==
-
1
:
...
@@ -131,15 +128,14 @@ if __name__ == "__main__":
...
@@ -131,15 +128,14 @@ if __name__ == "__main__":
window_ahead
=
model_config
[
"window_ahead"
]
window_ahead
=
model_config
[
"window_ahead"
]
key_projection
=
model_config
[
"key_projection"
]
key_projection
=
model_config
[
"key_projection"
]
value_projection
=
model_config
[
"value_projection"
]
value_projection
=
model_config
[
"value_projection"
]
dv3
=
make_model
(
n_speakers
,
speaker_dim
,
speaker_embed_std
,
embed_dim
,
dv3
=
make_model
(
padding_idx
,
embedding_std
,
max_positions
,
n_vocab
,
n_speakers
,
speaker_dim
,
speaker_embed_std
,
embed_dim
,
padding_idx
,
freeze_embedding
,
filter_size
,
encoder_channels
,
embedding_std
,
max_positions
,
n_vocab
,
freeze_embedding
,
n_mels
,
decoder_channels
,
r
,
filter_size
,
encoder_channels
,
n_mels
,
decoder_channels
,
r
,
trainable_positional_encodings
,
use_memory_mask
,
trainable_positional_encodings
,
use_memory_mask
,
query_position_rate
,
key_position_rate
,
query_position_rate
,
key_position_rate
,
window_backward
,
window_backward
,
window_ahead
,
key_projection
,
window_ahead
,
key_projection
,
value_projection
,
downsample_factor
,
value_projection
,
downsample_factor
,
linear_dim
,
linear_dim
,
use_decoder_states
,
converter_channels
,
dropout
)
use_decoder_states
,
converter_channels
,
dropout
)
# =========================loss=========================
# =========================loss=========================
loss_config
=
config
[
"loss"
]
loss_config
=
config
[
"loss"
]
...
@@ -149,7 +145,8 @@ if __name__ == "__main__":
...
@@ -149,7 +145,8 @@ if __name__ == "__main__":
priority_freq_weight
=
loss_config
[
"priority_freq_weight"
]
priority_freq_weight
=
loss_config
[
"priority_freq_weight"
]
binary_divergence_weight
=
loss_config
[
"binary_divergence_weight"
]
binary_divergence_weight
=
loss_config
[
"binary_divergence_weight"
]
guided_attention_sigma
=
loss_config
[
"guided_attention_sigma"
]
guided_attention_sigma
=
loss_config
[
"guided_attention_sigma"
]
criterion
=
TTSLoss
(
masked_weight
=
masked_weight
,
criterion
=
TTSLoss
(
masked_weight
=
masked_weight
,
priority_bin
=
priority_bin
,
priority_bin
=
priority_bin
,
priority_weight
=
priority_freq_weight
,
priority_weight
=
priority_freq_weight
,
binary_divergence_weight
=
binary_divergence_weight
,
binary_divergence_weight
=
binary_divergence_weight
,
...
@@ -169,7 +166,8 @@ if __name__ == "__main__":
...
@@ -169,7 +166,8 @@ if __name__ == "__main__":
beta1
=
optim_config
[
"beta1"
]
beta1
=
optim_config
[
"beta1"
]
beta2
=
optim_config
[
"beta2"
]
beta2
=
optim_config
[
"beta2"
]
epsilon
=
optim_config
[
"epsilon"
]
epsilon
=
optim_config
[
"epsilon"
]
optim
=
fluid
.
optimizer
.
Adam
(
lr_scheduler
,
optim
=
fluid
.
optimizer
.
Adam
(
lr_scheduler
,
beta1
,
beta1
,
beta2
,
beta2
,
epsilon
=
epsilon
,
epsilon
=
epsilon
,
...
@@ -183,8 +181,8 @@ if __name__ == "__main__":
...
@@ -183,8 +181,8 @@ if __name__ == "__main__":
# =========================link(dataloader, paddle)=========================
# =========================link(dataloader, paddle)=========================
# CAUTION: it does not return a DataLoader
# CAUTION: it does not return a DataLoader
loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
capacity
=
10
,
loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
return_list
=
True
)
capacity
=
10
,
return_list
=
True
)
loader
.
set_batch_generator
(
ljspeech_loader
,
places
=
place
)
loader
.
set_batch_generator
(
ljspeech_loader
,
places
=
place
)
# tensorboard & checkpoint preparation
# tensorboard & checkpoint preparation
...
@@ -247,7 +245,8 @@ if __name__ == "__main__":
...
@@ -247,7 +245,8 @@ if __name__ == "__main__":
# TODO: clean code
# TODO: clean code
# train state saving, the first sentence in the batch
# train state saving, the first sentence in the batch
if
global_step
%
snap_interval
==
0
:
if
global_step
%
snap_interval
==
0
:
save_state
(
state_dir
,
save_state
(
state_dir
,
writer
,
writer
,
global_step
,
global_step
,
mel_input
=
downsampled_mel_specs
,
mel_input
=
downsampled_mel_specs
,
...
@@ -275,16 +274,16 @@ if __name__ == "__main__":
...
@@ -275,16 +274,16 @@ if __name__ == "__main__":
"Some have accepted this as a miracle without any physical explanation."
,
"Some have accepted this as a miracle without any physical explanation."
,
]
]
for
idx
,
sent
in
enumerate
(
sentences
):
for
idx
,
sent
in
enumerate
(
sentences
):
wav
,
attn
=
eval_model
(
dv3
,
sent
,
wav
,
attn
=
eval_model
(
replace_pronounciation_prob
,
dv3
,
sent
,
replace_pronounciation_prob
,
min_level_db
,
ref_level_db
,
min_level_db
,
ref_level_db
,
power
,
n_iter
,
power
,
n_iter
,
win_length
,
win_length
,
hop_length
,
preemphasis
)
hop_length
,
preemphasis
)
wav_path
=
os
.
path
.
join
(
wav_path
=
os
.
path
.
join
(
state_dir
,
"waveform"
,
state_dir
,
"waveform"
,
"eval_sample_{:09d}.wav"
.
format
(
global_step
))
"eval_sample_{:09d}.wav"
.
format
(
global_step
))
sf
.
write
(
wav_path
,
wav
,
sample_rate
)
sf
.
write
(
wav_path
,
wav
,
sample_rate
)
writer
.
add_audio
(
"eval_sample_{}"
.
format
(
idx
),
writer
.
add_audio
(
"eval_sample_{}"
.
format
(
idx
),
wav
,
wav
,
global_step
,
global_step
,
sample_rate
=
sample_rate
)
sample_rate
=
sample_rate
)
...
@@ -292,7 +291,8 @@ if __name__ == "__main__":
...
@@ -292,7 +291,8 @@ if __name__ == "__main__":
state_dir
,
"alignments"
,
state_dir
,
"alignments"
,
"eval_sample_attn_{:09d}.png"
.
format
(
global_step
))
"eval_sample_attn_{:09d}.png"
.
format
(
global_step
))
plot_alignment
(
attn
,
attn_path
)
plot_alignment
(
attn
,
attn_path
)
writer
.
add_image
(
"eval_sample_attn{}"
.
format
(
idx
),
writer
.
add_image
(
"eval_sample_attn{}"
.
format
(
idx
),
cm
.
viridis
(
attn
),
cm
.
viridis
(
attn
),
global_step
,
global_step
,
dataformats
=
"HWC"
)
dataformats
=
"HWC"
)
...
...
parakeet/model
s/waveflow/README.md
→
example
s/waveflow/README.md
浏览文件 @
25883dcd
parakeet/model
s/waveflow/benchmark.py
→
example
s/waveflow/benchmark.py
浏览文件 @
25883dcd
...
@@ -2,35 +2,47 @@ import os
...
@@ -2,35 +2,47 @@ import os
import
random
import
random
from
pprint
import
pprint
from
pprint
import
pprint
import
json
argparse
import
argparse
import
numpy
as
np
import
numpy
as
np
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.dygraph
as
dg
from
paddle
import
fluid
from
paddle
import
fluid
import
utils
import
utils
from
waveflow
import
WaveFlow
from
parakeet.models.
waveflow
import
WaveFlow
def
add_options_to_parser
(
parser
):
def
add_options_to_parser
(
parser
):
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'waveflow'
,
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'waveflow'
,
help
=
"general name of the model"
)
help
=
"general name of the model"
)
parser
.
add_argument
(
'--name'
,
type
=
str
,
parser
.
add_argument
(
help
=
"specific name of the training model"
)
'--name'
,
type
=
str
,
help
=
"specific name of the training model"
)
parser
.
add_argument
(
'--root'
,
type
=
str
,
parser
.
add_argument
(
help
=
"root path of the LJSpeech dataset"
)
'--root'
,
type
=
str
,
help
=
"root path of the LJSpeech dataset"
)
parser
.
add_argument
(
'--use_gpu'
,
type
=
bool
,
default
=
True
,
parser
.
add_argument
(
'--use_gpu'
,
type
=
bool
,
default
=
True
,
help
=
"option to use gpu training"
)
help
=
"option to use gpu training"
)
parser
.
add_argument
(
'--iteration'
,
type
=
int
,
default
=
None
,
parser
.
add_argument
(
'--iteration'
,
type
=
int
,
default
=
None
,
help
=
(
"which iteration of checkpoint to load, "
help
=
(
"which iteration of checkpoint to load, "
"default to load the latest checkpoint"
))
"default to load the latest checkpoint"
))
parser
.
add_argument
(
'--checkpoint'
,
type
=
str
,
default
=
None
,
parser
.
add_argument
(
'--checkpoint'
,
type
=
str
,
default
=
None
,
help
=
"path of the checkpoint to load"
)
help
=
"path of the checkpoint to load"
)
def
benchmark
(
config
):
def
benchmark
(
config
):
pprint
(
jsonargparse
.
namespace_to_dict
(
config
))
pprint
(
vars
(
config
))
# Get checkpoint directory path.
# Get checkpoint directory path.
run_dir
=
os
.
path
.
join
(
"runs"
,
config
.
model
,
config
.
name
)
run_dir
=
os
.
path
.
join
(
"runs"
,
config
.
model
,
config
.
name
)
...
@@ -58,9 +70,8 @@ def benchmark(config):
...
@@ -58,9 +70,8 @@ def benchmark(config):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
# Create parser.
# Create parser.
parser
=
jsonargparse
.
ArgumentParser
(
parser
=
argparse
.
ArgumentParser
(
description
=
"Synthesize audio using WaveNet model"
,
description
=
"Synthesize audio using WaveNet model"
)
formatter_class
=
'default_argparse'
)
add_options_to_parser
(
parser
)
add_options_to_parser
(
parser
)
utils
.
add_config_options_to_parser
(
parser
)
utils
.
add_config_options_to_parser
(
parser
)
...
@@ -68,4 +79,5 @@ if __name__ == "__main__":
...
@@ -68,4 +79,5 @@ if __name__ == "__main__":
# For conflicting updates to the same field,
# For conflicting updates to the same field,
# the preceding update will be overwritten by the following one.
# the preceding update will be overwritten by the following one.
config
=
parser
.
parse_args
()
config
=
parser
.
parse_args
()
config
=
utils
.
add_yaml_config
(
config
)
benchmark
(
config
)
benchmark
(
config
)
parakeet/model
s/waveflow/configs/waveflow_ljspeech.yaml
→
example
s/waveflow/configs/waveflow_ljspeech.yaml
浏览文件 @
25883dcd
文件已移动
parakeet/model
s/waveflow/synthesis.py
→
example
s/waveflow/synthesis.py
浏览文件 @
25883dcd
...
@@ -2,40 +2,58 @@ import os
...
@@ -2,40 +2,58 @@ import os
import
random
import
random
from
pprint
import
pprint
from
pprint
import
pprint
import
json
argparse
import
argparse
import
numpy
as
np
import
numpy
as
np
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.dygraph
as
dg
from
paddle
import
fluid
from
paddle
import
fluid
import
utils
import
utils
from
waveflow
import
WaveFlow
from
parakeet.models.
waveflow
import
WaveFlow
def
add_options_to_parser
(
parser
):
def
add_options_to_parser
(
parser
):
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'waveflow'
,
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'waveflow'
,
help
=
"general name of the model"
)
help
=
"general name of the model"
)
parser
.
add_argument
(
'--name'
,
type
=
str
,
parser
.
add_argument
(
help
=
"specific name of the training model"
)
'--name'
,
type
=
str
,
help
=
"specific name of the training model"
)
parser
.
add_argument
(
'--root'
,
type
=
str
,
parser
.
add_argument
(
help
=
"root path of the LJSpeech dataset"
)
'--root'
,
type
=
str
,
help
=
"root path of the LJSpeech dataset"
)
parser
.
add_argument
(
'--use_gpu'
,
type
=
bool
,
default
=
True
,
parser
.
add_argument
(
'--use_gpu'
,
type
=
bool
,
default
=
True
,
help
=
"option to use gpu training"
)
help
=
"option to use gpu training"
)
parser
.
add_argument
(
'--iteration'
,
type
=
int
,
default
=
None
,
parser
.
add_argument
(
'--iteration'
,
type
=
int
,
default
=
None
,
help
=
(
"which iteration of checkpoint to load, "
help
=
(
"which iteration of checkpoint to load, "
"default to load the latest checkpoint"
))
"default to load the latest checkpoint"
))
parser
.
add_argument
(
'--checkpoint'
,
type
=
str
,
default
=
None
,
parser
.
add_argument
(
'--checkpoint'
,
type
=
str
,
default
=
None
,
help
=
"path of the checkpoint to load"
)
help
=
"path of the checkpoint to load"
)
parser
.
add_argument
(
'--output'
,
type
=
str
,
default
=
"./syn_audios"
,
parser
.
add_argument
(
'--output'
,
type
=
str
,
default
=
"./syn_audios"
,
help
=
"path to write synthesized audio files"
)
help
=
"path to write synthesized audio files"
)
parser
.
add_argument
(
'--sample'
,
type
=
int
,
default
=
None
,
parser
.
add_argument
(
'--sample'
,
type
=
int
,
default
=
None
,
help
=
"which of the valid samples to synthesize audio"
)
help
=
"which of the valid samples to synthesize audio"
)
def
synthesize
(
config
):
def
synthesize
(
config
):
pprint
(
jsonargparse
.
namespace_to_dict
(
config
))
pprint
(
vars
(
config
))
# Get checkpoint directory path.
# Get checkpoint directory path.
run_dir
=
os
.
path
.
join
(
"runs"
,
config
.
model
,
config
.
name
)
run_dir
=
os
.
path
.
join
(
"runs"
,
config
.
model
,
config
.
name
)
...
@@ -72,9 +90,8 @@ def synthesize(config):
...
@@ -72,9 +90,8 @@ def synthesize(config):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
# Create parser.
# Create parser.
parser
=
jsonargparse
.
ArgumentParser
(
parser
=
argparse
.
ArgumentParser
(
description
=
"Synthesize audio using WaveNet model"
,
description
=
"Synthesize audio using WaveNet model"
)
formatter_class
=
'default_argparse'
)
add_options_to_parser
(
parser
)
add_options_to_parser
(
parser
)
utils
.
add_config_options_to_parser
(
parser
)
utils
.
add_config_options_to_parser
(
parser
)
...
@@ -82,4 +99,5 @@ if __name__ == "__main__":
...
@@ -82,4 +99,5 @@ if __name__ == "__main__":
# For conflicting updates to the same field,
# For conflicting updates to the same field,
# the preceding update will be overwritten by the following one.
# the preceding update will be overwritten by the following one.
config
=
parser
.
parse_args
()
config
=
parser
.
parse_args
()
config
=
utils
.
add_yaml_config
(
config
)
synthesize
(
config
)
synthesize
(
config
)
parakeet/model
s/waveflow/train.py
→
example
s/waveflow/train.py
浏览文件 @
25883dcd
...
@@ -4,34 +4,48 @@ import subprocess
...
@@ -4,34 +4,48 @@ import subprocess
import
time
import
time
from
pprint
import
pprint
from
pprint
import
pprint
import
json
argparse
import
argparse
import
numpy
as
np
import
numpy
as
np
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.dygraph
as
dg
from
paddle
import
fluid
from
paddle
import
fluid
from
tensorboardX
import
SummaryWriter
from
tensorboardX
import
SummaryWriter
import
slurm
import
utils
import
utils
from
waveflow
import
WaveFlow
from
parakeet.models.
waveflow
import
WaveFlow
def
add_options_to_parser
(
parser
):
def
add_options_to_parser
(
parser
):
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'waveflow'
,
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'waveflow'
,
help
=
"general name of the model"
)
help
=
"general name of the model"
)
parser
.
add_argument
(
'--name'
,
type
=
str
,
parser
.
add_argument
(
help
=
"specific name of the training model"
)
'--name'
,
type
=
str
,
help
=
"specific name of the training model"
)
parser
.
add_argument
(
'--root'
,
type
=
str
,
parser
.
add_argument
(
help
=
"root path of the LJSpeech dataset"
)
'--root'
,
type
=
str
,
help
=
"root path of the LJSpeech dataset"
)
parser
.
add_argument
(
'--parallel'
,
type
=
bool
,
default
=
True
,
parser
.
add_argument
(
'--parallel'
,
type
=
utils
.
str2bool
,
default
=
True
,
help
=
"option to use data parallel training"
)
help
=
"option to use data parallel training"
)
parser
.
add_argument
(
'--use_gpu'
,
type
=
bool
,
default
=
True
,
parser
.
add_argument
(
'--use_gpu'
,
type
=
utils
.
str2bool
,
default
=
True
,
help
=
"option to use gpu training"
)
help
=
"option to use gpu training"
)
parser
.
add_argument
(
'--iteration'
,
type
=
int
,
default
=
None
,
parser
.
add_argument
(
'--iteration'
,
type
=
int
,
default
=
None
,
help
=
(
"which iteration of checkpoint to load, "
help
=
(
"which iteration of checkpoint to load, "
"default to load the latest checkpoint"
))
"default to load the latest checkpoint"
))
parser
.
add_argument
(
'--checkpoint'
,
type
=
str
,
default
=
None
,
parser
.
add_argument
(
'--checkpoint'
,
type
=
str
,
default
=
None
,
help
=
"path of the checkpoint to load"
)
help
=
"path of the checkpoint to load"
)
...
@@ -45,12 +59,13 @@ def train(config):
...
@@ -45,12 +59,13 @@ def train(config):
if
rank
==
0
:
if
rank
==
0
:
# Print the whole config setting.
# Print the whole config setting.
pprint
(
jsonargparse
.
namespace_to_dict
(
config
))
pprint
(
vars
(
config
))
# Make checkpoint directory.
# Make checkpoint directory.
run_dir
=
os
.
path
.
join
(
"runs"
,
config
.
model
,
config
.
name
)
run_dir
=
os
.
path
.
join
(
"runs"
,
config
.
model
,
config
.
name
)
checkpoint_dir
=
os
.
path
.
join
(
run_dir
,
"checkpoint"
)
checkpoint_dir
=
os
.
path
.
join
(
run_dir
,
"checkpoint"
)
os
.
makedirs
(
checkpoint_dir
,
exist_ok
=
True
)
if
not
os
.
path
.
exists
(
checkpoint_dir
):
os
.
makedirs
(
checkpoint_dir
)
# Create tensorboard logger.
# Create tensorboard logger.
tb
=
SummaryWriter
(
os
.
path
.
join
(
run_dir
,
"logs"
))
\
tb
=
SummaryWriter
(
os
.
path
.
join
(
run_dir
,
"logs"
))
\
...
@@ -102,8 +117,8 @@ def train(config):
...
@@ -102,8 +117,8 @@ def train(config):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
# Create parser.
# Create parser.
parser
=
jsonargparse
.
ArgumentParser
(
description
=
"Train WaveFlow model"
,
parser
=
argparse
.
ArgumentParser
(
description
=
"Train WaveFlow model"
)
formatter_class
=
'default_argparse'
)
#
formatter_class='default_argparse')
add_options_to_parser
(
parser
)
add_options_to_parser
(
parser
)
utils
.
add_config_options_to_parser
(
parser
)
utils
.
add_config_options_to_parser
(
parser
)
...
@@ -111,4 +126,5 @@ if __name__ == "__main__":
...
@@ -111,4 +126,5 @@ if __name__ == "__main__":
# For conflicting updates to the same field,
# For conflicting updates to the same field,
# the preceding update will be overwritten by the following one.
# the preceding update will be overwritten by the following one.
config
=
parser
.
parse_args
()
config
=
parser
.
parse_args
()
config
=
utils
.
add_yaml_config
(
config
)
train
(
config
)
train
(
config
)
parakeet/model
s/waveflow/utils.py
→
example
s/waveflow/utils.py
浏览文件 @
25883dcd
...
@@ -2,59 +2,96 @@ import itertools
...
@@ -2,59 +2,96 @@ import itertools
import
os
import
os
import
time
import
time
import
jsonargparse
import
argparse
import
ruamel.yaml
import
numpy
as
np
import
numpy
as
np
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.dygraph
as
dg
def
str2bool
(
v
):
return
v
.
lower
()
in
(
"true"
,
"t"
,
"1"
)
def
add_config_options_to_parser
(
parser
):
def
add_config_options_to_parser
(
parser
):
parser
.
add_argument
(
'--valid_size'
,
type
=
int
,
parser
.
add_argument
(
help
=
"size of the valid dataset"
)
'--valid_size'
,
type
=
int
,
help
=
"size of the valid dataset"
)
parser
.
add_argument
(
'--segment_length'
,
type
=
int
,
parser
.
add_argument
(
'--segment_length'
,
type
=
int
,
help
=
"the length of audio clip for training"
)
help
=
"the length of audio clip for training"
)
parser
.
add_argument
(
'--sample_rate'
,
type
=
int
,
parser
.
add_argument
(
help
=
"sampling rate of audio data file"
)
'--sample_rate'
,
type
=
int
,
help
=
"sampling rate of audio data file"
)
parser
.
add_argument
(
'--fft_window_shift'
,
type
=
int
,
parser
.
add_argument
(
'--fft_window_shift'
,
type
=
int
,
help
=
"the shift of fft window for each frame"
)
help
=
"the shift of fft window for each frame"
)
parser
.
add_argument
(
'--fft_window_size'
,
type
=
int
,
parser
.
add_argument
(
'--fft_window_size'
,
type
=
int
,
help
=
"the size of fft window for each frame"
)
help
=
"the size of fft window for each frame"
)
parser
.
add_argument
(
'--fft_size'
,
type
=
int
,
parser
.
add_argument
(
help
=
"the size of fft filter on each frame"
)
'--fft_size'
,
type
=
int
,
help
=
"the size of fft filter on each frame"
)
parser
.
add_argument
(
'--mel_bands'
,
type
=
int
,
parser
.
add_argument
(
'--mel_bands'
,
type
=
int
,
help
=
"the number of mel bands when calculating mel spectrograms"
)
help
=
"the number of mel bands when calculating mel spectrograms"
)
parser
.
add_argument
(
'--mel_fmin'
,
type
=
float
,
parser
.
add_argument
(
'--mel_fmin'
,
type
=
float
,
help
=
"lowest frequency in calculating mel spectrograms"
)
help
=
"lowest frequency in calculating mel spectrograms"
)
parser
.
add_argument
(
'--mel_fmax'
,
type
=
float
,
parser
.
add_argument
(
'--mel_fmax'
,
type
=
float
,
help
=
"highest frequency in calculating mel spectrograms"
)
help
=
"highest frequency in calculating mel spectrograms"
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
parser
.
add_argument
(
help
=
"seed of random initialization for the model"
)
'--seed'
,
type
=
int
,
help
=
"seed of random initialization for the model"
)
parser
.
add_argument
(
'--learning_rate'
,
type
=
float
)
parser
.
add_argument
(
'--learning_rate'
,
type
=
float
)
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
parser
.
add_argument
(
help
=
"batch size for training"
)
'--batch_size'
,
type
=
int
,
help
=
"batch size for training"
)
parser
.
add_argument
(
'--test_every'
,
type
=
int
,
parser
.
add_argument
(
help
=
"test interval during training"
)
'--test_every'
,
type
=
int
,
help
=
"test interval during training"
)
parser
.
add_argument
(
'--save_every'
,
type
=
int
,
parser
.
add_argument
(
'--save_every'
,
type
=
int
,
help
=
"checkpointing interval during training"
)
help
=
"checkpointing interval during training"
)
parser
.
add_argument
(
'--max_iterations'
,
type
=
int
,
parser
.
add_argument
(
help
=
"maximum training iterations"
)
'--max_iterations'
,
type
=
int
,
help
=
"maximum training iterations"
)
parser
.
add_argument
(
'--sigma'
,
type
=
float
,
parser
.
add_argument
(
'--sigma'
,
type
=
float
,
help
=
"standard deviation of the latent Gaussian variable"
)
help
=
"standard deviation of the latent Gaussian variable"
)
parser
.
add_argument
(
'--n_flows'
,
type
=
int
,
parser
.
add_argument
(
'--n_flows'
,
type
=
int
,
help
=
"number of flows"
)
help
=
"number of flows"
)
parser
.
add_argument
(
parser
.
add_argument
(
'--n_group'
,
type
=
int
,
'--n_group'
,
type
=
int
,
help
=
"number of adjacent audio samples to squeeze into one column"
)
help
=
"number of adjacent audio samples to squeeze into one column"
)
parser
.
add_argument
(
'--n_layers'
,
type
=
int
,
parser
.
add_argument
(
'--n_layers'
,
type
=
int
,
help
=
"number of conv2d layer in one wavenet-like flow architecture"
)
help
=
"number of conv2d layer in one wavenet-like flow architecture"
)
parser
.
add_argument
(
'--n_channels'
,
type
=
int
,
parser
.
add_argument
(
help
=
"number of residual channels in flow"
)
'--n_channels'
,
type
=
int
,
help
=
"number of residual channels in flow"
)
parser
.
add_argument
(
'--kernel_h'
,
type
=
int
,
parser
.
add_argument
(
'--kernel_h'
,
type
=
int
,
help
=
"height of the kernel in the conv2d layer"
)
help
=
"height of the kernel in the conv2d layer"
)
parser
.
add_argument
(
'--kernel_w'
,
type
=
int
,
parser
.
add_argument
(
help
=
"width of the kernel in the conv2d layer"
)
'--kernel_w'
,
type
=
int
,
help
=
"width of the kernel in the conv2d layer"
)
parser
.
add_argument
(
'--config'
,
type
=
str
,
help
=
"Path to the config file."
)
parser
.
add_argument
(
'--config'
,
action
=
jsonargparse
.
ActionConfigFile
)
def
add_yaml_config
(
config
):
with
open
(
config
.
config
,
'rt'
)
as
f
:
yaml_cfg
=
ruamel
.
yaml
.
safe_load
(
f
)
cfg_vars
=
vars
(
config
)
for
k
,
v
in
yaml_cfg
.
items
():
if
k
in
cfg_vars
and
cfg_vars
[
k
]
is
not
None
:
continue
cfg_vars
[
k
]
=
v
return
config
def
load_latest_checkpoint
(
checkpoint_dir
,
rank
=
0
):
def
load_latest_checkpoint
(
checkpoint_dir
,
rank
=
0
):
...
@@ -84,8 +121,12 @@ def save_latest_checkpoint(checkpoint_dir, iteration):
...
@@ -84,8 +121,12 @@ def save_latest_checkpoint(checkpoint_dir, iteration):
handle
.
write
(
"model_checkpoint_path: step-{}"
.
format
(
iteration
))
handle
.
write
(
"model_checkpoint_path: step-{}"
.
format
(
iteration
))
def
load_parameters
(
checkpoint_dir
,
rank
,
model
,
optimizer
=
None
,
def
load_parameters
(
checkpoint_dir
,
iteration
=
None
,
file_path
=
None
):
rank
,
model
,
optimizer
=
None
,
iteration
=
None
,
file_path
=
None
):
if
file_path
is
None
:
if
file_path
is
None
:
if
iteration
is
None
:
if
iteration
is
None
:
iteration
=
load_latest_checkpoint
(
checkpoint_dir
,
rank
)
iteration
=
load_latest_checkpoint
(
checkpoint_dir
,
rank
)
...
...
parakeet/datasets/ljspeech.py
浏览文件 @
25883dcd
...
@@ -5,21 +5,26 @@ import librosa
...
@@ -5,21 +5,26 @@ import librosa
from
..
import
g2p
from
..
import
g2p
from
..data.sampler
import
SequentialSampler
,
RandomSampler
,
BatchSampler
from
..data.sampler
import
SequentialSampler
,
RandomSampler
,
BatchSampler
from
..data.dataset
import
Dataset
from
..data.dataset
import
Dataset
Mixin
from
..data.datacargo
import
DataCargo
from
..data.datacargo
import
DataCargo
from
..data.batch
import
TextIDBatcher
,
SpecBatcher
from
..data.batch
import
TextIDBatcher
,
SpecBatcher
class
LJSpeech
(
Dataset
):
class
LJSpeech
(
Dataset
Mixin
):
def
__init__
(
self
,
root
):
def
__init__
(
self
,
root
):
super
(
LJSpeech
,
self
).
__init__
()
super
(
LJSpeech
,
self
).
__init__
()
assert
isinstance
(
root
,
(
str
,
Path
)),
"root should be a string or Path object"
assert
isinstance
(
root
,
(
str
,
Path
)),
"root should be a string or Path object"
self
.
root
=
root
if
isinstance
(
root
,
Path
)
else
Path
(
root
)
self
.
root
=
root
if
isinstance
(
root
,
Path
)
else
Path
(
root
)
self
.
metadata
=
self
.
_prepare_metadata
()
self
.
metadata
=
self
.
_prepare_metadata
()
def
_prepare_metadata
(
self
):
def
_prepare_metadata
(
self
):
csv_path
=
self
.
root
.
joinpath
(
"metadata.csv"
)
csv_path
=
self
.
root
.
joinpath
(
"metadata.csv"
)
metadata
=
pd
.
read_csv
(
csv_path
,
sep
=
"|"
,
header
=
None
,
quoting
=
3
,
metadata
=
pd
.
read_csv
(
csv_path
,
sep
=
"|"
,
header
=
None
,
quoting
=
3
,
names
=
[
"fname"
,
"raw_text"
,
"normalized_text"
])
names
=
[
"fname"
,
"raw_text"
,
"normalized_text"
])
return
metadata
return
metadata
...
@@ -35,7 +40,9 @@ class LJSpeech(Dataset):
...
@@ -35,7 +40,9 @@ class LJSpeech(Dataset):
wav_path
=
self
.
root
.
joinpath
(
"wavs"
,
fname
+
".wav"
)
wav_path
=
self
.
root
.
joinpath
(
"wavs"
,
fname
+
".wav"
)
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
wav
,
sample_rate
=
librosa
.
load
(
wav_path
,
sr
=
None
)
# we would rather use functor to hold its parameters
wav
,
sample_rate
=
librosa
.
load
(
wav_path
,
sr
=
None
)
# we would rather use functor to hold its parameters
trimed
,
_
=
librosa
.
effects
.
trim
(
wav
)
trimed
,
_
=
librosa
.
effects
.
trim
(
wav
)
preemphasized
=
librosa
.
effects
.
preemphasis
(
trimed
)
preemphasized
=
librosa
.
effects
.
preemphasis
(
trimed
)
D
=
librosa
.
stft
(
preemphasized
)
D
=
librosa
.
stft
(
preemphasized
)
...
@@ -50,8 +57,10 @@ class LJSpeech(Dataset):
...
@@ -50,8 +57,10 @@ class LJSpeech(Dataset):
mel
=
np
.
clip
((
mel
-
ref_db
+
max_db
)
/
max_db
,
1e-8
,
1
)
mel
=
np
.
clip
((
mel
-
ref_db
+
max_db
)
/
max_db
,
1e-8
,
1
)
mel
=
np
.
clip
((
mag
-
ref_db
+
max_db
)
/
max_db
,
1e-8
,
1
)
mel
=
np
.
clip
((
mag
-
ref_db
+
max_db
)
/
max_db
,
1e-8
,
1
)
phonemes
=
np
.
array
(
g2p
.
en
.
text_to_sequence
(
normalized_text
),
dtype
=
np
.
int64
)
phonemes
=
np
.
array
(
return
(
mag
,
mel
,
phonemes
)
# maybe we need to implement it as a map in the future
g2p
.
en
.
text_to_sequence
(
normalized_text
),
dtype
=
np
.
int64
)
return
(
mag
,
mel
,
phonemes
)
# maybe we need to implement it as a map in the future
def
_batch_examples
(
self
,
minibatch
):
def
_batch_examples
(
self
,
minibatch
):
mag_batch
=
[]
mag_batch
=
[]
...
@@ -78,5 +87,3 @@ class LJSpeech(Dataset):
...
@@ -78,5 +87,3 @@ class LJSpeech(Dataset):
def
__len__
(
self
):
def
__len__
(
self
):
return
len
(
self
.
metadata
)
return
len
(
self
.
metadata
)
parakeet/models/waveflow/__init__.py
0 → 100644
浏览文件 @
25883dcd
from
parakeet.models.waveflow.waveflow
import
WaveFlow
parakeet/models/waveflow/data.py
浏览文件 @
25883dcd
...
@@ -5,10 +5,9 @@ import numpy as np
...
@@ -5,10 +5,9 @@ import numpy as np
from
paddle
import
fluid
from
paddle
import
fluid
from
parakeet.datasets
import
ljspeech
from
parakeet.datasets
import
ljspeech
from
parakeet.data
import
dataset
from
parakeet.data
import
SpecBatcher
,
WavBatcher
from
parakeet.data.batch
import
SpecBatcher
,
WavBatcher
from
parakeet.data
import
DataCargo
,
DatasetMixin
from
parakeet.data.datacargo
import
DataCargo
from
parakeet.data
import
DistributedSampler
,
BatchSampler
from
parakeet.data.sampler
import
DistributedSampler
,
BatchSampler
from
scipy.io.wavfile
import
read
from
scipy.io.wavfile
import
read
...
@@ -27,7 +26,7 @@ class Dataset(ljspeech.LJSpeech):
...
@@ -27,7 +26,7 @@ class Dataset(ljspeech.LJSpeech):
return
audio
return
audio
class
Subset
(
dataset
.
Dataset
):
class
Subset
(
DatasetMixin
):
def
__init__
(
self
,
dataset
,
indices
,
valid
):
def
__init__
(
self
,
dataset
,
indices
,
valid
):
self
.
dataset
=
dataset
self
.
dataset
=
dataset
self
.
indices
=
indices
self
.
indices
=
indices
...
@@ -36,14 +35,14 @@ class Subset(dataset.Dataset):
...
@@ -36,14 +35,14 @@ class Subset(dataset.Dataset):
def
get_mel
(
self
,
audio
):
def
get_mel
(
self
,
audio
):
spectrogram
=
librosa
.
core
.
stft
(
spectrogram
=
librosa
.
core
.
stft
(
audio
,
n_fft
=
self
.
config
.
fft_size
,
audio
,
n_fft
=
self
.
config
.
fft_size
,
hop_length
=
self
.
config
.
fft_window_shift
,
hop_length
=
self
.
config
.
fft_window_shift
,
win_length
=
self
.
config
.
fft_window_size
)
win_length
=
self
.
config
.
fft_window_size
)
spectrogram_magnitude
=
np
.
abs
(
spectrogram
)
spectrogram_magnitude
=
np
.
abs
(
spectrogram
)
# mel_filter_bank shape: [n_mels, 1 + n_fft/2]
# mel_filter_bank shape: [n_mels, 1 + n_fft/2]
mel_filter_bank
=
librosa
.
filters
.
mel
(
mel_filter_bank
=
librosa
.
filters
.
mel
(
sr
=
self
.
config
.
sample_rate
,
sr
=
self
.
config
.
sample_rate
,
n_fft
=
self
.
config
.
fft_size
,
n_fft
=
self
.
config
.
fft_size
,
n_mels
=
self
.
config
.
mel_bands
,
n_mels
=
self
.
config
.
mel_bands
,
fmin
=
self
.
config
.
mel_fmin
,
fmin
=
self
.
config
.
mel_fmin
,
...
@@ -70,10 +69,11 @@ class Subset(dataset.Dataset):
...
@@ -70,10 +69,11 @@ class Subset(dataset.Dataset):
if
audio
.
shape
[
0
]
>=
segment_length
:
if
audio
.
shape
[
0
]
>=
segment_length
:
max_audio_start
=
audio
.
shape
[
0
]
-
segment_length
max_audio_start
=
audio
.
shape
[
0
]
-
segment_length
audio_start
=
random
.
randint
(
0
,
max_audio_start
)
audio_start
=
random
.
randint
(
0
,
max_audio_start
)
audio
=
audio
[
audio_start
:
(
audio_start
+
segment_length
)]
audio
=
audio
[
audio_start
:
(
audio_start
+
segment_length
)]
else
:
else
:
audio
=
np
.
pad
(
audio
,
(
0
,
segment_length
-
audio
.
shape
[
0
]),
audio
=
np
.
pad
(
audio
,
(
0
,
segment_length
-
audio
.
shape
[
0
]),
mode
=
'constant'
,
constant_values
=
0
)
mode
=
'constant'
,
constant_values
=
0
)
# Normalize audio to the [-1, 1] range.
# Normalize audio to the [-1, 1] range.
audio
=
audio
.
astype
(
np
.
float32
)
/
32768.0
audio
=
audio
.
astype
(
np
.
float32
)
/
32768.0
...
@@ -112,8 +112,8 @@ class LJSpeech:
...
@@ -112,8 +112,8 @@ class LJSpeech:
sampler
=
DistributedSampler
(
len
(
trainset
),
nranks
,
rank
)
sampler
=
DistributedSampler
(
len
(
trainset
),
nranks
,
rank
)
total_bs
=
config
.
batch_size
total_bs
=
config
.
batch_size
assert
total_bs
%
nranks
==
0
assert
total_bs
%
nranks
==
0
train_sampler
=
BatchSampler
(
sampler
,
total_bs
//
nranks
,
train_sampler
=
BatchSampler
(
drop_last
=
True
)
sampler
,
total_bs
//
nranks
,
drop_last
=
True
)
trainloader
=
DataCargo
(
trainset
,
batch_sampler
=
train_sampler
)
trainloader
=
DataCargo
(
trainset
,
batch_sampler
=
train_sampler
)
trainreader
=
fluid
.
io
.
PyReader
(
capacity
=
50
,
return_list
=
True
)
trainreader
=
fluid
.
io
.
PyReader
(
capacity
=
50
,
return_list
=
True
)
...
...
parakeet/models/waveflow/waveflow.py
浏览文件 @
25883dcd
...
@@ -8,13 +8,18 @@ from paddle import fluid
...
@@ -8,13 +8,18 @@ from paddle import fluid
from
scipy.io.wavfile
import
write
from
scipy.io.wavfile
import
write
import
utils
import
utils
from
data
import
LJSpeech
from
.
data
import
LJSpeech
from
waveflow_modules
import
WaveFlowLoss
,
WaveFlowModule
from
.
waveflow_modules
import
WaveFlowLoss
,
WaveFlowModule
class
WaveFlow
():
class
WaveFlow
():
def
__init__
(
self
,
config
,
checkpoint_dir
,
parallel
=
False
,
rank
=
0
,
def
__init__
(
self
,
nranks
=
1
,
tb_logger
=
None
):
config
,
checkpoint_dir
,
parallel
=
False
,
rank
=
0
,
nranks
=
1
,
tb_logger
=
None
):
self
.
config
=
config
self
.
config
=
config
self
.
checkpoint_dir
=
checkpoint_dir
self
.
checkpoint_dir
=
checkpoint_dir
self
.
parallel
=
parallel
self
.
parallel
=
parallel
...
@@ -28,7 +33,7 @@ class WaveFlow():
...
@@ -28,7 +33,7 @@ class WaveFlow():
self
.
trainloader
=
dataset
.
trainloader
self
.
trainloader
=
dataset
.
trainloader
self
.
validloader
=
dataset
.
validloader
self
.
validloader
=
dataset
.
validloader
waveflow
=
WaveFlowModule
(
"waveflow"
,
config
)
waveflow
=
WaveFlowModule
(
config
)
# Dry run once to create and initalize all necessary parameters.
# Dry run once to create and initalize all necessary parameters.
audio
=
dg
.
to_variable
(
np
.
random
.
randn
(
1
,
16000
).
astype
(
np
.
float32
))
audio
=
dg
.
to_variable
(
np
.
random
.
randn
(
1
,
16000
).
astype
(
np
.
float32
))
...
@@ -38,11 +43,15 @@ class WaveFlow():
...
@@ -38,11 +43,15 @@ class WaveFlow():
if
training
:
if
training
:
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
config
.
learning_rate
)
learning_rate
=
config
.
learning_rate
,
parameter_list
=
waveflow
.
parameters
())
# Load parameters.
# Load parameters.
utils
.
load_parameters
(
self
.
checkpoint_dir
,
self
.
rank
,
utils
.
load_parameters
(
waveflow
,
optimizer
,
self
.
checkpoint_dir
,
self
.
rank
,
waveflow
,
optimizer
,
iteration
=
config
.
iteration
,
iteration
=
config
.
iteration
,
file_path
=
config
.
checkpoint
)
file_path
=
config
.
checkpoint
)
print
(
"Rank {}: checkpoint loaded."
.
format
(
self
.
rank
))
print
(
"Rank {}: checkpoint loaded."
.
format
(
self
.
rank
))
...
@@ -58,7 +67,10 @@ class WaveFlow():
...
@@ -58,7 +67,10 @@ class WaveFlow():
else
:
else
:
# Load parameters.
# Load parameters.
utils
.
load_parameters
(
self
.
checkpoint_dir
,
self
.
rank
,
waveflow
,
utils
.
load_parameters
(
self
.
checkpoint_dir
,
self
.
rank
,
waveflow
,
iteration
=
config
.
iteration
,
iteration
=
config
.
iteration
,
file_path
=
config
.
checkpoint
)
file_path
=
config
.
checkpoint
)
print
(
"Rank {}: checkpoint loaded."
.
format
(
self
.
rank
))
print
(
"Rank {}: checkpoint loaded."
.
format
(
self
.
rank
))
...
@@ -83,7 +95,8 @@ class WaveFlow():
...
@@ -83,7 +95,8 @@ class WaveFlow():
else
:
else
:
loss
.
backward
()
loss
.
backward
()
self
.
optimizer
.
minimize
(
loss
,
parameter_list
=
self
.
waveflow
.
parameters
())
self
.
optimizer
.
minimize
(
loss
,
parameter_list
=
self
.
waveflow
.
parameters
())
self
.
waveflow
.
clear_gradients
()
self
.
waveflow
.
clear_gradients
()
graph_time
=
time
.
time
()
graph_time
=
time
.
time
()
...
@@ -139,7 +152,8 @@ class WaveFlow():
...
@@ -139,7 +152,8 @@ class WaveFlow():
sample
=
config
.
sample
sample
=
config
.
sample
output
=
"{}/{}/iter-{}"
.
format
(
config
.
output
,
config
.
name
,
iteration
)
output
=
"{}/{}/iter-{}"
.
format
(
config
.
output
,
config
.
name
,
iteration
)
os
.
makedirs
(
output
,
exist_ok
=
True
)
if
not
os
.
path
.
exists
(
output
):
os
.
makedirs
(
output
)
mels_list
=
[
mels
for
_
,
mels
in
self
.
validloader
()]
mels_list
=
[
mels
for
_
,
mels
in
self
.
validloader
()]
if
sample
is
not
None
:
if
sample
is
not
None
:
...
@@ -155,8 +169,8 @@ class WaveFlow():
...
@@ -155,8 +169,8 @@ class WaveFlow():
audio
=
audio
[
0
]
audio
=
audio
[
0
]
audio_time
=
audio
.
shape
[
0
]
/
self
.
config
.
sample_rate
audio_time
=
audio
.
shape
[
0
]
/
self
.
config
.
sample_rate
print
(
"audio time {:.4f}, synthesis time {:.4f}"
.
format
(
print
(
"audio time {:.4f}, synthesis time {:.4f}"
.
format
(
audio_time
,
audio_time
,
syn_time
))
syn_time
))
# Denormalize audio from [-1, 1] to [-32768, 32768] int16 range.
# Denormalize audio from [-1, 1] to [-32768, 32768] int16 range.
audio
=
audio
.
numpy
()
*
32768.0
audio
=
audio
.
numpy
()
*
32768.0
...
@@ -180,8 +194,8 @@ class WaveFlow():
...
@@ -180,8 +194,8 @@ class WaveFlow():
syn_time
=
time
.
time
()
-
start_time
syn_time
=
time
.
time
()
-
start_time
audio_time
=
audio
.
shape
[
1
]
*
batch_size
/
self
.
config
.
sample_rate
audio_time
=
audio
.
shape
[
1
]
*
batch_size
/
self
.
config
.
sample_rate
print
(
"audio time {:.4f}, synthesis time {:.4f}"
.
format
(
print
(
"audio time {:.4f}, synthesis time {:.4f}"
.
format
(
audio_time
,
audio_time
,
syn_time
))
syn_time
))
print
(
"{} X real-time"
.
format
(
audio_time
/
syn_time
))
print
(
"{} X real-time"
.
format
(
audio_time
/
syn_time
))
def
save
(
self
,
iteration
):
def
save
(
self
,
iteration
):
...
...
parakeet/models/waveflow/waveflow_modules.py
浏览文件 @
25883dcd
...
@@ -3,22 +3,23 @@ import itertools
...
@@ -3,22 +3,23 @@ import itertools
import
numpy
as
np
import
numpy
as
np
import
paddle.fluid.dygraph
as
dg
import
paddle.fluid.dygraph
as
dg
from
paddle
import
fluid
from
paddle
import
fluid
from
parakeet.modules
import
conv
,
modules
,
weight_norm
from
parakeet.modules
import
weight_norm
def
set_param_attr
(
layer
,
c_in
=
1
):
def
get_param_attr
(
layer_type
,
filter_size
,
c_in
=
1
):
if
isinstance
(
layer
,
(
weight_norm
.
Conv2DTranspose
,
weight_norm
.
Conv2D
))
:
if
layer_type
==
"weight_norm"
:
k
=
np
.
sqrt
(
1.0
/
(
c_in
*
np
.
prod
(
layer
.
_
filter_size
)))
k
=
np
.
sqrt
(
1.0
/
(
c_in
*
np
.
prod
(
filter_size
)))
weight_init
=
fluid
.
initializer
.
UniformInitializer
(
low
=-
k
,
high
=
k
)
weight_init
=
fluid
.
initializer
.
UniformInitializer
(
low
=-
k
,
high
=
k
)
bias_init
=
fluid
.
initializer
.
UniformInitializer
(
low
=-
k
,
high
=
k
)
bias_init
=
fluid
.
initializer
.
UniformInitializer
(
low
=-
k
,
high
=
k
)
elif
isinstance
(
layer
,
dg
.
Conv2D
)
:
elif
layer_type
==
"common"
:
weight_init
=
fluid
.
initializer
.
ConstantInitializer
(
0.0
)
weight_init
=
fluid
.
initializer
.
ConstantInitializer
(
0.0
)
bias_init
=
fluid
.
initializer
.
ConstantInitializer
(
0.0
)
bias_init
=
fluid
.
initializer
.
ConstantInitializer
(
0.0
)
else
:
else
:
raise
TypeError
(
"Unsupported layer type."
)
raise
TypeError
(
"Unsupported layer type."
)
layer
.
_param_attr
=
fluid
.
ParamAttr
(
initializer
=
weight_init
)
param_attr
=
fluid
.
ParamAttr
(
initializer
=
weight_init
)
layer
.
_bias_attr
=
fluid
.
ParamAttr
(
initializer
=
bias_init
)
bias_attr
=
fluid
.
ParamAttr
(
initializer
=
bias_init
)
return
param_attr
,
bias_attr
def
unfold
(
x
,
n_group
):
def
unfold
(
x
,
n_group
):
...
@@ -48,20 +49,23 @@ class WaveFlowLoss:
...
@@ -48,20 +49,23 @@ class WaveFlowLoss:
class
Conditioner
(
dg
.
Layer
):
class
Conditioner
(
dg
.
Layer
):
def
__init__
(
self
,
name_scope
):
def
__init__
(
self
):
super
(
Conditioner
,
self
).
__init__
(
name_scope
)
super
(
Conditioner
,
self
).
__init__
()
upsample_factors
=
[
16
,
16
]
upsample_factors
=
[
16
,
16
]
self
.
upsample_conv2d
=
[]
self
.
upsample_conv2d
=
[]
for
s
in
upsample_factors
:
for
s
in
upsample_factors
:
in_channel
=
1
in_channel
=
1
conv_trans2d
=
modules
.
Conv2DTranspose
(
param_attr
,
bias_attr
=
get_param_attr
(
self
.
full_name
(),
"weight_norm"
,
(
3
,
2
*
s
),
c_in
=
in_channel
)
conv_trans2d
=
weight_norm
.
Conv2DTranspose
(
num_channels
=
in_channel
,
num_filters
=
1
,
num_filters
=
1
,
filter_size
=
(
3
,
2
*
s
),
filter_size
=
(
3
,
2
*
s
),
padding
=
(
1
,
s
//
2
),
padding
=
(
1
,
s
//
2
),
stride
=
(
1
,
s
))
stride
=
(
1
,
s
),
set_param_attr
(
conv_trans2d
,
c_in
=
in_channel
)
param_attr
=
param_attr
,
bias_attr
=
bias_attr
)
self
.
upsample_conv2d
.
append
(
conv_trans2d
)
self
.
upsample_conv2d
.
append
(
conv_trans2d
)
for
i
,
layer
in
enumerate
(
self
.
upsample_conv2d
):
for
i
,
layer
in
enumerate
(
self
.
upsample_conv2d
):
...
@@ -86,8 +90,8 @@ class Conditioner(dg.Layer):
...
@@ -86,8 +90,8 @@ class Conditioner(dg.Layer):
class
Flow
(
dg
.
Layer
):
class
Flow
(
dg
.
Layer
):
def
__init__
(
self
,
name_scope
,
config
):
def
__init__
(
self
,
config
):
super
(
Flow
,
self
).
__init__
(
name_scope
)
super
(
Flow
,
self
).
__init__
()
self
.
n_layers
=
config
.
n_layers
self
.
n_layers
=
config
.
n_layers
self
.
n_channels
=
config
.
n_channels
self
.
n_channels
=
config
.
n_channels
self
.
kernel_h
=
config
.
kernel_h
self
.
kernel_h
=
config
.
kernel_h
...
@@ -95,27 +99,34 @@ class Flow(dg.Layer):
...
@@ -95,27 +99,34 @@ class Flow(dg.Layer):
# Transform audio: [batch, 1, n_group, time/n_group]
# Transform audio: [batch, 1, n_group, time/n_group]
# => [batch, n_channels, n_group, time/n_group]
# => [batch, n_channels, n_group, time/n_group]
param_attr
,
bias_attr
=
get_param_attr
(
"weight_norm"
,
(
1
,
1
),
c_in
=
1
)
self
.
start
=
weight_norm
.
Conv2D
(
self
.
start
=
weight_norm
.
Conv2D
(
self
.
full_name
()
,
num_channels
=
1
,
num_filters
=
self
.
n_channels
,
num_filters
=
self
.
n_channels
,
filter_size
=
(
1
,
1
))
filter_size
=
(
1
,
1
),
set_param_attr
(
self
.
start
,
c_in
=
1
)
param_attr
=
param_attr
,
bias_attr
=
bias_attr
)
# Initializing last layer to 0 makes the affine coupling layers
# Initializing last layer to 0 makes the affine coupling layers
# do nothing at first. This helps with training stability
# do nothing at first. This helps with training stability
# output shape: [batch, 2, n_group, time/n_group]
# output shape: [batch, 2, n_group, time/n_group]
param_attr
,
bias_attr
=
get_param_attr
(
"common"
,
(
1
,
1
),
c_in
=
self
.
n_channels
)
self
.
end
=
dg
.
Conv2D
(
self
.
end
=
dg
.
Conv2D
(
self
.
full_name
()
,
num_channels
=
self
.
n_channels
,
num_filters
=
2
,
num_filters
=
2
,
filter_size
=
(
1
,
1
))
filter_size
=
(
1
,
1
),
set_param_attr
(
self
.
end
)
param_attr
=
param_attr
,
bias_attr
=
bias_attr
)
# receiptive fileds: (kernel - 1) * sum(dilations) + 1 >= squeeze
# receiptive fileds: (kernel - 1) * sum(dilations) + 1 >= squeeze
dilation_dict
=
{
8
:
[
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
],
dilation_dict
=
{
8
:
[
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
],
16
:
[
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
],
16
:
[
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
],
32
:
[
1
,
2
,
4
,
1
,
2
,
4
,
1
,
2
],
32
:
[
1
,
2
,
4
,
1
,
2
,
4
,
1
,
2
],
64
:
[
1
,
2
,
4
,
8
,
16
,
1
,
2
,
4
],
64
:
[
1
,
2
,
4
,
8
,
16
,
1
,
2
,
4
],
128
:
[
1
,
2
,
4
,
8
,
16
,
32
,
64
,
1
]}
128
:
[
1
,
2
,
4
,
8
,
16
,
32
,
64
,
1
]
}
self
.
dilation_h_list
=
dilation_dict
[
config
.
n_group
]
self
.
dilation_h_list
=
dilation_dict
[
config
.
n_group
]
self
.
in_layers
=
[]
self
.
in_layers
=
[]
...
@@ -123,32 +134,42 @@ class Flow(dg.Layer):
...
@@ -123,32 +134,42 @@ class Flow(dg.Layer):
self
.
res_skip_layers
=
[]
self
.
res_skip_layers
=
[]
for
i
in
range
(
self
.
n_layers
):
for
i
in
range
(
self
.
n_layers
):
dilation_h
=
self
.
dilation_h_list
[
i
]
dilation_h
=
self
.
dilation_h_list
[
i
]
dilation_w
=
2
**
i
dilation_w
=
2
**
i
param_attr
,
bias_attr
=
get_param_attr
(
"weight_norm"
,
(
self
.
kernel_h
,
self
.
kernel_w
),
c_in
=
self
.
n_channels
)
in_layer
=
weight_norm
.
Conv2D
(
in_layer
=
weight_norm
.
Conv2D
(
self
.
full_name
()
,
num_channels
=
self
.
n_channels
,
num_filters
=
2
*
self
.
n_channels
,
num_filters
=
2
*
self
.
n_channels
,
filter_size
=
(
self
.
kernel_h
,
self
.
kernel_w
),
filter_size
=
(
self
.
kernel_h
,
self
.
kernel_w
),
dilation
=
(
dilation_h
,
dilation_w
))
dilation
=
(
dilation_h
,
dilation_w
),
set_param_attr
(
in_layer
,
c_in
=
self
.
n_channels
)
param_attr
=
param_attr
,
bias_attr
=
bias_attr
)
self
.
in_layers
.
append
(
in_layer
)
self
.
in_layers
.
append
(
in_layer
)
param_attr
,
bias_attr
=
get_param_attr
(
"weight_norm"
,
(
1
,
1
),
c_in
=
config
.
mel_bands
)
cond_layer
=
weight_norm
.
Conv2D
(
cond_layer
=
weight_norm
.
Conv2D
(
self
.
full_name
()
,
num_channels
=
config
.
mel_bands
,
num_filters
=
2
*
self
.
n_channels
,
num_filters
=
2
*
self
.
n_channels
,
filter_size
=
(
1
,
1
))
filter_size
=
(
1
,
1
),
set_param_attr
(
cond_layer
,
c_in
=
config
.
mel_bands
)
param_attr
=
param_attr
,
bias_attr
=
bias_attr
)
self
.
cond_layers
.
append
(
cond_layer
)
self
.
cond_layers
.
append
(
cond_layer
)
if
i
<
self
.
n_layers
-
1
:
if
i
<
self
.
n_layers
-
1
:
res_skip_channels
=
2
*
self
.
n_channels
res_skip_channels
=
2
*
self
.
n_channels
else
:
else
:
res_skip_channels
=
self
.
n_channels
res_skip_channels
=
self
.
n_channels
param_attr
,
bias_attr
=
get_param_attr
(
"weight_norm"
,
(
1
,
1
),
c_in
=
self
.
n_channels
)
res_skip_layer
=
weight_norm
.
Conv2D
(
res_skip_layer
=
weight_norm
.
Conv2D
(
self
.
full_name
()
,
num_channels
=
self
.
n_channels
,
num_filters
=
res_skip_channels
,
num_filters
=
res_skip_channels
,
filter_size
=
(
1
,
1
))
filter_size
=
(
1
,
1
),
set_param_attr
(
res_skip_layer
,
c_in
=
self
.
n_channels
)
param_attr
=
param_attr
,
bias_attr
=
bias_attr
)
self
.
res_skip_layers
.
append
(
res_skip_layer
)
self
.
res_skip_layers
.
append
(
res_skip_layer
)
self
.
add_sublayer
(
"in_layer_{}"
.
format
(
i
),
in_layer
)
self
.
add_sublayer
(
"in_layer_{}"
.
format
(
i
),
in_layer
)
...
@@ -162,14 +183,14 @@ class Flow(dg.Layer):
...
@@ -162,14 +183,14 @@ class Flow(dg.Layer):
for
i
in
range
(
self
.
n_layers
):
for
i
in
range
(
self
.
n_layers
):
dilation_h
=
self
.
dilation_h_list
[
i
]
dilation_h
=
self
.
dilation_h_list
[
i
]
dilation_w
=
2
**
i
dilation_w
=
2
**
i
# Pad height dim (n_group): causal convolution
# Pad height dim (n_group): causal convolution
# Pad width dim (time): dialated non-causal convolution
# Pad width dim (time): dialated non-causal convolution
pad_top
,
pad_bottom
=
(
self
.
kernel_h
-
1
)
*
dilation_h
,
0
pad_top
,
pad_bottom
=
(
self
.
kernel_h
-
1
)
*
dilation_h
,
0
pad_left
=
pad_right
=
int
((
self
.
kernel_w
-
1
)
*
dilation_w
/
2
)
pad_left
=
pad_right
=
int
((
self
.
kernel_w
-
1
)
*
dilation_w
/
2
)
audio_pad
=
fluid
.
layers
.
pad2d
(
audio
,
audio_pad
=
fluid
.
layers
.
pad2d
(
paddings
=
[
pad_top
,
pad_bottom
,
pad_left
,
pad_right
])
audio
,
paddings
=
[
pad_top
,
pad_bottom
,
pad_left
,
pad_right
])
hidden
=
self
.
in_layers
[
i
](
audio_pad
)
hidden
=
self
.
in_layers
[
i
](
audio_pad
)
cond_hidden
=
self
.
cond_layers
[
i
](
mel
)
cond_hidden
=
self
.
cond_layers
[
i
](
mel
)
...
@@ -196,7 +217,7 @@ class Flow(dg.Layer):
...
@@ -196,7 +217,7 @@ class Flow(dg.Layer):
for
i
in
range
(
self
.
n_layers
):
for
i
in
range
(
self
.
n_layers
):
dilation_h
=
self
.
dilation_h_list
[
i
]
dilation_h
=
self
.
dilation_h_list
[
i
]
dilation_w
=
2
**
i
dilation_w
=
2
**
i
state_size
=
dilation_h
*
(
self
.
kernel_h
-
1
)
state_size
=
dilation_h
*
(
self
.
kernel_h
-
1
)
queue
=
queues
[
i
]
queue
=
queues
[
i
]
...
@@ -206,7 +227,7 @@ class Flow(dg.Layer):
...
@@ -206,7 +227,7 @@ class Flow(dg.Layer):
queue
.
append
(
fluid
.
layers
.
zeros_like
(
audio
))
queue
.
append
(
fluid
.
layers
.
zeros_like
(
audio
))
state
=
queue
[
0
:
state_size
]
state
=
queue
[
0
:
state_size
]
state
=
fluid
.
layers
.
concat
(
[
*
state
,
audio
],
axis
=
2
)
state
=
fluid
.
layers
.
concat
(
state
+
[
audio
],
axis
=
2
)
queue
.
pop
(
0
)
queue
.
pop
(
0
)
queue
.
append
(
audio
)
queue
.
append
(
audio
)
...
@@ -214,10 +235,10 @@ class Flow(dg.Layer):
...
@@ -214,10 +235,10 @@ class Flow(dg.Layer):
# Pad height dim (n_group): causal convolution
# Pad height dim (n_group): causal convolution
# Pad width dim (time): dialated non-causal convolution
# Pad width dim (time): dialated non-causal convolution
pad_top
,
pad_bottom
=
0
,
0
pad_top
,
pad_bottom
=
0
,
0
pad_left
=
int
((
self
.
kernel_w
-
1
)
*
dilation_w
/
2
)
pad_left
=
int
((
self
.
kernel_w
-
1
)
*
dilation_w
/
2
)
pad_right
=
int
((
self
.
kernel_w
-
1
)
*
dilation_w
/
2
)
pad_right
=
int
((
self
.
kernel_w
-
1
)
*
dilation_w
/
2
)
state
=
fluid
.
layers
.
pad2d
(
state
,
state
=
fluid
.
layers
.
pad2d
(
paddings
=
[
pad_top
,
pad_bottom
,
pad_left
,
pad_right
])
state
,
paddings
=
[
pad_top
,
pad_bottom
,
pad_left
,
pad_right
])
hidden
=
self
.
in_layers
[
i
](
state
)
hidden
=
self
.
in_layers
[
i
](
state
)
cond_hidden
=
self
.
cond_layers
[
i
](
mel
)
cond_hidden
=
self
.
cond_layers
[
i
](
mel
)
...
@@ -241,18 +262,18 @@ class Flow(dg.Layer):
...
@@ -241,18 +262,18 @@ class Flow(dg.Layer):
class
WaveFlowModule
(
dg
.
Layer
):
class
WaveFlowModule
(
dg
.
Layer
):
def
__init__
(
self
,
name_scope
,
config
):
def
__init__
(
self
,
config
):
super
(
WaveFlowModule
,
self
).
__init__
(
name_scope
)
super
(
WaveFlowModule
,
self
).
__init__
()
self
.
n_flows
=
config
.
n_flows
self
.
n_flows
=
config
.
n_flows
self
.
n_group
=
config
.
n_group
self
.
n_group
=
config
.
n_group
self
.
n_layers
=
config
.
n_layers
self
.
n_layers
=
config
.
n_layers
assert
self
.
n_group
%
2
==
0
assert
self
.
n_group
%
2
==
0
assert
self
.
n_flows
%
2
==
0
assert
self
.
n_flows
%
2
==
0
self
.
conditioner
=
Conditioner
(
self
.
full_name
()
)
self
.
conditioner
=
Conditioner
()
self
.
flows
=
[]
self
.
flows
=
[]
for
i
in
range
(
self
.
n_flows
):
for
i
in
range
(
self
.
n_flows
):
flow
=
Flow
(
self
.
full_name
(),
config
)
flow
=
Flow
(
config
)
self
.
flows
.
append
(
flow
)
self
.
flows
.
append
(
flow
)
self
.
add_sublayer
(
"flow_{}"
.
format
(
i
),
flow
)
self
.
add_sublayer
(
"flow_{}"
.
format
(
i
),
flow
)
...
@@ -284,7 +305,6 @@ class WaveFlowModule(dg.Layer):
...
@@ -284,7 +305,6 @@ class WaveFlowModule(dg.Layer):
audio
=
fluid
.
layers
.
transpose
(
unfold
(
audio
,
self
.
n_group
),
[
0
,
2
,
1
])
audio
=
fluid
.
layers
.
transpose
(
unfold
(
audio
,
self
.
n_group
),
[
0
,
2
,
1
])
# [bs, 1, n_group, time/n_group]
# [bs, 1, n_group, time/n_group]
audio
=
fluid
.
layers
.
unsqueeze
(
audio
,
1
)
audio
=
fluid
.
layers
.
unsqueeze
(
audio
,
1
)
log_s_list
=
[]
log_s_list
=
[]
for
i
in
range
(
self
.
n_flows
):
for
i
in
range
(
self
.
n_flows
):
inputs
=
audio
[:,
:,
:
-
1
,
:]
inputs
=
audio
[:,
:,
:
-
1
,
:]
...
@@ -305,7 +325,6 @@ class WaveFlowModule(dg.Layer):
...
@@ -305,7 +325,6 @@ class WaveFlowModule(dg.Layer):
mel
=
fluid
.
layers
.
stack
(
mel_slices
,
axis
=
2
)
mel
=
fluid
.
layers
.
stack
(
mel_slices
,
axis
=
2
)
z
=
fluid
.
layers
.
squeeze
(
audio
,
[
1
])
z
=
fluid
.
layers
.
squeeze
(
audio
,
[
1
])
return
z
,
log_s_list
return
z
,
log_s_list
def
synthesize
(
self
,
mel
,
sigma
=
1.0
):
def
synthesize
(
self
,
mel
,
sigma
=
1.0
):
...
@@ -331,7 +350,7 @@ class WaveFlowModule(dg.Layer):
...
@@ -331,7 +350,7 @@ class WaveFlowModule(dg.Layer):
for
h
in
range
(
1
,
self
.
n_group
):
for
h
in
range
(
1
,
self
.
n_group
):
inputs
=
audio_h
inputs
=
audio_h
conds
=
mel
[:,
:,
h
:(
h
+
1
),
:]
conds
=
mel
[:,
:,
h
:(
h
+
1
),
:]
outputs
=
self
.
flows
[
i
].
infer
(
inputs
,
conds
,
queues
)
outputs
=
self
.
flows
[
i
].
infer
(
inputs
,
conds
,
queues
)
log_s
=
outputs
[:,
0
:
1
,
:,
:]
log_s
=
outputs
[:,
0
:
1
,
:,
:]
...
...
parakeet/modules/weight_norm.py
浏览文件 @
25883dcd
...
@@ -40,8 +40,8 @@ def norm_except(param, dim, power):
...
@@ -40,8 +40,8 @@ def norm_except(param, dim, power):
def
compute_weight
(
v
,
g
,
dim
,
power
):
def
compute_weight
(
v
,
g
,
dim
,
power
):
assert
len
(
g
.
shape
)
==
1
,
"magnitude should be a vector"
assert
len
(
g
.
shape
)
==
1
,
"magnitude should be a vector"
v_normalized
=
F
.
elementwise_div
(
v
,
(
norm_except
(
v
,
dim
,
power
)
+
1e-12
),
v_normalized
=
F
.
elementwise_div
(
axis
=
dim
)
v
,
(
norm_except
(
v
,
dim
,
power
)
+
1e-12
),
axis
=
dim
)
weight
=
F
.
elementwise_mul
(
v_normalized
,
g
,
axis
=
dim
)
weight
=
F
.
elementwise_mul
(
v_normalized
,
g
,
axis
=
dim
)
return
weight
return
weight
...
@@ -63,20 +63,21 @@ class WeightNormWrapper(dg.Layer):
...
@@ -63,20 +63,21 @@ class WeightNormWrapper(dg.Layer):
original_weight
=
getattr
(
layer
,
param_name
)
original_weight
=
getattr
(
layer
,
param_name
)
self
.
add_parameter
(
self
.
add_parameter
(
w_v
,
w_v
,
self
.
create_parameter
(
shape
=
original_weight
.
shape
,
self
.
create_parameter
(
dtype
=
original_weight
.
dtype
))
shape
=
original_weight
.
shape
,
dtype
=
original_weight
.
dtype
))
F
.
assign
(
original_weight
,
getattr
(
self
,
w_v
))
F
.
assign
(
original_weight
,
getattr
(
self
,
w_v
))
delattr
(
layer
,
param_name
)
delattr
(
layer
,
param_name
)
temp
=
norm_except
(
getattr
(
self
,
w_v
),
self
.
dim
,
self
.
power
)
temp
=
norm_except
(
getattr
(
self
,
w_v
),
self
.
dim
,
self
.
power
)
self
.
add_parameter
(
self
.
add_parameter
(
w_g
,
self
.
create_parameter
(
shape
=
temp
.
shape
,
dtype
=
temp
.
dtype
))
w_g
,
self
.
create_parameter
(
shape
=
temp
.
shape
,
dtype
=
temp
.
dtype
))
F
.
assign
(
temp
,
getattr
(
self
,
w_g
))
F
.
assign
(
temp
,
getattr
(
self
,
w_g
))
# also set this when setting up
# also set this when setting up
setattr
(
setattr
(
self
.
layer
,
self
.
param_name
,
self
.
layer
,
self
.
param_name
,
compute_weight
(
compute_weight
(
getattr
(
self
,
w_v
),
getattr
(
self
,
w_g
),
self
.
dim
,
getattr
(
self
,
w_v
)
,
self
.
power
))
getattr
(
self
,
w_g
),
self
.
dim
,
self
.
power
))
self
.
weigth_norm_applied
=
True
self
.
weigth_norm_applied
=
True
...
@@ -84,10 +85,10 @@ class WeightNormWrapper(dg.Layer):
...
@@ -84,10 +85,10 @@ class WeightNormWrapper(dg.Layer):
def
hook
(
self
):
def
hook
(
self
):
w_v
=
self
.
param_name
+
"_v"
w_v
=
self
.
param_name
+
"_v"
w_g
=
self
.
param_name
+
"_g"
w_g
=
self
.
param_name
+
"_g"
setattr
(
setattr
(
self
.
layer
,
self
.
param_name
,
self
.
layer
,
self
.
param_name
,
compute_weight
(
compute_weight
(
getattr
(
self
,
w_v
),
getattr
(
self
,
w_g
),
self
.
dim
,
getattr
(
self
,
w_v
)
,
self
.
power
))
getattr
(
self
,
w_g
),
self
.
dim
,
self
.
power
))
def
remove_weight_norm
(
self
):
def
remove_weight_norm
(
self
):
self
.
hook
()
self
.
hook
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录