Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Parakeet
提交
14235cd1
P
Parakeet
项目概览
PaddlePaddle
/
Parakeet
通知
8
Star
3
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
19
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Parakeet
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
19
Issue
19
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
14235cd1
编写于
6月 19, 2020
作者:
L
lifuchen
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
modified synthesis of transformer_tts & fastspeech
上级
681d34b9
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
296 addition
and
134 deletion
+296
-134
examples/fastspeech/synthesis.py
examples/fastspeech/synthesis.py
+134
-86
examples/fastspeech/synthesis.sh
examples/fastspeech/synthesis.sh
+10
-3
examples/transformer_tts/synthesis.py
examples/transformer_tts/synthesis.py
+140
-41
examples/transformer_tts/synthesis.sh
examples/transformer_tts/synthesis.sh
+11
-3
examples/transformer_tts/train_vocoder.py
examples/transformer_tts/train_vocoder.py
+1
-1
未找到文件。
examples/fastspeech/synthesis.py
浏览文件 @
14235cd1
...
@@ -28,6 +28,8 @@ from parakeet.models.fastspeech.fastspeech import FastSpeech
...
@@ -28,6 +28,8 @@ from parakeet.models.fastspeech.fastspeech import FastSpeech
from
parakeet.models.transformer_tts.utils
import
*
from
parakeet.models.transformer_tts.utils
import
*
from
parakeet.models.wavenet
import
WaveNet
,
UpsampleNet
from
parakeet.models.wavenet
import
WaveNet
,
UpsampleNet
from
parakeet.models.clarinet
import
STFT
,
Clarinet
,
ParallelWaveNet
from
parakeet.models.clarinet
import
STFT
,
Clarinet
,
ParallelWaveNet
from
parakeet.modules
import
weight_norm
from
parakeet.models.waveflow
import
WaveFlowModule
from
parakeet.utils.layer_tools
import
freeze
from
parakeet.utils.layer_tools
import
freeze
from
parakeet.utils
import
io
from
parakeet.utils
import
io
...
@@ -35,7 +37,13 @@ from parakeet.utils import io
...
@@ -35,7 +37,13 @@ from parakeet.utils import io
def
add_config_options_to_parser
(
parser
):
def
add_config_options_to_parser
(
parser
):
parser
.
add_argument
(
"--config"
,
type
=
str
,
help
=
"path of the config file"
)
parser
.
add_argument
(
"--config"
,
type
=
str
,
help
=
"path of the config file"
)
parser
.
add_argument
(
parser
.
add_argument
(
"--config_clarinet"
,
type
=
str
,
help
=
"path of the clarinet config file"
)
"--vocoder"
,
type
=
str
,
default
=
"griffinlim"
,
choices
=
[
'griffinlim'
,
'clarinet'
,
'waveflow'
],
help
=
"vocoder method"
)
parser
.
add_argument
(
"--config_vocoder"
,
type
=
str
,
help
=
"path of the vocoder config file"
)
parser
.
add_argument
(
"--use_gpu"
,
type
=
int
,
default
=
0
,
help
=
"device to use"
)
parser
.
add_argument
(
"--use_gpu"
,
type
=
int
,
default
=
0
,
help
=
"device to use"
)
parser
.
add_argument
(
parser
.
add_argument
(
"--alpha"
,
"--alpha"
,
...
@@ -47,9 +55,9 @@ def add_config_options_to_parser(parser):
...
@@ -47,9 +55,9 @@ def add_config_options_to_parser(parser):
parser
.
add_argument
(
parser
.
add_argument
(
"--checkpoint"
,
type
=
str
,
help
=
"fastspeech checkpoint to synthesis"
)
"--checkpoint"
,
type
=
str
,
help
=
"fastspeech checkpoint to synthesis"
)
parser
.
add_argument
(
parser
.
add_argument
(
"--checkpoint_
clarinet
"
,
"--checkpoint_
vocoder
"
,
type
=
str
,
type
=
str
,
help
=
"
clarinet
checkpoint to synthesis"
)
help
=
"
vocoder
checkpoint to synthesis"
)
parser
.
add_argument
(
parser
.
add_argument
(
"--output"
,
"--output"
,
...
@@ -83,46 +91,62 @@ def synthesis(text_input, args):
...
@@ -83,46 +91,62 @@ def synthesis(text_input, args):
pos_text
=
np
.
arange
(
1
,
text
.
shape
[
1
]
+
1
)
pos_text
=
np
.
arange
(
1
,
text
.
shape
[
1
]
+
1
)
pos_text
=
np
.
expand_dims
(
pos_text
,
axis
=
0
)
pos_text
=
np
.
expand_dims
(
pos_text
,
axis
=
0
)
text
=
dg
.
to_variable
(
text
)
text
=
dg
.
to_variable
(
text
)
.
astype
(
np
.
int64
)
pos_text
=
dg
.
to_variable
(
pos_text
)
pos_text
=
dg
.
to_variable
(
pos_text
)
.
astype
(
np
.
int64
)
_
,
mel_output_postnet
=
model
(
text
,
pos_text
,
alpha
=
args
.
alpha
)
_
,
mel_output_postnet
=
model
(
text
,
pos_text
,
alpha
=
args
.
alpha
)
result
=
np
.
exp
(
mel_output_postnet
.
numpy
())
if
args
.
vocoder
==
'griffinlim'
:
mel_output_postnet
=
fluid
.
layers
.
transpose
(
#synthesis use griffin-lim
fluid
.
layers
.
squeeze
(
mel_output_postnet
,
[
0
]),
[
1
,
0
])
wav
=
synthesis_with_griffinlim
(
mel_output_postnet
=
np
.
exp
(
mel_output_postnet
.
numpy
())
mel_output_postnet
,
basis
=
librosa
.
filters
.
mel
(
cfg
[
'audio'
][
'sr'
],
cfg
[
'audio'
][
'n_fft'
],
sr
=
cfg
[
'audio'
][
'sr'
],
cfg
[
'audio'
][
'num_mels'
])
n_fft
=
cfg
[
'audio'
][
'n_fft'
],
inv_basis
=
np
.
linalg
.
pinv
(
basis
)
num_mels
=
cfg
[
'audio'
][
'num_mels'
],
spec
=
np
.
maximum
(
1e-10
,
np
.
dot
(
inv_basis
,
mel_output_postnet
))
power
=
cfg
[
'audio'
][
'power'
],
hop_length
=
cfg
[
'audio'
][
'hop_length'
],
# synthesis use clarinet
win_length
=
cfg
[
'audio'
][
'win_length'
])
wav_clarinet
=
synthesis_with_clarinet
(
elif
args
.
vocoder
==
'clarinet'
:
args
.
config_clarinet
,
args
.
checkpoint_clarinet
,
result
,
place
)
# synthesis use clarinet
writer
.
add_audio
(
text_input
+
'(clarinet)'
,
wav_clarinet
,
0
,
wav
=
synthesis_with_clarinet
(
mel_output_postnet
,
args
.
config_vocoder
,
args
.
checkpoint_vocoder
,
place
)
elif
args
.
vocoder
==
'waveflow'
:
wav
=
synthesis_with_waveflow
(
mel_output_postnet
,
args
,
args
.
checkpoint_vocoder
,
place
)
else
:
print
(
'vocoder error, we only support griffinlim, clarinet and waveflow, but recevied %s.'
%
args
.
vocoder
)
writer
.
add_audio
(
text_input
+
'('
+
args
.
vocoder
+
')'
,
wav
,
0
,
cfg
[
'audio'
][
'sr'
])
cfg
[
'audio'
][
'sr'
])
if
not
os
.
path
.
exists
(
os
.
path
.
join
(
args
.
output
,
'samples'
)):
if
not
os
.
path
.
exists
(
os
.
path
.
join
(
args
.
output
,
'samples'
)):
os
.
mkdir
(
os
.
path
.
join
(
args
.
output
,
'samples'
))
os
.
mkdir
(
os
.
path
.
join
(
args
.
output
,
'samples'
))
write
(
os
.
path
.
join
(
os
.
path
.
join
(
args
.
output
,
'samples'
),
'clarinet.wav'
),
cfg
[
'audio'
][
'sr'
],
wav_clarinet
)
#synthesis use griffin-lim
wav
=
librosa
.
core
.
griffinlim
(
spec
**
cfg
[
'audio'
][
'power'
],
hop_length
=
cfg
[
'audio'
][
'hop_length'
],
win_length
=
cfg
[
'audio'
][
'win_length'
])
writer
.
add_audio
(
text_input
+
'(griffin-lim)'
,
wav
,
0
,
cfg
[
'audio'
][
'sr'
])
write
(
write
(
os
.
path
.
join
(
os
.
path
.
join
(
os
.
path
.
join
(
args
.
output
,
'samples'
),
'grinffin-lim
.wav'
),
os
.
path
.
join
(
args
.
output
,
'samples'
),
args
.
vocoder
+
'
.wav'
),
cfg
[
'audio'
][
'sr'
],
wav
)
cfg
[
'audio'
][
'sr'
],
wav
)
print
(
"Synthesis completed !!!"
)
print
(
"Synthesis completed !!!"
)
writer
.
close
()
writer
.
close
()
def
synthesis_with_clarinet
(
config_path
,
checkpoint
,
mel_spectrogram
,
place
):
def
synthesis_with_griffinlim
(
mel_output
,
sr
,
n_fft
,
num_mels
,
power
,
hop_length
,
win_length
):
mel_output
=
fluid
.
layers
.
transpose
(
fluid
.
layers
.
squeeze
(
mel_output
,
[
0
]),
[
1
,
0
])
mel_output
=
np
.
exp
(
mel_output
.
numpy
())
basis
=
librosa
.
filters
.
mel
(
sr
,
n_fft
,
num_mels
)
inv_basis
=
np
.
linalg
.
pinv
(
basis
)
spec
=
np
.
maximum
(
1e-10
,
np
.
dot
(
inv_basis
,
mel_output
))
wav
=
librosa
.
core
.
griffinlim
(
spec
**
power
,
hop_length
=
hop_length
,
win_length
=
win_length
)
return
wav
def
synthesis_with_clarinet
(
mel_output
,
config_path
,
checkpoint
,
place
):
mel_spectrogram
=
np
.
exp
(
mel_output
.
numpy
())
with
open
(
config_path
,
'rt'
)
as
f
:
with
open
(
config_path
,
'rt'
)
as
f
:
config
=
yaml
.
safe_load
(
f
)
config
=
yaml
.
safe_load
(
f
)
...
@@ -136,62 +160,86 @@ def synthesis_with_clarinet(config_path, checkpoint, mel_spectrogram, place):
...
@@ -136,62 +160,86 @@ def synthesis_with_clarinet(config_path, checkpoint, mel_spectrogram, place):
# only batch=1 for validation is enabled
# only batch=1 for validation is enabled
with
dg
.
guard
(
place
):
fluid
.
enable_dygraph
(
place
)
# conditioner(upsampling net)
# conditioner(upsampling net)
conditioner_config
=
config
[
"conditioner"
]
conditioner_config
=
config
[
"conditioner"
]
upsampling_factors
=
conditioner_config
[
"upsampling_factors"
]
upsampling_factors
=
conditioner_config
[
"upsampling_factors"
]
upsample_net
=
UpsampleNet
(
upscale_factors
=
upsampling_factors
)
upsample_net
=
UpsampleNet
(
upscale_factors
=
upsampling_factors
)
freeze
(
upsample_net
)
freeze
(
upsample_net
)
residual_channels
=
teacher_config
[
"residual_channels"
]
residual_channels
=
teacher_config
[
"residual_channels"
]
loss_type
=
teacher_config
[
"loss_type"
]
loss_type
=
teacher_config
[
"loss_type"
]
output_dim
=
teacher_config
[
"output_dim"
]
output_dim
=
teacher_config
[
"output_dim"
]
log_scale_min
=
teacher_config
[
"log_scale_min"
]
log_scale_min
=
teacher_config
[
"log_scale_min"
]
assert
loss_type
==
"mog"
and
output_dim
==
3
,
\
assert
loss_type
==
"mog"
and
output_dim
==
3
,
\
"the teacher wavenet should be a wavenet with single gaussian output"
"the teacher wavenet should be a wavenet with single gaussian output"
teacher
=
WaveNet
(
n_loop
,
n_layer
,
residual_channels
,
output_dim
,
teacher
=
WaveNet
(
n_loop
,
n_layer
,
residual_channels
,
output_dim
,
n_mels
,
n_mels
,
filter_size
,
loss_type
,
log_scale_min
)
filter_size
,
loss_type
,
log_scale_min
)
# load & freeze upsample_net & teacher
# load & freeze upsample_net & teacher
freeze
(
teacher
)
freeze
(
teacher
)
student_config
=
config
[
"student"
]
student_config
=
config
[
"student"
]
n_loops
=
student_config
[
"n_loops"
]
n_loops
=
student_config
[
"n_loops"
]
n_layers
=
student_config
[
"n_layers"
]
n_layers
=
student_config
[
"n_layers"
]
student_residual_channels
=
student_config
[
"residual_channels"
]
student_residual_channels
=
student_config
[
"residual_channels"
]
student_filter_size
=
student_config
[
"filter_size"
]
student_filter_size
=
student_config
[
"filter_size"
]
student_log_scale_min
=
student_config
[
"log_scale_min"
]
student_log_scale_min
=
student_config
[
"log_scale_min"
]
student
=
ParallelWaveNet
(
n_loops
,
n_layers
,
student_residual_channels
,
student
=
ParallelWaveNet
(
n_loops
,
n_layers
,
student_residual_channels
,
n_mels
,
student_filter_size
)
n_mels
,
student_filter_size
)
stft_config
=
config
[
"stft"
]
stft_config
=
config
[
"stft"
]
stft
=
STFT
(
stft
=
STFT
(
n_fft
=
stft_config
[
"n_fft"
],
n_fft
=
stft_config
[
"n_fft"
],
hop_length
=
stft_config
[
"hop_length"
],
hop_length
=
stft_config
[
"hop_length"
],
win_length
=
stft_config
[
"win_length"
])
win_length
=
stft_config
[
"win_length"
])
lmd
=
config
[
"loss"
][
"lmd"
]
lmd
=
config
[
"loss"
][
"lmd"
]
model
=
Clarinet
(
upsample_net
,
teacher
,
student
,
stft
,
model
=
Clarinet
(
upsample_net
,
teacher
,
student
,
stft
,
student_log_scale_min
,
lmd
)
student_log_scale_min
,
lmd
)
io
.
load_parameters
(
model
=
model
,
checkpoint_path
=
checkpoint
)
io
.
load_parameters
(
model
=
model
,
checkpoint_path
=
checkpoint
)
if
not
os
.
path
.
exists
(
args
.
output
):
if
not
os
.
path
.
exists
(
args
.
output
):
os
.
makedirs
(
args
.
output
)
os
.
makedirs
(
args
.
output
)
model
.
eval
()
model
.
eval
()
# Rescale mel_spectrogram.
# Rescale mel_spectrogram.
min_level
,
ref_level
=
1e-5
,
20
# hard code it
min_level
,
ref_level
=
1e-5
,
20
# hard code it
mel_spectrogram
=
20
*
np
.
log10
(
np
.
maximum
(
min_level
,
mel_spectrogram
))
mel_spectrogram
=
20
*
np
.
log10
(
np
.
maximum
(
min_level
,
mel_spectrogram
))
mel_spectrogram
=
mel_spectrogram
-
ref_level
mel_spectrogram
=
mel_spectrogram
-
ref_level
mel_spectrogram
=
np
.
clip
((
mel_spectrogram
+
100
)
/
100
,
0
,
1
)
mel_spectrogram
=
np
.
clip
((
mel_spectrogram
+
100
)
/
100
,
0
,
1
)
mel_spectrogram
=
dg
.
to_variable
(
mel_spectrogram
)
mel_spectrogram
=
dg
.
to_variable
(
mel_spectrogram
)
mel_spectrogram
=
fluid
.
layers
.
transpose
(
mel_spectrogram
,
[
0
,
2
,
1
])
mel_spectrogram
=
fluid
.
layers
.
transpose
(
mel_spectrogram
,
[
0
,
2
,
1
])
wav_var
=
model
.
synthesis
(
mel_spectrogram
)
wav_var
=
model
.
synthesis
(
mel_spectrogram
)
wav_np
=
wav_var
.
numpy
()[
0
]
wav_np
=
wav_var
.
numpy
()[
0
]
return
wav_np
return
wav_np
def
synthesis_with_waveflow
(
mel_output
,
args
,
checkpoint
,
place
):
#mel_output = np.exp(mel_output.numpy())
mel_output
=
mel_output
.
numpy
()
fluid
.
enable_dygraph
(
place
)
args
.
config
=
args
.
config_vocoder
args
.
use_fp16
=
False
config
=
io
.
add_yaml_config_to_args
(
args
)
mel_spectrogram
=
dg
.
to_variable
(
mel_output
)
mel_spectrogram
=
fluid
.
layers
.
transpose
(
mel_spectrogram
,
[
0
,
2
,
1
])
# Build model.
waveflow
=
WaveFlowModule
(
config
)
io
.
load_parameters
(
model
=
waveflow
,
checkpoint_path
=
checkpoint
)
for
layer
in
waveflow
.
sublayers
():
if
isinstance
(
layer
,
weight_norm
.
WeightNormWrapper
):
layer
.
remove_weight_norm
()
# Run model inference.
wav
=
waveflow
.
synthesize
(
mel_spectrogram
,
sigma
=
config
.
sigma
)
return
wav
.
numpy
()[
0
]
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
examples/fastspeech/synthesis.sh
浏览文件 @
14235cd1
# train model
# train model
CUDA_VISIBLE_DEVICES
=
0
\
python
-u
synthesis.py
\
python
-u
synthesis.py
\
--use_gpu
=
1
\
--use_gpu
=
1
\
--alpha
=
1.0
\
--alpha
=
1.0
\
--checkpoint
=
'./checkpoint/fastspeech
/step-12
0000'
\
--checkpoint
=
'./checkpoint/fastspeech
1024/step-16
0000'
\
--config
=
'configs/ljspeech.yaml'
\
--config
=
'configs/ljspeech.yaml'
\
--config_clarine
=
'../clarinet/configs/config.yaml'
\
--checkpoint_clarinet
=
'../clarinet/checkpoint/step-500000'
\
--output
=
'./synthesis'
\
--output
=
'./synthesis'
\
--vocoder
=
'waveflow'
\
--config_vocoder
=
'../waveflow/checkpoint/waveflow_res64_ljspeech_ckpt_1.0/waveflow_ljspeech.yaml'
\
--checkpoint_vocoder
=
'../waveflow/checkpoint/waveflow_res64_ljspeech_ckpt_1.0/step-3020000'
\
#--vocoder='clarinet' \
#--config_vocoder='../clarinet/configs/clarinet_ljspeech.yaml' \
#--checkpoint_vocoder='../clarinet/checkpoint/step-500000' \
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in synthesis!"
echo
"Failed in synthesis!"
...
...
examples/transformer_tts/synthesis.py
浏览文件 @
14235cd1
...
@@ -28,6 +28,10 @@ from parakeet.models.transformer_tts.utils import *
...
@@ -28,6 +28,10 @@ from parakeet.models.transformer_tts.utils import *
from
parakeet
import
audio
from
parakeet
import
audio
from
parakeet.models.transformer_tts
import
Vocoder
from
parakeet.models.transformer_tts
import
Vocoder
from
parakeet.models.transformer_tts
import
TransformerTTS
from
parakeet.models.transformer_tts
import
TransformerTTS
from
parakeet.modules
import
weight_norm
from
parakeet.models.waveflow
import
WaveFlowModule
from
parakeet.modules.weight_norm
import
WeightNormWrapper
from
parakeet.models.wavenet
import
UpsampleNet
,
WaveNet
,
ConditionalWavenet
from
parakeet.utils
import
io
from
parakeet.utils
import
io
...
@@ -44,6 +48,14 @@ def add_config_options_to_parser(parser):
...
@@ -44,6 +48,14 @@ def add_config_options_to_parser(parser):
"--checkpoint_transformer"
,
"--checkpoint_transformer"
,
type
=
str
,
type
=
str
,
help
=
"transformer_tts checkpoint to synthesis"
)
help
=
"transformer_tts checkpoint to synthesis"
)
parser
.
add_argument
(
"--vocoder"
,
type
=
str
,
default
=
"griffinlim"
,
choices
=
[
'griffinlim'
,
'wavenet'
,
'waveflow'
],
help
=
"vocoder method"
)
parser
.
add_argument
(
"--config_vocoder"
,
type
=
str
,
help
=
"path of the vocoder config file"
)
parser
.
add_argument
(
parser
.
add_argument
(
"--checkpoint_vocoder"
,
"--checkpoint_vocoder"
,
type
=
str
,
type
=
str
,
...
@@ -82,31 +94,32 @@ def synthesis(text_input, args):
...
@@ -82,31 +94,32 @@ def synthesis(text_input, args):
model
=
model
,
checkpoint_path
=
args
.
checkpoint_transformer
)
model
=
model
,
checkpoint_path
=
args
.
checkpoint_transformer
)
model
.
eval
()
model
.
eval
()
with
fluid
.
unique_name
.
guard
():
model_vocoder
=
Vocoder
(
cfg
[
'train'
][
'batch_size'
],
cfg
[
'vocoder'
][
'hidden_size'
],
cfg
[
'audio'
][
'num_mels'
],
cfg
[
'audio'
][
'n_fft'
])
# Load parameters.
global_step
=
io
.
load_parameters
(
model
=
model_vocoder
,
checkpoint_path
=
args
.
checkpoint_vocoder
)
model_vocoder
.
eval
()
# init input
# init input
text
=
np
.
asarray
(
text_to_sequence
(
text_input
))
text
=
np
.
asarray
(
text_to_sequence
(
text_input
))
text
=
fluid
.
layers
.
unsqueeze
(
dg
.
to_variable
(
text
),
[
0
])
text
=
fluid
.
layers
.
unsqueeze
(
dg
.
to_variable
(
text
)
.
astype
(
np
.
int64
)
,
[
0
])
mel_input
=
dg
.
to_variable
(
np
.
zeros
([
1
,
1
,
80
])).
astype
(
np
.
float32
)
mel_input
=
dg
.
to_variable
(
np
.
zeros
([
1
,
1
,
80
])).
astype
(
np
.
float32
)
pos_text
=
np
.
arange
(
1
,
text
.
shape
[
1
]
+
1
)
pos_text
=
np
.
arange
(
1
,
text
.
shape
[
1
]
+
1
)
pos_text
=
fluid
.
layers
.
unsqueeze
(
dg
.
to_variable
(
pos_text
),
[
0
])
pos_text
=
fluid
.
layers
.
unsqueeze
(
dg
.
to_variable
(
pos_text
).
astype
(
np
.
int64
),
[
0
])
pbar
=
tqdm
(
range
(
args
.
max_len
))
pbar
=
tqdm
(
range
(
args
.
max_len
))
for
i
in
pbar
:
for
i
in
pbar
:
pos_mel
=
np
.
arange
(
1
,
mel_input
.
shape
[
1
]
+
1
)
pos_mel
=
np
.
arange
(
1
,
mel_input
.
shape
[
1
]
+
1
)
pos_mel
=
fluid
.
layers
.
unsqueeze
(
dg
.
to_variable
(
pos_mel
),
[
0
])
pos_mel
=
fluid
.
layers
.
unsqueeze
(
dg
.
to_variable
(
pos_mel
).
astype
(
np
.
int64
),
[
0
])
mel_pred
,
postnet_pred
,
attn_probs
,
stop_preds
,
attn_enc
,
attn_dec
=
model
(
mel_pred
,
postnet_pred
,
attn_probs
,
stop_preds
,
attn_enc
,
attn_dec
=
model
(
text
,
mel_input
,
pos_text
,
pos_mel
)
text
,
mel_input
,
pos_text
,
pos_mel
)
mel_input
=
fluid
.
layers
.
concat
(
mel_input
=
fluid
.
layers
.
concat
(
[
mel_input
,
postnet_pred
[:,
-
1
:,
:]],
axis
=
1
)
[
mel_input
,
postnet_pred
[:,
-
1
:,
:]],
axis
=
1
)
global_step
=
0
mag_pred
=
model_vocoder
(
postnet_pred
)
for
i
,
prob
in
enumerate
(
attn_probs
):
for
j
in
range
(
4
):
x
=
np
.
uint8
(
cm
.
viridis
(
prob
.
numpy
()[
j
])
*
255
)
writer
.
add_image
(
'Attention_%d_0'
%
global_step
,
x
,
i
*
4
+
j
,
dataformats
=
"HWC"
)
_ljspeech_processor
=
audio
.
AudioProcessor
(
_ljspeech_processor
=
audio
.
AudioProcessor
(
sample_rate
=
cfg
[
'audio'
][
'sr'
],
sample_rate
=
cfg
[
'audio'
][
'sr'
],
...
@@ -122,45 +135,130 @@ def synthesis(text_input, args):
...
@@ -122,45 +135,130 @@ def synthesis(text_input, args):
symmetric_norm
=
False
,
symmetric_norm
=
False
,
max_norm
=
1.
,
max_norm
=
1.
,
mel_fmin
=
0
,
mel_fmin
=
0
,
mel_fmax
=
None
,
mel_fmax
=
8000
,
clip_norm
=
True
,
clip_norm
=
True
,
griffin_lim_iters
=
60
,
griffin_lim_iters
=
60
,
do_trim_silence
=
False
,
do_trim_silence
=
False
,
sound_norm
=
False
)
sound_norm
=
False
)
# synthesis with cbhg
if
args
.
vocoder
==
'griffinlim'
:
wav
=
_ljspeech_processor
.
inv_spectrogram
(
#synthesis use griffin-lim
fluid
.
layers
.
transpose
(
fluid
.
layers
.
squeeze
(
mag_pred
,
[
0
]),
[
1
,
0
]
)
wav
=
synthesis_with_griffinlim
(
postnet_pred
,
_ljspeech_processor
)
.
numpy
())
elif
args
.
vocoder
==
'wavenet'
:
global_step
=
0
# synthesis use wavenet
for
i
,
prob
in
enumerate
(
attn_probs
):
wav
=
synthesis_with_wavenet
(
postnet_pred
,
args
)
for
j
in
range
(
4
)
:
elif
args
.
vocoder
==
'waveflow'
:
x
=
np
.
uint8
(
cm
.
viridis
(
prob
.
numpy
()[
j
])
*
255
)
# synthesis use waveflow
writer
.
add_image
(
wav
=
synthesis_with_waveflow
(
postnet_pred
,
args
,
'Attention_%d_0'
%
global_step
,
args
.
checkpoint_vocoder
,
x
,
_ljspeech_processor
,
place
)
i
*
4
+
j
,
else
:
dataformats
=
"HWC"
)
print
(
'vocoder error, we only support griffinlim, cbhg and waveflow, but recevied %s.'
writer
.
add_audio
(
text_input
+
'(cbhg)'
,
wav
,
0
,
cfg
[
'audio'
][
'sr'
]
)
%
args
.
vocoder
)
writer
.
add_audio
(
text_input
+
'('
+
args
.
vocoder
+
')'
,
wav
,
0
,
cfg
[
'audio'
][
'sr'
])
if
not
os
.
path
.
exists
(
os
.
path
.
join
(
args
.
output
,
'samples'
)):
if
not
os
.
path
.
exists
(
os
.
path
.
join
(
args
.
output
,
'samples'
)):
os
.
mkdir
(
os
.
path
.
join
(
args
.
output
,
'samples'
))
os
.
mkdir
(
os
.
path
.
join
(
args
.
output
,
'samples'
))
write
(
write
(
os
.
path
.
join
(
os
.
path
.
join
(
args
.
output
,
'samples'
),
'cbhg.wav'
),
os
.
path
.
join
(
os
.
path
.
join
(
args
.
output
,
'samples'
),
args
.
vocoder
+
'.wav'
),
cfg
[
'audio'
][
'sr'
],
wav
)
cfg
[
'audio'
][
'sr'
],
wav
)
print
(
"Synthesis completed !!!"
)
writer
.
close
()
def
synthesis_with_griffinlim
(
mel_output
,
_ljspeech_processor
):
# synthesis with griffin-lim
# synthesis with griffin-lim
wav
=
_ljspeech_processor
.
inv_melspectrogram
(
mel_output
=
fluid
.
layers
.
transpose
(
fluid
.
layers
.
transpose
(
fluid
.
layers
.
squeeze
(
mel_output
,
[
0
]),
[
1
,
0
])
fluid
.
layers
.
squeeze
(
postnet_pred
,
[
0
]),
[
1
,
0
]).
numpy
())
mel_output
=
np
.
exp
(
mel_output
.
numpy
())
writer
.
add_audio
(
text_input
+
'(griffin)'
,
wav
,
0
,
cfg
[
'audio'
][
'sr'
])
basis
=
librosa
.
filters
.
mel
(
22050
,
1024
,
80
,
fmin
=
0
,
fmax
=
8000
)
inv_basis
=
np
.
linalg
.
pinv
(
basis
)
spec
=
np
.
maximum
(
1e-10
,
np
.
dot
(
inv_basis
,
mel_output
))
write
(
wav
=
librosa
.
core
.
griffinlim
(
spec
**
1.2
,
hop_length
=
256
,
win_length
=
1024
)
os
.
path
.
join
(
os
.
path
.
join
(
args
.
output
,
'samples'
),
'griffin.wav'
),
cfg
[
'audio'
][
'sr'
],
wav
)
return
wav
print
(
"Synthesis completed !!!"
)
writer
.
close
()
def
synthesis_with_wavenet
(
mel_output
,
args
):
with
open
(
args
.
config_vocoder
,
'rt'
)
as
f
:
config
=
yaml
.
safe_load
(
f
)
n_mels
=
config
[
"data"
][
"n_mels"
]
model_config
=
config
[
"model"
]
filter_size
=
model_config
[
"filter_size"
]
upsampling_factors
=
model_config
[
"upsampling_factors"
]
encoder
=
UpsampleNet
(
upsampling_factors
)
n_loop
=
model_config
[
"n_loop"
]
n_layer
=
model_config
[
"n_layer"
]
residual_channels
=
model_config
[
"residual_channels"
]
output_dim
=
model_config
[
"output_dim"
]
loss_type
=
model_config
[
"loss_type"
]
log_scale_min
=
model_config
[
"log_scale_min"
]
decoder
=
WaveNet
(
n_loop
,
n_layer
,
residual_channels
,
output_dim
,
n_mels
,
filter_size
,
loss_type
,
log_scale_min
)
model
=
ConditionalWavenet
(
encoder
,
decoder
)
# load model parameters
iteration
=
io
.
load_parameters
(
model
,
checkpoint_path
=
args
.
checkpoint_vocoder
)
for
layer
in
model
.
sublayers
():
if
isinstance
(
layer
,
WeightNormWrapper
):
layer
.
remove_weight_norm
()
mel_output
=
fluid
.
layers
.
transpose
(
mel_output
,
[
0
,
2
,
1
])
wav
=
model
.
synthesis
(
mel_output
)
return
wav
.
numpy
()[
0
]
def
synthesis_with_cbhg
(
mel_output
,
_ljspeech_processor
,
cfg
):
with
fluid
.
unique_name
.
guard
():
model_vocoder
=
Vocoder
(
cfg
[
'train'
][
'batch_size'
],
cfg
[
'vocoder'
][
'hidden_size'
],
cfg
[
'audio'
][
'num_mels'
],
cfg
[
'audio'
][
'n_fft'
])
# Load parameters.
global_step
=
io
.
load_parameters
(
model
=
model_vocoder
,
checkpoint_path
=
args
.
checkpoint_vocoder
)
model_vocoder
.
eval
()
mag_pred
=
model_vocoder
(
mel_output
)
# synthesis with cbhg
wav
=
_ljspeech_processor
.
inv_spectrogram
(
fluid
.
layers
.
transpose
(
fluid
.
layers
.
squeeze
(
mag_pred
,
[
0
]),
[
1
,
0
])
.
numpy
())
return
wav
def
synthesis_with_waveflow
(
mel_output
,
args
,
checkpoint
,
_ljspeech_processor
,
place
):
mel_output
=
fluid
.
layers
.
transpose
(
fluid
.
layers
.
squeeze
(
mel_output
,
[
0
]),
[
1
,
0
])
mel_output
=
mel_output
.
numpy
()
#mel_output = (mel_output - mel_output.min())/(mel_output.max() - mel_output.min())
#mel_output = 5 * mel_output - 4
#mel_output = np.log(10) * mel_output
fluid
.
enable_dygraph
(
place
)
args
.
config
=
args
.
config_vocoder
args
.
use_fp16
=
False
config
=
io
.
add_yaml_config_to_args
(
args
)
mel_spectrogram
=
dg
.
to_variable
(
mel_output
)
mel_spectrogram
=
fluid
.
layers
.
unsqueeze
(
mel_spectrogram
,
[
0
])
# Build model.
waveflow
=
WaveFlowModule
(
config
)
io
.
load_parameters
(
model
=
waveflow
,
checkpoint_path
=
checkpoint
)
for
layer
in
waveflow
.
sublayers
():
if
isinstance
(
layer
,
weight_norm
.
WeightNormWrapper
):
layer
.
remove_weight_norm
()
# Run model inference.
wav
=
waveflow
.
synthesize
(
mel_spectrogram
,
sigma
=
config
.
sigma
)
return
wav
.
numpy
()[
0
]
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
@@ -169,5 +267,6 @@ if __name__ == '__main__':
...
@@ -169,5 +267,6 @@ if __name__ == '__main__':
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
# Print the whole config setting.
# Print the whole config setting.
pprint
(
vars
(
args
))
pprint
(
vars
(
args
))
synthesis
(
"Parakeet stands for Paddle PARAllel text-to-speech toolkit."
,
synthesis
(
args
)
"Life was like a box of chocolates, you never know what you're gonna get."
,
args
)
examples/transformer_tts/synthesis.sh
浏览文件 @
14235cd1
...
@@ -2,12 +2,20 @@
...
@@ -2,12 +2,20 @@
# train model
# train model
CUDA_VISIBLE_DEVICES
=
0
\
CUDA_VISIBLE_DEVICES
=
0
\
python
-u
synthesis.py
\
python
-u
synthesis.py
\
--max_len
=
3
00
\
--max_len
=
4
00
\
--use_gpu
=
1
\
--use_gpu
=
0
\
--output
=
'./synthesis'
\
--output
=
'./synthesis'
\
--config
=
'configs/ljspeech.yaml'
\
--config
=
'configs/ljspeech.yaml'
\
--checkpoint_transformer
=
'./checkpoint/transformer/step-120000'
\
--checkpoint_transformer
=
'./checkpoint/transformer/step-120000'
\
--checkpoint_vocoder
=
'./checkpoint/vocoder/step-100000'
\
--vocoder
=
'wavenet'
\
--config_vocoder
=
'../wavenet/config.yaml'
\
--checkpoint_vocoder
=
'../wavenet/step-2450000'
\
#--vocoder='waveflow' \
#--config_vocoder='../waveflow/checkpoint/waveflow_res64_ljspeech_ckpt_1.0/waveflow_ljspeech.yaml' \
#--checkpoint_vocoder='../waveflow/checkpoint/waveflow_res64_ljspeech_ckpt_1.0/step-3020000' \
#--vocoder='cbhg' \
#--config_vocoder='configs/ljspeech.yaml' \
#--checkpoint_vocoder='checkpoint/cbhg/step-100000' \
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in training!"
echo
"Failed in training!"
...
...
examples/transformer_tts/train_vocoder.py
浏览文件 @
14235cd1
...
@@ -98,7 +98,7 @@ def main(args):
...
@@ -98,7 +98,7 @@ def main(args):
local_rank
,
local_rank
,
is_vocoder
=
True
).
reader
()
is_vocoder
=
True
).
reader
()
for
epoch
in
range
(
cfg
[
'train'
][
'max_
epochs
'
]):
for
epoch
in
range
(
cfg
[
'train'
][
'max_
iteration
'
]):
pbar
=
tqdm
(
reader
)
pbar
=
tqdm
(
reader
)
for
i
,
data
in
enumerate
(
pbar
):
for
i
,
data
in
enumerate
(
pbar
):
pbar
.
set_description
(
'Processing at epoch %d'
%
epoch
)
pbar
.
set_description
(
'Processing at epoch %d'
%
epoch
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录