Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Parakeet
提交
7938a5f6
P
Parakeet
项目概览
PaddlePaddle
/
Parakeet
通知
8
Star
3
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
19
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Parakeet
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
19
Issue
19
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
7938a5f6
编写于
7月 13, 2020
作者:
C
chenfeiyu
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add griffin lim as an alternative vocoder
上级
282c36c2
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
34 addition
and
10 deletion
+34
-10
examples/deepvoice3/synthesize.py
examples/deepvoice3/synthesize.py
+23
-7
examples/deepvoice3/vocoder.py
examples/deepvoice3/vocoder.py
+11
-3
未找到文件。
examples/deepvoice3/synthesize.py
浏览文件 @
7938a5f6
...
@@ -18,7 +18,7 @@ from parakeet.data import SliceDataset, DataCargo, PartialyRandomizedSimilarTime
...
@@ -18,7 +18,7 @@ from parakeet.data import SliceDataset, DataCargo, PartialyRandomizedSimilarTime
from
parakeet.utils.io
import
save_parameters
,
load_parameters
,
add_yaml_config_to_args
from
parakeet.utils.io
import
save_parameters
,
load_parameters
,
add_yaml_config_to_args
from
parakeet.g2p
import
en
from
parakeet.g2p
import
en
from
vocoder
import
WaveflowVocoder
from
vocoder
import
WaveflowVocoder
,
GriffinLimVocoder
from
train
import
create_model
from
train
import
create_model
...
@@ -26,8 +26,18 @@ def main(args, config):
...
@@ -26,8 +26,18 @@ def main(args, config):
model
=
create_model
(
config
)
model
=
create_model
(
config
)
loaded_step
=
load_parameters
(
model
,
checkpoint_path
=
args
.
checkpoint
)
loaded_step
=
load_parameters
(
model
,
checkpoint_path
=
args
.
checkpoint
)
model
.
eval
()
model
.
eval
()
if
args
.
vocoder
==
"waveflow"
:
vocoder
=
WaveflowVocoder
()
vocoder
=
WaveflowVocoder
()
vocoder
.
model
.
eval
()
vocoder
.
model
.
eval
()
elif
args
.
vocoder
==
"griffin-lim"
:
vocoder
=
GriffinLimVocoder
(
sharpening_factor
=
config
[
"sharpening_factor"
],
sample_rate
=
config
[
"sample_rate"
],
n_fft
=
config
[
"n_fft"
],
win_length
=
config
[
"win_length"
],
hop_length
=
config
[
"hop_length"
])
else
:
raise
ValueError
(
"Other vocoders are not supported."
)
if
not
os
.
path
.
exists
(
args
.
output
):
if
not
os
.
path
.
exists
(
args
.
output
):
os
.
makedirs
(
args
.
output
)
os
.
makedirs
(
args
.
output
)
...
@@ -35,12 +45,12 @@ def main(args, config):
...
@@ -35,12 +45,12 @@ def main(args, config):
with
open
(
args
.
input
,
'rt'
)
as
f
:
with
open
(
args
.
input
,
'rt'
)
as
f
:
sentences
=
[
line
.
strip
()
for
line
in
f
.
readlines
()]
sentences
=
[
line
.
strip
()
for
line
in
f
.
readlines
()]
for
i
,
sentence
in
enumerate
(
sentences
):
for
i
,
sentence
in
enumerate
(
sentences
):
wav
=
synthesize
(
config
,
model
,
vocoder
,
sentence
,
monotonic_layers
)
wav
=
synthesize
(
args
,
config
,
model
,
vocoder
,
sentence
,
monotonic_layers
)
sf
.
write
(
os
.
path
.
join
(
args
.
output
,
"sentence{}.wav"
.
format
(
i
)),
sf
.
write
(
os
.
path
.
join
(
args
.
output
,
"sentence{}.wav"
.
format
(
i
)),
wav
,
samplerate
=
config
[
"sample_rate"
])
wav
,
samplerate
=
config
[
"sample_rate"
])
def
synthesize
(
config
,
model
,
vocoder
,
sentence
,
monotonic_layers
):
def
synthesize
(
args
,
config
,
model
,
vocoder
,
sentence
,
monotonic_layers
):
print
(
"[synthesize] {}"
.
format
(
sentence
))
print
(
"[synthesize] {}"
.
format
(
sentence
))
text
=
en
.
text_to_sequence
(
sentence
,
p
=
1.0
)
text
=
en
.
text_to_sequence
(
sentence
,
p
=
1.0
)
text
=
np
.
expand_dims
(
np
.
array
(
text
,
dtype
=
"int64"
),
0
)
text
=
np
.
expand_dims
(
np
.
array
(
text
,
dtype
=
"int64"
),
0
)
...
@@ -58,11 +68,16 @@ def synthesize(config, model, vocoder, sentence, monotonic_layers):
...
@@ -58,11 +68,16 @@ def synthesize(config, model, vocoder, sentence, monotonic_layers):
force_monotonic_attention
=
force_monotonic_attention
,
force_monotonic_attention
=
force_monotonic_attention
,
window
=
(
config
[
"backward_step"
],
config
[
"forward_step"
]))
window
=
(
config
[
"backward_step"
],
config
[
"forward_step"
]))
decoded
,
refined
,
attentions
=
outputs
decoded
,
refined
,
attentions
=
outputs
wav
=
vocoder
(
F
.
transpose
(
decoded
,
(
0
,
2
,
1
)))
if
args
.
vocoder
==
"griffin-lim"
:
wav_np
=
vocoder
(
refined
.
numpy
()[
0
].
T
)
else
:
wav
=
vocoder
(
F
.
transpose
(
refined
,
(
0
,
2
,
1
)))
wav_np
=
wav
.
numpy
()[
0
]
wav_np
=
wav
.
numpy
()[
0
]
return
wav_np
return
wav_np
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
import
argparse
import
argparse
from
ruamel
import
yaml
from
ruamel
import
yaml
...
@@ -72,6 +87,7 @@ if __name__ == "__main__":
...
@@ -72,6 +87,7 @@ if __name__ == "__main__":
parser
.
add_argument
(
"--output"
,
type
=
str
,
required
=
True
,
help
=
"path to save audio"
)
parser
.
add_argument
(
"--output"
,
type
=
str
,
required
=
True
,
help
=
"path to save audio"
)
parser
.
add_argument
(
"--checkpoint"
,
type
=
str
,
required
=
True
,
help
=
"data path of the checkpoint"
)
parser
.
add_argument
(
"--checkpoint"
,
type
=
str
,
required
=
True
,
help
=
"data path of the checkpoint"
)
parser
.
add_argument
(
"--monotonic_layers"
,
type
=
str
,
required
=
True
,
help
=
"monotonic decoder layer, index starts friom 1"
)
parser
.
add_argument
(
"--monotonic_layers"
,
type
=
str
,
required
=
True
,
help
=
"monotonic decoder layer, index starts friom 1"
)
parser
.
add_argument
(
"--vocoder"
,
type
=
str
,
default
=
"waveflow"
,
choices
=
[
'griffin-lim'
,
'waveflow'
],
help
=
"vocoder to use"
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
with
open
(
args
.
config
,
'rt'
)
as
f
:
with
open
(
args
.
config
,
'rt'
)
as
f
:
config
=
yaml
.
safe_load
(
f
)
config
=
yaml
.
safe_load
(
f
)
...
...
examples/deepvoice3/vocoder.py
浏览文件 @
7938a5f6
...
@@ -31,13 +31,21 @@ class WaveflowVocoder(object):
...
@@ -31,13 +31,21 @@ class WaveflowVocoder(object):
return
audio
return
audio
class
GriffinLimVocoder
(
object
):
class
GriffinLimVocoder
(
object
):
def
__init__
(
self
,
sharpening_factor
=
1.4
,
win_length
=
1024
,
hop_length
=
256
):
def
__init__
(
self
,
sharpening_factor
=
1.4
,
sample_rate
=
22050
,
n_fft
=
1024
,
win_length
=
1024
,
hop_length
=
256
):
self
.
sample_rate
=
sample_rate
self
.
n_fft
=
n_fft
self
.
sharpening_factor
=
sharpening_factor
self
.
sharpening_factor
=
sharpening_factor
self
.
win_length
=
win_length
self
.
win_length
=
win_length
self
.
hop_length
=
hop_length
self
.
hop_length
=
hop_length
def
__call__
(
self
,
spec
):
def
__call__
(
self
,
mel
):
audio
=
librosa
.
core
.
griffinlim
(
np
.
exp
(
spec
*
self
.
sharpening_factor
),
spec
=
librosa
.
feature
.
inverse
.
mel_to_stft
(
np
.
exp
(
mel
),
sr
=
self
.
sample_rate
,
n_fft
=
self
.
n_fft
,
fmin
=
0
,
fmax
=
8000.0
,
power
=
1.0
)
audio
=
librosa
.
core
.
griffinlim
(
spec
**
self
.
sharpening_factor
,
win_length
=
self
.
win_length
,
hop_length
=
self
.
hop_length
)
win_length
=
self
.
win_length
,
hop_length
=
self
.
hop_length
)
return
audio
return
audio
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录