Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
mrywhh
Real-Time-Voice-Cloning
提交
b7008db2
R
Real-Time-Voice-Cloning
项目概览
mrywhh
/
Real-Time-Voice-Cloning
落后 Fork 源项目 12 个版本
从无法访问的项目Fork
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
R
Real-Time-Voice-Cloning
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
b7008db2
编写于
2月 22, 2019
作者:
C
Corentin Jemine
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Inference demo for Tacotron
上级
0e8571db
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
76 addition
and
9 deletion
+76
-9
tacotron2/__init__.py
tacotron2/__init__.py
+0
-0
tacotron2/inference_demo.py
tacotron2/inference_demo.py
+32
-0
tacotron2/sentences.txt
tacotron2/sentences.txt
+2
-2
tacotron2/tacotron/synthesize.py
tacotron2/tacotron/synthesize.py
+1
-1
tacotron2/tacotron/synthesizer.py
tacotron2/tacotron/synthesizer.py
+41
-6
未找到文件。
tacotron2/__init__.py
已删除
100644 → 0
浏览文件 @
0e8571db
tacotron2/inference_demo.py
0 → 100644
浏览文件 @
b7008db2
from
datasets.audio
import
inv_mel_spectrogram
from
tacotron
import
synthesizer
from
hparams
import
hparams
from
vlibs
import
fileio
import
sounddevice
as
sd
import
tensorflow
as
tf
import
numpy
as
np
import
os
os
.
environ
[
'TF_CPP_MIN_LOG_LEVEL'
]
=
'3'
def
get_speaker_embed
(
speaker_id
):
embed_root
=
r
"E:\Datasets\Synthesizer\embed"
embeds
=
[
np
.
load
(
f
)
for
f
in
fileio
.
get_files
(
embed_root
,
"embed-%d-"
%
speaker_id
)]
speaker_embed
=
np
.
mean
(
embeds
,
axis
=
0
)
speaker_embed
/=
np
.
linalg
.
norm
(
speaker_embed
,
2
)
return
speaker_embed
[
None
,
...]
if
__name__
==
'__main__'
:
checkpoint_dir
=
os
.
path
.
join
(
'logs-conditioned'
,
'taco_pretrained'
)
checkpoint_fpath
=
tf
.
train
.
get_checkpoint_state
(
checkpoint_dir
).
model_checkpoint_path
synth
=
synthesizer
.
Synthesizer
()
synth
.
load
(
checkpoint_fpath
,
hparams
)
while
True
:
speaker_id
=
int
(
input
(
"Speaker ID: "
))
speaker_embed
=
get_speaker_embed
(
speaker_id
)
text
=
input
(
"Text: "
)
mel
=
synth
.
my_synthesize
(
speaker_embed
,
text
)
wav
=
inv_mel_spectrogram
(
mel
.
T
,
hparams
)
sd
.
play
(
wav
,
16000
)
sd
.
wait
()
tacotron2/sentences.txt
浏览文件 @
b7008db2
Synthesizing new speech with a new voice!
I hope my thesis will work out nicely.
Can you pass me the butter?
Automatic multispeaker voice cloning.
Can you pass me the butter?
There was no world in which Icarus felt not the need to strike his blade.
This sentence should be the last one.
\ No newline at end of file
tacotron2/tacotron/synthesize.py
浏览文件 @
b7008db2
...
...
@@ -14,7 +14,6 @@ from tqdm import tqdm
def
generate_fast
(
model
,
text
):
model
.
synthesize
(
text
,
None
,
None
,
None
,
None
)
def
run_live
(
args
,
checkpoint_path
,
hparams
):
#Log to Terminal without keeping any records in files
log
(
hparams_debug_string
())
...
...
@@ -114,6 +113,7 @@ def run_synthesis(args, checkpoint_path, output_dir, hparams):
log
(
'synthesized mel spectrograms at {}'
.
format
(
synth_dir
))
return
os
.
path
.
join
(
synth_dir
,
'map.txt'
)
def
tacotron_synthesize
(
args
,
hparams
,
checkpoint
,
sentences
=
None
):
output_dir
=
'tacotron_'
+
args
.
output_dir
...
...
tacotron2/tacotron/synthesizer.py
浏览文件 @
b7008db2
import
os
import
wave
from
datetime
import
datetime
import
numpy
as
np
import
pyaudio
import
sounddevice
as
sd
import
tensorflow
as
tf
from
datasets
import
audio
from
infolog
import
log
from
librosa
import
effects
from
tacotron.models
import
create_model
from
tacotron.utils
import
plot
from
tacotron.utils.text
import
text_to_sequence
...
...
@@ -19,7 +16,7 @@ class Synthesizer:
log
(
'Constructing model: %s'
%
model_name
)
#Force the batch size to be known in order to use attention masking in batch synthesis
inputs
=
tf
.
placeholder
(
tf
.
int32
,
(
None
,
None
),
name
=
'inputs'
)
input_lengths
=
tf
.
placeholder
(
tf
.
int32
,
(
None
),
name
=
'input_lengths'
)
input_lengths
=
tf
.
placeholder
(
tf
.
int32
,
(
None
,
),
name
=
'input_lengths'
)
speaker_embeddings
=
tf
.
placeholder
(
tf
.
float32
,
(
None
,
hparams
.
speaker_embedding_size
),
name
=
'speaker_embeddings'
)
targets
=
tf
.
placeholder
(
tf
.
float32
,
(
None
,
None
,
hparams
.
num_mels
),
name
=
'mel_targets'
)
...
...
@@ -68,6 +65,45 @@ class Synthesizer:
saver
=
tf
.
train
.
Saver
()
saver
.
restore
(
self
.
session
,
checkpoint_path
)
def
my_synthesize
(
self
,
speaker_embed
,
text
,
raise_exception
=
False
):
"""
Lighter synthesis function that directly returns the mel spectrogram.
:param speaker_embed:
:param text: the text to synthesize
:param raise_exception:
:return:
"""
# Prepare the input
cleaner_names
=
[
x
.
strip
()
for
x
in
self
.
_hparams
.
cleaners
.
split
(
','
)]
seqs
=
[
np
.
asarray
(
text_to_sequence
(
text
,
cleaner_names
))]
input_lengths
=
[
len
(
seq
)
for
seq
in
seqs
]
input_seqs
,
max_seq_len
=
self
.
_prepare_inputs
(
seqs
)
split_infos
=
[[
max_seq_len
,
0
,
0
,
0
]]
feed_dict
=
{
self
.
inputs
:
input_seqs
,
self
.
input_lengths
:
np
.
asarray
(
input_lengths
,
dtype
=
np
.
int32
),
self
.
split_infos
:
np
.
asarray
(
split_infos
,
dtype
=
np
.
int32
),
self
.
speaker_embeddings
:
speaker_embed
}
# Forward it
mels
,
alignments
,
stop_tokens
=
self
.
session
.
run
(
[
self
.
mel_outputs
,
self
.
alignments
,
self
.
stop_token_prediction
],
feed_dict
=
feed_dict
)
mel
,
alignment
,
stop_token
=
mels
[
0
][
0
],
alignments
[
0
][
0
],
stop_tokens
[
0
][
0
]
# Trim the output
try
:
target_length
=
np
.
round
(
stop_token
).
index
(
1
)
mel
=
mel
[:
target_length
,
:]
except
:
if
raise_exception
:
raise
Exception
(
"Tacotron could not generate a stop token."
)
return
mel
def
synthesize
(
self
,
texts
,
basenames
,
out_dir
,
log_dir
,
mel_filenames
):
hparams
=
self
.
_hparams
...
...
@@ -116,8 +152,7 @@ class Synthesizer:
assert
len
(
np_targets
)
==
len
(
texts
)
feed_dict
[
self
.
split_infos
]
=
np
.
asarray
(
split_infos
,
dtype
=
np
.
int32
)
embed_fpath
=
r
"E:\Datasets\Synthesizer\embed\embed-85-121551-0036.npy"
feed_dict
[
self
.
speaker_embeddings
]
=
np
.
load
(
embed_fpath
)[
None
,
...]
feed_dict
[
self
.
speaker_embeddings
]
=
np
.
zeros
((
len
(
texts
),
256
))
if
self
.
gta
or
not
hparams
.
predict_linear
:
mels
,
alignments
,
stop_tokens
=
self
.
session
.
run
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录