Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
a97c7b52
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 1 年 前同步成功
通知
206
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
a97c7b52
编写于
11月 18, 2021
作者:
小湉湉
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
rename spembs
上级
8d025451
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
92 addition
and
92 deletion
+92
-92
examples/aishell3/vc1/local/preprocess.sh
examples/aishell3/vc1/local/preprocess.sh
+1
-1
examples/csmsc/voc1/run.sh
examples/csmsc/voc1/run.sh
+1
-1
paddlespeech/t2s/datasets/am_batch_fn.py
paddlespeech/t2s/datasets/am_batch_fn.py
+8
-8
paddlespeech/t2s/exps/fastspeech2/normalize.py
paddlespeech/t2s/exps/fastspeech2/normalize.py
+3
-3
paddlespeech/t2s/exps/fastspeech2/preprocess.py
paddlespeech/t2s/exps/fastspeech2/preprocess.py
+15
-15
paddlespeech/t2s/exps/fastspeech2/synthesize.py
paddlespeech/t2s/exps/fastspeech2/synthesize.py
+5
-5
paddlespeech/t2s/exps/fastspeech2/train.py
paddlespeech/t2s/exps/fastspeech2/train.py
+2
-2
paddlespeech/t2s/exps/fastspeech2/voice_cloning.py
paddlespeech/t2s/exps/fastspeech2/voice_cloning.py
+9
-8
paddlespeech/t2s/models/fastspeech2/fastspeech2.py
paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+23
-23
paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
+6
-6
paddlespeech/t2s/models/transformer_tts/transformer_tts.py
paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+19
-20
未找到文件。
examples/aishell3/vc1/local/preprocess.sh
浏览文件 @
a97c7b52
...
...
@@ -35,7 +35,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--config
=
${
config_path
}
\
--num-cpu
=
20
\
--cut-sil
=
True
\
--
embed-
dir
=
dump/embed
--
spk_emb_
dir
=
dump/embed
fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
...
...
examples/csmsc/voc1/run.sh
浏览文件 @
a97c7b52
...
...
@@ -3,7 +3,7 @@
set
-e
source
path.sh
gpus
=
0,1
gpus
=
4,5
stage
=
0
stop_stage
=
100
...
...
paddlespeech/t2s/datasets/am_batch_fn.py
浏览文件 @
a97c7b52
...
...
@@ -100,7 +100,7 @@ def fastspeech2_single_spk_batch_fn(examples):
def
fastspeech2_multi_spk_batch_fn
(
examples
):
# fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"/"sp
embs
"]
# fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"/"sp
k_emb
"]
text
=
[
np
.
array
(
item
[
"text"
],
dtype
=
np
.
int64
)
for
item
in
examples
]
speech
=
[
np
.
array
(
item
[
"speech"
],
dtype
=
np
.
float32
)
for
item
in
examples
]
pitch
=
[
np
.
array
(
item
[
"pitch"
],
dtype
=
np
.
float32
)
for
item
in
examples
]
...
...
@@ -139,14 +139,14 @@ def fastspeech2_multi_spk_batch_fn(examples):
"pitch"
:
pitch
,
"energy"
:
energy
}
# sp
embs
has a higher priority than spk_id
if
"sp
embs
"
in
examples
[
0
]:
sp
embs
=
[
np
.
array
(
item
[
"sp
embs
"
],
dtype
=
np
.
float32
)
for
item
in
examples
# sp
k_emb
has a higher priority than spk_id
if
"sp
k_emb
"
in
examples
[
0
]:
sp
k_emb
=
[
np
.
array
(
item
[
"sp
k_emb
"
],
dtype
=
np
.
float32
)
for
item
in
examples
]
sp
embs
=
batch_sequences
(
spembs
)
sp
embs
=
paddle
.
to_tensor
(
spembs
)
batch
[
"sp
embs"
]
=
spembs
sp
k_emb
=
batch_sequences
(
spk_emb
)
sp
k_emb
=
paddle
.
to_tensor
(
spk_emb
)
batch
[
"sp
k_emb"
]
=
spk_emb
elif
"spk_id"
in
examples
[
0
]:
spk_id
=
[
np
.
array
(
item
[
"spk_id"
],
dtype
=
np
.
int64
)
for
item
in
examples
]
spk_id
=
paddle
.
to_tensor
(
spk_id
)
...
...
paddlespeech/t2s/exps/fastspeech2/normalize.py
浏览文件 @
a97c7b52
...
...
@@ -167,9 +167,9 @@ def main():
"pitch"
:
str
(
pitch_path
),
"energy"
:
str
(
energy_path
)
}
# add sp
embs
for voice cloning
if
"sp
embs
"
in
item
:
record
[
"sp
embs"
]
=
str
(
item
[
"spembs
"
])
# add sp
k_emb
for voice cloning
if
"sp
k_emb
"
in
item
:
record
[
"sp
k_emb"
]
=
str
(
item
[
"spk_emb
"
])
output_metadata
.
append
(
record
)
output_metadata
.
sort
(
key
=
itemgetter
(
'utt_id'
))
...
...
paddlespeech/t2s/exps/fastspeech2/preprocess.py
浏览文件 @
a97c7b52
...
...
@@ -45,7 +45,7 @@ def process_sentence(config: Dict[str, Any],
pitch_extractor
=
None
,
energy_extractor
=
None
,
cut_sil
:
bool
=
True
,
embed
_dir
:
Path
=
None
):
spk_emb
_dir
:
Path
=
None
):
utt_id
=
fp
.
stem
# for vctk
if
utt_id
.
endswith
(
"_mic2"
):
...
...
@@ -117,12 +117,12 @@ def process_sentence(config: Dict[str, Any],
"energy"
:
str
(
energy_path
),
"speaker"
:
speaker
}
if
embed
_dir
:
if
speaker
in
os
.
listdir
(
embed
_dir
):
if
spk_emb
_dir
:
if
speaker
in
os
.
listdir
(
spk_emb
_dir
):
embed_name
=
utt_id
+
".npy"
embed_path
=
embed
_dir
/
speaker
/
embed_name
embed_path
=
spk_emb
_dir
/
speaker
/
embed_name
if
embed_path
.
is_file
():
record
[
"sp
embs
"
]
=
str
(
embed_path
)
record
[
"sp
k_emb
"
]
=
str
(
embed_path
)
else
:
return
None
return
record
...
...
@@ -137,13 +137,13 @@ def process_sentences(config,
energy_extractor
=
None
,
nprocs
:
int
=
1
,
cut_sil
:
bool
=
True
,
embed
_dir
:
Path
=
None
):
spk_emb
_dir
:
Path
=
None
):
if
nprocs
==
1
:
results
=
[]
for
fp
in
fps
:
record
=
process_sentence
(
config
,
fp
,
sentences
,
output_dir
,
mel_extractor
,
pitch_extractor
,
energy_extractor
,
cut_sil
,
embed
_dir
)
energy_extractor
,
cut_sil
,
spk_emb
_dir
)
if
record
:
results
.
append
(
record
)
else
:
...
...
@@ -154,7 +154,7 @@ def process_sentences(config,
future
=
pool
.
submit
(
process_sentence
,
config
,
fp
,
sentences
,
output_dir
,
mel_extractor
,
pitch_extractor
,
energy_extractor
,
cut_sil
,
embed
_dir
)
cut_sil
,
spk_emb
_dir
)
future
.
add_done_callback
(
lambda
p
:
progress
.
update
())
futures
.
append
(
future
)
...
...
@@ -213,7 +213,7 @@ def main():
help
=
"whether cut sil in the edge of audio"
)
parser
.
add_argument
(
"--
embed-
dir"
,
"--
spk_emb_
dir"
,
default
=
None
,
type
=
str
,
help
=
"directory to speaker embedding files."
)
...
...
@@ -226,10 +226,10 @@ def main():
dumpdir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
dur_file
=
Path
(
args
.
dur_file
).
expanduser
()
if
args
.
embed
_dir
:
embed_dir
=
Path
(
args
.
embed
_dir
).
expanduser
().
resolve
()
if
args
.
spk_emb
_dir
:
spk_emb_dir
=
Path
(
args
.
spk_emb
_dir
).
expanduser
().
resolve
()
else
:
embed
_dir
=
None
spk_emb
_dir
=
None
assert
rootdir
.
is_dir
()
assert
dur_file
.
is_file
()
...
...
@@ -339,7 +339,7 @@ def main():
energy_extractor
,
nprocs
=
args
.
num_cpu
,
cut_sil
=
args
.
cut_sil
,
embed_dir
=
embed
_dir
)
spk_emb_dir
=
spk_emb
_dir
)
if
dev_wav_files
:
process_sentences
(
config
,
...
...
@@ -350,7 +350,7 @@ def main():
pitch_extractor
,
energy_extractor
,
cut_sil
=
args
.
cut_sil
,
embed_dir
=
embed
_dir
)
spk_emb_dir
=
spk_emb
_dir
)
if
test_wav_files
:
process_sentences
(
config
,
...
...
@@ -362,7 +362,7 @@ def main():
energy_extractor
,
nprocs
=
args
.
num_cpu
,
cut_sil
=
args
.
cut_sil
,
embed_dir
=
embed
_dir
)
spk_emb_dir
=
spk_emb
_dir
)
if
__name__
==
"__main__"
:
...
...
paddlespeech/t2s/exps/fastspeech2/synthesize.py
浏览文件 @
a97c7b52
...
...
@@ -49,7 +49,7 @@ def evaluate(args, fastspeech2_config, pwg_config):
fields
+=
[
"spk_id"
]
elif
args
.
voice_cloning
:
print
(
"voice cloning!"
)
fields
+=
[
"sp
embs
"
]
fields
+=
[
"sp
k_emb
"
]
else
:
print
(
"single speaker fastspeech2!"
)
print
(
"num_speakers:"
,
num_speakers
)
...
...
@@ -99,15 +99,15 @@ def evaluate(args, fastspeech2_config, pwg_config):
for
datum
in
test_dataset
:
utt_id
=
datum
[
"utt_id"
]
text
=
paddle
.
to_tensor
(
datum
[
"text"
])
sp
embs
=
None
sp
k_emb
=
None
spk_id
=
None
if
args
.
voice_cloning
and
"sp
embs
"
in
datum
:
sp
embs
=
paddle
.
to_tensor
(
np
.
load
(
datum
[
"spembs
"
]))
if
args
.
voice_cloning
and
"sp
k_emb
"
in
datum
:
sp
k_emb
=
paddle
.
to_tensor
(
np
.
load
(
datum
[
"spk_emb
"
]))
elif
"spk_id"
in
datum
:
spk_id
=
paddle
.
to_tensor
(
datum
[
"spk_id"
])
with
paddle
.
no_grad
():
wav
=
pwg_inference
(
fastspeech2_inference
(
text
,
spk_id
=
spk_id
,
sp
embs
=
spembs
))
fastspeech2_inference
(
text
,
spk_id
=
spk_id
,
sp
k_emb
=
spk_emb
))
sf
.
write
(
str
(
output_dir
/
(
utt_id
+
".wav"
)),
wav
.
numpy
(),
...
...
paddlespeech/t2s/exps/fastspeech2/train.py
浏览文件 @
a97c7b52
...
...
@@ -73,8 +73,8 @@ def train_sp(args, config):
elif
args
.
voice_cloning
:
print
(
"Training voice cloning!"
)
collate_fn
=
fastspeech2_multi_spk_batch_fn
fields
+=
[
"sp
embs
"
]
converters
[
"sp
embs
"
]
=
np
.
load
fields
+=
[
"sp
k_emb
"
]
converters
[
"sp
k_emb
"
]
=
np
.
load
else
:
print
(
"single speaker fastspeech2!"
)
collate_fn
=
fastspeech2_single_spk_batch_fn
...
...
paddlespeech/t2s/exps/fastspeech2/voice_cloning.py
浏览文件 @
a97c7b52
...
...
@@ -107,24 +107,25 @@ def voice_cloning(args, fastspeech2_config, pwg_config):
mel_sequences
=
p
.
extract_mel_partials
(
p
.
preprocess_wav
(
ref_audio_path
))
# print("mel_sequences: ", mel_sequences.shape)
with
paddle
.
no_grad
():
sp
embs
=
speaker_encoder
.
embed_utterance
(
sp
k_emb
=
speaker_encoder
.
embed_utterance
(
paddle
.
to_tensor
(
mel_sequences
))
# print("sp
embs shape: ", spembs
.shape)
# print("sp
k_emb shape: ", spk_emb
.shape)
with
paddle
.
no_grad
():
wav
=
pwg_inference
(
fastspeech2_inference
(
phone_ids
,
spembs
=
spembs
))
wav
=
pwg_inference
(
fastspeech2_inference
(
phone_ids
,
spk_emb
=
spk_emb
))
sf
.
write
(
str
(
output_dir
/
(
utt_id
+
".wav"
)),
wav
.
numpy
(),
samplerate
=
fastspeech2_config
.
fs
)
print
(
f
"
{
utt_id
}
done!"
)
# Randomly generate numbers of 0 ~ 0.2, 256 is the dim of sp
embs
random_sp
embs
=
np
.
random
.
rand
(
256
)
*
0.2
random_sp
embs
=
paddle
.
to_tensor
(
random_spembs
)
utt_id
=
"random_sp
embs
"
# Randomly generate numbers of 0 ~ 0.2, 256 is the dim of sp
k_emb
random_sp
k_emb
=
np
.
random
.
rand
(
256
)
*
0.2
random_sp
k_emb
=
paddle
.
to_tensor
(
random_spk_emb
)
utt_id
=
"random_sp
k_emb
"
with
paddle
.
no_grad
():
wav
=
pwg_inference
(
fastspeech2_inference
(
phone_ids
,
sp
embs
=
spembs
))
wav
=
pwg_inference
(
fastspeech2_inference
(
phone_ids
,
sp
k_emb
=
spk_emb
))
sf
.
write
(
str
(
output_dir
/
(
utt_id
+
".wav"
)),
wav
.
numpy
(),
...
...
paddlespeech/t2s/models/fastspeech2/fastspeech2.py
浏览文件 @
a97c7b52
...
...
@@ -297,7 +297,7 @@ class FastSpeech2(nn.Layer):
pitch
:
paddle
.
Tensor
,
energy
:
paddle
.
Tensor
,
tone_id
:
paddle
.
Tensor
=
None
,
sp
embs
:
paddle
.
Tensor
=
None
,
sp
k_emb
:
paddle
.
Tensor
=
None
,
spk_id
:
paddle
.
Tensor
=
None
)
->
Tuple
[
paddle
.
Tensor
,
Dict
[
str
,
paddle
.
Tensor
],
paddle
.
Tensor
]:
"""Calculate forward propagation.
...
...
@@ -320,7 +320,7 @@ class FastSpeech2(nn.Layer):
Batch of padded token-averaged energy (B, Tmax, 1).
tone_id : Tensor, optional(int64)
Batch of padded tone ids (B, Tmax).
sp
embs
: Tensor, optional
sp
k_emb
: Tensor, optional
Batch of speaker embeddings (B, spk_embed_dim).
spk_id : Tnesor, optional(int64)
Batch of speaker ids (B,)
...
...
@@ -364,7 +364,7 @@ class FastSpeech2(nn.Layer):
ps
,
es
,
is_inference
=
False
,
sp
embs
=
spembs
,
sp
k_emb
=
spk_emb
,
spk_id
=
spk_id
,
tone_id
=
tone_id
)
# modify mod part of groundtruth
...
...
@@ -385,7 +385,7 @@ class FastSpeech2(nn.Layer):
es
:
paddle
.
Tensor
=
None
,
is_inference
:
bool
=
False
,
alpha
:
float
=
1.0
,
sp
embs
=
None
,
sp
k_emb
=
None
,
spk_id
=
None
,
tone_id
=
None
)
->
Sequence
[
paddle
.
Tensor
]:
# forward encoder
...
...
@@ -395,12 +395,12 @@ class FastSpeech2(nn.Layer):
# integrate speaker embedding
if
self
.
spk_embed_dim
is
not
None
:
# sp
embs
has a higher priority than spk_id
if
sp
embs
is
not
None
:
hs
=
self
.
_integrate_with_spk_embed
(
hs
,
sp
embs
)
# sp
k_emb
has a higher priority than spk_id
if
sp
k_emb
is
not
None
:
hs
=
self
.
_integrate_with_spk_embed
(
hs
,
sp
k_emb
)
elif
spk_id
is
not
None
:
sp
embs
=
self
.
spk_embedding_table
(
spk_id
)
hs
=
self
.
_integrate_with_spk_embed
(
hs
,
sp
embs
)
sp
k_emb
=
self
.
spk_embedding_table
(
spk_id
)
hs
=
self
.
_integrate_with_spk_embed
(
hs
,
sp
k_emb
)
# integrate tone embedding
if
self
.
tone_embed_dim
is
not
None
:
...
...
@@ -488,7 +488,7 @@ class FastSpeech2(nn.Layer):
energy
:
paddle
.
Tensor
=
None
,
alpha
:
float
=
1.0
,
use_teacher_forcing
:
bool
=
False
,
sp
embs
=
None
,
sp
k_emb
=
None
,
spk_id
=
None
,
tone_id
=
None
,
)
->
Tuple
[
paddle
.
Tensor
,
paddle
.
Tensor
,
paddle
.
Tensor
]:
...
...
@@ -511,7 +511,7 @@ class FastSpeech2(nn.Layer):
use_teacher_forcing : bool, optional
Whether to use teacher forcing.
If true, groundtruth of duration, pitch and energy will be used.
sp
embs
: Tensor, optional
sp
k_emb
: Tensor, optional
peaker embedding vector (spk_embed_dim,).
spk_id : Tensor, optional(int64)
Batch of padded spk ids (1,).
...
...
@@ -535,8 +535,8 @@ class FastSpeech2(nn.Layer):
if
y
is
not
None
:
ys
=
y
.
unsqueeze
(
0
)
if
sp
embs
is
not
None
:
sp
embs
=
spembs
.
unsqueeze
(
0
)
if
sp
k_emb
is
not
None
:
sp
k_emb
=
spk_emb
.
unsqueeze
(
0
)
if
tone_id
is
not
None
:
tone_id
=
tone_id
.
unsqueeze
(
0
)
...
...
@@ -555,7 +555,7 @@ class FastSpeech2(nn.Layer):
ds
=
ds
,
ps
=
ps
,
es
=
es
,
sp
embs
=
spembs
,
sp
k_emb
=
spk_emb
,
spk_id
=
spk_id
,
tone_id
=
tone_id
,
is_inference
=
True
)
...
...
@@ -567,19 +567,19 @@ class FastSpeech2(nn.Layer):
ys
,
is_inference
=
True
,
alpha
=
alpha
,
sp
embs
=
spembs
,
sp
k_emb
=
spk_emb
,
spk_id
=
spk_id
,
tone_id
=
tone_id
)
return
outs
[
0
],
d_outs
[
0
],
p_outs
[
0
],
e_outs
[
0
]
def
_integrate_with_spk_embed
(
self
,
hs
,
sp
embs
):
def
_integrate_with_spk_embed
(
self
,
hs
,
sp
k_emb
):
"""Integrate speaker embedding with hidden states.
Parameters
----------
hs : Tensor
Batch of hidden state sequences (B, Tmax, adim).
sp
embs
: Tensor
sp
k_emb
: Tensor
Batch of speaker embeddings (B, spk_embed_dim).
Returns
...
...
@@ -589,13 +589,13 @@ class FastSpeech2(nn.Layer):
"""
if
self
.
spk_embed_integration_type
==
"add"
:
# apply projection and then add to hidden states
sp
embs
=
self
.
spk_projection
(
F
.
normalize
(
spembs
))
hs
=
hs
+
sp
embs
.
unsqueeze
(
1
)
sp
k_emb
=
self
.
spk_projection
(
F
.
normalize
(
spk_emb
))
hs
=
hs
+
sp
k_emb
.
unsqueeze
(
1
)
elif
self
.
spk_embed_integration_type
==
"concat"
:
# concat hidden states with spk embeds and then apply projection
sp
embs
=
F
.
normalize
(
spembs
).
unsqueeze
(
1
).
expand
(
sp
k_emb
=
F
.
normalize
(
spk_emb
).
unsqueeze
(
1
).
expand
(
shape
=
[
-
1
,
hs
.
shape
[
1
],
-
1
])
hs
=
self
.
spk_projection
(
paddle
.
concat
([
hs
,
sp
embs
],
axis
=-
1
))
hs
=
self
.
spk_projection
(
paddle
.
concat
([
hs
,
sp
k_emb
],
axis
=-
1
))
else
:
raise
NotImplementedError
(
"support only add or concat."
)
...
...
@@ -680,9 +680,9 @@ class FastSpeech2Inference(nn.Layer):
self
.
normalizer
=
normalizer
self
.
acoustic_model
=
model
def
forward
(
self
,
text
,
spk_id
=
None
,
sp
embs
=
None
):
def
forward
(
self
,
text
,
spk_id
=
None
,
sp
k_emb
=
None
):
normalized_mel
,
d_outs
,
p_outs
,
e_outs
=
self
.
acoustic_model
.
inference
(
text
,
spk_id
=
spk_id
,
sp
embs
=
spembs
)
text
,
spk_id
=
spk_id
,
sp
k_emb
=
spk_emb
)
logmel
=
self
.
normalizer
.
inverse
(
normalized_mel
)
return
logmel
...
...
paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
浏览文件 @
a97c7b52
...
...
@@ -54,9 +54,9 @@ class FastSpeech2Updater(StandardUpdater):
losses_dict
=
{}
# spk_id!=None in multiple spk fastspeech2
spk_id
=
batch
[
"spk_id"
]
if
"spk_id"
in
batch
else
None
sp
embs
=
batch
[
"spembs"
]
if
"spembs
"
in
batch
else
None
sp
k_emb
=
batch
[
"spk_emb"
]
if
"spk_emb
"
in
batch
else
None
# No explicit speaker identifier labels are used during voice cloning training.
if
sp
embs
is
not
None
:
if
sp
k_emb
is
not
None
:
spk_id
=
None
before_outs
,
after_outs
,
d_outs
,
p_outs
,
e_outs
,
ys
,
olens
=
self
.
model
(
...
...
@@ -68,7 +68,7 @@ class FastSpeech2Updater(StandardUpdater):
pitch
=
batch
[
"pitch"
],
energy
=
batch
[
"energy"
],
spk_id
=
spk_id
,
sp
embs
=
spembs
)
sp
k_emb
=
spk_emb
)
l1_loss
,
duration_loss
,
pitch_loss
,
energy_loss
=
self
.
criterion
(
after_outs
=
after_outs
,
...
...
@@ -131,8 +131,8 @@ class FastSpeech2Evaluator(StandardEvaluator):
losses_dict
=
{}
# spk_id!=None in multiple spk fastspeech2
spk_id
=
batch
[
"spk_id"
]
if
"spk_id"
in
batch
else
None
sp
embs
=
batch
[
"spembs"
]
if
"spembs
"
in
batch
else
None
if
sp
embs
is
not
None
:
sp
k_emb
=
batch
[
"spk_emb"
]
if
"spk_emb
"
in
batch
else
None
if
sp
k_emb
is
not
None
:
spk_id
=
None
before_outs
,
after_outs
,
d_outs
,
p_outs
,
e_outs
,
ys
,
olens
=
self
.
model
(
...
...
@@ -144,7 +144,7 @@ class FastSpeech2Evaluator(StandardEvaluator):
pitch
=
batch
[
"pitch"
],
energy
=
batch
[
"energy"
],
spk_id
=
spk_id
,
sp
embs
=
spembs
)
sp
k_emb
=
spk_emb
)
l1_loss
,
duration_loss
,
pitch_loss
,
energy_loss
=
self
.
criterion
(
after_outs
=
after_outs
,
...
...
paddlespeech/t2s/models/transformer_tts/transformer_tts.py
浏览文件 @
a97c7b52
...
...
@@ -391,7 +391,7 @@ class TransformerTTS(nn.Layer):
text_lengths
:
paddle
.
Tensor
,
speech
:
paddle
.
Tensor
,
speech_lengths
:
paddle
.
Tensor
,
sp
embs
:
paddle
.
Tensor
=
None
,
sp
k_emb
:
paddle
.
Tensor
=
None
,
)
->
Tuple
[
paddle
.
Tensor
,
Dict
[
str
,
paddle
.
Tensor
],
paddle
.
Tensor
]:
"""Calculate forward propagation.
...
...
@@ -405,7 +405,7 @@ class TransformerTTS(nn.Layer):
Batch of padded target features (B, Lmax, odim).
speech_lengths : Tensor(int64)
Batch of the lengths of each target (B,).
sp
embs
: Tensor, optional
sp
k_emb
: Tensor, optional
Batch of speaker embeddings (B, spk_embed_dim).
Returns
...
...
@@ -439,7 +439,7 @@ class TransformerTTS(nn.Layer):
# calculate transformer outputs
after_outs
,
before_outs
,
logits
=
self
.
_forward
(
xs
,
ilens
,
ys
,
olens
,
sp
embs
)
sp
k_emb
)
# modifiy mod part of groundtruth
...
...
@@ -467,7 +467,7 @@ class TransformerTTS(nn.Layer):
ilens
:
paddle
.
Tensor
,
ys
:
paddle
.
Tensor
,
olens
:
paddle
.
Tensor
,
sp
embs
:
paddle
.
Tensor
,
sp
k_emb
:
paddle
.
Tensor
,
)
->
Tuple
[
paddle
.
Tensor
,
paddle
.
Tensor
,
paddle
.
Tensor
]:
# forward encoder
x_masks
=
self
.
_source_mask
(
ilens
)
...
...
@@ -480,7 +480,7 @@ class TransformerTTS(nn.Layer):
# integrate speaker embedding
if
self
.
spk_embed_dim
is
not
None
:
hs
=
self
.
_integrate_with_spk_embed
(
hs
,
sp
embs
)
hs
=
self
.
_integrate_with_spk_embed
(
hs
,
sp
k_emb
)
# thin out frames for reduction factor (B, Lmax, odim) -> (B, Lmax//r, odim)
if
self
.
reduction_factor
>
1
:
...
...
@@ -514,7 +514,7 @@ class TransformerTTS(nn.Layer):
self
,
text
:
paddle
.
Tensor
,
speech
:
paddle
.
Tensor
=
None
,
sp
embs
:
paddle
.
Tensor
=
None
,
sp
k_emb
:
paddle
.
Tensor
=
None
,
threshold
:
float
=
0.5
,
minlenratio
:
float
=
0.0
,
maxlenratio
:
float
=
10.0
,
...
...
@@ -528,7 +528,7 @@ class TransformerTTS(nn.Layer):
Input sequence of characters (T,).
speech : Tensor, optional
Feature sequence to extract style (N, idim).
sp
embs
: Tensor, optional
sp
k_emb
: Tensor, optional
Speaker embedding vector (spk_embed_dim,).
threshold : float, optional
Threshold in inference.
...
...
@@ -551,7 +551,6 @@ class TransformerTTS(nn.Layer):
"""
# input of embedding must be int64
y
=
speech
spemb
=
spembs
# add eos at the last of sequence
text
=
numpy
.
pad
(
...
...
@@ -564,12 +563,12 @@ class TransformerTTS(nn.Layer):
# get teacher forcing outputs
xs
,
ys
=
x
.
unsqueeze
(
0
),
y
.
unsqueeze
(
0
)
sp
embs
=
None
if
spemb
is
None
else
sp
emb
.
unsqueeze
(
0
)
sp
k_emb
=
None
if
spk_emb
is
None
else
spk_
emb
.
unsqueeze
(
0
)
ilens
=
paddle
.
to_tensor
(
[
xs
.
shape
[
1
]],
dtype
=
paddle
.
int64
,
place
=
xs
.
place
)
olens
=
paddle
.
to_tensor
(
[
ys
.
shape
[
1
]],
dtype
=
paddle
.
int64
,
place
=
ys
.
place
)
outs
,
*
_
=
self
.
_forward
(
xs
,
ilens
,
ys
,
olens
,
sp
embs
)
outs
,
*
_
=
self
.
_forward
(
xs
,
ilens
,
ys
,
olens
,
sp
k_emb
)
# get attention weights
att_ws
=
[]
...
...
@@ -590,9 +589,9 @@ class TransformerTTS(nn.Layer):
hs
=
hs
+
style_embs
.
unsqueeze
(
1
)
# integrate speaker embedding
if
s
elf
.
spk_embed_dim
is
not
None
:
sp
embs
=
sp
emb
.
unsqueeze
(
0
)
hs
=
self
.
_integrate_with_spk_embed
(
hs
,
sp
embs
)
if
s
pk_emb
is
not
None
:
sp
k_emb
=
spk_
emb
.
unsqueeze
(
0
)
hs
=
self
.
_integrate_with_spk_embed
(
hs
,
sp
k_emb
)
# set limits of length
maxlen
=
int
(
hs
.
shape
[
1
]
*
maxlenratio
/
self
.
reduction_factor
)
...
...
@@ -726,14 +725,14 @@ class TransformerTTS(nn.Layer):
def
_integrate_with_spk_embed
(
self
,
hs
:
paddle
.
Tensor
,
sp
embs
:
paddle
.
Tensor
)
->
paddle
.
Tensor
:
sp
k_emb
:
paddle
.
Tensor
)
->
paddle
.
Tensor
:
"""Integrate speaker embedding with hidden states.
Parameters
----------
hs : Tensor
Batch of hidden state sequences (B, Tmax, adim).
sp
embs
: Tensor
sp
k_emb
: Tensor
Batch of speaker embeddings (B, spk_embed_dim).
Returns
...
...
@@ -744,13 +743,13 @@ class TransformerTTS(nn.Layer):
"""
if
self
.
spk_embed_integration_type
==
"add"
:
# apply projection and then add to hidden states
sp
embs
=
self
.
projection
(
F
.
normalize
(
spembs
))
hs
=
hs
+
sp
embs
.
unsqueeze
(
1
)
sp
k_emb
=
self
.
projection
(
F
.
normalize
(
spk_emb
))
hs
=
hs
+
sp
k_emb
.
unsqueeze
(
1
)
elif
self
.
spk_embed_integration_type
==
"concat"
:
# concat hidden states with spk embeds and then apply projection
sp
embs
=
F
.
normalize
(
spembs
).
unsqueeze
(
1
).
expand
(
-
1
,
hs
.
shape
[
1
],
-
1
)
hs
=
self
.
projection
(
paddle
.
concat
([
hs
,
sp
embs
],
axis
=-
1
))
sp
k_emb
=
F
.
normalize
(
spk_emb
).
unsqueeze
(
1
).
expand
(
-
1
,
hs
.
shape
[
1
],
-
1
)
hs
=
self
.
projection
(
paddle
.
concat
([
hs
,
sp
k_emb
],
axis
=-
1
))
else
:
raise
NotImplementedError
(
"support only add or concat."
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录