Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
96323816
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 1 年 前同步成功
通知
206
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
96323816
编写于
1月 18, 2022
作者:
小湉湉
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix yamls, change labels to stop_labels, test=tts
上级
1bf1a876
变更
14
隐藏空白更改
内联
并排
Showing
14 changed file
with
53 addition
and
73 deletion
+53
-73
examples/aishell3/tts3/conf/default.yaml
examples/aishell3/tts3/conf/default.yaml
+2
-2
examples/aishell3/vc1/conf/default.yaml
examples/aishell3/vc1/conf/default.yaml
+2
-2
examples/csmsc/tts0/conf/default.yaml
examples/csmsc/tts0/conf/default.yaml
+0
-4
examples/csmsc/tts3/conf/conformer.yaml
examples/csmsc/tts3/conf/conformer.yaml
+2
-2
examples/csmsc/tts3/conf/default.yaml
examples/csmsc/tts3/conf/default.yaml
+2
-2
examples/ljspeech/tts3/conf/default.yaml
examples/ljspeech/tts3/conf/default.yaml
+2
-2
examples/vctk/tts3/conf/default.yaml
examples/vctk/tts3/conf/default.yaml
+2
-2
paddlespeech/t2s/exps/new_tacotron2/preprocess.py
paddlespeech/t2s/exps/new_tacotron2/preprocess.py
+1
-26
paddlespeech/t2s/models/new_tacotron2/tacotron2.py
paddlespeech/t2s/models/new_tacotron2/tacotron2.py
+7
-6
paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py
paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py
+16
-6
paddlespeech/t2s/models/transformer_tts/transformer_tts.py
paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+7
-9
paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py
...ech/t2s/models/transformer_tts/transformer_tts_updater.py
+4
-4
paddlespeech/t2s/modules/losses.py
paddlespeech/t2s/modules/losses.py
+5
-5
paddlespeech/t2s/modules/tacotron2/attentions.py
paddlespeech/t2s/modules/tacotron2/attentions.py
+1
-1
未找到文件。
examples/aishell3/tts3/conf/default.yaml
浏览文件 @
96323816
...
...
@@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels
:
80
# The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min
:
80
# M
ax
imum f0 for pitch extraction.
f0max
:
400
# M
in
imum f0 for pitch extraction.
f0min
:
80
# M
in
imum f0 for pitch extraction.
f0max
:
400
# M
ax
imum f0 for pitch extraction.
###########################################################
...
...
examples/aishell3/vc1/conf/default.yaml
浏览文件 @
96323816
...
...
@@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels
:
80
# The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min
:
80
# M
ax
imum f0 for pitch extraction.
f0max
:
400
# M
in
imum f0 for pitch extraction.
f0min
:
80
# M
in
imum f0 for pitch extraction.
f0max
:
400
# M
ax
imum f0 for pitch extraction.
###########################################################
...
...
examples/csmsc/tts0/conf/default.yaml
浏览文件 @
96323816
...
...
@@ -21,10 +21,6 @@ fmin: 80 # Minimum frequency of Mel basis.
fmax
:
7600
# Maximum frequency of Mel basis.
n_mels
:
80
# The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min
:
80
# Maximum f0 for pitch extraction.
f0max
:
400
# Minimum f0 for pitch extraction.
###########################################################
# DATA SETTING #
###########################################################
...
...
examples/csmsc/tts3/conf/conformer.yaml
浏览文件 @
96323816
...
...
@@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels
:
80
# The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min
:
80
# M
ax
imum f0 for pitch extraction.
f0max
:
400
# M
in
imum f0 for pitch extraction.
f0min
:
80
# M
in
imum f0 for pitch extraction.
f0max
:
400
# M
ax
imum f0 for pitch extraction.
###########################################################
...
...
examples/csmsc/tts3/conf/default.yaml
浏览文件 @
96323816
...
...
@@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels
:
80
# The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min
:
80
# M
ax
imum f0 for pitch extraction.
f0max
:
400
# M
in
imum f0 for pitch extraction.
f0min
:
80
# M
in
imum f0 for pitch extraction.
f0max
:
400
# M
ax
imum f0 for pitch extraction.
###########################################################
...
...
examples/ljspeech/tts3/conf/default.yaml
浏览文件 @
96323816
...
...
@@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels
:
80
# The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min
:
80
# M
ax
imum f0 for pitch extraction.
f0max
:
400
# M
in
imum f0 for pitch extraction.
f0min
:
80
# M
in
imum f0 for pitch extraction.
f0max
:
400
# M
ax
imum f0 for pitch extraction.
###########################################################
...
...
examples/vctk/tts3/conf/default.yaml
浏览文件 @
96323816
...
...
@@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels
:
80
# The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min
:
80
# M
ax
imum f0 for pitch extraction.
f0max
:
400
# M
in
imum f0 for pitch extraction.
f0min
:
80
# M
in
imum f0 for pitch extraction.
f0max
:
400
# M
ax
imum f0 for pitch extraction.
###########################################################
...
...
paddlespeech/t2s/exps/new_tacotron2/preprocess.py
浏览文件 @
96323816
...
...
@@ -27,9 +27,7 @@ import tqdm
import
yaml
from
yacs.config
import
CfgNode
from
paddlespeech.t2s.data.get_feats
import
Energy
from
paddlespeech.t2s.data.get_feats
import
LogMelFBank
from
paddlespeech.t2s.data.get_feats
import
Pitch
from
paddlespeech.t2s.datasets.preprocess_utils
import
compare_duration_and_mel_length
from
paddlespeech.t2s.datasets.preprocess_utils
import
get_input_token
from
paddlespeech.t2s.datasets.preprocess_utils
import
get_phn_dur
...
...
@@ -42,8 +40,6 @@ def process_sentence(config: Dict[str, Any],
sentences
:
Dict
,
output_dir
:
Path
,
mel_extractor
=
None
,
pitch_extractor
=
None
,
energy_extractor
=
None
,
cut_sil
:
bool
=
True
,
spk_emb_dir
:
Path
=
None
):
utt_id
=
fp
.
stem
...
...
@@ -117,8 +113,6 @@ def process_sentences(config,
sentences
:
Dict
,
output_dir
:
Path
,
mel_extractor
=
None
,
pitch_extractor
=
None
,
energy_extractor
=
None
,
nprocs
:
int
=
1
,
cut_sil
:
bool
=
True
,
spk_emb_dir
:
Path
=
None
):
...
...
@@ -126,8 +120,7 @@ def process_sentences(config,
results
=
[]
for
fp
in
fps
:
record
=
process_sentence
(
config
,
fp
,
sentences
,
output_dir
,
mel_extractor
,
pitch_extractor
,
energy_extractor
,
cut_sil
,
spk_emb_dir
)
mel_extractor
,
cut_sil
,
spk_emb_dir
)
if
record
:
results
.
append
(
record
)
else
:
...
...
@@ -137,7 +130,6 @@ def process_sentences(config,
for
fp
in
fps
:
future
=
pool
.
submit
(
process_sentence
,
config
,
fp
,
sentences
,
output_dir
,
mel_extractor
,
pitch_extractor
,
energy_extractor
,
cut_sil
,
spk_emb_dir
)
future
.
add_done_callback
(
lambda
p
:
progress
.
update
())
futures
.
append
(
future
)
...
...
@@ -299,17 +291,6 @@ def main():
n_mels
=
config
.
n_mels
,
fmin
=
config
.
fmin
,
fmax
=
config
.
fmax
)
pitch_extractor
=
Pitch
(
sr
=
config
.
fs
,
hop_length
=
config
.
n_shift
,
f0min
=
config
.
f0min
,
f0max
=
config
.
f0max
)
energy_extractor
=
Energy
(
sr
=
config
.
fs
,
n_fft
=
config
.
n_fft
,
hop_length
=
config
.
n_shift
,
win_length
=
config
.
win_length
,
window
=
config
.
window
)
# process for the 3 sections
if
train_wav_files
:
...
...
@@ -319,8 +300,6 @@ def main():
sentences
,
train_dump_dir
,
mel_extractor
,
pitch_extractor
,
energy_extractor
,
nprocs
=
args
.
num_cpu
,
cut_sil
=
args
.
cut_sil
,
spk_emb_dir
=
spk_emb_dir
)
...
...
@@ -331,8 +310,6 @@ def main():
sentences
,
dev_dump_dir
,
mel_extractor
,
pitch_extractor
,
energy_extractor
,
cut_sil
=
args
.
cut_sil
,
spk_emb_dir
=
spk_emb_dir
)
if
test_wav_files
:
...
...
@@ -342,8 +319,6 @@ def main():
sentences
,
test_dump_dir
,
mel_extractor
,
pitch_extractor
,
energy_extractor
,
nprocs
=
args
.
num_cpu
,
cut_sil
=
args
.
cut_sil
,
spk_emb_dir
=
spk_emb_dir
)
...
...
paddlespeech/t2s/models/new_tacotron2/tacotron2.py
浏览文件 @
96323816
...
...
@@ -300,10 +300,10 @@ class Tacotron2(nn.Layer):
olens
=
speech_lengths
# make labels for stop prediction
labels
=
make_pad_mask
(
olens
-
1
)
stop_
labels
=
make_pad_mask
(
olens
-
1
)
# bool 类型无法切片
labels
=
paddle
.
cast
(
labels
,
dtype
=
'float32'
)
labels
=
F
.
pad
(
labels
,
[
0
,
0
,
0
,
1
],
"constant"
,
1.0
)
stop_labels
=
paddle
.
cast
(
stop_
labels
,
dtype
=
'float32'
)
stop_labels
=
F
.
pad
(
stop_
labels
,
[
0
,
0
,
0
,
1
],
"constant"
,
1.0
)
# calculate tacotron2 outputs
after_outs
,
before_outs
,
logits
,
att_ws
=
self
.
_forward
(
...
...
@@ -322,12 +322,13 @@ class Tacotron2(nn.Layer):
olens
=
olens
-
olens
%
self
.
reduction_factor
max_out
=
max
(
olens
)
ys
=
ys
[:,
:
max_out
]
labels
=
labels
[:,
:
max_out
]
labels
=
paddle
.
scatter
(
labels
,
1
,
(
olens
-
1
).
unsqueeze
(
1
),
1.0
)
stop_labels
=
stop_labels
[:,
:
max_out
]
stop_labels
=
paddle
.
scatter
(
stop_labels
,
1
,
(
olens
-
1
).
unsqueeze
(
1
),
1.0
)
olens_in
=
olens
//
self
.
reduction_factor
else
:
olens_in
=
olens
return
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
,
att_ws
,
olens_in
return
after_outs
,
before_outs
,
logits
,
ys
,
stop_
labels
,
olens
,
att_ws
,
olens_in
def
_forward
(
self
,
...
...
paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py
浏览文件 @
96323816
...
...
@@ -74,7 +74,7 @@ class Tacotron2Updater(StandardUpdater):
if
spk_emb
is
not
None
:
spk_id
=
None
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
,
att_ws
,
olens_in
=
self
.
model
(
after_outs
,
before_outs
,
logits
,
ys
,
stop_
labels
,
olens
,
att_ws
,
olens_in
=
self
.
model
(
text
=
batch
[
"text"
],
text_lengths
=
batch
[
"text_lengths"
],
speech
=
batch
[
"speech"
],
...
...
@@ -83,8 +83,13 @@ class Tacotron2Updater(StandardUpdater):
spk_emb
=
spk_emb
)
# calculate taco2 loss
l1_loss
,
mse_loss
,
bce_loss
=
self
.
taco2_loss
(
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
)
l1_loss
,
mse_loss
,
bce_loss
=
self
.
taco2_loss
(
after_outs
=
after_outs
,
before_outs
=
before_outs
,
logits
=
logits
,
ys
=
ys
,
stop_labels
=
stop_labels
,
olens
=
olens
)
if
self
.
loss_type
==
"L1+L2"
:
loss
=
l1_loss
+
mse_loss
+
bce_loss
...
...
@@ -164,7 +169,7 @@ class Tacotron2Evaluator(StandardEvaluator):
if
spk_emb
is
not
None
:
spk_id
=
None
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
,
att_ws
,
olens_in
=
self
.
model
(
after_outs
,
before_outs
,
logits
,
ys
,
stop_
labels
,
olens
,
att_ws
,
olens_in
=
self
.
model
(
text
=
batch
[
"text"
],
text_lengths
=
batch
[
"text_lengths"
],
speech
=
batch
[
"speech"
],
...
...
@@ -173,8 +178,13 @@ class Tacotron2Evaluator(StandardEvaluator):
spk_emb
=
spk_emb
)
# calculate taco2 loss
l1_loss
,
mse_loss
,
bce_loss
=
self
.
taco2_loss
(
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
)
l1_loss
,
mse_loss
,
bce_loss
=
self
.
taco2_loss
(
after_outs
=
after_outs
,
before_outs
=
before_outs
,
logits
=
logits
,
ys
=
ys
,
stop_labels
=
stop_labels
,
olens
=
olens
)
if
self
.
loss_type
==
"L1+L2"
:
loss
=
l1_loss
+
mse_loss
+
bce_loss
...
...
paddlespeech/t2s/models/transformer_tts/transformer_tts.py
浏览文件 @
96323816
...
...
@@ -433,12 +433,10 @@ class TransformerTTS(nn.Layer):
olens
=
paddle
.
cast
(
speech_lengths
,
'int64'
)
# make labels for stop prediction
labels
=
make_pad_mask
(
olens
-
1
)
labels
=
numpy
.
pad
(
labels
.
numpy
(),
((
0
,
0
),
(
0
,
1
)),
'constant'
,
constant_values
=
1.0
)
labels
=
paddle
.
to_tensor
(
labels
)
labels
=
paddle
.
cast
(
labels
,
dtype
=
"float32"
)
# labels = F.pad(labels, [0, 1], "constant", 1.0)
stop_labels
=
make_pad_mask
(
olens
-
1
)
# bool 类型无法切片
stop_labels
=
paddle
.
cast
(
stop_labels
,
dtype
=
'float32'
)
stop_labels
=
F
.
pad
(
stop_labels
,
[
0
,
0
,
0
,
1
],
"constant"
,
1.0
)
# calculate transformer outputs
after_outs
,
before_outs
,
logits
=
self
.
_forward
(
xs
,
ilens
,
ys
,
olens
,
...
...
@@ -450,8 +448,8 @@ class TransformerTTS(nn.Layer):
olens
=
olens
-
olens
%
self
.
reduction_factor
max_olen
=
max
(
olens
)
ys
=
ys
[:,
:
max_olen
]
labels
=
labels
[:,
:
max_olen
]
labels
[:,
-
1
]
=
1.0
# make sure at least one frame has 1
stop_labels
=
stop_
labels
[:,
:
max_olen
]
stop_
labels
[:,
-
1
]
=
1.0
# make sure at least one frame has 1
olens_in
=
olens
//
self
.
reduction_factor
else
:
olens_in
=
olens
...
...
@@ -465,7 +463,7 @@ class TransformerTTS(nn.Layer):
'num_layers_applied_guided_attn'
]
=
self
.
num_layers_applied_guided_attn
need_dict
[
'use_scaled_pos_enc'
]
=
self
.
use_scaled_pos_enc
return
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
,
olens_in
,
need_dict
return
after_outs
,
before_outs
,
logits
,
ys
,
stop_
labels
,
olens
,
olens_in
,
need_dict
def
_forward
(
self
,
...
...
paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py
浏览文件 @
96323816
...
...
@@ -75,7 +75,7 @@ class TransformerTTSUpdater(StandardUpdater):
self
.
msg
=
"Rank: {}, "
.
format
(
dist
.
get_rank
())
losses_dict
=
{}
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
,
olens_in
,
need_dict
=
self
.
model
(
after_outs
,
before_outs
,
logits
,
ys
,
stop_
labels
,
olens
,
olens_in
,
need_dict
=
self
.
model
(
text
=
batch
[
"text"
],
text_lengths
=
batch
[
"text_lengths"
],
speech
=
batch
[
"speech"
],
...
...
@@ -86,7 +86,7 @@ class TransformerTTSUpdater(StandardUpdater):
before_outs
=
before_outs
,
logits
=
logits
,
ys
=
ys
,
labels
=
labels
,
stop_labels
=
stop_
labels
,
olens
=
olens
)
report
(
"train/bce_loss"
,
float
(
bce_loss
))
...
...
@@ -226,7 +226,7 @@ class TransformerTTSEvaluator(StandardEvaluator):
def
evaluate_core
(
self
,
batch
):
self
.
msg
=
"Evaluate: "
losses_dict
=
{}
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
,
olens_in
,
need_dict
=
self
.
model
(
after_outs
,
before_outs
,
logits
,
ys
,
stop_
labels
,
olens
,
olens_in
,
need_dict
=
self
.
model
(
text
=
batch
[
"text"
],
text_lengths
=
batch
[
"text_lengths"
],
speech
=
batch
[
"speech"
],
...
...
@@ -237,7 +237,7 @@ class TransformerTTSEvaluator(StandardEvaluator):
before_outs
=
before_outs
,
logits
=
logits
,
ys
=
ys
,
labels
=
labels
,
stop_labels
=
stop_
labels
,
olens
=
olens
)
report
(
"eval/bce_loss"
,
float
(
bce_loss
))
...
...
paddlespeech/t2s/modules/losses.py
浏览文件 @
96323816
...
...
@@ -263,7 +263,7 @@ class Tacotron2Loss(nn.Layer):
self
.
bce_criterion
=
nn
.
BCEWithLogitsLoss
(
reduction
=
reduction
,
pos_weight
=
paddle
.
to_tensor
(
bce_pos_weight
))
def
forward
(
self
,
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
):
def
forward
(
self
,
after_outs
,
before_outs
,
logits
,
ys
,
stop_
labels
,
olens
):
"""Calculate forward propagation.
Parameters
----------
...
...
@@ -275,7 +275,7 @@ class Tacotron2Loss(nn.Layer):
Batch of stop logits (B, Lmax).
ys : Tensor
Batch of padded target features (B, Lmax, odim).
labels : Tensor(int64)
stop_
labels : Tensor(int64)
Batch of the sequences of stop token labels (B, Lmax).
olens : Tensor(int64)
Batch of the lengths of each target (B,).
...
...
@@ -296,8 +296,8 @@ class Tacotron2Loss(nn.Layer):
masks
.
broadcast_to
(
after_outs
.
shape
))
before_outs
=
before_outs
.
masked_select
(
masks
.
broadcast_to
(
before_outs
.
shape
))
labels
=
labels
.
masked_select
(
masks
[:,
:,
0
].
broadcast_to
(
labels
.
shape
))
stop_labels
=
stop_
labels
.
masked_select
(
masks
[:,
:,
0
].
broadcast_to
(
stop_
labels
.
shape
))
logits
=
logits
.
masked_select
(
masks
[:,
:,
0
].
broadcast_to
(
logits
.
shape
))
...
...
@@ -306,7 +306,7 @@ class Tacotron2Loss(nn.Layer):
before_outs
,
ys
)
mse_loss
=
self
.
mse_criterion
(
after_outs
,
ys
)
+
self
.
mse_criterion
(
before_outs
,
ys
)
bce_loss
=
self
.
bce_criterion
(
logits
,
labels
)
bce_loss
=
self
.
bce_criterion
(
logits
,
stop_
labels
)
# make weighted mask and apply it
if
self
.
use_weighted_masking
:
...
...
paddlespeech/t2s/modules/tacotron2/attentions.py
浏览文件 @
96323816
...
...
@@ -207,7 +207,7 @@ class AttLoc(nn.Layer):
w
=
F
.
softmax
(
scaling
*
e
,
axis
=
1
)
# weighted sum over f
l
ames
# weighted sum over f
r
ames
# utt x hdim
c
=
paddle
.
sum
(
self
.
enc_h
*
w
.
reshape
([
batch
,
self
.
h_length
,
1
]),
axis
=
1
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录