Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
f191d0b0
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
f191d0b0
编写于
1月 04, 2022
作者:
J
Jerryuhoo
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
change speaker embedding position
Change speaker embedding position into the encoder.
上级
11991b6d
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
18 addition
and
122 deletion
+18
-122
examples/aishell3/tts2/default_multi.yaml
examples/aishell3/tts2/default_multi.yaml
+0
-52
paddlespeech/t2s/models/speedyspeech/speedyspeech.py
paddlespeech/t2s/models/speedyspeech/speedyspeech.py
+18
-70
未找到文件。
examples/aishell3/tts2/default_multi.yaml
已删除
100644 → 0
浏览文件 @
11991b6d
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs
:
24000
# Sampling rate.
n_fft
:
2048
# FFT size (samples).
n_shift
:
300
# Hop size (samples). 12.5ms
win_length
:
1200
# Window length (samples). 50ms
# If set to null, it will be the same as fft_size.
window
:
"
hann"
# Window function.
n_mels
:
80
# Number of mel basis.
fmin
:
80
# Minimum freq in mel basis calculation.
fmax
:
7600
# Maximum frequency in mel basis calculation.
###########################################################
# DATA SETTING #
###########################################################
batch_size
:
32
num_workers
:
4
###########################################################
# MODEL SETTING #
###########################################################
model
:
encoder_hidden_size
:
128
encoder_kernel_size
:
3
encoder_dilations
:
[
1
,
3
,
9
,
27
,
1
,
3
,
9
,
27
,
1
,
1
]
duration_predictor_hidden_size
:
128
decoder_hidden_size
:
128
decoder_output_size
:
80
decoder_kernel_size
:
3
decoder_dilations
:
[
1
,
3
,
9
,
27
,
1
,
3
,
9
,
27
,
1
,
3
,
9
,
27
,
1
,
3
,
9
,
27
,
1
,
1
]
spk_embed_dim
:
256
spk_embed_integration_type
:
add
# speaker embedding integration type
###########################################################
# OPTIMIZER SETTING #
###########################################################
optimizer
:
optim
:
adam
# optimizer type
learning_rate
:
0.002
# learning rate
max_grad_norm
:
1
###########################################################
# TRAINING SETTING #
###########################################################
max_epoch
:
100
num_snapshots
:
5
###########################################################
# OTHER SETTING #
###########################################################
seed
:
10086
\ No newline at end of file
paddlespeech/t2s/models/speedyspeech/speedyspeech.py
浏览文件 @
f191d0b0
...
@@ -96,7 +96,7 @@ class TextEmbedding(nn.Layer):
...
@@ -96,7 +96,7 @@ class TextEmbedding(nn.Layer):
class
SpeedySpeechEncoder
(
nn
.
Layer
):
class
SpeedySpeechEncoder
(
nn
.
Layer
):
def
__init__
(
self
,
vocab_size
,
tone_size
,
hidden_size
,
kernel_size
,
def
__init__
(
self
,
vocab_size
,
tone_size
,
hidden_size
,
kernel_size
,
dilations
):
dilations
,
spk_num
=
None
):
super
().
__init__
()
super
().
__init__
()
self
.
embedding
=
TextEmbedding
(
self
.
embedding
=
TextEmbedding
(
vocab_size
,
vocab_size
,
...
@@ -104,6 +104,15 @@ class SpeedySpeechEncoder(nn.Layer):
...
@@ -104,6 +104,15 @@ class SpeedySpeechEncoder(nn.Layer):
tone_size
,
tone_size
,
padding_idx
=
0
,
padding_idx
=
0
,
tone_padding_idx
=
0
)
tone_padding_idx
=
0
)
if
spk_num
:
self
.
spk_emb
=
nn
.
Embedding
(
num_embeddings
=
spk_num
,
embedding_dim
=
hidden_size
,
padding_idx
=
0
)
else
:
self
.
spk_emb
=
None
self
.
prenet
=
nn
.
Sequential
(
self
.
prenet
=
nn
.
Sequential
(
nn
.
Linear
(
hidden_size
,
hidden_size
),
nn
.
Linear
(
hidden_size
,
hidden_size
),
nn
.
ReLU
(),
)
nn
.
ReLU
(),
)
...
@@ -118,8 +127,10 @@ class SpeedySpeechEncoder(nn.Layer):
...
@@ -118,8 +127,10 @@ class SpeedySpeechEncoder(nn.Layer):
nn
.
BatchNorm1D
(
hidden_size
,
data_format
=
"NLC"
),
nn
.
BatchNorm1D
(
hidden_size
,
data_format
=
"NLC"
),
nn
.
Linear
(
hidden_size
,
hidden_size
),
)
nn
.
Linear
(
hidden_size
,
hidden_size
),
)
def
forward
(
self
,
text
,
tones
):
def
forward
(
self
,
text
,
tones
,
spk_id
=
None
):
embedding
=
self
.
embedding
(
text
,
tones
)
embedding
=
self
.
embedding
(
text
,
tones
)
if
self
.
spk_emb
:
embedding
+=
self
.
spk_emb
(
spk_id
).
unsqueeze
(
1
)
embedding
=
self
.
prenet
(
embedding
)
embedding
=
self
.
prenet
(
embedding
)
x
=
self
.
res_blocks
(
embedding
)
x
=
self
.
res_blocks
(
embedding
)
x
=
embedding
+
self
.
postnet1
(
x
)
x
=
embedding
+
self
.
postnet1
(
x
)
...
@@ -172,14 +183,11 @@ class SpeedySpeech(nn.Layer):
...
@@ -172,14 +183,11 @@ class SpeedySpeech(nn.Layer):
decoder_kernel_size
,
decoder_kernel_size
,
decoder_dilations
,
decoder_dilations
,
tone_size
=
None
,
tone_size
=
None
,
spk_num
:
int
=
None
,
spk_num
=
None
):
spk_embed_dim
:
int
=
None
,
spk_embed_integration_type
:
str
=
"add"
,
):
super
().
__init__
()
super
().
__init__
()
encoder
=
SpeedySpeechEncoder
(
vocab_size
,
tone_size
,
encoder
=
SpeedySpeechEncoder
(
vocab_size
,
tone_size
,
encoder_hidden_size
,
encoder_kernel_size
,
encoder_hidden_size
,
encoder_kernel_size
,
encoder_dilations
)
encoder_dilations
,
spk_num
)
duration_predictor
=
DurationPredictor
(
duration_predictor_hidden_size
)
duration_predictor
=
DurationPredictor
(
duration_predictor_hidden_size
)
decoder
=
SpeedySpeechDecoder
(
decoder_hidden_size
,
decoder_output_size
,
decoder
=
SpeedySpeechDecoder
(
decoder_hidden_size
,
decoder_output_size
,
decoder_kernel_size
,
decoder_dilations
)
decoder_kernel_size
,
decoder_dilations
)
...
@@ -187,27 +195,6 @@ class SpeedySpeech(nn.Layer):
...
@@ -187,27 +195,6 @@ class SpeedySpeech(nn.Layer):
self
.
encoder
=
encoder
self
.
encoder
=
encoder
self
.
duration_predictor
=
duration_predictor
self
.
duration_predictor
=
duration_predictor
self
.
decoder
=
decoder
self
.
decoder
=
decoder
self
.
spk_embed_dim
=
spk_embed_dim
# use idx 0 as padding idx
self
.
padding_idx
=
0
if
self
.
spk_embed_dim
is
not
None
:
self
.
spk_embed_integration_type
=
spk_embed_integration_type
if
spk_num
and
self
.
spk_embed_dim
:
self
.
spk_embedding_table
=
nn
.
Embedding
(
num_embeddings
=
spk_num
,
embedding_dim
=
self
.
spk_embed_dim
,
padding_idx
=
self
.
padding_idx
)
self
.
encoder_hidden_size
=
encoder_hidden_size
# define additional projection for speaker embedding
if
self
.
spk_embed_dim
is
not
None
:
print
(
"spk_embed_integration_type------------"
,
spk_embed_integration_type
)
if
self
.
spk_embed_integration_type
==
"add"
:
self
.
spk_projection
=
nn
.
Linear
(
self
.
spk_embed_dim
,
self
.
encoder_hidden_size
)
else
:
self
.
spk_projection
=
nn
.
Linear
(
self
.
encoder_hidden_size
+
self
.
spk_embed_dim
,
self
.
encoder_hidden_size
)
def
forward
(
self
,
text
,
tones
,
durations
,
spk_id
:
paddle
.
Tensor
=
None
):
def
forward
(
self
,
text
,
tones
,
durations
,
spk_id
:
paddle
.
Tensor
=
None
):
# input of embedding must be int64
# input of embedding must be int64
...
@@ -216,13 +203,7 @@ class SpeedySpeech(nn.Layer):
...
@@ -216,13 +203,7 @@ class SpeedySpeech(nn.Layer):
if
spk_id
is
not
None
:
if
spk_id
is
not
None
:
spk_id
=
paddle
.
cast
(
spk_id
,
'int64'
)
spk_id
=
paddle
.
cast
(
spk_id
,
'int64'
)
durations
=
paddle
.
cast
(
durations
,
'int64'
)
durations
=
paddle
.
cast
(
durations
,
'int64'
)
encodings
=
self
.
encoder
(
text
,
tones
)
encodings
=
self
.
encoder
(
text
,
tones
,
spk_id
)
# (B, T)
if
self
.
spk_embed_dim
is
not
None
:
if
spk_id
is
not
None
:
spk_emb
=
self
.
spk_embedding_table
(
spk_id
)
encodings
=
self
.
_integrate_with_spk_embed
(
encodings
,
spk_emb
)
pred_durations
=
self
.
duration_predictor
(
encodings
.
detach
())
pred_durations
=
self
.
duration_predictor
(
encodings
.
detach
())
...
@@ -237,7 +218,7 @@ class SpeedySpeech(nn.Layer):
...
@@ -237,7 +218,7 @@ class SpeedySpeech(nn.Layer):
decoded
=
self
.
decoder
(
encodings
)
decoded
=
self
.
decoder
(
encodings
)
return
decoded
,
pred_durations
return
decoded
,
pred_durations
def
inference
(
self
,
text
,
tones
=
None
,
spk_id
=
None
,
):
def
inference
(
self
,
text
,
tones
=
None
,
spk_id
=
None
):
# text: [T]
# text: [T]
# tones: [T]
# tones: [T]
# input of embedding must be int64
# input of embedding must be int64
...
@@ -247,11 +228,7 @@ class SpeedySpeech(nn.Layer):
...
@@ -247,11 +228,7 @@ class SpeedySpeech(nn.Layer):
tones
=
paddle
.
cast
(
tones
,
'int64'
)
tones
=
paddle
.
cast
(
tones
,
'int64'
)
tones
=
tones
.
unsqueeze
(
0
)
tones
=
tones
.
unsqueeze
(
0
)
encodings
=
self
.
encoder
(
text
,
tones
)
encodings
=
self
.
encoder
(
text
,
tones
,
spk_id
)
if
self
.
spk_embed_dim
is
not
None
:
if
spk_id
is
not
None
:
spk_emb
=
self
.
spk_embedding_table
(
spk_id
)
encodings
=
self
.
_integrate_with_spk_embed
(
encodings
,
spk_emb
)
pred_durations
=
self
.
duration_predictor
(
encodings
)
# (1, T)
pred_durations
=
self
.
duration_predictor
(
encodings
)
# (1, T)
durations_to_expand
=
paddle
.
round
(
pred_durations
.
exp
())
durations_to_expand
=
paddle
.
round
(
pred_durations
.
exp
())
...
@@ -278,35 +255,6 @@ class SpeedySpeech(nn.Layer):
...
@@ -278,35 +255,6 @@ class SpeedySpeech(nn.Layer):
decoded
=
self
.
decoder
(
encodings
)
decoded
=
self
.
decoder
(
encodings
)
return
decoded
[
0
]
return
decoded
[
0
]
def
_integrate_with_spk_embed
(
self
,
hs
,
spk_emb
):
"""Integrate speaker embedding with hidden states.
Parameters
----------
hs : Tensor
Batch of hidden state sequences (B, Tmax, adim).
spk_emb : Tensor
Batch of speaker embeddings (B, spk_embed_dim).
Returns
----------
Tensor
Batch of integrated hidden state sequences (B, Tmax, adim)
"""
if
self
.
spk_embed_integration_type
==
"add"
:
# apply projection and then add to hidden states
spk_emb
=
self
.
spk_projection
(
F
.
normalize
(
spk_emb
))
hs
=
hs
+
spk_emb
.
unsqueeze
(
1
)
elif
self
.
spk_embed_integration_type
==
"concat"
:
# concat hidden states with spk embeds and then apply projection
spk_emb
=
F
.
normalize
(
spk_emb
).
unsqueeze
(
1
).
expand
(
shape
=
[
-
1
,
hs
.
shape
[
1
],
-
1
])
hs
=
self
.
spk_projection
(
paddle
.
concat
([
hs
,
spk_emb
],
axis
=-
1
))
else
:
raise
NotImplementedError
(
"support only add or concat."
)
return
hs
class
SpeedySpeechInference
(
nn
.
Layer
):
class
SpeedySpeechInference
(
nn
.
Layer
):
def
__init__
(
self
,
normalizer
,
speedyspeech_model
):
def
__init__
(
self
,
normalizer
,
speedyspeech_model
):
super
().
__init__
()
super
().
__init__
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录