Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
94688264
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
94688264
编写于
7月 04, 2022
作者:
小湉湉
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add ernie sat model file and config
上级
0ea9def0
变更
46
展开全部
隐藏空白更改
内联
并排
Showing
46 changed file
with
3604 addition
and
81 deletion
+3604
-81
examples/aishell3/ernie_sat/conf/default.yaml
examples/aishell3/ernie_sat/conf/default.yaml
+282
-0
examples/aishell3/ernie_sat/local/preprocess.sh
examples/aishell3/ernie_sat/local/preprocess.sh
+61
-0
examples/aishell3/ernie_sat/local/synthesize.sh
examples/aishell3/ernie_sat/local/synthesize.sh
+1
-0
examples/aishell3/ernie_sat/local/train.sh
examples/aishell3/ernie_sat/local/train.sh
+12
-0
examples/aishell3/ernie_sat/path.sh
examples/aishell3/ernie_sat/path.sh
+13
-0
examples/aishell3/ernie_sat/run.sh
examples/aishell3/ernie_sat/run.sh
+32
-0
examples/aishell3/tts3/conf/conformer.yaml
examples/aishell3/tts3/conf/conformer.yaml
+2
-2
examples/aishell3/tts3/conf/default.yaml
examples/aishell3/tts3/conf/default.yaml
+2
-2
examples/aishell3_vctk/ernie_sat/conf/default.yaml
examples/aishell3_vctk/ernie_sat/conf/default.yaml
+351
-0
examples/aishell3_vctk/ernie_sat/local/preprocess.sh
examples/aishell3_vctk/ernie_sat/local/preprocess.sh
+67
-0
examples/aishell3_vctk/ernie_sat/local/synthesize.sh
examples/aishell3_vctk/ernie_sat/local/synthesize.sh
+1
-0
examples/aishell3_vctk/ernie_sat/local/train.sh
examples/aishell3_vctk/ernie_sat/local/train.sh
+12
-0
examples/aishell3_vctk/ernie_sat/path.sh
examples/aishell3_vctk/ernie_sat/path.sh
+13
-0
examples/aishell3_vctk/ernie_sat/run.sh
examples/aishell3_vctk/ernie_sat/run.sh
+32
-0
examples/csmsc/tts2/conf/default.yaml
examples/csmsc/tts2/conf/default.yaml
+11
-11
examples/csmsc/voc3/conf/default.yaml
examples/csmsc/voc3/conf/default.yaml
+1
-1
examples/csmsc/voc3/conf/finetune.yaml
examples/csmsc/voc3/conf/finetune.yaml
+1
-1
examples/ernie_sat/local/align.py
examples/ernie_sat/local/align.py
+13
-0
examples/ernie_sat/local/inference.py
examples/ernie_sat/local/inference.py
+14
-6
examples/ernie_sat/local/inference_new.py
examples/ernie_sat/local/inference_new.py
+622
-0
examples/ernie_sat/local/sedit_arg_parser.py
examples/ernie_sat/local/sedit_arg_parser.py
+13
-0
examples/ernie_sat/local/utils.py
examples/ernie_sat/local/utils.py
+13
-0
examples/ernie_sat/run_clone_en_to_zh_new.sh
examples/ernie_sat/run_clone_en_to_zh_new.sh
+27
-0
examples/ernie_sat/run_gen_en_new.sh
examples/ernie_sat/run_gen_en_new.sh
+26
-0
examples/ernie_sat/run_sedit_en_new.sh
examples/ernie_sat/run_sedit_en_new.sh
+27
-0
examples/ernie_sat/test_run_new.sh
examples/ernie_sat/test_run_new.sh
+6
-0
examples/vctk/ernie_sat/conf/default.yaml
examples/vctk/ernie_sat/conf/default.yaml
+162
-0
examples/vctk/ernie_sat/local/preprocess.sh
examples/vctk/ernie_sat/local/preprocess.sh
+61
-0
examples/vctk/ernie_sat/local/synthesize.sh
examples/vctk/ernie_sat/local/synthesize.sh
+1
-0
examples/vctk/ernie_sat/local/train.sh
examples/vctk/ernie_sat/local/train.sh
+12
-0
examples/vctk/ernie_sat/path.sh
examples/vctk/ernie_sat/path.sh
+13
-0
examples/vctk/ernie_sat/run.sh
examples/vctk/ernie_sat/run.sh
+32
-0
examples/vctk/tts3/conf/default.yaml
examples/vctk/tts3/conf/default.yaml
+3
-3
paddlespeech/t2s/datasets/am_batch_fn.py
paddlespeech/t2s/datasets/am_batch_fn.py
+145
-2
paddlespeech/t2s/exps/ernie_sat/__init__.py
paddlespeech/t2s/exps/ernie_sat/__init__.py
+0
-0
paddlespeech/t2s/exps/ernie_sat/normalize.py
paddlespeech/t2s/exps/ernie_sat/normalize.py
+130
-0
paddlespeech/t2s/exps/ernie_sat/preprocess.py
paddlespeech/t2s/exps/ernie_sat/preprocess.py
+341
-0
paddlespeech/t2s/exps/ernie_sat/synthesize.py
paddlespeech/t2s/exps/ernie_sat/synthesize.py
+0
-0
paddlespeech/t2s/exps/ernie_sat/train.py
paddlespeech/t2s/exps/ernie_sat/train.py
+194
-0
paddlespeech/t2s/models/ernie_sat/__init__.py
paddlespeech/t2s/models/ernie_sat/__init__.py
+3
-1
paddlespeech/t2s/models/ernie_sat/ernie_sat.py
paddlespeech/t2s/models/ernie_sat/ernie_sat.py
+670
-0
paddlespeech/t2s/models/ernie_sat/ernie_sat_updater.py
paddlespeech/t2s/models/ernie_sat/ernie_sat_updater.py
+148
-0
paddlespeech/t2s/models/ernie_sat/mlm.py
paddlespeech/t2s/models/ernie_sat/mlm.py
+14
-36
paddlespeech/t2s/models/fastspeech2/fastspeech2.py
paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+0
-2
paddlespeech/t2s/modules/losses.py
paddlespeech/t2s/modules/losses.py
+20
-13
paddlespeech/t2s/modules/nets_utils.py
paddlespeech/t2s/modules/nets_utils.py
+0
-1
未找到文件。
examples/aishell3/ernie_sat/conf/default.yaml
0 → 100644
浏览文件 @
94688264
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs
:
24000
# sr
n_fft
:
2048
# FFT size (samples).
n_shift
:
300
# Hop size (samples). 12.5ms
win_length
:
1200
# Window length (samples). 50ms
# If set to null, it will be the same as fft_size.
window
:
"
hann"
# Window function.
# Only used for feats_type != raw
fmin
:
80
# Minimum frequency of Mel basis.
fmax
:
7600
# Maximum frequency of Mel basis.
n_mels
:
80
# The number of mel basis.
mean_phn_span
:
8
mlm_prob
:
0.8
###########################################################
# DATA SETTING #
###########################################################
batch_size
:
64
num_workers
:
2
###########################################################
# MODEL SETTING #
###########################################################
model
:
text_masking
:
false
postnet_layers
:
5
postnet_filts
:
5
postnet_chans
:
256
encoder_type
:
conformer
decoder_type
:
conformer
enc_input_layer
:
sega_mlm
enc_pre_speech_layer
:
0
enc_cnn_module_kernel
:
7
enc_attention_dim
:
384
enc_attention_heads
:
2
enc_linear_units
:
1536
enc_num_blocks
:
4
enc_dropout_rate
:
0.2
enc_positional_dropout_rate
:
0.2
enc_attention_dropout_rate
:
0.2
enc_normalize_before
:
true
enc_macaron_style
:
true
enc_use_cnn_module
:
true
enc_selfattention_layer_type
:
legacy_rel_selfattn
enc_activation_type
:
swish
enc_pos_enc_layer_type
:
legacy_rel_pos
enc_positionwise_layer_type
:
conv1d
enc_positionwise_conv_kernel_size
:
3
dec_cnn_module_kernel
:
31
dec_attention_dim
:
384
dec_attention_heads
:
2
dec_linear_units
:
1536
dec_num_blocks
:
4
dec_dropout_rate
:
0.2
dec_positional_dropout_rate
:
0.2
dec_attention_dropout_rate
:
0.2
dec_macaron_style
:
true
dec_use_cnn_module
:
true
dec_selfattention_layer_type
:
legacy_rel_selfattn
dec_activation_type
:
swish
dec_pos_enc_layer_type
:
legacy_rel_pos
dec_positionwise_layer_type
:
conv1d
dec_positionwise_conv_kernel_size
:
3
###########################################################
# OPTIMIZER SETTING #
###########################################################
optimizer
:
optim
:
adam
# optimizer type
learning_rate
:
0.001
# learning rate
###########################################################
# TRAINING SETTING #
###########################################################
max_epoch
:
200
num_snapshots
:
5
###########################################################
# OTHER SETTING #
###########################################################
seed
:
10086
token_list
:
-
<blank>
-
<unk>
-
d
-
sp
-
sh
-
ii
-
j
-
zh
-
l
-
x
-
b
-
g
-
uu
-
e5
-
h
-
q
-
m
-
i1
-
t
-
z
-
ch
-
f
-
s
-
u4
-
ix4
-
i4
-
n
-
i3
-
iu3
-
vv
-
ian4
-
ix2
-
r
-
e4
-
ai4
-
k
-
ing2
-
a1
-
en2
-
ui4
-
ong1
-
uo3
-
u2
-
u3
-
ao4
-
ee
-
p
-
an1
-
eng2
-
i2
-
in1
-
c
-
ai2
-
ian2
-
e2
-
an4
-
ing4
-
v4
-
ai3
-
a5
-
ian3
-
eng1
-
ong4
-
ang4
-
ian1
-
ing1
-
iy4
-
ao3
-
ang1
-
uo4
-
u1
-
iao4
-
iu4
-
a4
-
van2
-
ie4
-
ang2
-
ou4
-
iang4
-
ix1
-
er4
-
iy1
-
e1
-
en1
-
ui2
-
an3
-
ei4
-
ong2
-
uo1
-
ou3
-
uo2
-
iao1
-
ou1
-
an2
-
uan4
-
ia4
-
ia1
-
ang3
-
v3
-
iu2
-
iao3
-
in4
-
a3
-
ei3
-
iang3
-
v2
-
eng4
-
en3
-
aa
-
uan1
-
v1
-
ao1
-
ve4
-
ie3
-
ai1
-
ing3
-
iang1
-
a2
-
ui1
-
en4
-
en5
-
in3
-
uan3
-
e3
-
ie1
-
ve2
-
ei2
-
in2
-
ix3
-
uan2
-
iang2
-
ie2
-
ua4
-
ou2
-
uai4
-
er2
-
eng3
-
uang3
-
un1
-
ong3
-
uang4
-
vn4
-
un2
-
iy3
-
iz4
-
ui3
-
iao2
-
iong4
-
un4
-
van4
-
ao2
-
uang1
-
iy5
-
o2
-
ei1
-
ua1
-
iu1
-
uang2
-
er5
-
o1
-
un3
-
vn1
-
vn2
-
o4
-
ve1
-
van3
-
ua2
-
er3
-
iong3
-
van1
-
ia2
-
iy2
-
ia3
-
iong1
-
uo5
-
oo
-
ve3
-
ou5
-
uai3
-
ian5
-
iong2
-
uai2
-
uai1
-
ua3
-
vn3
-
ia5
-
ie5
-
ueng1
-
o5
-
o3
-
iang5
-
ei5
-
<sos/eos>
\ No newline at end of file
examples/aishell3/ernie_sat/local/preprocess.sh
0 → 100755
浏览文件 @
94688264
#!/bin/bash
stage
=
0
stop_stage
=
100
config_path
=
$1
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
# get durations from MFA's result
echo
"Generate durations.txt from MFA results ..."
python3
${
MAIN_ROOT
}
/utils/gen_duration_from_textgrid.py
\
--inputdir
=
./aishell3_alignment_tone
\
--output
durations.txt
\
--config
=
${
config_path
}
fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
# extract features
echo
"Extract features ..."
python3
${
BIN_DIR
}
/preprocess.py
\
--dataset
=
aishell3
\
--rootdir
=
~/datasets/data_aishell3/
\
--dumpdir
=
dump
\
--dur-file
=
durations.txt
\
--config
=
${
config_path
}
\
--num-cpu
=
20
\
--cut-sil
=
True
fi
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
# get features' stats(mean and std)
echo
"Get features' stats ..."
python3
${
MAIN_ROOT
}
/utils/compute_statistics.py
\
--metadata
=
dump/train/raw/metadata.jsonl
\
--field-name
=
"speech"
fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# normalize and covert phone/speaker to id, dev and test should use train's stats
echo
"Normalize ..."
python3
${
BIN_DIR
}
/normalize.py
\
--metadata
=
dump/train/raw/metadata.jsonl
\
--dumpdir
=
dump/train/norm
\
--speech-stats
=
dump/train/speech_stats.npy
\
--phones-dict
=
dump/phone_id_map.txt
\
--speaker-dict
=
dump/speaker_id_map.txt
python3
${
BIN_DIR
}
/normalize.py
\
--metadata
=
dump/dev/raw/metadata.jsonl
\
--dumpdir
=
dump/dev/norm
\
--speech-stats
=
dump/train/speech_stats.npy
\
--phones-dict
=
dump/phone_id_map.txt
\
--speaker-dict
=
dump/speaker_id_map.txt
python3
${
BIN_DIR
}
/normalize.py
\
--metadata
=
dump/test/raw/metadata.jsonl
\
--dumpdir
=
dump/test/norm
\
--speech-stats
=
dump/train/speech_stats.npy
\
--phones-dict
=
dump/phone_id_map.txt
\
--speaker-dict
=
dump/speaker_id_map.txt
fi
examples/aishell3/ernie_sat/local/synthesize.sh
0 → 100755
浏览文件 @
94688264
#!/bin/bash
\ No newline at end of file
examples/aishell3/ernie_sat/local/train.sh
0 → 100755
浏览文件 @
94688264
#!/bin/bash
config_path
=
$1
train_output_path
=
$2
python3
${
BIN_DIR
}
/train.py
\
--train-metadata
=
dump/train/norm/metadata.jsonl
\
--dev-metadata
=
dump/dev/norm/metadata.jsonl
\
--config
=
${
config_path
}
\
--output-dir
=
${
train_output_path
}
\
--ngpu
=
1
\
--phones-dict
=
dump/phone_id_map.txt
\ No newline at end of file
examples/aishell3/ernie_sat/path.sh
0 → 100755
浏览文件 @
94688264
#!/bin/bash
export
MAIN_ROOT
=
`
realpath
${
PWD
}
/../../../
`
export
PATH
=
${
MAIN_ROOT
}
:
${
MAIN_ROOT
}
/utils:
${
PATH
}
export
LC_ALL
=
C
export
PYTHONDONTWRITEBYTECODE
=
1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export
PYTHONIOENCODING
=
UTF-8
export
PYTHONPATH
=
${
MAIN_ROOT
}
:
${
PYTHONPATH
}
MODEL
=
ernie_sat
export
BIN_DIR
=
${
MAIN_ROOT
}
/paddlespeech/t2s/exps/
${
MODEL
}
\ No newline at end of file
examples/aishell3/ernie_sat/run.sh
0 → 100755
浏览文件 @
94688264
#!/bin/bash
set
-e
source
path.sh
gpus
=
0,1
stage
=
0
stop_stage
=
100
conf_path
=
conf/default.yaml
train_output_path
=
exp/default
ckpt_name
=
snapshot_iter_153.pdz
# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
# prepare data
./local/preprocess.sh
${
conf_path
}
||
exit
-1
fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/train.sh
${
conf_path
}
${
train_output_path
}
||
exit
-1
fi
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
# synthesize, vocoder is pwgan
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/synthesize.sh
${
conf_path
}
${
train_output_path
}
${
ckpt_name
}
||
exit
-1
fi
examples/aishell3/tts3/conf/conformer.yaml
浏览文件 @
94688264
...
...
@@ -94,8 +94,8 @@ updater:
# OPTIMIZER SETTING #
###########################################################
optimizer
:
optim
:
adam
# optimizer type
learning_rate
:
0.001
# learning rate
optim
:
adam
# optimizer type
learning_rate
:
0.001
# learning rate
###########################################################
# TRAINING SETTING #
...
...
examples/aishell3/tts3/conf/default.yaml
浏览文件 @
94688264
...
...
@@ -88,8 +88,8 @@ updater:
# OPTIMIZER SETTING #
###########################################################
optimizer
:
optim
:
adam
# optimizer type
learning_rate
:
0.001
# learning rate
optim
:
adam
# optimizer type
learning_rate
:
0.001
# learning rate
###########################################################
# TRAINING SETTING #
...
...
examples/aishell3_vctk/ernie_sat/conf/default.yaml
0 → 100644
浏览文件 @
94688264
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs
:
24000
# sr
n_fft
:
2048
# FFT size (samples).
n_shift
:
300
# Hop size (samples). 12.5ms
win_length
:
1200
# Window length (samples). 50ms
# If set to null, it will be the same as fft_size.
window
:
"
hann"
# Window function.
# Only used for feats_type != raw
fmin
:
80
# Minimum frequency of Mel basis.
fmax
:
7600
# Maximum frequency of Mel basis.
n_mels
:
80
# The number of mel basis.
mean_phn_span
:
8
mlm_prob
:
0.8
###########################################################
# DATA SETTING #
###########################################################
batch_size
:
64
num_workers
:
2
###########################################################
# MODEL SETTING #
###########################################################
model
:
text_masking
:
true
postnet_layers
:
5
postnet_filts
:
5
postnet_chans
:
256
encoder_type
:
conformer
decoder_type
:
conformer
enc_input_layer
:
sega_mlm
enc_pre_speech_layer
:
0
enc_cnn_module_kernel
:
7
enc_attention_dim
:
384
enc_attention_heads
:
2
enc_linear_units
:
1536
enc_num_blocks
:
4
enc_dropout_rate
:
0.2
enc_positional_dropout_rate
:
0.2
enc_attention_dropout_rate
:
0.2
enc_normalize_before
:
true
enc_macaron_style
:
true
enc_use_cnn_module
:
true
enc_selfattention_layer_type
:
legacy_rel_selfattn
enc_activation_type
:
swish
enc_pos_enc_layer_type
:
legacy_rel_pos
enc_positionwise_layer_type
:
conv1d
enc_positionwise_conv_kernel_size
:
3
dec_cnn_module_kernel
:
31
dec_attention_dim
:
384
dec_attention_heads
:
2
dec_linear_units
:
1536
dec_num_blocks
:
4
dec_dropout_rate
:
0.2
dec_positional_dropout_rate
:
0.2
dec_attention_dropout_rate
:
0.2
dec_macaron_style
:
true
dec_use_cnn_module
:
true
dec_selfattention_layer_type
:
legacy_rel_selfattn
dec_activation_type
:
swish
dec_pos_enc_layer_type
:
legacy_rel_pos
dec_positionwise_layer_type
:
conv1d
dec_positionwise_conv_kernel_size
:
3
###########################################################
# OPTIMIZER SETTING #
###########################################################
optimizer
:
optim
:
adam
# optimizer type
learning_rate
:
0.001
# learning rate
###########################################################
# TRAINING SETTING #
###########################################################
max_epoch
:
100
num_snapshots
:
5
###########################################################
# OTHER SETTING #
###########################################################
seed
:
10086
token_list
:
-
<blank>
-
<unk>
-
AH0
-
T
-
N
-
sp
-
S
-
R
-
D
-
L
-
Z
-
DH
-
IH1
-
K
-
W
-
M
-
EH1
-
AE1
-
ER0
-
B
-
IY1
-
P
-
V
-
IY0
-
F
-
HH
-
AA1
-
AY1
-
AH1
-
EY1
-
IH0
-
AO1
-
OW1
-
UW1
-
G
-
NG
-
SH
-
Y
-
TH
-
ER1
-
JH
-
UH1
-
AW1
-
CH
-
IH2
-
OW0
-
OW2
-
EY2
-
EH2
-
UW0
-
OY1
-
ZH
-
EH0
-
AY2
-
AW2
-
AA2
-
AE2
-
IY2
-
AH2
-
AE0
-
AO2
-
AY0
-
AO0
-
UW2
-
UH2
-
AA0
-
EY0
-
AW0
-
UH0
-
ER2
-
OY2
-
OY0
-
d
-
sh
-
ii
-
j
-
zh
-
l
-
x
-
b
-
g
-
uu
-
e5
-
h
-
q
-
m
-
i1
-
t
-
z
-
ch
-
f
-
s
-
u4
-
ix4
-
i4
-
n
-
i3
-
iu3
-
vv
-
ian4
-
ix2
-
r
-
e4
-
ai4
-
k
-
ing2
-
a1
-
en2
-
ui4
-
ong1
-
uo3
-
u2
-
u3
-
ao4
-
ee
-
p
-
an1
-
eng2
-
i2
-
in1
-
c
-
ai2
-
ian2
-
e2
-
an4
-
ing4
-
v4
-
ai3
-
a5
-
ian3
-
eng1
-
ong4
-
ang4
-
ian1
-
ing1
-
iy4
-
ao3
-
ang1
-
uo4
-
u1
-
iao4
-
iu4
-
a4
-
van2
-
ie4
-
ang2
-
ou4
-
iang4
-
ix1
-
er4
-
iy1
-
e1
-
en1
-
ui2
-
an3
-
ei4
-
ong2
-
uo1
-
ou3
-
uo2
-
iao1
-
ou1
-
an2
-
uan4
-
ia4
-
ia1
-
ang3
-
v3
-
iu2
-
iao3
-
in4
-
a3
-
ei3
-
iang3
-
v2
-
eng4
-
en3
-
aa
-
uan1
-
v1
-
ao1
-
ve4
-
ie3
-
ai1
-
ing3
-
iang1
-
a2
-
ui1
-
en4
-
en5
-
in3
-
uan3
-
e3
-
ie1
-
ve2
-
ei2
-
in2
-
ix3
-
uan2
-
iang2
-
ie2
-
ua4
-
ou2
-
uai4
-
er2
-
eng3
-
uang3
-
un1
-
ong3
-
uang4
-
vn4
-
un2
-
iy3
-
iz4
-
ui3
-
iao2
-
iong4
-
un4
-
van4
-
ao2
-
uang1
-
iy5
-
o2
-
ei1
-
ua1
-
iu1
-
uang2
-
er5
-
o1
-
un3
-
vn1
-
vn2
-
o4
-
ve1
-
van3
-
ua2
-
er3
-
iong3
-
van1
-
ia2
-
iy2
-
ia3
-
iong1
-
uo5
-
oo
-
ve3
-
ou5
-
uai3
-
ian5
-
iong2
-
uai2
-
uai1
-
ua3
-
vn3
-
ia5
-
ie5
-
ueng1
-
o5
-
o3
-
iang5
-
ei5
-
<sos/eos>
examples/aishell3_vctk/ernie_sat/local/preprocess.sh
0 → 100755
浏览文件 @
94688264
#!/bin/bash
stage
=
0
stop_stage
=
100
config_path
=
$1
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
# get durations from MFA's result
echo
"Generate durations.txt from MFA results ..."
python3
${
MAIN_ROOT
}
/utils/gen_duration_from_textgrid.py
\
--inputdir
=
./aishell3_alignment_tone
\
--output
durations.txt
\
--config
=
${
config_path
}
fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
# extract features
echo
"Extract features ..."
python3
${
BIN_DIR
}
/preprocess.py
\
--dataset
=
aishell3
\
--rootdir
=
~/datasets/data_aishell3/
\
--dumpdir
=
dump
\
--dur-file
=
durations.txt
\
--config
=
${
config_path
}
\
--num-cpu
=
20
\
--cut-sil
=
True
fi
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
# get features' stats(mean and std)
echo
"Get features' stats ..."
python3
${
MAIN_ROOT
}
/utils/compute_statistics.py
\
--metadata
=
dump/train/raw/metadata.jsonl
\
--field-name
=
"speech"
fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# normalize and covert phone/speaker to id, dev and test should use train's stats
echo
"Normalize ..."
python3
${
BIN_DIR
}
/normalize.py
\
--metadata
=
dump/train/raw/metadata.jsonl
\
--dumpdir
=
dump/train/norm
\
--speech-stats
=
dump/train/speech_stats.npy
\
--pitch-stats
=
dump/train/pitch_stats.npy
\
--energy-stats
=
dump/train/energy_stats.npy
\
--phones-dict
=
dump/phone_id_map.txt
\
--speaker-dict
=
dump/speaker_id_map.txt
python3
${
BIN_DIR
}
/normalize.py
\
--metadata
=
dump/dev/raw/metadata.jsonl
\
--dumpdir
=
dump/dev/norm
\
--speech-stats
=
dump/train/speech_stats.npy
\
--pitch-stats
=
dump/train/pitch_stats.npy
\
--energy-stats
=
dump/train/energy_stats.npy
\
--phones-dict
=
dump/phone_id_map.txt
\
--speaker-dict
=
dump/speaker_id_map.txt
python3
${
BIN_DIR
}
/normalize.py
\
--metadata
=
dump/test/raw/metadata.jsonl
\
--dumpdir
=
dump/test/norm
\
--speech-stats
=
dump/train/speech_stats.npy
\
--pitch-stats
=
dump/train/pitch_stats.npy
\
--energy-stats
=
dump/train/energy_stats.npy
\
--phones-dict
=
dump/phone_id_map.txt
\
--speaker-dict
=
dump/speaker_id_map.txt
fi
examples/aishell3_vctk/ernie_sat/local/synthesize.sh
0 → 100755
浏览文件 @
94688264
#!/bin/bash
\ No newline at end of file
examples/aishell3_vctk/ernie_sat/local/train.sh
0 → 100755
浏览文件 @
94688264
#!/bin/bash
config_path
=
$1
train_output_path
=
$2
python3
${
BIN_DIR
}
/train.py
\
--train-metadata
=
dump/train/norm/metadata.jsonl
\
--dev-metadata
=
dump/dev/norm/metadata.jsonl
\
--config
=
${
config_path
}
\
--output-dir
=
${
train_output_path
}
\
--ngpu
=
1
\
--phones-dict
=
dump/phone_id_map.txt
\ No newline at end of file
examples/aishell3_vctk/ernie_sat/path.sh
0 → 100755
浏览文件 @
94688264
#!/bin/bash
export
MAIN_ROOT
=
`
realpath
${
PWD
}
/../../
`
export
PATH
=
${
MAIN_ROOT
}
:
${
MAIN_ROOT
}
/utils:
${
PATH
}
export
LC_ALL
=
C
export
PYTHONDONTWRITEBYTECODE
=
1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export
PYTHONIOENCODING
=
UTF-8
export
PYTHONPATH
=
${
MAIN_ROOT
}
:
${
PYTHONPATH
}
MODEL
=
ernie_sat
export
BIN_DIR
=
${
MAIN_ROOT
}
/paddlespeech/t2s/exps/
${
MODEL
}
\ No newline at end of file
examples/aishell3_vctk/ernie_sat/run.sh
0 → 100755
浏览文件 @
94688264
#!/bin/bash
set
-e
source
path.sh
gpus
=
0,1
stage
=
0
stop_stage
=
100
conf_path
=
conf/default.yaml
train_output_path
=
exp/default
ckpt_name
=
snapshot_iter_153.pdz
# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
# prepare data
./local/preprocess.sh
${
conf_path
}
||
exit
-1
fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/train.sh
${
conf_path
}
${
train_output_path
}
||
exit
-1
fi
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
# synthesize, vocoder is pwgan
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/synthesize.sh
${
conf_path
}
${
train_output_path
}
${
ckpt_name
}
||
exit
-1
fi
examples/csmsc/tts2/conf/default.yaml
浏览文件 @
94688264
...
...
@@ -21,22 +21,22 @@ num_workers: 4
# MODEL SETTING #
###########################################################
model
:
encoder_hidden_size
:
128
encoder_kernel_size
:
3
encoder_dilations
:
[
1
,
3
,
9
,
27
,
1
,
3
,
9
,
27
,
1
,
1
]
duration_predictor_hidden_size
:
128
decoder_hidden_size
:
128
decoder_output_size
:
80
decoder_kernel_size
:
3
decoder_dilations
:
[
1
,
3
,
9
,
27
,
1
,
3
,
9
,
27
,
1
,
3
,
9
,
27
,
1
,
3
,
9
,
27
,
1
,
1
]
encoder_hidden_size
:
128
encoder_kernel_size
:
3
encoder_dilations
:
[
1
,
3
,
9
,
27
,
1
,
3
,
9
,
27
,
1
,
1
]
duration_predictor_hidden_size
:
128
decoder_hidden_size
:
128
decoder_output_size
:
80
decoder_kernel_size
:
3
decoder_dilations
:
[
1
,
3
,
9
,
27
,
1
,
3
,
9
,
27
,
1
,
3
,
9
,
27
,
1
,
3
,
9
,
27
,
1
,
1
]
###########################################################
# OPTIMIZER SETTING #
###########################################################
optimizer
:
optim
:
adam
# optimizer type
learning_rate
:
0.002
# learning rate
max_grad_norm
:
1
optim
:
adam
# optimizer type
learning_rate
:
0.002
# learning rate
max_grad_norm
:
1
###########################################################
# TRAINING SETTING #
...
...
examples/csmsc/voc3/conf/default.yaml
浏览文件 @
94688264
...
...
@@ -29,7 +29,7 @@ generator_params:
out_channels
:
4
# Number of output channels.
kernel_size
:
7
# Kernel size of initial and final conv layers.
channels
:
384
# Initial number of channels for conv layers.
upsample_scales
:
[
5
,
5
,
3
]
# List of Upsampling scales. prod(upsample_scales) == n_shift
upsample_scales
:
[
5
,
5
,
3
]
# List of Upsampling scales. prod(upsample_scales)
x out_channels
== n_shift
stack_kernel_size
:
3
# Kernel size of dilated conv layers in residual stack.
stacks
:
4
# Number of stacks in a single residual stack module.
use_weight_norm
:
True
# Whether to use weight normalization.
...
...
examples/csmsc/voc3/conf/finetune.yaml
浏览文件 @
94688264
...
...
@@ -29,7 +29,7 @@ generator_params:
out_channels
:
4
# Number of output channels.
kernel_size
:
7
# Kernel size of initial and final conv layers.
channels
:
384
# Initial number of channels for conv layers.
upsample_scales
:
[
5
,
5
,
3
]
# List of Upsampling scales. prod(upsample_scales) == n_shift
upsample_scales
:
[
5
,
5
,
3
]
# List of Upsampling scales. prod(upsample_scales)
x out_channels
== n_shift
stack_kernel_size
:
3
# Kernel size of dilated conv layers in residual stack.
stacks
:
4
# Number of stacks in a single residual stack module.
use_weight_norm
:
True
# Whether to use weight normalization.
...
...
examples/ernie_sat/local/align.py
浏览文件 @
94688264
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Usage:
align.py wavfile trsfile outwordfile outphonefile
"""
...
...
examples/ernie_sat/local/inference.py
浏览文件 @
94688264
#!/usr/bin/env python3
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
random
from
typing
import
Dict
...
...
@@ -305,7 +317,6 @@ def get_dur_adj_factor(orig_dur: List[int],
def
prep_feats_with_dur
(
wav_path
:
str
,
mlm_model
:
nn
.
Layer
,
source_lang
:
str
=
"English"
,
target_lang
:
str
=
"English"
,
old_str
:
str
=
""
,
...
...
@@ -425,8 +436,7 @@ def prep_feats_with_dur(wav_path: str,
return
new_wav
,
new_phns
,
new_mfa_start
,
new_mfa_end
,
old_span_bdy
,
new_span_bdy
def
prep_feats
(
mlm_model
:
nn
.
Layer
,
wav_path
:
str
,
def
prep_feats
(
wav_path
:
str
,
source_lang
:
str
=
"english"
,
target_lang
:
str
=
"english"
,
old_str
:
str
=
""
,
...
...
@@ -440,7 +450,6 @@ def prep_feats(mlm_model: nn.Layer,
wav
,
phns
,
mfa_start
,
mfa_end
,
old_span_bdy
,
new_span_bdy
=
prep_feats_with_dur
(
source_lang
=
source_lang
,
target_lang
=
target_lang
,
mlm_model
=
mlm_model
,
old_str
=
old_str
,
new_str
=
new_str
,
wav_path
=
wav_path
,
...
...
@@ -482,7 +491,6 @@ def decode_with_model(mlm_model: nn.Layer,
batch
,
old_span_bdy
,
new_span_bdy
=
prep_feats
(
source_lang
=
source_lang
,
target_lang
=
target_lang
,
mlm_model
=
mlm_model
,
wav_path
=
wav_path
,
old_str
=
old_str
,
new_str
=
new_str
,
...
...
examples/ernie_sat/local/inference_new.py
0 → 100644
浏览文件 @
94688264
此差异已折叠。
点击以展开。
examples/ernie_sat/local/sedit_arg_parser.py
浏览文件 @
94688264
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
...
...
examples/ernie_sat/local/utils.py
浏览文件 @
94688264
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
pathlib
import
Path
from
typing
import
Dict
from
typing
import
List
...
...
examples/ernie_sat/run_clone_en_to_zh_new.sh
0 → 100755
浏览文件 @
94688264
#!/bin/bash
set
-e
source
path.sh
# en --> zh 的 语音合成
# 根据 Prompt_003_new 作为提示语音: This was not the show for me. 来合成: '今天天气很好'
# 注: 输入的 new_str 需为中文汉字, 否则会通过预处理只保留中文汉字, 即合成预处理后的中文语音。
python
local
/inference_new.py
\
--task_name
=
cross-lingual_clone
\
--model_name
=
paddle_checkpoint_dual_mask_enzh
\
--uid
=
Prompt_003_new
\
--new_str
=
'今天天气很好.'
\
--prefix
=
'./prompt/dev/'
\
--source_lang
=
english
\
--target_lang
=
chinese
\
--output_name
=
pred_clone.wav
\
--voc
=
pwgan_aishell3
\
--voc_config
=
download/pwg_aishell3_ckpt_0.5/default.yaml
\
--voc_ckpt
=
download/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz
\
--voc_stat
=
download/pwg_aishell3_ckpt_0.5/feats_stats.npy
\
--am
=
fastspeech2_csmsc
\
--am_config
=
download/fastspeech2_conformer_baker_ckpt_0.5/conformer.yaml
\
--am_ckpt
=
download/fastspeech2_conformer_baker_ckpt_0.5/snapshot_iter_76000.pdz
\
--am_stat
=
download/fastspeech2_conformer_baker_ckpt_0.5/speech_stats.npy
\
--phones_dict
=
download/fastspeech2_conformer_baker_ckpt_0.5/phone_id_map.txt
examples/ernie_sat/run_gen_en_new.sh
0 → 100755
浏览文件 @
94688264
#!/bin/bash
set
-e
source
path.sh
# 纯英文的语音合成
# 样例为根据 p299_096 对应的语音作为提示语音: This was not the show for me. 来合成: 'I enjoy my life.'
python
local
/inference_new.py
\
--task_name
=
synthesize
\
--model_name
=
paddle_checkpoint_en
\
--uid
=
p299_096
\
--new_str
=
'I enjoy my life, do you?'
\
--prefix
=
'./prompt/dev/'
\
--source_lang
=
english
\
--target_lang
=
english
\
--output_name
=
pred_gen.wav
\
--voc
=
pwgan_aishell3
\
--voc_config
=
download/pwg_aishell3_ckpt_0.5/default.yaml
\
--voc_ckpt
=
download/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz
\
--voc_stat
=
download/pwg_aishell3_ckpt_0.5/feats_stats.npy
\
--am
=
fastspeech2_ljspeech
\
--am_config
=
download/fastspeech2_nosil_ljspeech_ckpt_0.5/default.yaml
\
--am_ckpt
=
download/fastspeech2_nosil_ljspeech_ckpt_0.5/snapshot_iter_100000.pdz
\
--am_stat
=
download/fastspeech2_nosil_ljspeech_ckpt_0.5/speech_stats.npy
\
--phones_dict
=
download/fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt
examples/ernie_sat/run_sedit_en_new.sh
0 → 100755
浏览文件 @
94688264
#!/bin/bash
set
-e
source
path.sh
# 纯英文的语音编辑
# 样例为把 p243_new 对应的原始语音: For that reason cover should not be given.编辑成 'for that reason cover is impossible to be given.' 对应的语音
# NOTE: 语音编辑任务暂支持句子中 1 个位置的替换或者插入文本操作
python
local
/inference_new.py
\
--task_name
=
edit
\
--model_name
=
paddle_checkpoint_en
\
--uid
=
p243_new
\
--new_str
=
'for that reason cover is impossible to be given.'
\
--prefix
=
'./prompt/dev/'
\
--source_lang
=
english
\
--target_lang
=
english
\
--output_name
=
pred_edit.wav
\
--voc
=
pwgan_aishell3
\
--voc_config
=
download/pwg_aishell3_ckpt_0.5/default.yaml
\
--voc_ckpt
=
download/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz
\
--voc_stat
=
download/pwg_aishell3_ckpt_0.5/feats_stats.npy
\
--am
=
fastspeech2_ljspeech
\
--am_config
=
download/fastspeech2_nosil_ljspeech_ckpt_0.5/default.yaml
\
--am_ckpt
=
download/fastspeech2_nosil_ljspeech_ckpt_0.5/snapshot_iter_100000.pdz
\
--am_stat
=
download/fastspeech2_nosil_ljspeech_ckpt_0.5/speech_stats.npy
\
--phones_dict
=
download/fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt
examples/ernie_sat/test_run_new.sh
0 → 100755
浏览文件 @
94688264
#!/bin/bash
rm
-rf
*
.wav
./run_sedit_en_new.sh
# 语音编辑任务(英文)
./run_gen_en_new.sh
# 个性化语音合成任务(英文)
./run_clone_en_to_zh_new.sh
# 跨语言语音合成任务(英文到中文的语音克隆)
\ No newline at end of file
examples/vctk/ernie_sat/conf/default.yaml
0 → 100644
浏览文件 @
94688264
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs
:
24000
# sr
n_fft
:
2048
# FFT size (samples).
n_shift
:
300
# Hop size (samples). 12.5ms
win_length
:
1200
# Window length (samples). 50ms
# If set to null, it will be the same as fft_size.
window
:
"
hann"
# Window function.
# Only used for feats_type != raw
fmin
:
80
# Minimum frequency of Mel basis.
fmax
:
7600
# Maximum frequency of Mel basis.
n_mels
:
80
# The number of mel basis.
mean_phn_span
:
8
mlm_prob
:
0.8
###########################################################
# DATA SETTING #
###########################################################
batch_size
:
20
num_workers
:
2
###########################################################
# MODEL SETTING #
###########################################################
model
:
text_masking
:
false
postnet_layers
:
5
postnet_filts
:
5
postnet_chans
:
256
encoder_type
:
conformer
decoder_type
:
conformer
enc_input_layer
:
sega_mlm
enc_pre_speech_layer
:
0
enc_cnn_module_kernel
:
7
enc_attention_dim
:
384
enc_attention_heads
:
2
enc_linear_units
:
1536
enc_num_blocks
:
4
enc_dropout_rate
:
0.2
enc_positional_dropout_rate
:
0.2
enc_attention_dropout_rate
:
0.2
enc_normalize_before
:
true
enc_macaron_style
:
true
enc_use_cnn_module
:
true
enc_selfattention_layer_type
:
legacy_rel_selfattn
enc_activation_type
:
swish
enc_pos_enc_layer_type
:
legacy_rel_pos
enc_positionwise_layer_type
:
conv1d
enc_positionwise_conv_kernel_size
:
3
dec_cnn_module_kernel
:
31
dec_attention_dim
:
384
dec_attention_heads
:
2
dec_linear_units
:
1536
dec_num_blocks
:
4
dec_dropout_rate
:
0.2
dec_positional_dropout_rate
:
0.2
dec_attention_dropout_rate
:
0.2
dec_macaron_style
:
true
dec_use_cnn_module
:
true
dec_selfattention_layer_type
:
legacy_rel_selfattn
dec_activation_type
:
swish
dec_pos_enc_layer_type
:
legacy_rel_pos
dec_positionwise_layer_type
:
conv1d
dec_positionwise_conv_kernel_size
:
3
###########################################################
# OPTIMIZER SETTING #
###########################################################
optimizer
:
optim
:
adam
# optimizer type
learning_rate
:
0.001
# learning rate
###########################################################
# TRAINING SETTING #
###########################################################
max_epoch
:
200
num_snapshots
:
5
###########################################################
# OTHER SETTING #
###########################################################
seed
:
10086
token_list
:
-
<blank>
-
<unk>
-
AH0
-
T
-
N
-
sp
-
D
-
S
-
R
-
L
-
IH1
-
DH
-
AE1
-
M
-
EH1
-
K
-
Z
-
W
-
HH
-
ER0
-
AH1
-
IY1
-
P
-
V
-
F
-
B
-
AY1
-
IY0
-
EY1
-
AA1
-
AO1
-
UW1
-
IH0
-
OW1
-
NG
-
G
-
SH
-
ER1
-
Y
-
TH
-
AW1
-
CH
-
UH1
-
IH2
-
JH
-
OW0
-
EH2
-
OY1
-
AY2
-
EH0
-
EY2
-
UW0
-
AE2
-
AA2
-
OW2
-
AH2
-
ZH
-
AO2
-
IY2
-
AE0
-
UW2
-
AY0
-
AA0
-
AO0
-
AW2
-
EY0
-
UH2
-
ER2
-
OY2
-
UH0
-
AW0
-
OY0
-
<sos/eos>
\ No newline at end of file
examples/vctk/ernie_sat/local/preprocess.sh
0 → 100755
浏览文件 @
94688264
#!/bin/bash
stage
=
0
stop_stage
=
100
config_path
=
$1
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
# get durations from MFA's result
echo
"Generate durations.txt from MFA results ..."
python3
${
MAIN_ROOT
}
/utils/gen_duration_from_textgrid.py
\
--inputdir
=
./vctk_alignment
\
--output
durations.txt
\
--config
=
${
config_path
}
fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
# extract features
echo
"Extract features ..."
python3
${
BIN_DIR
}
/preprocess.py
\
--dataset
=
vctk
\
--rootdir
=
~/datasets/VCTK-Corpus-0.92/
\
--dumpdir
=
dump
\
--dur-file
=
durations.txt
\
--config
=
${
config_path
}
\
--num-cpu
=
20
\
--cut-sil
=
True
fi
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
# get features' stats(mean and std)
echo
"Get features' stats ..."
python3
${
MAIN_ROOT
}
/utils/compute_statistics.py
\
--metadata
=
dump/train/raw/metadata.jsonl
\
--field-name
=
"speech"
fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# normalize and covert phone/speaker to id, dev and test should use train's stats
echo
"Normalize ..."
python3
${
BIN_DIR
}
/normalize.py
\
--metadata
=
dump/train/raw/metadata.jsonl
\
--dumpdir
=
dump/train/norm
\
--speech-stats
=
dump/train/speech_stats.npy
\
--phones-dict
=
dump/phone_id_map.txt
\
--speaker-dict
=
dump/speaker_id_map.txt
python3
${
BIN_DIR
}
/normalize.py
\
--metadata
=
dump/dev/raw/metadata.jsonl
\
--dumpdir
=
dump/dev/norm
\
--speech-stats
=
dump/train/speech_stats.npy
\
--phones-dict
=
dump/phone_id_map.txt
\
--speaker-dict
=
dump/speaker_id_map.txt
python3
${
BIN_DIR
}
/normalize.py
\
--metadata
=
dump/test/raw/metadata.jsonl
\
--dumpdir
=
dump/test/norm
\
--speech-stats
=
dump/train/speech_stats.npy
\
--phones-dict
=
dump/phone_id_map.txt
\
--speaker-dict
=
dump/speaker_id_map.txt
fi
examples/vctk/ernie_sat/local/synthesize.sh
0 → 100755
浏览文件 @
94688264
#!/bin/bash
\ No newline at end of file
examples/vctk/ernie_sat/local/train.sh
0 → 100755
浏览文件 @
94688264
#!/bin/bash
config_path
=
$1
train_output_path
=
$2
python3
${
BIN_DIR
}
/train.py
\
--train-metadata
=
dump/train/norm/metadata.jsonl
\
--dev-metadata
=
dump/dev/norm/metadata.jsonl
\
--config
=
${
config_path
}
\
--output-dir
=
${
train_output_path
}
\
--ngpu
=
1
\
--phones-dict
=
dump/phone_id_map.txt
\ No newline at end of file
examples/vctk/ernie_sat/path.sh
0 → 100755
浏览文件 @
94688264
#!/bin/bash
export
MAIN_ROOT
=
`
realpath
${
PWD
}
/../../../
`
export
PATH
=
${
MAIN_ROOT
}
:
${
MAIN_ROOT
}
/utils:
${
PATH
}
export
LC_ALL
=
C
export
PYTHONDONTWRITEBYTECODE
=
1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export
PYTHONIOENCODING
=
UTF-8
export
PYTHONPATH
=
${
MAIN_ROOT
}
:
${
PYTHONPATH
}
MODEL
=
ernie_sat
export
BIN_DIR
=
${
MAIN_ROOT
}
/paddlespeech/t2s/exps/
${
MODEL
}
\ No newline at end of file
examples/vctk/ernie_sat/run.sh
0 → 100755
浏览文件 @
94688264
#!/bin/bash
set
-e
source
path.sh
gpus
=
0,1
stage
=
0
stop_stage
=
100
conf_path
=
conf/default.yaml
train_output_path
=
exp/default
ckpt_name
=
snapshot_iter_153.pdz
# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
# prepare data
./local/preprocess.sh
${
conf_path
}
||
exit
-1
fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/train.sh
${
conf_path
}
${
train_output_path
}
||
exit
-1
fi
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
# synthesize, vocoder is pwgan
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/synthesize.sh
${
conf_path
}
${
train_output_path
}
${
ckpt_name
}
||
exit
-1
fi
examples/vctk/tts3/conf/default.yaml
浏览文件 @
94688264
...
...
@@ -24,7 +24,7 @@ f0max: 400 # Maximum f0 for pitch extraction.
# DATA SETTING #
###########################################################
batch_size
:
64
num_workers
:
4
num_workers
:
2
###########################################################
...
...
@@ -88,8 +88,8 @@ updater:
# OPTIMIZER SETTING #
###########################################################
optimizer
:
optim
:
adam
# optimizer type
learning_rate
:
0.001
# learning rate
optim
:
adam
# optimizer type
learning_rate
:
0.001
# learning rate
###########################################################
# TRAINING SETTING #
...
...
paddlespeech/t2s/datasets/am_batch_fn.py
浏览文件 @
94688264
...
...
@@ -28,6 +28,149 @@ from paddlespeech.t2s.modules.nets_utils import phones_masking
from
paddlespeech.t2s.modules.nets_utils
import
phones_text_masking
# 因为要传参数,所以需要额外构建
def
build_erniesat_collate_fn
(
mlm_prob
:
float
=
0.8
,
mean_phn_span
:
int
=
8
,
seg_emb
:
bool
=
False
,
text_masking
:
bool
=
False
,
epoch
:
int
=-
1
,
):
if
epoch
==
-
1
:
mlm_prob_factor
=
1
else
:
mlm_prob_factor
=
0.8
return
ErnieSATCollateFn
(
mlm_prob
=
mlm_prob
*
mlm_prob_factor
,
mean_phn_span
=
mean_phn_span
,
seg_emb
=
seg_emb
,
text_masking
=
text_masking
)
class
ErnieSATCollateFn
:
"""Functor class of common_collate_fn()"""
def
__init__
(
self
,
mlm_prob
:
float
=
0.8
,
mean_phn_span
:
int
=
8
,
seg_emb
:
bool
=
False
,
text_masking
:
bool
=
False
):
self
.
mlm_prob
=
mlm_prob
self
.
mean_phn_span
=
mean_phn_span
self
.
seg_emb
=
seg_emb
self
.
text_masking
=
text_masking
def
__call__
(
self
,
exmaples
):
return
erniesat_batch_fn
(
exmaples
,
mlm_prob
=
self
.
mlm_prob
,
mean_phn_span
=
self
.
mean_phn_span
,
seg_emb
=
self
.
seg_emb
,
text_masking
=
self
.
text_masking
)
def
erniesat_batch_fn
(
examples
,
mlm_prob
:
float
=
0.8
,
mean_phn_span
:
int
=
8
,
seg_emb
:
bool
=
False
,
text_masking
:
bool
=
False
):
# fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy"]
text
=
[
np
.
array
(
item
[
"text"
],
dtype
=
np
.
int64
)
for
item
in
examples
]
speech
=
[
np
.
array
(
item
[
"speech"
],
dtype
=
np
.
float32
)
for
item
in
examples
]
text_lengths
=
[
np
.
array
(
item
[
"text_lengths"
],
dtype
=
np
.
int64
)
for
item
in
examples
]
speech_lengths
=
[
np
.
array
(
item
[
"speech_lengths"
],
dtype
=
np
.
int64
)
for
item
in
examples
]
align_start
=
[
np
.
array
(
item
[
"align_start"
],
dtype
=
np
.
int64
)
for
item
in
examples
]
align_end
=
[
np
.
array
(
item
[
"align_end"
],
dtype
=
np
.
int64
)
for
item
in
examples
]
align_start_lengths
=
[
np
.
array
(
len
(
item
[
"align_start"
]),
dtype
=
np
.
int64
)
for
item
in
examples
]
# add_pad
text
=
batch_sequences
(
text
)
speech
=
batch_sequences
(
speech
)
align_start
=
batch_sequences
(
align_start
)
align_end
=
batch_sequences
(
align_end
)
# convert each batch to paddle.Tensor
text
=
paddle
.
to_tensor
(
text
)
speech
=
paddle
.
to_tensor
(
speech
)
text_lengths
=
paddle
.
to_tensor
(
text_lengths
)
speech_lengths
=
paddle
.
to_tensor
(
speech_lengths
)
align_start_lengths
=
paddle
.
to_tensor
(
align_start_lengths
)
speech_pad
=
speech
text_pad
=
text
text_mask
=
make_non_pad_mask
(
text_lengths
,
text_pad
,
length_dim
=
1
).
unsqueeze
(
-
2
)
speech_mask
=
make_non_pad_mask
(
speech_lengths
,
speech_pad
[:,
:,
0
],
length_dim
=
1
).
unsqueeze
(
-
2
)
# dual_mask 的是混合中英时候同时 mask 语音和文本
# ernie sat 在实现跨语言的时候都 mask 了
span_bdy
=
None
if
text_masking
:
masked_pos
,
text_masked_pos
=
phones_text_masking
(
xs_pad
=
speech_pad
,
src_mask
=
speech_mask
,
text_pad
=
text_pad
,
text_mask
=
text_mask
,
align_start
=
align_start
,
align_end
=
align_end
,
align_start_lens
=
align_start_lengths
,
mlm_prob
=
mlm_prob
,
mean_phn_span
=
mean_phn_span
,
span_bdy
=
span_bdy
)
# 训练纯中文和纯英文的 -> a3t 没有对 phoneme 做 mask, 只对语音 mask 了
# a3t 和 ernie sat 的区别主要在于做 mask 的时候
else
:
masked_pos
=
phones_masking
(
xs_pad
=
speech_pad
,
src_mask
=
speech_mask
,
align_start
=
align_start
,
align_end
=
align_end
,
align_start_lens
=
align_start_lengths
,
mlm_prob
=
mlm_prob
,
mean_phn_span
=
mean_phn_span
,
span_bdy
=
span_bdy
)
text_masked_pos
=
paddle
.
zeros
(
paddle
.
shape
(
text_pad
))
speech_seg_pos
,
text_seg_pos
=
get_seg_pos
(
speech_pad
=
speech_pad
,
text_pad
=
text_pad
,
align_start
=
align_start
,
align_end
=
align_end
,
align_start_lens
=
align_start_lengths
,
seg_emb
=
seg_emb
)
batch
=
{
"text"
:
text
,
"speech"
:
speech
,
# need to generate
"masked_pos"
:
masked_pos
,
"speech_mask"
:
speech_mask
,
"text_mask"
:
text_mask
,
"speech_seg_pos"
:
speech_seg_pos
,
"text_seg_pos"
:
text_seg_pos
,
"text_masked_pos"
:
text_masked_pos
}
return
batch
def
tacotron2_single_spk_batch_fn
(
examples
):
# fields = ["text", "text_lengths", "speech", "speech_lengths"]
text
=
[
np
.
array
(
item
[
"text"
],
dtype
=
np
.
int64
)
for
item
in
examples
]
...
...
@@ -378,7 +521,6 @@ class MLMCollateFn:
mean_phn_span
=
self
.
mean_phn_span
,
seg_emb
=
self
.
seg_emb
,
text_masking
=
self
.
text_masking
,
attention_window
=
self
.
attention_window
,
not_sequence
=
self
.
not_sequence
)
...
...
@@ -389,7 +531,6 @@ def mlm_collate_fn(
mean_phn_span
:
int
=
8
,
seg_emb
:
bool
=
False
,
text_masking
:
bool
=
False
,
attention_window
:
int
=
0
,
pad_value
:
int
=
0
,
not_sequence
:
Collection
[
str
]
=
(),
)
->
Tuple
[
List
[
str
],
Dict
[
str
,
paddle
.
Tensor
]]:
...
...
@@ -420,6 +561,7 @@ def mlm_collate_fn(
feats
=
feats_extract
.
get_log_mel_fbank
(
np
.
array
(
output
[
"speech"
][
0
]))
feats
=
paddle
.
to_tensor
(
feats
)
print
(
"feats.shape:"
,
feats
.
shape
)
feats_lens
=
paddle
.
shape
(
feats
)[
0
]
feats
=
paddle
.
unsqueeze
(
feats
,
0
)
...
...
@@ -439,6 +581,7 @@ def mlm_collate_fn(
text_lens
,
text_pad
,
length_dim
=
1
).
unsqueeze
(
-
2
)
speech_mask
=
make_non_pad_mask
(
feats_lens
,
speech_pad
[:,
:,
0
],
length_dim
=
1
).
unsqueeze
(
-
2
)
span_bdy
=
None
if
'span_bdy'
in
output
.
keys
():
span_bdy
=
output
[
'span_bdy'
]
...
...
paddlespeech/t2s/exps/ernie_sat/__init__.py
0 → 100644
浏览文件 @
94688264
paddlespeech/t2s/exps/ernie_sat/normalize.py
0 → 100644
浏览文件 @
94688264
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Normalize feature files and dump them."""
import
argparse
import
logging
from
operator
import
itemgetter
from
pathlib
import
Path
import
jsonlines
import
numpy
as
np
from
sklearn.preprocessing
import
StandardScaler
from
tqdm
import
tqdm
from
paddlespeech.t2s.datasets.data_table
import
DataTable
def
main
():
"""Run preprocessing process."""
parser
=
argparse
.
ArgumentParser
(
description
=
"Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
)
parser
.
add_argument
(
"--metadata"
,
type
=
str
,
required
=
True
,
help
=
"directory including feature files to be normalized. "
"you need to specify either *-scp or rootdir."
)
parser
.
add_argument
(
"--dumpdir"
,
type
=
str
,
required
=
True
,
help
=
"directory to dump normalized feature files."
)
parser
.
add_argument
(
"--speech-stats"
,
type
=
str
,
required
=
True
,
help
=
"speech statistics file."
)
parser
.
add_argument
(
"--phones-dict"
,
type
=
str
,
default
=
None
,
help
=
"phone vocabulary file."
)
parser
.
add_argument
(
"--speaker-dict"
,
type
=
str
,
default
=
None
,
help
=
"speaker id map file."
)
args
=
parser
.
parse_args
()
dumpdir
=
Path
(
args
.
dumpdir
).
expanduser
()
# use absolute path
dumpdir
=
dumpdir
.
resolve
()
dumpdir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
# get dataset
with
jsonlines
.
open
(
args
.
metadata
,
'r'
)
as
reader
:
metadata
=
list
(
reader
)
dataset
=
DataTable
(
metadata
,
converters
=
{
"speech"
:
np
.
load
,
})
logging
.
info
(
f
"The number of files =
{
len
(
dataset
)
}
."
)
# restore scaler
speech_scaler
=
StandardScaler
()
speech_scaler
.
mean_
=
np
.
load
(
args
.
speech_stats
)[
0
]
speech_scaler
.
scale_
=
np
.
load
(
args
.
speech_stats
)[
1
]
speech_scaler
.
n_features_in_
=
speech_scaler
.
mean_
.
shape
[
0
]
vocab_phones
=
{}
with
open
(
args
.
phones_dict
,
'rt'
)
as
f
:
phn_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
for
phn
,
id
in
phn_id
:
vocab_phones
[
phn
]
=
int
(
id
)
vocab_speaker
=
{}
with
open
(
args
.
speaker_dict
,
'rt'
)
as
f
:
spk_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
for
spk
,
id
in
spk_id
:
vocab_speaker
[
spk
]
=
int
(
id
)
# process each file
output_metadata
=
[]
for
item
in
tqdm
(
dataset
):
utt_id
=
item
[
'utt_id'
]
speech
=
item
[
'speech'
]
# normalize
speech
=
speech_scaler
.
transform
(
speech
)
speech_dir
=
dumpdir
/
"data_speech"
speech_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
speech_path
=
speech_dir
/
f
"
{
utt_id
}
_speech.npy"
np
.
save
(
speech_path
,
speech
.
astype
(
np
.
float32
),
allow_pickle
=
False
)
phone_ids
=
[
vocab_phones
[
p
]
for
p
in
item
[
'phones'
]]
spk_id
=
vocab_speaker
[
item
[
"speaker"
]]
record
=
{
"utt_id"
:
item
[
'utt_id'
],
"spk_id"
:
spk_id
,
"text"
:
phone_ids
,
"text_lengths"
:
item
[
'text_lengths'
],
"speech_lengths"
:
item
[
'speech_lengths'
],
"durations"
:
item
[
'durations'
],
"speech"
:
str
(
speech_path
),
"align_start"
:
item
[
'align_start'
],
"align_end"
:
item
[
'align_end'
],
}
# add spk_emb for voice cloning
if
"spk_emb"
in
item
:
record
[
"spk_emb"
]
=
str
(
item
[
"spk_emb"
])
output_metadata
.
append
(
record
)
output_metadata
.
sort
(
key
=
itemgetter
(
'utt_id'
))
output_metadata_path
=
Path
(
args
.
dumpdir
)
/
"metadata.jsonl"
with
jsonlines
.
open
(
output_metadata_path
,
'w'
)
as
writer
:
for
item
in
output_metadata
:
writer
.
write
(
item
)
logging
.
info
(
f
"metadata dumped into
{
output_metadata_path
}
"
)
if
__name__
==
"__main__"
:
main
()
paddlespeech/t2s/exps/ernie_sat/preprocess.py
0 → 100644
浏览文件 @
94688264
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
os
from
concurrent.futures
import
ThreadPoolExecutor
from
operator
import
itemgetter
from
pathlib
import
Path
from
typing
import
Any
from
typing
import
Dict
from
typing
import
List
import
jsonlines
import
librosa
import
numpy
as
np
import
tqdm
import
yaml
from
yacs.config
import
CfgNode
from
paddlespeech.t2s.datasets.get_feats
import
LogMelFBank
from
paddlespeech.t2s.datasets.preprocess_utils
import
compare_duration_and_mel_length
from
paddlespeech.t2s.datasets.preprocess_utils
import
get_input_token
from
paddlespeech.t2s.datasets.preprocess_utils
import
get_phn_dur
from
paddlespeech.t2s.datasets.preprocess_utils
import
get_spk_id_map
from
paddlespeech.t2s.datasets.preprocess_utils
import
merge_silence
from
paddlespeech.t2s.utils
import
str2bool
def
process_sentence
(
config
:
Dict
[
str
,
Any
],
fp
:
Path
,
sentences
:
Dict
,
output_dir
:
Path
,
mel_extractor
=
None
,
cut_sil
:
bool
=
True
,
spk_emb_dir
:
Path
=
None
):
utt_id
=
fp
.
stem
# for vctk
if
utt_id
.
endswith
(
"_mic2"
):
utt_id
=
utt_id
[:
-
5
]
record
=
None
if
utt_id
in
sentences
:
# reading, resampling may occur
wav
,
_
=
librosa
.
load
(
str
(
fp
),
sr
=
config
.
fs
)
if
len
(
wav
.
shape
)
!=
1
:
return
record
max_value
=
np
.
abs
(
wav
).
max
()
if
max_value
>
1.0
:
wav
=
wav
/
max_value
assert
len
(
wav
.
shape
)
==
1
,
f
"
{
utt_id
}
is not a mono-channel audio."
assert
np
.
abs
(
wav
).
max
(
)
<=
1.0
,
f
"
{
utt_id
}
is seems to be different that 16 bit PCM."
phones
=
sentences
[
utt_id
][
0
]
durations
=
sentences
[
utt_id
][
1
]
speaker
=
sentences
[
utt_id
][
2
]
d_cumsum
=
np
.
pad
(
np
.
array
(
durations
).
cumsum
(
0
),
(
1
,
0
),
'constant'
)
# little imprecise than use *.TextGrid directly
times
=
librosa
.
frames_to_time
(
d_cumsum
,
sr
=
config
.
fs
,
hop_length
=
config
.
n_shift
)
if
cut_sil
:
start
=
0
end
=
d_cumsum
[
-
1
]
if
phones
[
0
]
==
"sil"
and
len
(
durations
)
>
1
:
start
=
times
[
1
]
durations
=
durations
[
1
:]
phones
=
phones
[
1
:]
if
phones
[
-
1
]
==
'sil'
and
len
(
durations
)
>
1
:
end
=
times
[
-
2
]
durations
=
durations
[:
-
1
]
phones
=
phones
[:
-
1
]
sentences
[
utt_id
][
0
]
=
phones
sentences
[
utt_id
][
1
]
=
durations
start
,
end
=
librosa
.
time_to_samples
([
start
,
end
],
sr
=
config
.
fs
)
wav
=
wav
[
start
:
end
]
# extract mel feats
logmel
=
mel_extractor
.
get_log_mel_fbank
(
wav
)
# change duration according to mel_length
compare_duration_and_mel_length
(
sentences
,
utt_id
,
logmel
)
# utt_id may be popped in compare_duration_and_mel_length
if
utt_id
not
in
sentences
:
return
None
phones
=
sentences
[
utt_id
][
0
]
durations
=
sentences
[
utt_id
][
1
]
num_frames
=
logmel
.
shape
[
0
]
assert
sum
(
durations
)
==
num_frames
new_d_cumsum
=
np
.
pad
(
np
.
array
(
durations
).
cumsum
(
0
),
(
1
,
0
),
'constant'
)
align_start
=
new_d_cumsum
[:
-
1
]
align_end
=
new_d_cumsum
[
1
:]
assert
len
(
align_start
)
==
len
(
align_end
)
==
len
(
durations
)
mel_dir
=
output_dir
/
"data_speech"
mel_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
mel_path
=
mel_dir
/
(
utt_id
+
"_speech.npy"
)
np
.
save
(
mel_path
,
logmel
)
# align_start_lengths == text_lengths
record
=
{
"utt_id"
:
utt_id
,
"phones"
:
phones
,
"text_lengths"
:
len
(
phones
),
"speech_lengths"
:
num_frames
,
"durations"
:
durations
,
"speech"
:
str
(
mel_path
),
"speaker"
:
speaker
,
"align_start"
:
align_start
.
tolist
(),
"align_end"
:
align_end
.
tolist
(),
}
if
spk_emb_dir
:
if
speaker
in
os
.
listdir
(
spk_emb_dir
):
embed_name
=
utt_id
+
".npy"
embed_path
=
spk_emb_dir
/
speaker
/
embed_name
if
embed_path
.
is_file
():
record
[
"spk_emb"
]
=
str
(
embed_path
)
else
:
return
None
return
record
def
process_sentences
(
config
,
fps
:
List
[
Path
],
sentences
:
Dict
,
output_dir
:
Path
,
mel_extractor
=
None
,
nprocs
:
int
=
1
,
cut_sil
:
bool
=
True
,
spk_emb_dir
:
Path
=
None
):
if
nprocs
==
1
:
results
=
[]
for
fp
in
tqdm
.
tqdm
(
fps
,
total
=
len
(
fps
)):
record
=
process_sentence
(
config
=
config
,
fp
=
fp
,
sentences
=
sentences
,
output_dir
=
output_dir
,
mel_extractor
=
mel_extractor
,
cut_sil
=
cut_sil
,
spk_emb_dir
=
spk_emb_dir
)
if
record
:
results
.
append
(
record
)
else
:
with
ThreadPoolExecutor
(
nprocs
)
as
pool
:
futures
=
[]
with
tqdm
.
tqdm
(
total
=
len
(
fps
))
as
progress
:
for
fp
in
fps
:
future
=
pool
.
submit
(
process_sentence
,
config
,
fp
,
sentences
,
output_dir
,
mel_extractor
,
cut_sil
,
spk_emb_dir
)
future
.
add_done_callback
(
lambda
p
:
progress
.
update
())
futures
.
append
(
future
)
results
=
[]
for
ft
in
futures
:
record
=
ft
.
result
()
if
record
:
results
.
append
(
record
)
results
.
sort
(
key
=
itemgetter
(
"utt_id"
))
with
jsonlines
.
open
(
output_dir
/
"metadata.jsonl"
,
'w'
)
as
writer
:
for
item
in
results
:
writer
.
write
(
item
)
print
(
"Done"
)
def
main
():
# parse config and args
parser
=
argparse
.
ArgumentParser
(
description
=
"Preprocess audio and then extract features."
)
parser
.
add_argument
(
"--dataset"
,
default
=
"baker"
,
type
=
str
,
help
=
"name of dataset, should in {baker, aishell3, ljspeech, vctk} now"
)
parser
.
add_argument
(
"--rootdir"
,
default
=
None
,
type
=
str
,
help
=
"directory to dataset."
)
parser
.
add_argument
(
"--dumpdir"
,
type
=
str
,
required
=
True
,
help
=
"directory to dump feature files."
)
parser
.
add_argument
(
"--dur-file"
,
default
=
None
,
type
=
str
,
help
=
"path to durations.txt."
)
parser
.
add_argument
(
"--config"
,
type
=
str
,
help
=
"fastspeech2 config file."
)
parser
.
add_argument
(
"--num-cpu"
,
type
=
int
,
default
=
1
,
help
=
"number of process."
)
parser
.
add_argument
(
"--cut-sil"
,
type
=
str2bool
,
default
=
True
,
help
=
"whether cut sil in the edge of audio"
)
parser
.
add_argument
(
"--spk_emb_dir"
,
default
=
None
,
type
=
str
,
help
=
"directory to speaker embedding files."
)
args
=
parser
.
parse_args
()
rootdir
=
Path
(
args
.
rootdir
).
expanduser
()
dumpdir
=
Path
(
args
.
dumpdir
).
expanduser
()
# use absolute path
dumpdir
=
dumpdir
.
resolve
()
dumpdir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
dur_file
=
Path
(
args
.
dur_file
).
expanduser
()
if
args
.
spk_emb_dir
:
spk_emb_dir
=
Path
(
args
.
spk_emb_dir
).
expanduser
().
resolve
()
else
:
spk_emb_dir
=
None
assert
rootdir
.
is_dir
()
assert
dur_file
.
is_file
()
with
open
(
args
.
config
,
'rt'
)
as
f
:
config
=
CfgNode
(
yaml
.
safe_load
(
f
))
sentences
,
speaker_set
=
get_phn_dur
(
dur_file
)
merge_silence
(
sentences
)
phone_id_map_path
=
dumpdir
/
"phone_id_map.txt"
speaker_id_map_path
=
dumpdir
/
"speaker_id_map.txt"
get_input_token
(
sentences
,
phone_id_map_path
,
args
.
dataset
)
get_spk_id_map
(
speaker_set
,
speaker_id_map_path
)
if
args
.
dataset
==
"baker"
:
wav_files
=
sorted
(
list
((
rootdir
/
"Wave"
).
rglob
(
"*.wav"
)))
# split data into 3 sections
num_train
=
9800
num_dev
=
100
train_wav_files
=
wav_files
[:
num_train
]
dev_wav_files
=
wav_files
[
num_train
:
num_train
+
num_dev
]
test_wav_files
=
wav_files
[
num_train
+
num_dev
:]
elif
args
.
dataset
==
"aishell3"
:
sub_num_dev
=
5
wav_dir
=
rootdir
/
"train"
/
"wav"
train_wav_files
=
[]
dev_wav_files
=
[]
test_wav_files
=
[]
for
speaker
in
os
.
listdir
(
wav_dir
):
wav_files
=
sorted
(
list
((
wav_dir
/
speaker
).
rglob
(
"*.wav"
)))
if
len
(
wav_files
)
>
100
:
train_wav_files
+=
wav_files
[:
-
sub_num_dev
*
2
]
dev_wav_files
+=
wav_files
[
-
sub_num_dev
*
2
:
-
sub_num_dev
]
test_wav_files
+=
wav_files
[
-
sub_num_dev
:]
else
:
train_wav_files
+=
wav_files
elif
args
.
dataset
==
"ljspeech"
:
wav_files
=
sorted
(
list
((
rootdir
/
"wavs"
).
rglob
(
"*.wav"
)))
# split data into 3 sections
num_train
=
12900
num_dev
=
100
train_wav_files
=
wav_files
[:
num_train
]
dev_wav_files
=
wav_files
[
num_train
:
num_train
+
num_dev
]
test_wav_files
=
wav_files
[
num_train
+
num_dev
:]
elif
args
.
dataset
==
"vctk"
:
sub_num_dev
=
5
wav_dir
=
rootdir
/
"wav48_silence_trimmed"
train_wav_files
=
[]
dev_wav_files
=
[]
test_wav_files
=
[]
for
speaker
in
os
.
listdir
(
wav_dir
):
wav_files
=
sorted
(
list
((
wav_dir
/
speaker
).
rglob
(
"*_mic2.flac"
)))
if
len
(
wav_files
)
>
100
:
train_wav_files
+=
wav_files
[:
-
sub_num_dev
*
2
]
dev_wav_files
+=
wav_files
[
-
sub_num_dev
*
2
:
-
sub_num_dev
]
test_wav_files
+=
wav_files
[
-
sub_num_dev
:]
else
:
train_wav_files
+=
wav_files
else
:
print
(
"dataset should in {baker, aishell3, ljspeech, vctk} now!"
)
train_dump_dir
=
dumpdir
/
"train"
/
"raw"
train_dump_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
dev_dump_dir
=
dumpdir
/
"dev"
/
"raw"
dev_dump_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
test_dump_dir
=
dumpdir
/
"test"
/
"raw"
test_dump_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
# Extractor
mel_extractor
=
LogMelFBank
(
sr
=
config
.
fs
,
n_fft
=
config
.
n_fft
,
hop_length
=
config
.
n_shift
,
win_length
=
config
.
win_length
,
window
=
config
.
window
,
n_mels
=
config
.
n_mels
,
fmin
=
config
.
fmin
,
fmax
=
config
.
fmax
)
# process for the 3 sections
if
train_wav_files
:
process_sentences
(
config
=
config
,
fps
=
train_wav_files
,
sentences
=
sentences
,
output_dir
=
train_dump_dir
,
mel_extractor
=
mel_extractor
,
nprocs
=
args
.
num_cpu
,
cut_sil
=
args
.
cut_sil
,
spk_emb_dir
=
spk_emb_dir
)
if
dev_wav_files
:
process_sentences
(
config
=
config
,
fps
=
dev_wav_files
,
sentences
=
sentences
,
output_dir
=
dev_dump_dir
,
mel_extractor
=
mel_extractor
,
cut_sil
=
args
.
cut_sil
,
spk_emb_dir
=
spk_emb_dir
)
if
test_wav_files
:
process_sentences
(
config
=
config
,
fps
=
test_wav_files
,
sentences
=
sentences
,
output_dir
=
test_dump_dir
,
mel_extractor
=
mel_extractor
,
nprocs
=
args
.
num_cpu
,
cut_sil
=
args
.
cut_sil
,
spk_emb_dir
=
spk_emb_dir
)
if
__name__
==
"__main__"
:
main
()
paddlespeech/t2s/exps/ernie_sat/synthesize.py
0 → 100644
浏览文件 @
94688264
paddlespeech/t2s/exps/ernie_sat/train.py
0 → 100644
浏览文件 @
94688264
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
logging
import
os
import
shutil
from
pathlib
import
Path
import
jsonlines
import
numpy
as
np
import
paddle
import
yaml
from
paddle
import
DataParallel
from
paddle
import
distributed
as
dist
from
paddle.io
import
DataLoader
from
paddle.io
import
DistributedBatchSampler
from
yacs.config
import
CfgNode
from
paddlespeech.t2s.datasets.am_batch_fn
import
build_erniesat_collate_fn
from
paddlespeech.t2s.datasets.data_table
import
DataTable
from
paddlespeech.t2s.models.ernie_sat
import
ErnieSAT
from
paddlespeech.t2s.models.ernie_sat
import
ErnieSATEvaluator
from
paddlespeech.t2s.models.ernie_sat
import
ErnieSATUpdater
from
paddlespeech.t2s.training.extensions.snapshot
import
Snapshot
from
paddlespeech.t2s.training.extensions.visualizer
import
VisualDL
from
paddlespeech.t2s.training.optimizer
import
build_optimizers
from
paddlespeech.t2s.training.seeding
import
seed_everything
from
paddlespeech.t2s.training.trainer
import
Trainer
def
train_sp
(
args
,
config
):
# decides device type and whether to run in parallel
# setup running environment correctly
if
(
not
paddle
.
is_compiled_with_cuda
())
or
args
.
ngpu
==
0
:
paddle
.
set_device
(
"cpu"
)
else
:
paddle
.
set_device
(
"gpu"
)
world_size
=
paddle
.
distributed
.
get_world_size
()
if
world_size
>
1
:
paddle
.
distributed
.
init_parallel_env
()
# set the random seed, it is a must for multiprocess training
seed_everything
(
config
.
seed
)
print
(
f
"rank:
{
dist
.
get_rank
()
}
, pid:
{
os
.
getpid
()
}
, parent_pid:
{
os
.
getppid
()
}
"
,
)
fields
=
[
"text"
,
"text_lengths"
,
"speech"
,
"speech_lengths"
,
"align_start"
,
"align_end"
]
converters
=
{
"speech"
:
np
.
load
}
spk_num
=
None
# dataloader has been too verbose
logging
.
getLogger
(
"DataLoader"
).
disabled
=
True
# construct dataset for training and validation
with
jsonlines
.
open
(
args
.
train_metadata
,
'r'
)
as
reader
:
train_metadata
=
list
(
reader
)
train_dataset
=
DataTable
(
data
=
train_metadata
,
fields
=
fields
,
converters
=
converters
,
)
with
jsonlines
.
open
(
args
.
dev_metadata
,
'r'
)
as
reader
:
dev_metadata
=
list
(
reader
)
dev_dataset
=
DataTable
(
data
=
dev_metadata
,
fields
=
fields
,
converters
=
converters
,
)
# collate function and dataloader
collate_fn
=
build_erniesat_collate_fn
(
mlm_prob
=
config
.
mlm_prob
,
mean_phn_span
=
config
.
mean_phn_span
,
seg_emb
=
config
.
model
[
'enc_input_layer'
]
==
'sega_mlm'
,
text_masking
=
config
[
"model"
][
"text_masking"
],
epoch
=
config
[
"max_epoch"
])
train_sampler
=
DistributedBatchSampler
(
train_dataset
,
batch_size
=
config
.
batch_size
,
shuffle
=
True
,
drop_last
=
True
)
print
(
"samplers done!"
)
train_dataloader
=
DataLoader
(
train_dataset
,
batch_sampler
=
train_sampler
,
collate_fn
=
collate_fn
,
num_workers
=
config
.
num_workers
)
dev_dataloader
=
DataLoader
(
dev_dataset
,
shuffle
=
False
,
drop_last
=
False
,
batch_size
=
config
.
batch_size
,
collate_fn
=
collate_fn
,
num_workers
=
config
.
num_workers
)
print
(
"dataloaders done!"
)
with
open
(
args
.
phones_dict
,
"r"
)
as
f
:
phn_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
vocab_size
=
len
(
phn_id
)
print
(
"vocab_size:"
,
vocab_size
)
odim
=
config
.
n_mels
model
=
ErnieSAT
(
idim
=
vocab_size
,
odim
=
odim
,
**
config
[
"model"
])
if
world_size
>
1
:
model
=
DataParallel
(
model
)
print
(
"model done!"
)
optimizer
=
build_optimizers
(
model
,
**
config
[
"optimizer"
])
print
(
"optimizer done!"
)
output_dir
=
Path
(
args
.
output_dir
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
if
dist
.
get_rank
()
==
0
:
config_name
=
args
.
config
.
split
(
"/"
)[
-
1
]
# copy conf to output_dir
shutil
.
copyfile
(
args
.
config
,
output_dir
/
config_name
)
updater
=
ErnieSATUpdater
(
model
=
model
,
optimizer
=
optimizer
,
dataloader
=
train_dataloader
,
text_masking
=
config
[
"model"
][
"text_masking"
],
odim
=
odim
,
output_dir
=
output_dir
)
trainer
=
Trainer
(
updater
,
(
config
.
max_epoch
,
'epoch'
),
output_dir
)
evaluator
=
ErnieSATEvaluator
(
model
=
model
,
dataloader
=
dev_dataloader
,
text_masking
=
config
[
"model"
][
"text_masking"
],
odim
=
odim
,
output_dir
=
output_dir
,
)
if
dist
.
get_rank
()
==
0
:
trainer
.
extend
(
evaluator
,
trigger
=
(
1
,
"epoch"
))
trainer
.
extend
(
VisualDL
(
output_dir
),
trigger
=
(
1
,
"iteration"
))
trainer
.
extend
(
Snapshot
(
max_size
=
config
.
num_snapshots
),
trigger
=
(
1
,
'epoch'
))
trainer
.
run
()
def
main
():
# parse args and config and redirect to train_sp
parser
=
argparse
.
ArgumentParser
(
description
=
"Train an ErnieSAT model."
)
parser
.
add_argument
(
"--config"
,
type
=
str
,
help
=
"ErnieSAT config file."
)
parser
.
add_argument
(
"--train-metadata"
,
type
=
str
,
help
=
"training data."
)
parser
.
add_argument
(
"--dev-metadata"
,
type
=
str
,
help
=
"dev data."
)
parser
.
add_argument
(
"--output-dir"
,
type
=
str
,
help
=
"output dir."
)
parser
.
add_argument
(
"--ngpu"
,
type
=
int
,
default
=
1
,
help
=
"if ngpu=0, use cpu."
)
parser
.
add_argument
(
"--phones-dict"
,
type
=
str
,
default
=
None
,
help
=
"phone vocabulary file."
)
args
=
parser
.
parse_args
()
with
open
(
args
.
config
)
as
f
:
config
=
CfgNode
(
yaml
.
safe_load
(
f
))
print
(
"========Args========"
)
print
(
yaml
.
safe_dump
(
vars
(
args
)))
print
(
"========Config========"
)
print
(
config
)
print
(
f
"master see the word size:
{
dist
.
get_world_size
()
}
, from pid:
{
os
.
getpid
()
}
"
)
# dispatch
if
args
.
ngpu
>
1
:
dist
.
spawn
(
train_sp
,
(
args
,
config
),
nprocs
=
args
.
ngpu
)
else
:
train_sp
(
args
,
config
)
if
__name__
==
"__main__"
:
main
()
paddlespeech/t2s/models/ernie_sat/__init__.py
浏览文件 @
94688264
# Copyright (c) 202
0
PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 202
2
PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -11,4 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.ernie_sat
import
*
from
.ernie_sat_updater
import
*
from
.mlm
import
*
paddlespeech/t2s/models/ernie_sat/ernie_sat.py
0 → 100644
浏览文件 @
94688264
此差异已折叠。
点击以展开。
paddlespeech/t2s/models/ernie_sat/ernie_sat_updater.py
0 → 100644
浏览文件 @
94688264
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
logging
from
pathlib
import
Path
from
paddle
import
distributed
as
dist
from
paddle.io
import
DataLoader
from
paddle.nn
import
Layer
from
paddle.optimizer
import
Optimizer
from
paddlespeech.t2s.modules.losses
import
MLMLoss
from
paddlespeech.t2s.training.extensions.evaluator
import
StandardEvaluator
from
paddlespeech.t2s.training.reporter
import
report
from
paddlespeech.t2s.training.updaters.standard_updater
import
StandardUpdater
logging
.
basicConfig
(
format
=
'%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s'
,
datefmt
=
'[%Y-%m-%d %H:%M:%S]'
)
logger
=
logging
.
getLogger
(
__name__
)
logger
.
setLevel
(
logging
.
INFO
)
class
ErnieSATUpdater
(
StandardUpdater
):
def
__init__
(
self
,
model
:
Layer
,
optimizer
:
Optimizer
,
dataloader
:
DataLoader
,
init_state
=
None
,
text_masking
:
bool
=
False
,
odim
:
int
=
80
,
output_dir
:
Path
=
None
):
super
().
__init__
(
model
,
optimizer
,
dataloader
,
init_state
=
None
)
self
.
criterion
=
MLMLoss
(
text_masking
=
text_masking
,
odim
=
odim
)
log_file
=
output_dir
/
'worker_{}.log'
.
format
(
dist
.
get_rank
())
self
.
filehandler
=
logging
.
FileHandler
(
str
(
log_file
))
logger
.
addHandler
(
self
.
filehandler
)
self
.
logger
=
logger
self
.
msg
=
""
def
update_core
(
self
,
batch
):
self
.
msg
=
"Rank: {}, "
.
format
(
dist
.
get_rank
())
losses_dict
=
{}
before_outs
,
after_outs
,
text_outs
=
self
.
model
(
speech
=
batch
[
"speech"
],
text
=
batch
[
"text"
],
masked_pos
=
batch
[
"masked_pos"
],
speech_mask
=
batch
[
"speech_mask"
],
text_mask
=
batch
[
"text_mask"
],
speech_seg_pos
=
batch
[
"speech_seg_pos"
],
text_seg_pos
=
batch
[
"text_seg_pos"
])
mlm_loss
,
text_mlm_loss
=
self
.
criterion
(
speech
=
batch
[
"speech"
],
before_outs
=
before_outs
,
after_outs
=
after_outs
,
masked_pos
=
batch
[
"masked_pos"
],
text
=
batch
[
"text"
],
# maybe None
text_outs
=
text_outs
,
# maybe None
text_masked_pos
=
batch
[
"text_masked_pos"
])
loss
=
mlm_loss
+
text_mlm_loss
if
text_mlm_loss
is
not
None
else
mlm_loss
optimizer
=
self
.
optimizer
optimizer
.
clear_grad
()
loss
.
backward
()
optimizer
.
step
()
report
(
"train/loss"
,
float
(
loss
))
report
(
"train/mlm_loss"
,
float
(
mlm_loss
))
if
text_mlm_loss
is
not
None
:
report
(
"train/text_mlm_loss"
,
float
(
text_mlm_loss
))
losses_dict
[
"text_mlm_loss"
]
=
float
(
text_mlm_loss
)
losses_dict
[
"mlm_loss"
]
=
float
(
mlm_loss
)
losses_dict
[
"loss"
]
=
float
(
loss
)
self
.
msg
+=
', '
.
join
(
'{}: {:>.6f}'
.
format
(
k
,
v
)
for
k
,
v
in
losses_dict
.
items
())
class
ErnieSATEvaluator
(
StandardEvaluator
):
def
__init__
(
self
,
model
:
Layer
,
dataloader
:
DataLoader
,
text_masking
:
bool
=
False
,
odim
:
int
=
80
,
output_dir
:
Path
=
None
):
super
().
__init__
(
model
,
dataloader
)
log_file
=
output_dir
/
'worker_{}.log'
.
format
(
dist
.
get_rank
())
self
.
filehandler
=
logging
.
FileHandler
(
str
(
log_file
))
logger
.
addHandler
(
self
.
filehandler
)
self
.
logger
=
logger
self
.
msg
=
""
self
.
criterion
=
MLMLoss
(
text_masking
=
text_masking
,
odim
=
odim
)
def
evaluate_core
(
self
,
batch
):
self
.
msg
=
"Evaluate: "
losses_dict
=
{}
before_outs
,
after_outs
,
text_outs
=
self
.
model
(
speech
=
batch
[
"speech"
],
text
=
batch
[
"text"
],
masked_pos
=
batch
[
"masked_pos"
],
speech_mask
=
batch
[
"speech_mask"
],
text_mask
=
batch
[
"text_mask"
],
speech_seg_pos
=
batch
[
"speech_seg_pos"
],
text_seg_pos
=
batch
[
"text_seg_pos"
])
mlm_loss
,
text_mlm_loss
=
self
.
criterion
(
speech
=
batch
[
"speech"
],
before_outs
=
before_outs
,
after_outs
=
after_outs
,
masked_pos
=
batch
[
"masked_pos"
],
text
=
batch
[
"text"
],
# maybe None
text_outs
=
text_outs
,
# maybe None
text_masked_pos
=
batch
[
"text_masked_pos"
])
loss
=
mlm_loss
+
text_mlm_loss
if
text_mlm_loss
is
not
None
else
mlm_loss
report
(
"eval/loss"
,
float
(
loss
))
report
(
"eval/mlm_loss"
,
float
(
mlm_loss
))
if
text_mlm_loss
is
not
None
:
report
(
"eval/text_mlm_loss"
,
float
(
text_mlm_loss
))
losses_dict
[
"text_mlm_loss"
]
=
float
(
text_mlm_loss
)
losses_dict
[
"mlm_loss"
]
=
float
(
mlm_loss
)
losses_dict
[
"loss"
]
=
float
(
loss
)
self
.
msg
+=
', '
.
join
(
'{}: {:>.6f}'
.
format
(
k
,
v
)
for
k
,
v
in
losses_dict
.
items
())
self
.
logger
.
info
(
self
.
msg
)
paddlespeech/t2s/models/ernie_sat/mlm.py
浏览文件 @
94688264
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
from
typing
import
Dict
from
typing
import
List
from
typing
import
Optional
from
typing
import
Tuple
from
typing
import
Union
import
paddle
import
yaml
...
...
@@ -109,7 +120,6 @@ class MLMEncoder(nn.Layer):
positionwise_conv_kernel_size
:
int
=
1
,
macaron_style
:
bool
=
False
,
pos_enc_layer_type
:
str
=
"abs_pos"
,
pos_enc_class
=
None
,
selfattention_layer_type
:
str
=
"selfattn"
,
activation_type
:
str
=
"swish"
,
use_cnn_module
:
bool
=
False
,
...
...
@@ -334,7 +344,6 @@ class MLMDecoder(MLMEncoder):
# encoder and decoder is nn.Layer, not str
class
MLM
(
nn
.
Layer
):
def
__init__
(
self
,
token_list
:
Union
[
Tuple
[
str
,
...],
List
[
str
]],
odim
:
int
,
encoder
:
nn
.
Layer
,
decoder
:
Optional
[
nn
.
Layer
],
...
...
@@ -345,7 +354,6 @@ class MLM(nn.Layer):
super
().
__init__
()
self
.
odim
=
odim
self
.
token_list
=
token_list
.
copy
()
self
.
encoder
=
encoder
self
.
decoder
=
decoder
self
.
vocab_size
=
encoder
.
text_embed
[
0
].
_num_embeddings
...
...
@@ -535,32 +543,6 @@ def build_model(args: argparse.Namespace, model_class=MLMEncAsDecoder) -> MLM:
vocab_size
=
len
(
token_list
)
odim
=
80
pos_enc_class
=
ScaledPositionalEncoding
if
args
.
use_scaled_pos_enc
else
PositionalEncoding
if
"conformer"
==
args
.
encoder
:
conformer_self_attn_layer_type
=
args
.
encoder_conf
[
'selfattention_layer_type'
]
conformer_pos_enc_layer_type
=
args
.
encoder_conf
[
'pos_enc_layer_type'
]
conformer_rel_pos_type
=
"legacy"
if
conformer_rel_pos_type
==
"legacy"
:
if
conformer_pos_enc_layer_type
==
"rel_pos"
:
conformer_pos_enc_layer_type
=
"legacy_rel_pos"
if
conformer_self_attn_layer_type
==
"rel_selfattn"
:
conformer_self_attn_layer_type
=
"legacy_rel_selfattn"
elif
conformer_rel_pos_type
==
"latest"
:
assert
conformer_pos_enc_layer_type
!=
"legacy_rel_pos"
assert
conformer_self_attn_layer_type
!=
"legacy_rel_selfattn"
else
:
raise
ValueError
(
f
"Unknown rel_pos_type:
{
conformer_rel_pos_type
}
"
)
args
.
encoder_conf
[
'selfattention_layer_type'
]
=
conformer_self_attn_layer_type
args
.
encoder_conf
[
'pos_enc_layer_type'
]
=
conformer_pos_enc_layer_type
if
"conformer"
==
args
.
decoder
:
args
.
decoder_conf
[
'selfattention_layer_type'
]
=
conformer_self_attn_layer_type
args
.
decoder_conf
[
'pos_enc_layer_type'
]
=
conformer_pos_enc_layer_type
# Encoder
encoder_class
=
MLMEncoder
...
...
@@ -571,10 +553,7 @@ def build_model(args: argparse.Namespace, model_class=MLMEncAsDecoder) -> MLM:
args
.
encoder_conf
[
'text_masking'
]
=
False
encoder
=
encoder_class
(
args
.
input_size
,
vocab_size
=
vocab_size
,
pos_enc_class
=
pos_enc_class
,
**
args
.
encoder_conf
)
args
.
input_size
,
vocab_size
=
vocab_size
,
**
args
.
encoder_conf
)
# Decoder
if
args
.
decoder
!=
'no_decoder'
:
...
...
@@ -591,7 +570,6 @@ def build_model(args: argparse.Namespace, model_class=MLMEncAsDecoder) -> MLM:
odim
=
odim
,
encoder
=
encoder
,
decoder
=
decoder
,
token_list
=
token_list
,
**
args
.
model_conf
,
)
# Initialize
...
...
paddlespeech/t2s/models/fastspeech2/fastspeech2.py
浏览文件 @
94688264
...
...
@@ -212,9 +212,7 @@ class FastSpeech2(nn.Layer):
super
().
__init__
()
# store hyperparameters
self
.
idim
=
idim
self
.
odim
=
odim
self
.
eos
=
idim
-
1
self
.
reduction_factor
=
reduction_factor
self
.
encoder_type
=
encoder_type
self
.
decoder_type
=
decoder_type
...
...
paddlespeech/t2s/modules/losses.py
浏览文件 @
94688264
...
...
@@ -1012,6 +1012,7 @@ class KLDivergenceLoss(nn.Layer):
# loss for ERNIE SAT
class
MLMLoss
(
nn
.
Layer
):
def
__init__
(
self
,
odim
:
int
,
lsm_weight
:
float
=
0.1
,
ignore_id
:
int
=-
1
,
text_masking
:
bool
=
False
):
...
...
@@ -1023,15 +1024,18 @@ class MLMLoss(nn.Layer):
else
:
self
.
l1_loss_func
=
nn
.
L1Loss
(
reduction
=
'none'
)
self
.
text_masking
=
text_masking
self
.
odim
=
odim
def
forward
(
self
,
speech
:
paddle
.
Tensor
,
before_outs
:
paddle
.
Tensor
,
after_outs
:
paddle
.
Tensor
,
masked_pos
:
paddle
.
Tensor
,
text
:
paddle
.
Tensor
=
None
,
text_outs
:
paddle
.
Tensor
=
None
,
text_masked_pos
:
paddle
.
Tensor
=
None
):
def
forward
(
self
,
speech
:
paddle
.
Tensor
,
before_outs
:
paddle
.
Tensor
,
after_outs
:
paddle
.
Tensor
,
masked_pos
:
paddle
.
Tensor
,
# for text_loss when text_masking == True
text
:
paddle
.
Tensor
=
None
,
text_outs
:
paddle
.
Tensor
=
None
,
text_masked_pos
:
paddle
.
Tensor
=
None
):
xs_pad
=
speech
mlm_loss_pos
=
masked_pos
>
0
...
...
@@ -1046,16 +1050,19 @@ class MLMLoss(nn.Layer):
paddle
.
reshape
(
after_outs
,
(
-
1
,
self
.
odim
)),
paddle
.
reshape
(
xs_pad
,
(
-
1
,
self
.
odim
))),
axis
=-
1
)
loss_mlm
=
paddle
.
sum
((
loss
*
paddle
.
reshape
(
mlm_loss
=
paddle
.
sum
((
loss
*
paddle
.
reshape
(
mlm_loss_pos
,
[
-
1
])))
/
paddle
.
sum
((
mlm_loss_pos
)
+
1e-10
)
text_mlm_loss
=
None
if
self
.
text_masking
:
loss_text
=
paddle
.
sum
((
self
.
text_mlm_loss
(
assert
text
is
not
None
assert
text_outs
is
not
None
assert
text_masked_pos
is
not
None
text_mlm_loss
=
paddle
.
sum
((
self
.
text_mlm_loss
(
paddle
.
reshape
(
text_outs
,
(
-
1
,
self
.
vocab_size
)),
paddle
.
reshape
(
text
,
(
-
1
)))
*
paddle
.
reshape
(
text_masked_pos
,
(
-
1
))))
/
paddle
.
sum
((
text_masked_pos
)
+
1e-10
)
return
loss_mlm
,
loss_text
return
loss_mlm
return
mlm_loss
,
text_mlm_loss
paddlespeech/t2s/modules/nets_utils.py
浏览文件 @
94688264
...
...
@@ -393,7 +393,6 @@ def phones_masking(xs_pad: paddle.Tensor,
mean_phn_span
=
mean_phn_span
).
nonzero
()
masked_start
=
align_start
[
idx
][
masked_phn_idxs
].
tolist
()
masked_end
=
align_end
[
idx
][
masked_phn_idxs
].
tolist
()
for
s
,
e
in
zip
(
masked_start
,
masked_end
):
masked_pos
[
idx
,
s
:
e
]
=
1
non_eos_mask
=
paddle
.
reshape
(
src_mask
,
paddle
.
shape
(
xs_pad
)[:
2
])
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录