Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
9f054e5a
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
1 年多 前同步成功
通知
208
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
9f054e5a
编写于
11月 16, 2021
作者:
X
xiegegege
提交者:
GitHub
11月 16, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Revert "[TTS]add multi-band melgan finetune scripts"
上级
26258949
变更
7
显示空白变更内容
内联
并排
Showing
7 changed file
with
123 addition
and
584 deletion
+123
-584
demos/style_fs2/style_syn.py
demos/style_fs2/style_syn.py
+119
-1
examples/csmsc/voc3/conf/finetune.yaml
examples/csmsc/voc3/conf/finetune.yaml
+0
-139
examples/csmsc/voc3/finetune.sh
examples/csmsc/voc3/finetune.sh
+0
-63
examples/csmsc/voc3/local/link_wav.py
examples/csmsc/voc3/local/link_wav.py
+0
-85
paddlespeech/t2s/datasets/vocoder_batch_fn.py
paddlespeech/t2s/datasets/vocoder_batch_fn.py
+4
-4
paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
+0
-167
paddlespeech/t2s/models/fastspeech2/fastspeech2.py
paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+0
-125
未找到文件。
demos/style_fs2/style_syn.py
浏览文件 @
9f054e5a
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
# limitations under the License.
# limitations under the License.
import
argparse
import
argparse
from
pathlib
import
Path
from
pathlib
import
Path
from
typing
import
Union
import
numpy
as
np
import
numpy
as
np
import
paddle
import
paddle
...
@@ -22,12 +23,129 @@ from yacs.config import CfgNode
...
@@ -22,12 +23,129 @@ from yacs.config import CfgNode
from
paddlespeech.t2s.frontend.zh_frontend
import
Frontend
from
paddlespeech.t2s.frontend.zh_frontend
import
Frontend
from
paddlespeech.t2s.models.fastspeech2
import
FastSpeech2
from
paddlespeech.t2s.models.fastspeech2
import
FastSpeech2
from
paddlespeech.t2s.models.fastspeech2
import
Style
FastSpeech2Inference
from
paddlespeech.t2s.models.fastspeech2
import
FastSpeech2Inference
from
paddlespeech.t2s.models.parallel_wavegan
import
PWGGenerator
from
paddlespeech.t2s.models.parallel_wavegan
import
PWGGenerator
from
paddlespeech.t2s.models.parallel_wavegan
import
PWGInference
from
paddlespeech.t2s.models.parallel_wavegan
import
PWGInference
from
paddlespeech.t2s.modules.normalizer
import
ZScore
from
paddlespeech.t2s.modules.normalizer
import
ZScore
class
StyleFastSpeech2Inference
(
FastSpeech2Inference
):
def
__init__
(
self
,
normalizer
,
model
,
pitch_stats_path
,
energy_stats_path
):
super
().
__init__
(
normalizer
,
model
)
pitch_mean
,
pitch_std
=
np
.
load
(
pitch_stats_path
)
self
.
pitch_mean
=
paddle
.
to_tensor
(
pitch_mean
)
self
.
pitch_std
=
paddle
.
to_tensor
(
pitch_std
)
energy_mean
,
energy_std
=
np
.
load
(
energy_stats_path
)
self
.
energy_mean
=
paddle
.
to_tensor
(
energy_mean
)
self
.
energy_std
=
paddle
.
to_tensor
(
energy_std
)
def
denorm
(
self
,
data
,
mean
,
std
):
return
data
*
std
+
mean
def
norm
(
self
,
data
,
mean
,
std
):
return
(
data
-
mean
)
/
std
def
forward
(
self
,
text
:
paddle
.
Tensor
,
durations
:
Union
[
paddle
.
Tensor
,
np
.
ndarray
]
=
None
,
durations_scale
:
Union
[
int
,
float
]
=
None
,
durations_bias
:
Union
[
int
,
float
]
=
None
,
pitch
:
Union
[
paddle
.
Tensor
,
np
.
ndarray
]
=
None
,
pitch_scale
:
Union
[
int
,
float
]
=
None
,
pitch_bias
:
Union
[
int
,
float
]
=
None
,
energy
:
Union
[
paddle
.
Tensor
,
np
.
ndarray
]
=
None
,
energy_scale
:
Union
[
int
,
float
]
=
None
,
energy_bias
:
Union
[
int
,
float
]
=
None
,
robot
:
bool
=
False
):
"""
Parameters
----------
text : Tensor(int64)
Input sequence of characters (T,).
speech : Tensor, optional
Feature sequence to extract style (N, idim).
durations : paddle.Tensor/np.ndarray, optional (int64)
Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
durations_scale: int/float, optional
durations_bias: int/float, optional
pitch : paddle.Tensor/np.ndarray, optional
Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
pitch_scale: int/float, optional
In denormed HZ domain.
pitch_bias: int/float, optional
In denormed HZ domain.
energy : paddle.Tensor/np.ndarray, optional
Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
energy_scale: int/float, optional
In denormed domain.
energy_bias: int/float, optional
In denormed domain.
robot : bool, optional
Weather output robot style
Returns
----------
Tensor
Output sequence of features (L, odim).
"""
normalized_mel
,
d_outs
,
p_outs
,
e_outs
=
self
.
acoustic_model
.
inference
(
text
,
durations
=
None
,
pitch
=
None
,
energy
=
None
)
# priority: groundtruth > scale/bias > previous output
# set durations
if
isinstance
(
durations
,
np
.
ndarray
):
durations
=
paddle
.
to_tensor
(
durations
)
elif
isinstance
(
durations
,
paddle
.
Tensor
):
durations
=
durations
elif
durations_scale
or
durations_bias
:
durations_scale
=
durations_scale
if
durations_scale
is
not
None
else
1
durations_bias
=
durations_bias
if
durations_bias
is
not
None
else
0
durations
=
durations_scale
*
d_outs
+
durations_bias
else
:
durations
=
d_outs
if
robot
:
# set normed pitch to zeros have the same effect with set denormd ones to mean
pitch
=
paddle
.
zeros
(
p_outs
.
shape
)
# set pitch, can overwrite robot set
if
isinstance
(
pitch
,
np
.
ndarray
):
pitch
=
paddle
.
to_tensor
(
pitch
)
elif
isinstance
(
pitch
,
paddle
.
Tensor
):
pitch
=
pitch
elif
pitch_scale
or
pitch_bias
:
pitch_scale
=
pitch_scale
if
pitch_scale
is
not
None
else
1
pitch_bias
=
pitch_bias
if
pitch_bias
is
not
None
else
0
p_Hz
=
paddle
.
exp
(
self
.
denorm
(
p_outs
,
self
.
pitch_mean
,
self
.
pitch_std
))
p_HZ
=
pitch_scale
*
p_Hz
+
pitch_bias
pitch
=
self
.
norm
(
paddle
.
log
(
p_HZ
),
self
.
pitch_mean
,
self
.
pitch_std
)
else
:
pitch
=
p_outs
# set energy
if
isinstance
(
energy
,
np
.
ndarray
):
energy
=
paddle
.
to_tensor
(
energy
)
elif
isinstance
(
energy
,
paddle
.
Tensor
):
energy
=
energy
elif
energy_scale
or
energy_bias
:
energy_scale
=
energy_scale
if
energy_scale
is
not
None
else
1
energy_bias
=
energy_bias
if
energy_bias
is
not
None
else
0
e_dnorm
=
self
.
denorm
(
e_outs
,
self
.
energy_mean
,
self
.
energy_std
)
e_dnorm
=
energy_scale
*
e_dnorm
+
energy_bias
energy
=
self
.
norm
(
e_dnorm
,
self
.
energy_mean
,
self
.
energy_std
)
else
:
energy
=
e_outs
normalized_mel
,
d_outs
,
p_outs
,
e_outs
=
self
.
acoustic_model
.
inference
(
text
,
durations
=
durations
,
pitch
=
pitch
,
energy
=
energy
,
use_teacher_forcing
=
True
)
logmel
=
self
.
normalizer
.
inverse
(
normalized_mel
)
return
logmel
def
evaluate
(
args
,
fastspeech2_config
,
pwg_config
):
def
evaluate
(
args
,
fastspeech2_config
,
pwg_config
):
# construct dataset for evaluation
# construct dataset for evaluation
...
...
examples/csmsc/voc3/conf/finetune.yaml
已删除
100644 → 0
浏览文件 @
26258949
# This is the hyperparameter configuration file for MelGAN.
# Please make sure this is adjusted for the CSMSC dataset. If you want to
# apply to the other dataset, you might need to carefully change some parameters.
# This configuration requires ~ 8GB memory and will finish within 7 days on Titan V.
# This configuration is based on full-band MelGAN but the hop size and sampling
# rate is different from the paper (16kHz vs 24kHz). The number of iteraions
# is not shown in the paper so currently we train 1M iterations (not sure enough
# to converge). The optimizer setting is based on @dathudeptrai advice.
# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs
:
24000
# Sampling rate.
n_fft
:
2048
# FFT size. (in samples)
n_shift
:
300
# Hop size. (in samples)
win_length
:
1200
# Window length. (in samples)
# If set to null, it will be the same as fft_size.
window
:
"
hann"
# Window function.
n_mels
:
80
# Number of mel basis.
fmin
:
80
# Minimum freq in mel basis calculation. (Hz)
fmax
:
7600
# Maximum frequency in mel basis calculation. (Hz)
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
generator_params
:
in_channels
:
80
# Number of input channels.
out_channels
:
4
# Number of output channels.
kernel_size
:
7
# Kernel size of initial and final conv layers.
channels
:
384
# Initial number of channels for conv layers.
upsample_scales
:
[
5
,
5
,
3
]
# List of Upsampling scales.
stack_kernel_size
:
3
# Kernel size of dilated conv layers in residual stack.
stacks
:
4
# Number of stacks in a single residual stack module.
use_weight_norm
:
True
# Whether to use weight normalization.
use_causal_conv
:
False
# Whether to use causal convolution.
use_final_nonlinear_activation
:
True
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
discriminator_params
:
in_channels
:
1
# Number of input channels.
out_channels
:
1
# Number of output channels.
scales
:
3
# Number of multi-scales.
downsample_pooling
:
"
AvgPool1D"
# Pooling type for the input downsampling.
downsample_pooling_params
:
# Parameters of the above pooling function.
kernel_size
:
4
stride
:
2
padding
:
1
exclusive
:
True
kernel_sizes
:
[
5
,
3
]
# List of kernel size.
channels
:
16
# Number of channels of the initial conv layer.
max_downsample_channels
:
512
# Maximum number of channels of downsampling layers.
downsample_scales
:
[
4
,
4
,
4
]
# List of downsampling scales.
nonlinear_activation
:
"
LeakyReLU"
# Nonlinear activation function.
nonlinear_activation_params
:
# Parameters of nonlinear activation function.
negative_slope
:
0.2
use_weight_norm
:
True
# Whether to use weight norm.
###########################################################
# STFT LOSS SETTING #
###########################################################
use_stft_loss
:
true
stft_loss_params
:
fft_sizes
:
[
1024
,
2048
,
512
]
# List of FFT size for STFT-based loss.
hop_sizes
:
[
120
,
240
,
50
]
# List of hop size for STFT-based loss
win_lengths
:
[
600
,
1200
,
240
]
# List of window length for STFT-based loss.
window
:
"
hann"
# Window function for STFT-based loss
use_subband_stft_loss
:
true
subband_stft_loss_params
:
fft_sizes
:
[
384
,
683
,
171
]
# List of FFT size for STFT-based loss.
hop_sizes
:
[
30
,
60
,
10
]
# List of hop size for STFT-based loss
win_lengths
:
[
150
,
300
,
60
]
# List of window length for STFT-based loss.
window
:
"
hann"
# Window function for STFT-based loss
###########################################################
# ADVERSARIAL LOSS SETTING #
###########################################################
use_feat_match_loss
:
false
# Whether to use feature matching loss.
lambda_adv
:
2.5
# Loss balancing coefficient for adversarial loss.
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size
:
64
# Batch size.
batch_max_steps
:
16200
# Length of each audio in batch. Make sure dividable by hop_size.
num_workers
:
2
# Number of workers in DataLoader.
###########################################################
# OPTIMIZER & SCHEDULER SETTING #
###########################################################
generator_optimizer_params
:
epsilon
:
1.0e-7
# Generator's epsilon.
weight_decay
:
0.0
# Generator's weight decay coefficient.
generator_grad_norm
:
-1
# Generator's gradient norm.
generator_scheduler_params
:
learning_rate
:
1.0e-3
# Generator's learning rate.
gamma
:
0.5
# Generator's scheduler gamma.
milestones
:
# At each milestone, lr will be multiplied by gamma.
-
100000
-
200000
-
300000
-
400000
-
500000
-
600000
discriminator_optimizer_params
:
epsilon
:
1.0e-7
# Discriminator's epsilon.
weight_decay
:
0.0
# Discriminator's weight decay coefficient.
discriminator_grad_norm
:
-1
# Discriminator's gradient norm.
discriminator_scheduler_params
:
learning_rate
:
1.0e-3
# Discriminator's learning rate.
gamma
:
0.5
# Discriminator's scheduler gamma.
milestones
:
# At each milestone, lr will be multiplied by gamma.
-
100000
-
200000
-
300000
-
400000
-
500000
-
600000
###########################################################
# INTERVAL SETTING #
###########################################################
discriminator_train_start_steps
:
200000
# Number of steps to start to train discriminator.
train_max_steps
:
1200000
# Number of training steps.
save_interval_steps
:
1000
# Interval steps to save checkpoint.
eval_interval_steps
:
1000
# Interval steps to evaluate the network.
###########################################################
# OTHER SETTING #
###########################################################
num_snapshots
:
10
# max number of snapshots to keep while training
seed
:
42
# random seed for paddle, random, and np.random
\ No newline at end of file
examples/csmsc/voc3/finetune.sh
已删除
100755 → 0
浏览文件 @
26258949
#!/bin/bash
source
path.sh
gpus
=
0
stage
=
0
stop_stage
=
100
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
python3
${
MAIN_ROOT
}
/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
\
--fastspeech2-config
=
fastspeech2_nosil_baker_ckpt_0.4/default.yaml
\
--fastspeech2-checkpoint
=
fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz
\
--fastspeech2-stat
=
fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy
\
--dur-file
=
durations.txt
\
--output-dir
=
dump_finetune
\
--phones-dict
=
fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
python3
local
/link_wav.py
\
--old-dump-dir
=
dump
\
--dump-dir
=
dump_finetune
fi
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
# get features' stats(mean and std)
echo
"Get features' stats ..."
cp
dump/train/feats_stats.npy dump_finetune/train/
fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# normalize, dev and test should use train's stats
echo
"Normalize ..."
python3
${
BIN_DIR
}
/../normalize.py
\
--metadata
=
dump_finetune/train/raw/metadata.jsonl
\
--dumpdir
=
dump_finetune/train/norm
\
--stats
=
dump_finetune/train/feats_stats.npy
python3
${
BIN_DIR
}
/../normalize.py
\
--metadata
=
dump_finetune/dev/raw/metadata.jsonl
\
--dumpdir
=
dump_finetune/dev/norm
\
--stats
=
dump_finetune/train/feats_stats.npy
python3
${
BIN_DIR
}
/../normalize.py
\
--metadata
=
dump_finetune/test/raw/metadata.jsonl
\
--dumpdir
=
dump_finetune/test/norm
\
--stats
=
dump_finetune/train/feats_stats.npy
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
CUDA_VISIBLE_DEVICES
=
${
gpus
}
\
FLAGS_cudnn_exhaustive_search
=
true
\
FLAGS_conv_workspace_size_limit
=
4000
\
python
${
BIN_DIR
}
/train.py
\
--train-metadata
=
dump_finetune/train/norm/metadata.jsonl
\
--dev-metadata
=
dump_finetune/dev/norm/metadata.jsonl
\
--config
=
conf/finetune.yaml
\
--output-dir
=
exp/finetune
\
--ngpu
=
1
fi
\ No newline at end of file
examples/csmsc/voc3/local/link_wav.py
已删除
100644 → 0
浏览文件 @
26258949
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
os
from
operator
import
itemgetter
from
pathlib
import
Path
import
jsonlines
import
numpy
as
np
def
main
():
# parse config and args
parser
=
argparse
.
ArgumentParser
(
description
=
"Preprocess audio and then extract features ."
)
parser
.
add_argument
(
"--old-dump-dir"
,
default
=
None
,
type
=
str
,
help
=
"directory to dump feature files."
)
parser
.
add_argument
(
"--dump-dir"
,
type
=
str
,
required
=
True
,
help
=
"directory to finetune dump feature files."
)
args
=
parser
.
parse_args
()
old_dump_dir
=
Path
(
args
.
old_dump_dir
).
expanduser
()
old_dump_dir
=
old_dump_dir
.
resolve
()
dump_dir
=
Path
(
args
.
dump_dir
).
expanduser
()
# use absolute path
dump_dir
=
dump_dir
.
resolve
()
dump_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
assert
old_dump_dir
.
is_dir
()
assert
dump_dir
.
is_dir
()
for
sub
in
[
"train"
,
"dev"
,
"test"
]:
# 把 old_dump_dir 里面的 *-wave.npy 软连接到 dump_dir 的对应位置
output_dir
=
dump_dir
/
sub
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
results
=
[]
for
name
in
os
.
listdir
(
output_dir
/
"raw"
):
# 003918_feats.npy
utt_id
=
name
.
split
(
"_"
)[
0
]
mel_path
=
output_dir
/
(
"raw/"
+
name
)
gen_mel
=
np
.
load
(
mel_path
)
wave_name
=
utt_id
+
"_wave.npy"
wav
=
np
.
load
(
old_dump_dir
/
sub
/
(
"raw/"
+
wave_name
))
os
.
symlink
(
old_dump_dir
/
sub
/
(
"raw/"
+
wave_name
),
output_dir
/
(
"raw/"
+
wave_name
))
num_sample
=
wav
.
shape
[
0
]
num_frames
=
gen_mel
.
shape
[
0
]
wav_path
=
output_dir
/
(
"raw/"
+
wave_name
)
record
=
{
"utt_id"
:
utt_id
,
"num_samples"
:
num_sample
,
"num_frames"
:
num_frames
,
"feats"
:
str
(
mel_path
),
"wave"
:
str
(
wav_path
),
}
results
.
append
(
record
)
results
.
sort
(
key
=
itemgetter
(
"utt_id"
))
with
jsonlines
.
open
(
output_dir
/
"raw/metadata.jsonl"
,
'w'
)
as
writer
:
for
item
in
results
:
writer
.
write
(
item
)
if
__name__
==
"__main__"
:
main
()
paddlespeech/t2s/datasets/vocoder_batch_fn.py
浏览文件 @
9f054e5a
...
@@ -110,10 +110,10 @@ class Clip(object):
...
@@ -110,10 +110,10 @@ class Clip(object):
if
len
(
x
)
<
c
.
shape
[
0
]
*
self
.
hop_size
:
if
len
(
x
)
<
c
.
shape
[
0
]
*
self
.
hop_size
:
x
=
np
.
pad
(
x
,
(
0
,
c
.
shape
[
0
]
*
self
.
hop_size
-
len
(
x
)),
mode
=
"edge"
)
x
=
np
.
pad
(
x
,
(
0
,
c
.
shape
[
0
]
*
self
.
hop_size
-
len
(
x
)),
mode
=
"edge"
)
elif
len
(
x
)
>
c
.
shape
[
0
]
*
self
.
hop_size
:
elif
len
(
x
)
>
c
.
shape
[
0
]
*
self
.
hop_size
:
#
print(
print
(
#
f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
f
"wave length: (
{
len
(
x
)
}
), mel length: (
{
c
.
shape
[
0
]
}
), hop size: (
{
self
.
hop_size
}
)"
#
)
)
x
=
x
[:
c
.
shape
[
0
]
*
self
.
hop_size
]
x
=
x
[:
c
.
shape
[
1
]
*
self
.
hop_size
]
# check the legnth is valid
# check the legnth is valid
assert
len
(
x
)
==
c
.
shape
[
assert
len
(
x
)
==
c
.
shape
[
...
...
paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
已删除
100644 → 0
浏览文件 @
26258949
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# generate mels using durations.txt
# for mb melgan finetune
# 长度和原本的 mel 不一致怎么办?
import
argparse
from
pathlib
import
Path
import
numpy
as
np
import
paddle
import
yaml
from
yacs.config
import
CfgNode
from
paddlespeech.t2s.datasets.preprocess_utils
import
get_phn_dur
from
paddlespeech.t2s.datasets.preprocess_utils
import
merge_silence
from
paddlespeech.t2s.models.fastspeech2
import
FastSpeech2
from
paddlespeech.t2s.models.fastspeech2
import
StyleFastSpeech2Inference
from
paddlespeech.t2s.modules.normalizer
import
ZScore
def
evaluate
(
args
,
fastspeech2_config
):
# construct dataset for evaluation
with
open
(
args
.
phones_dict
,
"r"
)
as
f
:
phn_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
vocab_size
=
len
(
phn_id
)
print
(
"vocab_size:"
,
vocab_size
)
phone_dict
=
{}
for
phn
,
id
in
phn_id
:
phone_dict
[
phn
]
=
int
(
id
)
odim
=
fastspeech2_config
.
n_mels
model
=
FastSpeech2
(
idim
=
vocab_size
,
odim
=
odim
,
**
fastspeech2_config
[
"model"
])
model
.
set_state_dict
(
paddle
.
load
(
args
.
fastspeech2_checkpoint
)[
"main_params"
])
model
.
eval
()
stat
=
np
.
load
(
args
.
fastspeech2_stat
)
mu
,
std
=
stat
mu
=
paddle
.
to_tensor
(
mu
)
std
=
paddle
.
to_tensor
(
std
)
fastspeech2_normalizer
=
ZScore
(
mu
,
std
)
fastspeech2_inference
=
StyleFastSpeech2Inference
(
fastspeech2_normalizer
,
model
)
fastspeech2_inference
.
eval
()
output_dir
=
Path
(
args
.
output_dir
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
sentences
,
speaker_set
=
get_phn_dur
(
args
.
dur_file
)
merge_silence
(
sentences
)
for
i
,
utt_id
in
enumerate
(
sentences
):
phones
=
sentences
[
utt_id
][
0
]
durations
=
sentences
[
utt_id
][
1
]
speaker
=
sentences
[
utt_id
][
2
]
# 裁剪掉开头和结尾的 sil
if
args
.
cut_sil
:
if
phones
[
0
]
==
"sil"
and
len
(
durations
)
>
1
:
durations
=
durations
[
1
:]
phones
=
phones
[
1
:]
if
phones
[
-
1
]
==
'sil'
and
len
(
durations
)
>
1
:
durations
=
durations
[:
-
1
]
phones
=
phones
[:
-
1
]
# sentences[utt_id][0] = phones
# sentences[utt_id][1] = durations
phone_ids
=
[
phone_dict
[
phn
]
for
phn
in
phones
]
phone_ids
=
paddle
.
to_tensor
(
np
.
array
(
phone_ids
))
durations
=
paddle
.
to_tensor
(
np
.
array
(
durations
))
# 生成的和真实的可能有 1, 2 帧的差距,但是 batch_fn 会修复
# split data into 3 sections
if
args
.
dataset
==
"baker"
:
num_train
=
9800
num_dev
=
100
if
i
in
range
(
0
,
num_train
):
sub_output_dir
=
output_dir
/
(
"train/raw"
)
elif
i
in
range
(
num_train
,
num_train
+
num_dev
):
sub_output_dir
=
output_dir
/
(
"dev/raw"
)
else
:
sub_output_dir
=
output_dir
/
(
"test/raw"
)
sub_output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
with
paddle
.
no_grad
():
mel
=
fastspeech2_inference
(
phone_ids
,
durations
=
durations
)
np
.
save
(
sub_output_dir
/
(
utt_id
+
"_feats.npy"
),
mel
)
def
main
():
# parse args and config and redirect to train_sp
parser
=
argparse
.
ArgumentParser
(
description
=
"Synthesize with fastspeech2 & parallel wavegan."
)
parser
.
add_argument
(
"--dataset"
,
default
=
"baker"
,
type
=
str
,
help
=
"name of dataset, should in {baker, ljspeech, vctk} now"
)
parser
.
add_argument
(
"--fastspeech2-config"
,
type
=
str
,
help
=
"fastspeech2 config file."
)
parser
.
add_argument
(
"--fastspeech2-checkpoint"
,
type
=
str
,
help
=
"fastspeech2 checkpoint to load."
)
parser
.
add_argument
(
"--fastspeech2-stat"
,
type
=
str
,
help
=
"mean and standard deviation used to normalize spectrogram when training fastspeech2."
)
parser
.
add_argument
(
"--phones-dict"
,
type
=
str
,
default
=
"phone_id_map.txt"
,
help
=
"phone vocabulary file."
)
parser
.
add_argument
(
"--dur-file"
,
default
=
None
,
type
=
str
,
help
=
"path to durations.txt."
)
parser
.
add_argument
(
"--output-dir"
,
type
=
str
,
help
=
"output dir."
)
parser
.
add_argument
(
"--ngpu"
,
type
=
int
,
default
=
1
,
help
=
"if ngpu == 0, use cpu."
)
parser
.
add_argument
(
"--verbose"
,
type
=
int
,
default
=
1
,
help
=
"verbose."
)
def
str2bool
(
str
):
return
True
if
str
.
lower
()
==
'true'
else
False
parser
.
add_argument
(
"--cut-sil"
,
type
=
str2bool
,
default
=
True
,
help
=
"whether cut sil in the edge of audio"
)
args
=
parser
.
parse_args
()
if
args
.
ngpu
==
0
:
paddle
.
set_device
(
"cpu"
)
elif
args
.
ngpu
>
0
:
paddle
.
set_device
(
"gpu"
)
else
:
print
(
"ngpu should >= 0 !"
)
with
open
(
args
.
fastspeech2_config
)
as
f
:
fastspeech2_config
=
CfgNode
(
yaml
.
safe_load
(
f
))
print
(
"========Args========"
)
print
(
yaml
.
safe_dump
(
vars
(
args
)))
print
(
"========Config========"
)
print
(
fastspeech2_config
)
evaluate
(
args
,
fastspeech2_config
)
if
__name__
==
"__main__"
:
main
()
paddlespeech/t2s/models/fastspeech2/fastspeech2.py
浏览文件 @
9f054e5a
...
@@ -16,9 +16,7 @@
...
@@ -16,9 +16,7 @@
from
typing
import
Dict
from
typing
import
Dict
from
typing
import
Sequence
from
typing
import
Sequence
from
typing
import
Tuple
from
typing
import
Tuple
from
typing
import
Union
import
numpy
as
np
import
paddle
import
paddle
import
paddle.nn.functional
as
F
import
paddle.nn.functional
as
F
from
paddle
import
nn
from
paddle
import
nn
...
@@ -689,129 +687,6 @@ class FastSpeech2Inference(nn.Layer):
...
@@ -689,129 +687,6 @@ class FastSpeech2Inference(nn.Layer):
return
logmel
return
logmel
class
StyleFastSpeech2Inference
(
FastSpeech2Inference
):
def
__init__
(
self
,
normalizer
,
model
,
pitch_stats_path
=
None
,
energy_stats_path
=
None
):
super
().
__init__
(
normalizer
,
model
)
if
pitch_stats_path
:
pitch_mean
,
pitch_std
=
np
.
load
(
pitch_stats_path
)
self
.
pitch_mean
=
paddle
.
to_tensor
(
pitch_mean
)
self
.
pitch_std
=
paddle
.
to_tensor
(
pitch_std
)
if
energy_stats_path
:
energy_mean
,
energy_std
=
np
.
load
(
energy_stats_path
)
self
.
energy_mean
=
paddle
.
to_tensor
(
energy_mean
)
self
.
energy_std
=
paddle
.
to_tensor
(
energy_std
)
def
denorm
(
self
,
data
,
mean
,
std
):
return
data
*
std
+
mean
def
norm
(
self
,
data
,
mean
,
std
):
return
(
data
-
mean
)
/
std
def
forward
(
self
,
text
:
paddle
.
Tensor
,
durations
:
Union
[
paddle
.
Tensor
,
np
.
ndarray
]
=
None
,
durations_scale
:
Union
[
int
,
float
]
=
None
,
durations_bias
:
Union
[
int
,
float
]
=
None
,
pitch
:
Union
[
paddle
.
Tensor
,
np
.
ndarray
]
=
None
,
pitch_scale
:
Union
[
int
,
float
]
=
None
,
pitch_bias
:
Union
[
int
,
float
]
=
None
,
energy
:
Union
[
paddle
.
Tensor
,
np
.
ndarray
]
=
None
,
energy_scale
:
Union
[
int
,
float
]
=
None
,
energy_bias
:
Union
[
int
,
float
]
=
None
,
robot
:
bool
=
False
):
"""
Parameters
----------
text : Tensor(int64)
Input sequence of characters (T,).
speech : Tensor, optional
Feature sequence to extract style (N, idim).
durations : paddle.Tensor/np.ndarray, optional (int64)
Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
durations_scale: int/float, optional
durations_bias: int/float, optional
pitch : paddle.Tensor/np.ndarray, optional
Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
pitch_scale: int/float, optional
In denormed HZ domain.
pitch_bias: int/float, optional
In denormed HZ domain.
energy : paddle.Tensor/np.ndarray, optional
Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
energy_scale: int/float, optional
In denormed domain.
energy_bias: int/float, optional
In denormed domain.
robot : bool, optional
Weather output robot style
Returns
----------
Tensor
Output sequence of features (L, odim).
"""
normalized_mel
,
d_outs
,
p_outs
,
e_outs
=
self
.
acoustic_model
.
inference
(
text
,
durations
=
None
,
pitch
=
None
,
energy
=
None
)
# priority: groundtruth > scale/bias > previous output
# set durations
if
isinstance
(
durations
,
np
.
ndarray
):
durations
=
paddle
.
to_tensor
(
durations
)
elif
isinstance
(
durations
,
paddle
.
Tensor
):
durations
=
durations
elif
durations_scale
or
durations_bias
:
durations_scale
=
durations_scale
if
durations_scale
is
not
None
else
1
durations_bias
=
durations_bias
if
durations_bias
is
not
None
else
0
durations
=
durations_scale
*
d_outs
+
durations_bias
else
:
durations
=
d_outs
if
robot
:
# set normed pitch to zeros have the same effect with set denormd ones to mean
pitch
=
paddle
.
zeros
(
p_outs
.
shape
)
# set pitch, can overwrite robot set
if
isinstance
(
pitch
,
np
.
ndarray
):
pitch
=
paddle
.
to_tensor
(
pitch
)
elif
isinstance
(
pitch
,
paddle
.
Tensor
):
pitch
=
pitch
elif
pitch_scale
or
pitch_bias
:
pitch_scale
=
pitch_scale
if
pitch_scale
is
not
None
else
1
pitch_bias
=
pitch_bias
if
pitch_bias
is
not
None
else
0
p_Hz
=
paddle
.
exp
(
self
.
denorm
(
p_outs
,
self
.
pitch_mean
,
self
.
pitch_std
))
p_HZ
=
pitch_scale
*
p_Hz
+
pitch_bias
pitch
=
self
.
norm
(
paddle
.
log
(
p_HZ
),
self
.
pitch_mean
,
self
.
pitch_std
)
else
:
pitch
=
p_outs
# set energy
if
isinstance
(
energy
,
np
.
ndarray
):
energy
=
paddle
.
to_tensor
(
energy
)
elif
isinstance
(
energy
,
paddle
.
Tensor
):
energy
=
energy
elif
energy_scale
or
energy_bias
:
energy_scale
=
energy_scale
if
energy_scale
is
not
None
else
1
energy_bias
=
energy_bias
if
energy_bias
is
not
None
else
0
e_dnorm
=
self
.
denorm
(
e_outs
,
self
.
energy_mean
,
self
.
energy_std
)
e_dnorm
=
energy_scale
*
e_dnorm
+
energy_bias
energy
=
self
.
norm
(
e_dnorm
,
self
.
energy_mean
,
self
.
energy_std
)
else
:
energy
=
e_outs
normalized_mel
,
d_outs
,
p_outs
,
e_outs
=
self
.
acoustic_model
.
inference
(
text
,
durations
=
durations
,
pitch
=
pitch
,
energy
=
energy
,
use_teacher_forcing
=
True
)
logmel
=
self
.
normalizer
.
inverse
(
normalized_mel
)
return
logmel
class
FastSpeech2Loss
(
nn
.
Layer
):
class
FastSpeech2Loss
(
nn
.
Layer
):
"""Loss function module for FastSpeech2."""
"""Loss function module for FastSpeech2."""
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录