Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
26258949
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
1 年多 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
26258949
编写于
11月 16, 2021
作者:
H
Hui Zhang
提交者:
GitHub
11月 16, 2021
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #995 from yt605155624/mbmelgan_fine
[TTS]add multi-band melgan finetune scripts
上级
6b129661
c5c9f190
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
584 addition
and
123 deletion
+584
-123
demos/style_fs2/style_syn.py
demos/style_fs2/style_syn.py
+1
-119
examples/csmsc/voc3/conf/finetune.yaml
examples/csmsc/voc3/conf/finetune.yaml
+139
-0
examples/csmsc/voc3/finetune.sh
examples/csmsc/voc3/finetune.sh
+63
-0
examples/csmsc/voc3/local/link_wav.py
examples/csmsc/voc3/local/link_wav.py
+85
-0
paddlespeech/t2s/datasets/vocoder_batch_fn.py
paddlespeech/t2s/datasets/vocoder_batch_fn.py
+4
-4
paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
+167
-0
paddlespeech/t2s/models/fastspeech2/fastspeech2.py
paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+125
-0
未找到文件。
demos/style_fs2/style_syn.py
浏览文件 @
26258949
...
...
@@ -13,7 +13,6 @@
# limitations under the License.
import
argparse
from
pathlib
import
Path
from
typing
import
Union
import
numpy
as
np
import
paddle
...
...
@@ -23,129 +22,12 @@ from yacs.config import CfgNode
from
paddlespeech.t2s.frontend.zh_frontend
import
Frontend
from
paddlespeech.t2s.models.fastspeech2
import
FastSpeech2
from
paddlespeech.t2s.models.fastspeech2
import
FastSpeech2Inference
from
paddlespeech.t2s.models.fastspeech2
import
Style
FastSpeech2Inference
from
paddlespeech.t2s.models.parallel_wavegan
import
PWGGenerator
from
paddlespeech.t2s.models.parallel_wavegan
import
PWGInference
from
paddlespeech.t2s.modules.normalizer
import
ZScore
class
StyleFastSpeech2Inference
(
FastSpeech2Inference
):
def
__init__
(
self
,
normalizer
,
model
,
pitch_stats_path
,
energy_stats_path
):
super
().
__init__
(
normalizer
,
model
)
pitch_mean
,
pitch_std
=
np
.
load
(
pitch_stats_path
)
self
.
pitch_mean
=
paddle
.
to_tensor
(
pitch_mean
)
self
.
pitch_std
=
paddle
.
to_tensor
(
pitch_std
)
energy_mean
,
energy_std
=
np
.
load
(
energy_stats_path
)
self
.
energy_mean
=
paddle
.
to_tensor
(
energy_mean
)
self
.
energy_std
=
paddle
.
to_tensor
(
energy_std
)
def
denorm
(
self
,
data
,
mean
,
std
):
return
data
*
std
+
mean
def
norm
(
self
,
data
,
mean
,
std
):
return
(
data
-
mean
)
/
std
def
forward
(
self
,
text
:
paddle
.
Tensor
,
durations
:
Union
[
paddle
.
Tensor
,
np
.
ndarray
]
=
None
,
durations_scale
:
Union
[
int
,
float
]
=
None
,
durations_bias
:
Union
[
int
,
float
]
=
None
,
pitch
:
Union
[
paddle
.
Tensor
,
np
.
ndarray
]
=
None
,
pitch_scale
:
Union
[
int
,
float
]
=
None
,
pitch_bias
:
Union
[
int
,
float
]
=
None
,
energy
:
Union
[
paddle
.
Tensor
,
np
.
ndarray
]
=
None
,
energy_scale
:
Union
[
int
,
float
]
=
None
,
energy_bias
:
Union
[
int
,
float
]
=
None
,
robot
:
bool
=
False
):
"""
Parameters
----------
text : Tensor(int64)
Input sequence of characters (T,).
speech : Tensor, optional
Feature sequence to extract style (N, idim).
durations : paddle.Tensor/np.ndarray, optional (int64)
Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
durations_scale: int/float, optional
durations_bias: int/float, optional
pitch : paddle.Tensor/np.ndarray, optional
Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
pitch_scale: int/float, optional
In denormed HZ domain.
pitch_bias: int/float, optional
In denormed HZ domain.
energy : paddle.Tensor/np.ndarray, optional
Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
energy_scale: int/float, optional
In denormed domain.
energy_bias: int/float, optional
In denormed domain.
robot : bool, optional
Weather output robot style
Returns
----------
Tensor
Output sequence of features (L, odim).
"""
normalized_mel
,
d_outs
,
p_outs
,
e_outs
=
self
.
acoustic_model
.
inference
(
text
,
durations
=
None
,
pitch
=
None
,
energy
=
None
)
# priority: groundtruth > scale/bias > previous output
# set durations
if
isinstance
(
durations
,
np
.
ndarray
):
durations
=
paddle
.
to_tensor
(
durations
)
elif
isinstance
(
durations
,
paddle
.
Tensor
):
durations
=
durations
elif
durations_scale
or
durations_bias
:
durations_scale
=
durations_scale
if
durations_scale
is
not
None
else
1
durations_bias
=
durations_bias
if
durations_bias
is
not
None
else
0
durations
=
durations_scale
*
d_outs
+
durations_bias
else
:
durations
=
d_outs
if
robot
:
# set normed pitch to zeros have the same effect with set denormd ones to mean
pitch
=
paddle
.
zeros
(
p_outs
.
shape
)
# set pitch, can overwrite robot set
if
isinstance
(
pitch
,
np
.
ndarray
):
pitch
=
paddle
.
to_tensor
(
pitch
)
elif
isinstance
(
pitch
,
paddle
.
Tensor
):
pitch
=
pitch
elif
pitch_scale
or
pitch_bias
:
pitch_scale
=
pitch_scale
if
pitch_scale
is
not
None
else
1
pitch_bias
=
pitch_bias
if
pitch_bias
is
not
None
else
0
p_Hz
=
paddle
.
exp
(
self
.
denorm
(
p_outs
,
self
.
pitch_mean
,
self
.
pitch_std
))
p_HZ
=
pitch_scale
*
p_Hz
+
pitch_bias
pitch
=
self
.
norm
(
paddle
.
log
(
p_HZ
),
self
.
pitch_mean
,
self
.
pitch_std
)
else
:
pitch
=
p_outs
# set energy
if
isinstance
(
energy
,
np
.
ndarray
):
energy
=
paddle
.
to_tensor
(
energy
)
elif
isinstance
(
energy
,
paddle
.
Tensor
):
energy
=
energy
elif
energy_scale
or
energy_bias
:
energy_scale
=
energy_scale
if
energy_scale
is
not
None
else
1
energy_bias
=
energy_bias
if
energy_bias
is
not
None
else
0
e_dnorm
=
self
.
denorm
(
e_outs
,
self
.
energy_mean
,
self
.
energy_std
)
e_dnorm
=
energy_scale
*
e_dnorm
+
energy_bias
energy
=
self
.
norm
(
e_dnorm
,
self
.
energy_mean
,
self
.
energy_std
)
else
:
energy
=
e_outs
normalized_mel
,
d_outs
,
p_outs
,
e_outs
=
self
.
acoustic_model
.
inference
(
text
,
durations
=
durations
,
pitch
=
pitch
,
energy
=
energy
,
use_teacher_forcing
=
True
)
logmel
=
self
.
normalizer
.
inverse
(
normalized_mel
)
return
logmel
def
evaluate
(
args
,
fastspeech2_config
,
pwg_config
):
# construct dataset for evaluation
...
...
examples/csmsc/voc3/conf/finetune.yaml
0 → 100644
浏览文件 @
26258949
# This is the hyperparameter configuration file for MelGAN.
# Please make sure this is adjusted for the CSMSC dataset. If you want to
# apply to the other dataset, you might need to carefully change some parameters.
# This configuration requires ~ 8GB memory and will finish within 7 days on Titan V.
# This configuration is based on full-band MelGAN but the hop size and sampling
# rate is different from the paper (16kHz vs 24kHz). The number of iteraions
# is not shown in the paper so currently we train 1M iterations (not sure enough
# to converge). The optimizer setting is based on @dathudeptrai advice.
# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs
:
24000
# Sampling rate.
n_fft
:
2048
# FFT size. (in samples)
n_shift
:
300
# Hop size. (in samples)
win_length
:
1200
# Window length. (in samples)
# If set to null, it will be the same as fft_size.
window
:
"
hann"
# Window function.
n_mels
:
80
# Number of mel basis.
fmin
:
80
# Minimum freq in mel basis calculation. (Hz)
fmax
:
7600
# Maximum frequency in mel basis calculation. (Hz)
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
generator_params
:
in_channels
:
80
# Number of input channels.
out_channels
:
4
# Number of output channels.
kernel_size
:
7
# Kernel size of initial and final conv layers.
channels
:
384
# Initial number of channels for conv layers.
upsample_scales
:
[
5
,
5
,
3
]
# List of Upsampling scales.
stack_kernel_size
:
3
# Kernel size of dilated conv layers in residual stack.
stacks
:
4
# Number of stacks in a single residual stack module.
use_weight_norm
:
True
# Whether to use weight normalization.
use_causal_conv
:
False
# Whether to use causal convolution.
use_final_nonlinear_activation
:
True
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
discriminator_params
:
in_channels
:
1
# Number of input channels.
out_channels
:
1
# Number of output channels.
scales
:
3
# Number of multi-scales.
downsample_pooling
:
"
AvgPool1D"
# Pooling type for the input downsampling.
downsample_pooling_params
:
# Parameters of the above pooling function.
kernel_size
:
4
stride
:
2
padding
:
1
exclusive
:
True
kernel_sizes
:
[
5
,
3
]
# List of kernel size.
channels
:
16
# Number of channels of the initial conv layer.
max_downsample_channels
:
512
# Maximum number of channels of downsampling layers.
downsample_scales
:
[
4
,
4
,
4
]
# List of downsampling scales.
nonlinear_activation
:
"
LeakyReLU"
# Nonlinear activation function.
nonlinear_activation_params
:
# Parameters of nonlinear activation function.
negative_slope
:
0.2
use_weight_norm
:
True
# Whether to use weight norm.
###########################################################
# STFT LOSS SETTING #
###########################################################
use_stft_loss
:
true
stft_loss_params
:
fft_sizes
:
[
1024
,
2048
,
512
]
# List of FFT size for STFT-based loss.
hop_sizes
:
[
120
,
240
,
50
]
# List of hop size for STFT-based loss
win_lengths
:
[
600
,
1200
,
240
]
# List of window length for STFT-based loss.
window
:
"
hann"
# Window function for STFT-based loss
use_subband_stft_loss
:
true
subband_stft_loss_params
:
fft_sizes
:
[
384
,
683
,
171
]
# List of FFT size for STFT-based loss.
hop_sizes
:
[
30
,
60
,
10
]
# List of hop size for STFT-based loss
win_lengths
:
[
150
,
300
,
60
]
# List of window length for STFT-based loss.
window
:
"
hann"
# Window function for STFT-based loss
###########################################################
# ADVERSARIAL LOSS SETTING #
###########################################################
use_feat_match_loss
:
false
# Whether to use feature matching loss.
lambda_adv
:
2.5
# Loss balancing coefficient for adversarial loss.
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size
:
64
# Batch size.
batch_max_steps
:
16200
# Length of each audio in batch. Make sure dividable by hop_size.
num_workers
:
2
# Number of workers in DataLoader.
###########################################################
# OPTIMIZER & SCHEDULER SETTING #
###########################################################
generator_optimizer_params
:
epsilon
:
1.0e-7
# Generator's epsilon.
weight_decay
:
0.0
# Generator's weight decay coefficient.
generator_grad_norm
:
-1
# Generator's gradient norm.
generator_scheduler_params
:
learning_rate
:
1.0e-3
# Generator's learning rate.
gamma
:
0.5
# Generator's scheduler gamma.
milestones
:
# At each milestone, lr will be multiplied by gamma.
-
100000
-
200000
-
300000
-
400000
-
500000
-
600000
discriminator_optimizer_params
:
epsilon
:
1.0e-7
# Discriminator's epsilon.
weight_decay
:
0.0
# Discriminator's weight decay coefficient.
discriminator_grad_norm
:
-1
# Discriminator's gradient norm.
discriminator_scheduler_params
:
learning_rate
:
1.0e-3
# Discriminator's learning rate.
gamma
:
0.5
# Discriminator's scheduler gamma.
milestones
:
# At each milestone, lr will be multiplied by gamma.
-
100000
-
200000
-
300000
-
400000
-
500000
-
600000
###########################################################
# INTERVAL SETTING #
###########################################################
discriminator_train_start_steps
:
200000
# Number of steps to start to train discriminator.
train_max_steps
:
1200000
# Number of training steps.
save_interval_steps
:
1000
# Interval steps to save checkpoint.
eval_interval_steps
:
1000
# Interval steps to evaluate the network.
###########################################################
# OTHER SETTING #
###########################################################
num_snapshots
:
10
# max number of snapshots to keep while training
seed
:
42
# random seed for paddle, random, and np.random
\ No newline at end of file
examples/csmsc/voc3/finetune.sh
0 → 100755
浏览文件 @
26258949
#!/bin/bash
source
path.sh
gpus
=
0
stage
=
0
stop_stage
=
100
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
python3
${
MAIN_ROOT
}
/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
\
--fastspeech2-config
=
fastspeech2_nosil_baker_ckpt_0.4/default.yaml
\
--fastspeech2-checkpoint
=
fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz
\
--fastspeech2-stat
=
fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy
\
--dur-file
=
durations.txt
\
--output-dir
=
dump_finetune
\
--phones-dict
=
fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
python3
local
/link_wav.py
\
--old-dump-dir
=
dump
\
--dump-dir
=
dump_finetune
fi
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
# get features' stats(mean and std)
echo
"Get features' stats ..."
cp
dump/train/feats_stats.npy dump_finetune/train/
fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# normalize, dev and test should use train's stats
echo
"Normalize ..."
python3
${
BIN_DIR
}
/../normalize.py
\
--metadata
=
dump_finetune/train/raw/metadata.jsonl
\
--dumpdir
=
dump_finetune/train/norm
\
--stats
=
dump_finetune/train/feats_stats.npy
python3
${
BIN_DIR
}
/../normalize.py
\
--metadata
=
dump_finetune/dev/raw/metadata.jsonl
\
--dumpdir
=
dump_finetune/dev/norm
\
--stats
=
dump_finetune/train/feats_stats.npy
python3
${
BIN_DIR
}
/../normalize.py
\
--metadata
=
dump_finetune/test/raw/metadata.jsonl
\
--dumpdir
=
dump_finetune/test/norm
\
--stats
=
dump_finetune/train/feats_stats.npy
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
CUDA_VISIBLE_DEVICES
=
${
gpus
}
\
FLAGS_cudnn_exhaustive_search
=
true
\
FLAGS_conv_workspace_size_limit
=
4000
\
python
${
BIN_DIR
}
/train.py
\
--train-metadata
=
dump_finetune/train/norm/metadata.jsonl
\
--dev-metadata
=
dump_finetune/dev/norm/metadata.jsonl
\
--config
=
conf/finetune.yaml
\
--output-dir
=
exp/finetune
\
--ngpu
=
1
fi
\ No newline at end of file
examples/csmsc/voc3/local/link_wav.py
0 → 100644
浏览文件 @
26258949
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
os
from
operator
import
itemgetter
from
pathlib
import
Path
import
jsonlines
import
numpy
as
np
def
main
():
# parse config and args
parser
=
argparse
.
ArgumentParser
(
description
=
"Preprocess audio and then extract features ."
)
parser
.
add_argument
(
"--old-dump-dir"
,
default
=
None
,
type
=
str
,
help
=
"directory to dump feature files."
)
parser
.
add_argument
(
"--dump-dir"
,
type
=
str
,
required
=
True
,
help
=
"directory to finetune dump feature files."
)
args
=
parser
.
parse_args
()
old_dump_dir
=
Path
(
args
.
old_dump_dir
).
expanduser
()
old_dump_dir
=
old_dump_dir
.
resolve
()
dump_dir
=
Path
(
args
.
dump_dir
).
expanduser
()
# use absolute path
dump_dir
=
dump_dir
.
resolve
()
dump_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
assert
old_dump_dir
.
is_dir
()
assert
dump_dir
.
is_dir
()
for
sub
in
[
"train"
,
"dev"
,
"test"
]:
# 把 old_dump_dir 里面的 *-wave.npy 软连接到 dump_dir 的对应位置
output_dir
=
dump_dir
/
sub
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
results
=
[]
for
name
in
os
.
listdir
(
output_dir
/
"raw"
):
# 003918_feats.npy
utt_id
=
name
.
split
(
"_"
)[
0
]
mel_path
=
output_dir
/
(
"raw/"
+
name
)
gen_mel
=
np
.
load
(
mel_path
)
wave_name
=
utt_id
+
"_wave.npy"
wav
=
np
.
load
(
old_dump_dir
/
sub
/
(
"raw/"
+
wave_name
))
os
.
symlink
(
old_dump_dir
/
sub
/
(
"raw/"
+
wave_name
),
output_dir
/
(
"raw/"
+
wave_name
))
num_sample
=
wav
.
shape
[
0
]
num_frames
=
gen_mel
.
shape
[
0
]
wav_path
=
output_dir
/
(
"raw/"
+
wave_name
)
record
=
{
"utt_id"
:
utt_id
,
"num_samples"
:
num_sample
,
"num_frames"
:
num_frames
,
"feats"
:
str
(
mel_path
),
"wave"
:
str
(
wav_path
),
}
results
.
append
(
record
)
results
.
sort
(
key
=
itemgetter
(
"utt_id"
))
with
jsonlines
.
open
(
output_dir
/
"raw/metadata.jsonl"
,
'w'
)
as
writer
:
for
item
in
results
:
writer
.
write
(
item
)
if
__name__
==
"__main__"
:
main
()
paddlespeech/t2s/datasets/vocoder_batch_fn.py
浏览文件 @
26258949
...
...
@@ -110,10 +110,10 @@ class Clip(object):
if
len
(
x
)
<
c
.
shape
[
0
]
*
self
.
hop_size
:
x
=
np
.
pad
(
x
,
(
0
,
c
.
shape
[
0
]
*
self
.
hop_size
-
len
(
x
)),
mode
=
"edge"
)
elif
len
(
x
)
>
c
.
shape
[
0
]
*
self
.
hop_size
:
print
(
f
"wave length: (
{
len
(
x
)
}
), mel length: (
{
c
.
shape
[
0
]
}
), hop size: (
{
self
.
hop_size
}
)"
)
x
=
x
[:
c
.
shape
[
1
]
*
self
.
hop_size
]
#
print(
#
f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
#
)
x
=
x
[:
c
.
shape
[
0
]
*
self
.
hop_size
]
# check the legnth is valid
assert
len
(
x
)
==
c
.
shape
[
...
...
paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
0 → 100644
浏览文件 @
26258949
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# generate mels using durations.txt
# for mb melgan finetune
# 长度和原本的 mel 不一致怎么办?
import
argparse
from
pathlib
import
Path
import
numpy
as
np
import
paddle
import
yaml
from
yacs.config
import
CfgNode
from
paddlespeech.t2s.datasets.preprocess_utils
import
get_phn_dur
from
paddlespeech.t2s.datasets.preprocess_utils
import
merge_silence
from
paddlespeech.t2s.models.fastspeech2
import
FastSpeech2
from
paddlespeech.t2s.models.fastspeech2
import
StyleFastSpeech2Inference
from
paddlespeech.t2s.modules.normalizer
import
ZScore
def
evaluate
(
args
,
fastspeech2_config
):
# construct dataset for evaluation
with
open
(
args
.
phones_dict
,
"r"
)
as
f
:
phn_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
vocab_size
=
len
(
phn_id
)
print
(
"vocab_size:"
,
vocab_size
)
phone_dict
=
{}
for
phn
,
id
in
phn_id
:
phone_dict
[
phn
]
=
int
(
id
)
odim
=
fastspeech2_config
.
n_mels
model
=
FastSpeech2
(
idim
=
vocab_size
,
odim
=
odim
,
**
fastspeech2_config
[
"model"
])
model
.
set_state_dict
(
paddle
.
load
(
args
.
fastspeech2_checkpoint
)[
"main_params"
])
model
.
eval
()
stat
=
np
.
load
(
args
.
fastspeech2_stat
)
mu
,
std
=
stat
mu
=
paddle
.
to_tensor
(
mu
)
std
=
paddle
.
to_tensor
(
std
)
fastspeech2_normalizer
=
ZScore
(
mu
,
std
)
fastspeech2_inference
=
StyleFastSpeech2Inference
(
fastspeech2_normalizer
,
model
)
fastspeech2_inference
.
eval
()
output_dir
=
Path
(
args
.
output_dir
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
sentences
,
speaker_set
=
get_phn_dur
(
args
.
dur_file
)
merge_silence
(
sentences
)
for
i
,
utt_id
in
enumerate
(
sentences
):
phones
=
sentences
[
utt_id
][
0
]
durations
=
sentences
[
utt_id
][
1
]
speaker
=
sentences
[
utt_id
][
2
]
# 裁剪掉开头和结尾的 sil
if
args
.
cut_sil
:
if
phones
[
0
]
==
"sil"
and
len
(
durations
)
>
1
:
durations
=
durations
[
1
:]
phones
=
phones
[
1
:]
if
phones
[
-
1
]
==
'sil'
and
len
(
durations
)
>
1
:
durations
=
durations
[:
-
1
]
phones
=
phones
[:
-
1
]
# sentences[utt_id][0] = phones
# sentences[utt_id][1] = durations
phone_ids
=
[
phone_dict
[
phn
]
for
phn
in
phones
]
phone_ids
=
paddle
.
to_tensor
(
np
.
array
(
phone_ids
))
durations
=
paddle
.
to_tensor
(
np
.
array
(
durations
))
# 生成的和真实的可能有 1, 2 帧的差距,但是 batch_fn 会修复
# split data into 3 sections
if
args
.
dataset
==
"baker"
:
num_train
=
9800
num_dev
=
100
if
i
in
range
(
0
,
num_train
):
sub_output_dir
=
output_dir
/
(
"train/raw"
)
elif
i
in
range
(
num_train
,
num_train
+
num_dev
):
sub_output_dir
=
output_dir
/
(
"dev/raw"
)
else
:
sub_output_dir
=
output_dir
/
(
"test/raw"
)
sub_output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
with
paddle
.
no_grad
():
mel
=
fastspeech2_inference
(
phone_ids
,
durations
=
durations
)
np
.
save
(
sub_output_dir
/
(
utt_id
+
"_feats.npy"
),
mel
)
def
main
():
# parse args and config and redirect to train_sp
parser
=
argparse
.
ArgumentParser
(
description
=
"Synthesize with fastspeech2 & parallel wavegan."
)
parser
.
add_argument
(
"--dataset"
,
default
=
"baker"
,
type
=
str
,
help
=
"name of dataset, should in {baker, ljspeech, vctk} now"
)
parser
.
add_argument
(
"--fastspeech2-config"
,
type
=
str
,
help
=
"fastspeech2 config file."
)
parser
.
add_argument
(
"--fastspeech2-checkpoint"
,
type
=
str
,
help
=
"fastspeech2 checkpoint to load."
)
parser
.
add_argument
(
"--fastspeech2-stat"
,
type
=
str
,
help
=
"mean and standard deviation used to normalize spectrogram when training fastspeech2."
)
parser
.
add_argument
(
"--phones-dict"
,
type
=
str
,
default
=
"phone_id_map.txt"
,
help
=
"phone vocabulary file."
)
parser
.
add_argument
(
"--dur-file"
,
default
=
None
,
type
=
str
,
help
=
"path to durations.txt."
)
parser
.
add_argument
(
"--output-dir"
,
type
=
str
,
help
=
"output dir."
)
parser
.
add_argument
(
"--ngpu"
,
type
=
int
,
default
=
1
,
help
=
"if ngpu == 0, use cpu."
)
parser
.
add_argument
(
"--verbose"
,
type
=
int
,
default
=
1
,
help
=
"verbose."
)
def
str2bool
(
str
):
return
True
if
str
.
lower
()
==
'true'
else
False
parser
.
add_argument
(
"--cut-sil"
,
type
=
str2bool
,
default
=
True
,
help
=
"whether cut sil in the edge of audio"
)
args
=
parser
.
parse_args
()
if
args
.
ngpu
==
0
:
paddle
.
set_device
(
"cpu"
)
elif
args
.
ngpu
>
0
:
paddle
.
set_device
(
"gpu"
)
else
:
print
(
"ngpu should >= 0 !"
)
with
open
(
args
.
fastspeech2_config
)
as
f
:
fastspeech2_config
=
CfgNode
(
yaml
.
safe_load
(
f
))
print
(
"========Args========"
)
print
(
yaml
.
safe_dump
(
vars
(
args
)))
print
(
"========Config========"
)
print
(
fastspeech2_config
)
evaluate
(
args
,
fastspeech2_config
)
if
__name__
==
"__main__"
:
main
()
paddlespeech/t2s/models/fastspeech2/fastspeech2.py
浏览文件 @
26258949
...
...
@@ -16,7 +16,9 @@
from
typing
import
Dict
from
typing
import
Sequence
from
typing
import
Tuple
from
typing
import
Union
import
numpy
as
np
import
paddle
import
paddle.nn.functional
as
F
from
paddle
import
nn
...
...
@@ -687,6 +689,129 @@ class FastSpeech2Inference(nn.Layer):
return
logmel
class
StyleFastSpeech2Inference
(
FastSpeech2Inference
):
def
__init__
(
self
,
normalizer
,
model
,
pitch_stats_path
=
None
,
energy_stats_path
=
None
):
super
().
__init__
(
normalizer
,
model
)
if
pitch_stats_path
:
pitch_mean
,
pitch_std
=
np
.
load
(
pitch_stats_path
)
self
.
pitch_mean
=
paddle
.
to_tensor
(
pitch_mean
)
self
.
pitch_std
=
paddle
.
to_tensor
(
pitch_std
)
if
energy_stats_path
:
energy_mean
,
energy_std
=
np
.
load
(
energy_stats_path
)
self
.
energy_mean
=
paddle
.
to_tensor
(
energy_mean
)
self
.
energy_std
=
paddle
.
to_tensor
(
energy_std
)
def
denorm
(
self
,
data
,
mean
,
std
):
return
data
*
std
+
mean
def
norm
(
self
,
data
,
mean
,
std
):
return
(
data
-
mean
)
/
std
def
forward
(
self
,
text
:
paddle
.
Tensor
,
durations
:
Union
[
paddle
.
Tensor
,
np
.
ndarray
]
=
None
,
durations_scale
:
Union
[
int
,
float
]
=
None
,
durations_bias
:
Union
[
int
,
float
]
=
None
,
pitch
:
Union
[
paddle
.
Tensor
,
np
.
ndarray
]
=
None
,
pitch_scale
:
Union
[
int
,
float
]
=
None
,
pitch_bias
:
Union
[
int
,
float
]
=
None
,
energy
:
Union
[
paddle
.
Tensor
,
np
.
ndarray
]
=
None
,
energy_scale
:
Union
[
int
,
float
]
=
None
,
energy_bias
:
Union
[
int
,
float
]
=
None
,
robot
:
bool
=
False
):
"""
Parameters
----------
text : Tensor(int64)
Input sequence of characters (T,).
speech : Tensor, optional
Feature sequence to extract style (N, idim).
durations : paddle.Tensor/np.ndarray, optional (int64)
Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
durations_scale: int/float, optional
durations_bias: int/float, optional
pitch : paddle.Tensor/np.ndarray, optional
Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
pitch_scale: int/float, optional
In denormed HZ domain.
pitch_bias: int/float, optional
In denormed HZ domain.
energy : paddle.Tensor/np.ndarray, optional
Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
energy_scale: int/float, optional
In denormed domain.
energy_bias: int/float, optional
In denormed domain.
robot : bool, optional
Weather output robot style
Returns
----------
Tensor
Output sequence of features (L, odim).
"""
normalized_mel
,
d_outs
,
p_outs
,
e_outs
=
self
.
acoustic_model
.
inference
(
text
,
durations
=
None
,
pitch
=
None
,
energy
=
None
)
# priority: groundtruth > scale/bias > previous output
# set durations
if
isinstance
(
durations
,
np
.
ndarray
):
durations
=
paddle
.
to_tensor
(
durations
)
elif
isinstance
(
durations
,
paddle
.
Tensor
):
durations
=
durations
elif
durations_scale
or
durations_bias
:
durations_scale
=
durations_scale
if
durations_scale
is
not
None
else
1
durations_bias
=
durations_bias
if
durations_bias
is
not
None
else
0
durations
=
durations_scale
*
d_outs
+
durations_bias
else
:
durations
=
d_outs
if
robot
:
# set normed pitch to zeros have the same effect with set denormd ones to mean
pitch
=
paddle
.
zeros
(
p_outs
.
shape
)
# set pitch, can overwrite robot set
if
isinstance
(
pitch
,
np
.
ndarray
):
pitch
=
paddle
.
to_tensor
(
pitch
)
elif
isinstance
(
pitch
,
paddle
.
Tensor
):
pitch
=
pitch
elif
pitch_scale
or
pitch_bias
:
pitch_scale
=
pitch_scale
if
pitch_scale
is
not
None
else
1
pitch_bias
=
pitch_bias
if
pitch_bias
is
not
None
else
0
p_Hz
=
paddle
.
exp
(
self
.
denorm
(
p_outs
,
self
.
pitch_mean
,
self
.
pitch_std
))
p_HZ
=
pitch_scale
*
p_Hz
+
pitch_bias
pitch
=
self
.
norm
(
paddle
.
log
(
p_HZ
),
self
.
pitch_mean
,
self
.
pitch_std
)
else
:
pitch
=
p_outs
# set energy
if
isinstance
(
energy
,
np
.
ndarray
):
energy
=
paddle
.
to_tensor
(
energy
)
elif
isinstance
(
energy
,
paddle
.
Tensor
):
energy
=
energy
elif
energy_scale
or
energy_bias
:
energy_scale
=
energy_scale
if
energy_scale
is
not
None
else
1
energy_bias
=
energy_bias
if
energy_bias
is
not
None
else
0
e_dnorm
=
self
.
denorm
(
e_outs
,
self
.
energy_mean
,
self
.
energy_std
)
e_dnorm
=
energy_scale
*
e_dnorm
+
energy_bias
energy
=
self
.
norm
(
e_dnorm
,
self
.
energy_mean
,
self
.
energy_std
)
else
:
energy
=
e_outs
normalized_mel
,
d_outs
,
p_outs
,
e_outs
=
self
.
acoustic_model
.
inference
(
text
,
durations
=
durations
,
pitch
=
pitch
,
energy
=
energy
,
use_teacher_forcing
=
True
)
logmel
=
self
.
normalizer
.
inverse
(
normalized_mel
)
return
logmel
class
FastSpeech2Loss
(
nn
.
Layer
):
"""Loss function module for FastSpeech2."""
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录