Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
d3d9f835
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
d3d9f835
编写于
11月 01, 2021
作者:
小湉湉
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add global init for multi band melgan to avoid large output in the begin
上级
9125d71a
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
19 addition
and
148 deletion
+19
-148
examples/csmsc/voc3/conf/default.yaml
examples/csmsc/voc3/conf/default.yaml
+2
-2
examples/csmsc/voc3/conf/use_tanh.yaml
examples/csmsc/voc3/conf/use_tanh.yaml
+0
-139
parakeet/models/melgan/melgan.py
parakeet/models/melgan/melgan.py
+16
-3
parakeet/modules/residual_stack.py
parakeet/modules/residual_stack.py
+1
-4
未找到文件。
examples/csmsc/voc3/conf/default.yaml
浏览文件 @
d3d9f835
...
...
@@ -35,7 +35,7 @@ generator_params:
stacks
:
4
# Number of stacks in a single residual stack module.
use_weight_norm
:
True
# Whether to use weight normalization.
use_causal_conv
:
False
# Whether to use causal convolution.
use_final_nonlinear_activation
:
False
# If True, spectral_convergence_loss and sub_spectral_convergence_loss will be too large (eg.30)
use_final_nonlinear_activation
:
True
###########################################################
...
...
@@ -129,7 +129,7 @@ discriminator_scheduler_params:
###########################################################
discriminator_train_start_steps
:
200000
# Number of steps to start to train discriminator.
train_max_steps
:
1000000
# Number of training steps.
save_interval_steps
:
5000
0
# Interval steps to save checkpoint.
save_interval_steps
:
5000
# Interval steps to save checkpoint.
eval_interval_steps
:
1000
# Interval steps to evaluate the network.
###########################################################
...
...
examples/csmsc/voc3/conf/use_tanh.yaml
已删除
100644 → 0
浏览文件 @
9125d71a
# This is the hyperparameter configuration file for MelGAN.
# Please make sure this is adjusted for the CSMSC dataset. If you want to
# apply to the other dataset, you might need to carefully change some parameters.
# This configuration requires ~ 8GB memory and will finish within 7 days on Titan V.
# This configuration is based on full-band MelGAN but the hop size and sampling
# rate is different from the paper (16kHz vs 24kHz). The number of iteraions
# is not shown in the paper so currently we train 1M iterations (not sure enough
# to converge). The optimizer setting is based on @dathudeptrai advice.
# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs
:
24000
# Sampling rate.
n_fft
:
2048
# FFT size. (in samples)
n_shift
:
300
# Hop size. (in samples)
win_length
:
1200
# Window length. (in samples)
# If set to null, it will be the same as fft_size.
window
:
"
hann"
# Window function.
n_mels
:
80
# Number of mel basis.
fmin
:
80
# Minimum freq in mel basis calculation. (Hz)
fmax
:
7600
# Maximum frequency in mel basis calculation. (Hz)
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
generator_params
:
in_channels
:
80
# Number of input channels.
out_channels
:
4
# Number of output channels.
kernel_size
:
7
# Kernel size of initial and final conv layers.
channels
:
384
# Initial number of channels for conv layers.
upsample_scales
:
[
5
,
5
,
3
]
# List of Upsampling scales.
stack_kernel_size
:
3
# Kernel size of dilated conv layers in residual stack.
stacks
:
4
# Number of stacks in a single residual stack module.
use_weight_norm
:
True
# Whether to use weight normalization.
use_causal_conv
:
False
# Whether to use causal convolution.
use_final_nonlinear_activation
:
True
# If True, spectral_convergence_loss and sub_spectral_convergence_loss will be too large (eg.30)
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
discriminator_params
:
in_channels
:
1
# Number of input channels.
out_channels
:
1
# Number of output channels.
scales
:
3
# Number of multi-scales.
downsample_pooling
:
"
AvgPool1D"
# Pooling type for the input downsampling.
downsample_pooling_params
:
# Parameters of the above pooling function.
kernel_size
:
4
stride
:
2
padding
:
1
exclusive
:
True
kernel_sizes
:
[
5
,
3
]
# List of kernel size.
channels
:
16
# Number of channels of the initial conv layer.
max_downsample_channels
:
512
# Maximum number of channels of downsampling layers.
downsample_scales
:
[
4
,
4
,
4
]
# List of downsampling scales.
nonlinear_activation
:
"
LeakyReLU"
# Nonlinear activation function.
nonlinear_activation_params
:
# Parameters of nonlinear activation function.
negative_slope
:
0.2
use_weight_norm
:
True
# Whether to use weight norm.
###########################################################
# STFT LOSS SETTING #
###########################################################
use_stft_loss
:
true
stft_loss_params
:
fft_sizes
:
[
1024
,
2048
,
512
]
# List of FFT size for STFT-based loss.
hop_sizes
:
[
120
,
240
,
50
]
# List of hop size for STFT-based loss
win_lengths
:
[
600
,
1200
,
240
]
# List of window length for STFT-based loss.
window
:
"
hann"
# Window function for STFT-based loss
use_subband_stft_loss
:
true
subband_stft_loss_params
:
fft_sizes
:
[
384
,
683
,
171
]
# List of FFT size for STFT-based loss.
hop_sizes
:
[
30
,
60
,
10
]
# List of hop size for STFT-based loss
win_lengths
:
[
150
,
300
,
60
]
# List of window length for STFT-based loss.
window
:
"
hann"
# Window function for STFT-based loss
###########################################################
# ADVERSARIAL LOSS SETTING #
###########################################################
use_feat_match_loss
:
false
# Whether to use feature matching loss.
lambda_adv
:
2.5
# Loss balancing coefficient for adversarial loss.
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size
:
64
# Batch size.
batch_max_steps
:
16200
# Length of each audio in batch. Make sure dividable by hop_size.
num_workers
:
2
# Number of workers in DataLoader.
###########################################################
# OPTIMIZER & SCHEDULER SETTING #
###########################################################
generator_optimizer_params
:
epsilon
:
1.0e-7
# Generator's epsilon.
weight_decay
:
0.0
# Generator's weight decay coefficient.
generator_grad_norm
:
-1
# Generator's gradient norm.
generator_scheduler_params
:
learning_rate
:
1.0e-3
# Generator's learning rate.
gamma
:
0.5
# Generator's scheduler gamma.
milestones
:
# At each milestone, lr will be multiplied by gamma.
-
100000
-
200000
-
300000
-
400000
-
500000
-
600000
discriminator_optimizer_params
:
epsilon
:
1.0e-7
# Discriminator's epsilon.
weight_decay
:
0.0
# Discriminator's weight decay coefficient.
discriminator_grad_norm
:
-1
# Discriminator's gradient norm.
discriminator_scheduler_params
:
learning_rate
:
1.0e-3
# Discriminator's learning rate.
gamma
:
0.5
# Discriminator's scheduler gamma.
milestones
:
# At each milestone, lr will be multiplied by gamma.
-
100000
-
200000
-
300000
-
400000
-
500000
-
600000
###########################################################
# INTERVAL SETTING #
###########################################################
discriminator_train_start_steps
:
200000
# Number of steps to start to train discriminator.
train_max_steps
:
1000000
# Number of training steps.
save_interval_steps
:
50000
# Interval steps to save checkpoint.
eval_interval_steps
:
1000
# Interval steps to evaluate the network.
###########################################################
# OTHER SETTING #
###########################################################
num_snapshots
:
10
# max number of snapshots to keep while training
seed
:
42
# random seed for paddle, random, and np.random
\ No newline at end of file
parakeet/models/melgan/melgan.py
浏览文件 @
d3d9f835
...
...
@@ -22,6 +22,7 @@ from paddle import nn
from
parakeet.modules.causal_conv
import
CausalConv1D
from
parakeet.modules.causal_conv
import
CausalConv1DTranspose
from
parakeet.modules.nets_utils
import
initialize
from
parakeet.modules.pqmf
import
PQMF
from
parakeet.modules.residual_stack
import
ResidualStack
...
...
@@ -45,7 +46,8 @@ class MelGANGenerator(nn.Layer):
pad_params
:
Dict
[
str
,
Any
]
=
{
"mode"
:
"reflect"
},
use_final_nonlinear_activation
:
bool
=
True
,
use_weight_norm
:
bool
=
True
,
use_causal_conv
:
bool
=
False
,
):
use_causal_conv
:
bool
=
False
,
init_type
:
str
=
"xavier_uniform"
,
):
"""Initialize MelGANGenerator module.
Parameters
----------
...
...
@@ -91,7 +93,10 @@ class MelGANGenerator(nn.Layer):
if
not
use_causal_conv
:
assert
(
kernel_size
-
1
)
%
2
==
0
,
"Not support even number kernel size."
# add initial layer
# initialize parameters
initialize
(
self
,
init_type
)
layers
=
[]
if
not
use_causal_conv
:
layers
+=
[
...
...
@@ -178,6 +183,7 @@ class MelGANGenerator(nn.Layer):
# define the model as a single function
self
.
melgan
=
nn
.
Sequential
(
*
layers
)
nn
.
initializer
.
set_global_initializer
(
None
)
# apply weight norm
if
use_weight_norm
:
...
...
@@ -322,6 +328,7 @@ class MelGANDiscriminator(nn.Layer):
assert
len
(
kernel_sizes
)
==
2
assert
kernel_sizes
[
0
]
%
2
==
1
assert
kernel_sizes
[
1
]
%
2
==
1
# add first layer
self
.
layers
.
append
(
nn
.
Sequential
(
...
...
@@ -417,7 +424,8 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
nonlinear_activation_params
:
Dict
[
str
,
Any
]
=
{
"negative_slope"
:
0.2
},
pad
:
str
=
"Pad1D"
,
pad_params
:
Dict
[
str
,
Any
]
=
{
"mode"
:
"reflect"
},
use_weight_norm
:
bool
=
True
,
):
use_weight_norm
:
bool
=
True
,
init_type
:
str
=
"xavier_uniform"
,
):
"""Initilize MelGAN multi-scale discriminator module.
Parameters
----------
...
...
@@ -454,6 +462,9 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
Whether to use causal convolution.
"""
super
().
__init__
()
# initialize parameters
initialize
(
self
,
init_type
)
self
.
discriminators
=
nn
.
LayerList
()
# add discriminators
...
...
@@ -474,6 +485,8 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
self
.
pooling
=
getattr
(
nn
,
downsample_pooling
)(
**
downsample_pooling_params
)
nn
.
initializer
.
set_global_initializer
(
None
)
# apply weight norm
if
use_weight_norm
:
self
.
apply_weight_norm
()
...
...
parakeet/modules/residual_stack.py
浏览文件 @
d3d9f835
...
...
@@ -106,7 +106,4 @@ class ResidualStack(nn.Layer):
Tensor
Output tensor (B, chennels, T).
"""
stack_output
=
self
.
stack
(
c
)
skip_layer_output
=
self
.
skip_layer
(
c
)
out
=
stack_output
+
skip_layer_output
return
out
return
self
.
stack
(
c
)
+
self
.
skip_layer
(
c
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录