Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Parakeet
提交
ac437a08
P
Parakeet
项目概览
PaddlePaddle
/
Parakeet
通知
14
Star
3
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
19
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Parakeet
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
19
Issue
19
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
ac437a08
编写于
2月 24, 2020
作者:
L
liuyibing01
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Remove unused modules.py
上级
25883dcd
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
0 addition
and
612 deletion
+0
-612
parakeet/modules/modules.py
parakeet/modules/modules.py
+0
-612
未找到文件。
parakeet/modules/modules.py
已删除
100644 → 0
浏览文件 @
25883dcd
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
from
paddle
import
fluid
import
paddle.fluid.dygraph
as
dg
import
numpy
as
np
from
.
import
conv
from
.
import
weight_norm
def
FC
(
name_scope
,
in_features
,
size
,
num_flatten_dims
=
1
,
relu
=
False
,
dropout
=
0.0
,
epsilon
=
1e-30
,
act
=
None
,
is_test
=
False
,
dtype
=
"float32"
):
"""
A special Linear Layer, when it is used with dropout, the weight is
initialized as normal(0, std=np.sqrt((1-dropout) / in_features))
"""
# stds
if
isinstance
(
in_features
,
int
):
in_features
=
[
in_features
]
stds
=
[
np
.
sqrt
((
1
-
dropout
)
/
in_feature
)
for
in_feature
in
in_features
]
if
relu
:
stds
=
[
std
*
np
.
sqrt
(
2.0
)
for
std
in
stds
]
weight_inits
=
[
fluid
.
initializer
.
NormalInitializer
(
scale
=
std
)
for
std
in
stds
]
bias_init
=
fluid
.
initializer
.
ConstantInitializer
(
0.0
)
# param attrs
weight_attrs
=
[
fluid
.
ParamAttr
(
initializer
=
init
)
for
init
in
weight_inits
]
bias_attr
=
fluid
.
ParamAttr
(
initializer
=
bias_init
)
layer
=
weight_norm
.
FC
(
name_scope
,
size
,
num_flatten_dims
=
num_flatten_dims
,
param_attr
=
weight_attrs
,
bias_attr
=
bias_attr
,
act
=
act
,
dtype
=
dtype
)
return
layer
def
Conv1D
(
name_scope
,
in_channels
,
num_filters
,
filter_size
=
3
,
dilation
=
1
,
groups
=
None
,
causal
=
False
,
std_mul
=
1.0
,
dropout
=
0.0
,
use_cudnn
=
True
,
act
=
None
,
dtype
=
"float32"
):
"""
A special Conv1D Layer, when it is used with dropout, the weight is
initialized as
normal(0, std=np.sqrt(std_mul * (1-dropout) / (filter_size * in_features)))
"""
# std
std
=
np
.
sqrt
((
std_mul
*
(
1
-
dropout
))
/
(
filter_size
*
in_channels
))
weight_init
=
fluid
.
initializer
.
NormalInitializer
(
loc
=
0.0
,
scale
=
std
)
bias_init
=
fluid
.
initializer
.
ConstantInitializer
(
0.0
)
# param attrs
weight_attr
=
fluid
.
ParamAttr
(
initializer
=
weight_init
)
bias_attr
=
fluid
.
ParamAttr
(
initializer
=
bias_init
)
layer
=
conv
.
Conv1D
(
name_scope
,
in_channels
,
num_filters
,
filter_size
,
dilation
,
groups
=
groups
,
causal
=
causal
,
param_attr
=
weight_attr
,
bias_attr
=
bias_attr
,
use_cudnn
=
use_cudnn
,
act
=
act
,
dtype
=
dtype
)
return
layer
def
Embedding
(
name_scope
,
num_embeddings
,
embed_dim
,
is_sparse
=
False
,
is_distributed
=
False
,
padding_idx
=
None
,
std
=
0.01
,
dtype
=
"float32"
):
# param attrs
weight_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Normal
(
scale
=
std
))
layer
=
dg
.
Embedding
(
name_scope
,
(
num_embeddings
,
embed_dim
),
padding_idx
=
padding_idx
,
param_attr
=
weight_attr
,
dtype
=
dtype
)
return
layer
class
Conv1DGLU
(
dg
.
Layer
):
"""
A Convolution 1D block with GLU activation. It also applys dropout for the
input x. It fuses speaker embeddings through a FC activated by softsign. It
has residual connection from the input x, and scale the output by
np.sqrt(0.5).
"""
def
__init__
(
self
,
name_scope
,
n_speakers
,
speaker_dim
,
in_channels
,
num_filters
,
filter_size
,
dilation
,
std_mul
=
4.0
,
dropout
=
0.0
,
causal
=
False
,
residual
=
True
,
dtype
=
"float32"
):
super
(
Conv1DGLU
,
self
).
__init__
(
name_scope
,
dtype
=
dtype
)
# conv spec
self
.
in_channels
=
in_channels
self
.
n_speakers
=
n_speakers
self
.
speaker_dim
=
speaker_dim
self
.
num_filters
=
num_filters
self
.
filter_size
=
filter_size
self
.
dilation
=
dilation
self
.
causal
=
causal
self
.
residual
=
residual
# weight init and dropout
self
.
std_mul
=
std_mul
self
.
dropout
=
dropout
if
residual
:
assert
(
in_channels
==
num_filters
),
"this block uses residual connection"
\
"the input_channes should equals num_filters"
self
.
conv
=
Conv1D
(
self
.
full_name
(),
in_channels
,
2
*
num_filters
,
filter_size
,
dilation
,
causal
=
causal
,
std_mul
=
std_mul
,
dropout
=
dropout
,
dtype
=
dtype
)
if
n_speakers
>
1
:
assert
(
speaker_dim
is
not
None
),
"speaker embed should not be null in multi-speaker case"
self
.
fc
=
Conv1D
(
self
.
full_name
(),
speaker_dim
,
num_filters
,
filter_size
=
1
,
dilation
=
1
,
causal
=
False
,
act
=
"softsign"
,
dtype
=
dtype
)
def
forward
(
self
,
x
,
speaker_embed_bc1t
=
None
):
"""
Args:
x (Variable): Shape(B, C_in, 1, T), the input of Conv1DGLU
layer, where B means batch_size, C_in means the input channels
T means input time steps.
speaker_embed_bct1 (Variable): Shape(B, C_sp, 1, T), expanded
speaker embed, where C_sp means speaker embedding size. Note
that when using residual connection, the Conv1DGLU does not
change the number of channels, so out channels equals input
channels.
Returns:
x (Variable): Shape(B, C_out, 1, T), the output of Conv1DGLU, where
C_out means the output channels of Conv1DGLU.
"""
residual
=
x
x
=
fluid
.
layers
.
dropout
(
x
,
self
.
dropout
,
dropout_implementation
=
"upscale_in_train"
)
x
=
self
.
conv
(
x
)
content
,
gate
=
fluid
.
layers
.
split
(
x
,
num_or_sections
=
2
,
dim
=
1
)
if
speaker_embed_bc1t
is
not
None
:
sp
=
self
.
fc
(
speaker_embed_bc1t
)
content
=
content
+
sp
# glu
x
=
fluid
.
layers
.
elementwise_mul
(
fluid
.
layers
.
sigmoid
(
gate
),
content
)
if
self
.
residual
:
x
=
fluid
.
layers
.
scale
(
x
+
residual
,
np
.
sqrt
(
0.5
))
return
x
def
add_input
(
self
,
x
,
speaker_embed_bc11
=
None
):
"""
Inputs:
x: shape(B, num_filters, 1, time_steps)
speaker_embed_bc11: shape(B, speaker_dim, 1, time_steps)
Outputs:
out: shape(B, num_filters, 1, time_steps), where time_steps = 1
"""
residual
=
x
# add step input and produce step output
x
=
fluid
.
layers
.
dropout
(
x
,
self
.
dropout
,
dropout_implementation
=
"upscale_in_train"
)
x
=
self
.
conv
.
add_input
(
x
)
content
,
gate
=
fluid
.
layers
.
split
(
x
,
num_or_sections
=
2
,
dim
=
1
)
if
speaker_embed_bc11
is
not
None
:
sp
=
self
.
fc
(
speaker_embed_bc11
)
content
=
content
+
sp
x
=
fluid
.
layers
.
elementwise_mul
(
fluid
.
layers
.
sigmoid
(
gate
),
content
)
if
self
.
residual
:
x
=
fluid
.
layers
.
scale
(
x
+
residual
,
np
.
sqrt
(
0.5
))
return
x
def
Conv1DTranspose
(
name_scope
,
in_channels
,
num_filters
,
filter_size
,
padding
=
0
,
stride
=
1
,
dilation
=
1
,
groups
=
None
,
std_mul
=
1.0
,
dropout
=
0.0
,
use_cudnn
=
True
,
act
=
None
,
dtype
=
"float32"
):
std
=
np
.
sqrt
(
std_mul
*
(
1
-
dropout
)
/
(
in_channels
*
filter_size
))
weight_init
=
fluid
.
initializer
.
NormalInitializer
(
scale
=
std
)
weight_attr
=
fluid
.
ParamAttr
(
initializer
=
weight_init
)
bias_init
=
fluid
.
initializer
.
ConstantInitializer
(
0.0
)
bias_attr
=
fluid
.
ParamAttr
(
initializer
=
bias_init
)
layer
=
conv
.
Conv1DTranspose
(
name_scope
,
in_channels
,
num_filters
,
filter_size
,
padding
=
padding
,
stride
=
stride
,
dilation
=
dilation
,
groups
=
groups
,
param_attr
=
weight_attr
,
bias_attr
=
bias_attr
,
use_cudnn
=
use_cudnn
,
act
=
act
,
dtype
=
dtype
)
return
layer
def
compute_position_embedding
(
rad
):
# rad is a transposed radius, shape(embed_dim, n_vocab)
embed_dim
,
n_vocab
=
rad
.
shape
even_dims
=
dg
.
to_variable
(
np
.
arange
(
0
,
embed_dim
,
2
).
astype
(
"int32"
))
odd_dims
=
dg
.
to_variable
(
np
.
arange
(
1
,
embed_dim
,
2
).
astype
(
"int32"
))
even_rads
=
fluid
.
layers
.
gather
(
rad
,
even_dims
)
odd_rads
=
fluid
.
layers
.
gather
(
rad
,
odd_dims
)
sines
=
fluid
.
layers
.
sin
(
even_rads
)
cosines
=
fluid
.
layers
.
cos
(
odd_rads
)
temp
=
fluid
.
layers
.
scatter
(
rad
,
even_dims
,
sines
)
out
=
fluid
.
layers
.
scatter
(
temp
,
odd_dims
,
cosines
)
out
=
fluid
.
layers
.
transpose
(
out
,
perm
=
[
1
,
0
])
return
out
def
position_encoding_init
(
n_position
,
d_pos_vec
,
position_rate
=
1.0
,
sinusoidal
=
True
):
""" Init the sinusoid position encoding table """
# keep idx 0 for padding token position encoding zero vector
position_enc
=
np
.
array
([[
position_rate
*
pos
/
np
.
power
(
10000
,
2
*
(
i
//
2
)
/
d_pos_vec
)
for
i
in
range
(
d_pos_vec
)
]
if
pos
!=
0
else
np
.
zeros
(
d_pos_vec
)
for
pos
in
range
(
n_position
)])
if
sinusoidal
:
position_enc
[
1
:,
0
::
2
]
=
np
.
sin
(
position_enc
[
1
:,
0
::
2
])
# dim 2i
position_enc
[
1
:,
1
::
2
]
=
np
.
cos
(
position_enc
[
1
:,
1
::
2
])
# dim 2i+1
return
position_enc
class
PositionEmbedding
(
dg
.
Layer
):
def
__init__
(
self
,
name_scope
,
n_position
,
d_pos_vec
,
position_rate
=
1.0
,
is_sparse
=
False
,
is_distributed
=
False
,
param_attr
=
None
,
max_norm
=
None
,
padding_idx
=
None
,
dtype
=
"float32"
):
super
(
PositionEmbedding
,
self
).
__init__
(
name_scope
,
dtype
=
dtype
)
self
.
embed
=
dg
.
Embedding
(
self
.
full_name
(),
size
=
(
n_position
,
d_pos_vec
),
is_sparse
=
is_sparse
,
is_distributed
=
is_distributed
,
padding_idx
=
None
,
param_attr
=
param_attr
,
dtype
=
dtype
)
self
.
set_weight
(
position_encoding_init
(
n_position
,
d_pos_vec
,
position_rate
=
position_rate
,
sinusoidal
=
False
).
astype
(
dtype
))
self
.
_is_sparse
=
is_sparse
self
.
_is_distributed
=
is_distributed
self
.
_remote_prefetch
=
self
.
_is_sparse
and
(
not
self
.
_is_distributed
)
if
self
.
_remote_prefetch
:
assert
self
.
_is_sparse
is
True
and
self
.
_is_distributed
is
False
self
.
_padding_idx
=
(
-
1
if
padding_idx
is
None
else
padding_idx
if
padding_idx
>=
0
else
(
n_position
+
padding_idx
))
self
.
_position_rate
=
position_rate
self
.
_max_norm
=
max_norm
self
.
_dtype
=
dtype
def
set_weight
(
self
,
array
):
assert
self
.
embed
.
_w
.
shape
==
list
(
array
.
shape
),
"shape does not match"
self
.
embed
.
_w
.
_ivar
.
value
().
get_tensor
().
set
(
array
,
fluid
.
framework
.
_current_expected_place
())
def
forward
(
self
,
indices
,
speaker_position_rate
=
None
):
"""
Args:
indices (Variable): Shape (B, T, 1), dtype: int64, position
indices, where B means the batch size, T means the time steps.
speaker_position_rate (Variable | float, optional), position
rate. It can be a float point number or a Variable with
shape (1,), then this speaker_position_rate is used for every
example. It can also be a Variable with shape (B, 1), which
contains a speaker position rate for each speaker.
Returns:
out (Variable): Shape(B, C_pos), position embedding, where C_pos
means position embedding size.
"""
rad
=
fluid
.
layers
.
transpose
(
self
.
embed
.
_w
,
perm
=
[
1
,
0
])
batch_size
=
indices
.
shape
[
0
]
if
speaker_position_rate
is
None
:
weight
=
compute_position_embedding
(
rad
)
out
=
self
.
_helper
.
create_variable_for_type_inference
(
self
.
_dtype
)
self
.
_helper
.
append_op
(
type
=
"lookup_table"
,
inputs
=
{
"Ids"
:
indices
,
"W"
:
weight
},
outputs
=
{
"Out"
:
out
},
attrs
=
{
"is_sparse"
:
self
.
_is_sparse
,
"is_distributed"
:
self
.
_is_distributed
,
"remote_prefetch"
:
self
.
_remote_prefetch
,
"padding_idx"
:
self
.
_padding_idx
,
# special value for lookup table op
})
return
out
elif
(
np
.
isscalar
(
speaker_position_rate
)
or
isinstance
(
speaker_position_rate
,
fluid
.
framework
.
Variable
)
and
speaker_position_rate
.
shape
==
[
1
,
1
]):
# # make a weight
# scale the weight (the operand for sin & cos)
if
np
.
isscalar
(
speaker_position_rate
):
scaled_rad
=
fluid
.
layers
.
scale
(
rad
,
speaker_position_rate
)
else
:
scaled_rad
=
fluid
.
layers
.
elementwise_mul
(
rad
,
speaker_position_rate
[
0
])
weight
=
compute_position_embedding
(
scaled_rad
)
out
=
self
.
_helper
.
create_variable_for_type_inference
(
self
.
_dtype
)
self
.
_helper
.
append_op
(
type
=
"lookup_table"
,
inputs
=
{
"Ids"
:
indices
,
"W"
:
weight
},
outputs
=
{
"Out"
:
out
},
attrs
=
{
"is_sparse"
:
self
.
_is_sparse
,
"is_distributed"
:
self
.
_is_distributed
,
"remote_prefetch"
:
self
.
_remote_prefetch
,
"padding_idx"
:
self
.
_padding_idx
,
# special value for lookup table op
})
return
out
elif
np
.
prod
(
speaker_position_rate
.
shape
)
>
1
:
assert
speaker_position_rate
.
shape
==
[
batch_size
,
1
]
outputs
=
[]
for
i
in
range
(
batch_size
):
rate
=
speaker_position_rate
[
i
]
# rate has shape [1]
scaled_rad
=
fluid
.
layers
.
elementwise_mul
(
rad
,
rate
)
weight
=
compute_position_embedding
(
scaled_rad
)
out
=
self
.
_helper
.
create_variable_for_type_inference
(
self
.
_dtype
)
sequence
=
indices
[
i
]
self
.
_helper
.
append_op
(
type
=
"lookup_table"
,
inputs
=
{
"Ids"
:
sequence
,
"W"
:
weight
},
outputs
=
{
"Out"
:
out
},
attrs
=
{
"is_sparse"
:
self
.
_is_sparse
,
"is_distributed"
:
self
.
_is_distributed
,
"remote_prefetch"
:
self
.
_remote_prefetch
,
"padding_idx"
:
-
1
,
})
outputs
.
append
(
out
)
out
=
fluid
.
layers
.
stack
(
outputs
)
return
out
else
:
raise
Exception
(
"Then you can just use position rate at init"
)
class
Conv1D_GU
(
dg
.
Layer
):
def
__init__
(
self
,
name_scope
,
conditioner_dim
,
in_channels
,
num_filters
,
filter_size
,
dilation
,
causal
=
False
,
residual
=
True
,
dtype
=
"float32"
):
super
(
Conv1D_GU
,
self
).
__init__
(
name_scope
,
dtype
=
dtype
)
self
.
conditioner_dim
=
conditioner_dim
self
.
in_channels
=
in_channels
self
.
num_filters
=
num_filters
self
.
filter_size
=
filter_size
self
.
dilation
=
dilation
self
.
causal
=
causal
self
.
residual
=
residual
if
residual
:
assert
(
in_channels
==
num_filters
),
"this block uses residual connection"
\
"the input_channels should equals num_filters"
self
.
conv
=
Conv1D
(
self
.
full_name
(),
in_channels
,
2
*
num_filters
,
filter_size
,
dilation
,
causal
=
causal
,
dtype
=
dtype
)
self
.
fc
=
Conv1D
(
self
.
full_name
(),
conditioner_dim
,
2
*
num_filters
,
filter_size
=
1
,
dilation
=
1
,
causal
=
False
,
dtype
=
dtype
)
def
forward
(
self
,
x
,
skip
=
None
,
conditioner
=
None
):
"""
Args:
x (Variable): Shape(B, C_in, 1, T), the input of Conv1D_GU
layer, where B means batch_size, C_in means the input channels
T means input time steps.
skip (Variable): Shape(B, C_in, 1, T), skip connection.
conditioner (Variable): Shape(B, C_con, 1, T), expanded mel
conditioner, where C_con is conditioner hidden dim which
equals the num of mel bands. Note that when using residual
connection, the Conv1D_GU does not change the number of
channels, so out channels equals input channels.
Returns:
x (Variable): Shape(B, C_out, 1, T), the output of Conv1D_GU, where
C_out means the output channels of Conv1D_GU.
skip (Variable): Shape(B, C_out, 1, T), skip connection.
"""
residual
=
x
x
=
self
.
conv
(
x
)
if
conditioner
is
not
None
:
cond_bias
=
self
.
fc
(
conditioner
)
x
+=
cond_bias
content
,
gate
=
fluid
.
layers
.
split
(
x
,
num_or_sections
=
2
,
dim
=
1
)
# Gated Unit.
x
=
fluid
.
layers
.
elementwise_mul
(
fluid
.
layers
.
sigmoid
(
gate
),
fluid
.
layers
.
tanh
(
content
))
if
skip
is
None
:
skip
=
x
else
:
skip
=
fluid
.
layers
.
scale
(
skip
+
x
,
np
.
sqrt
(
0.5
))
if
self
.
residual
:
x
=
fluid
.
layers
.
scale
(
residual
+
x
,
np
.
sqrt
(
0.5
))
return
x
,
skip
def
add_input
(
self
,
x
,
skip
=
None
,
conditioner
=
None
):
"""
Inputs:
x: shape(B, num_filters, 1, time_steps)
skip: shape(B, num_filters, 1, time_steps), skip connection
conditioner: shape(B, conditioner_dim, 1, time_steps)
Outputs:
x: shape(B, num_filters, 1, time_steps), where time_steps = 1
skip: skip connection, same shape as x
"""
residual
=
x
# add step input and produce step output
x
=
self
.
conv
.
add_input
(
x
)
if
conditioner
is
not
None
:
cond_bias
=
self
.
fc
(
conditioner
)
x
+=
cond_bias
content
,
gate
=
fluid
.
layers
.
split
(
x
,
num_or_sections
=
2
,
dim
=
1
)
# Gated Unit.
x
=
fluid
.
layers
.
elementwise_mul
(
fluid
.
layers
.
sigmoid
(
gate
),
fluid
.
layers
.
tanh
(
content
))
if
skip
is
None
:
skip
=
x
else
:
skip
=
fluid
.
layers
.
scale
(
skip
+
x
,
np
.
sqrt
(
0.5
))
if
self
.
residual
:
x
=
fluid
.
layers
.
scale
(
residual
+
x
,
np
.
sqrt
(
0.5
))
return
x
,
skip
def
Conv2DTranspose
(
name_scope
,
num_filters
,
filter_size
,
padding
=
0
,
stride
=
1
,
dilation
=
1
,
use_cudnn
=
True
,
act
=
None
,
dtype
=
"float32"
):
val
=
1.0
/
(
filter_size
[
0
]
*
filter_size
[
1
])
weight_init
=
fluid
.
initializer
.
ConstantInitializer
(
val
)
weight_attr
=
fluid
.
ParamAttr
(
initializer
=
weight_init
)
layer
=
weight_norm
.
Conv2DTranspose
(
name_scope
,
num_filters
,
filter_size
=
filter_size
,
padding
=
padding
,
stride
=
stride
,
dilation
=
dilation
,
param_attr
=
weight_attr
,
use_cudnn
=
use_cudnn
,
act
=
act
,
dtype
=
dtype
)
return
layer
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录