Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
bd0d69ca
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
bd0d69ca
编写于
4月 18, 2023
作者:
小湉湉
提交者:
GitHub
4月 18, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[TTS]add StarGANv2VC preprocess (#3163)
上级
c7d24ba4
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
485 addition
and
23 deletion
+485
-23
examples/vctk/vc3/conf/default.yaml
examples/vctk/vc3/conf/default.yaml
+15
-4
examples/vctk/vc3/local/preprocess.sh
examples/vctk/vc3/local/preprocess.sh
+23
-4
paddlespeech/t2s/datasets/am_batch_fn.py
paddlespeech/t2s/datasets/am_batch_fn.py
+66
-12
paddlespeech/t2s/datasets/data_table.py
paddlespeech/t2s/datasets/data_table.py
+53
-0
paddlespeech/t2s/exps/starganv2_vc/normalize.py
paddlespeech/t2s/exps/starganv2_vc/normalize.py
+101
-0
paddlespeech/t2s/exps/starganv2_vc/preprocess.py
paddlespeech/t2s/exps/starganv2_vc/preprocess.py
+214
-0
paddlespeech/t2s/exps/starganv2_vc/vc.py
paddlespeech/t2s/exps/starganv2_vc/vc.py
+12
-1
paddlespeech/t2s/models/starganv2_vc/starganv2_vc.py
paddlespeech/t2s/models/starganv2_vc/starganv2_vc.py
+1
-2
未找到文件。
examples/vctk/vc3/conf/default.yaml
浏览文件 @
bd0d69ca
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
#
其实没用上,其实用的是 16000
sr
:
24
000
#
源码 load 的时候用的 24k, 提取 mel 用的 16k, 后续 load 和提取 mel 都要改成 24k
fs
:
16
000
n_fft
:
2048
win_length
:
1200
hop_length
:
300
n_shift
:
300
win_length
:
1200
# Window length.(in samples) 50ms
# If set to null, it will be the same as fft_size.
window
:
"
hann"
# Window function.
fmin
:
0
# Minimum frequency of Mel basis.
fmax
:
8000
# Maximum frequency of Mel basis. sr // 2
n_mels
:
80
# only for StarGANv2 VC
norm
:
# None here
htk
:
True
power
:
2.0
###########################################################
# MODEL SETTING #
###########################################################
...
...
examples/vctk/vc3/local/preprocess.sh
浏览文件 @
bd0d69ca
...
...
@@ -6,13 +6,32 @@ stop_stage=100
config_path
=
$1
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
# extract features
echo
"Extract features ..."
python3
${
BIN_DIR
}
/preprocess.py
\
--dataset
=
vctk
\
--rootdir
=
~/datasets/VCTK-Corpus-0.92/
\
--dumpdir
=
dump
\
--config
=
${
config_path
}
\
--num-cpu
=
20
fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
fi
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
echo
"Normalize ..."
python3
${
BIN_DIR
}
/normalize.py
\
--metadata
=
dump/train/raw/metadata.jsonl
\
--dumpdir
=
dump/train/norm
\
--speaker-dict
=
dump/speaker_id_map.txt
python3
${
BIN_DIR
}
/normalize.py
\
--metadata
=
dump/dev/raw/metadata.jsonl
\
--dumpdir
=
dump/dev/norm
\
--speaker-dict
=
dump/speaker_id_map.txt
python3
${
BIN_DIR
}
/normalize.py
\
--metadata
=
dump/test/raw/metadata.jsonl
\
--dumpdir
=
dump/test/norm
\
--speaker-dict
=
dump/speaker_id_map.txt
fi
paddlespeech/t2s/datasets/am_batch_fn.py
浏览文件 @
bd0d69ca
...
...
@@ -669,18 +669,72 @@ def vits_multi_spk_batch_fn(examples):
return
batch
# 未完成
def
starganv2_vc_batch_fn
(
examples
):
batch
=
{
"x_real"
:
None
,
"y_org"
:
None
,
"x_ref"
:
None
,
"x_ref2"
:
None
,
"y_trg"
:
None
,
"z_trg"
:
None
,
"z_trg2"
:
None
,
}
return
batch
# 因为要传参数,所以需要额外构建
def
build_starganv2_vc_collate_fn
(
latent_dim
:
int
=
16
,
max_mel_length
:
int
=
192
):
return
StarGANv2VCCollateFn
(
latent_dim
=
latent_dim
,
max_mel_length
=
max_mel_length
)
class
StarGANv2VCCollateFn
:
"""Functor class of common_collate_fn()"""
def
__init__
(
self
,
latent_dim
:
int
=
16
,
max_mel_length
:
int
=
192
):
self
.
latent_dim
=
latent_dim
self
.
max_mel_length
=
max_mel_length
def
random_clip
(
self
,
mel
:
np
.
array
):
# [80, T]
mel_length
=
mel
.
shape
[
1
]
if
mel_length
>
self
.
max_mel_length
:
random_start
=
np
.
random
.
randint
(
0
,
mel_length
-
self
.
max_mel_length
)
mel
=
mel
[:,
random_start
:
random_start
+
self
.
max_mel_length
]
return
mel
def
__call__
(
self
,
exmaples
):
return
self
.
starganv2_vc_batch_fn
(
exmaples
)
def
starganv2_vc_batch_fn
(
self
,
examples
):
batch_size
=
len
(
examples
)
label
=
[
np
.
array
(
item
[
"label"
],
dtype
=
np
.
int64
)
for
item
in
examples
]
ref_label
=
[
np
.
array
(
item
[
"ref_label"
],
dtype
=
np
.
int64
)
for
item
in
examples
]
# 需要对 mel 进行裁剪
mel
=
[
self
.
random_clip
(
item
[
"mel"
])
for
item
in
examples
]
ref_mel
=
[
self
.
random_clip
(
item
[
"ref_mel"
])
for
item
in
examples
]
ref_mel_2
=
[
self
.
random_clip
(
item
[
"ref_mel_2"
])
for
item
in
examples
]
mel
=
batch_sequences
(
mel
)
ref_mel
=
batch_sequences
(
ref_mel
)
ref_mel_2
=
batch_sequences
(
ref_mel_2
)
# convert each batch to paddle.Tensor
# (B,)
label
=
paddle
.
to_tensor
(
label
)
ref_label
=
paddle
.
to_tensor
(
ref_label
)
# [B, 80, T] -> [B, 1, 80, T]
mel
=
paddle
.
to_tensor
(
mel
)
ref_mel
=
paddle
.
to_tensor
(
ref_mel
)
ref_mel_2
=
paddle
.
to_tensor
(
ref_mel_2
)
z_trg
=
paddle
.
randn
(
batch_size
,
self
.
latent_dim
)
z_trg2
=
paddle
.
randn
(
batch_size
,
self
.
latent_dim
)
batch
=
{
"x_real"
:
mels
,
"y_org"
:
labels
,
"x_ref"
:
ref_mels
,
"x_ref2"
:
ref_mels_2
,
"y_trg"
:
ref_labels
,
"z_trg"
:
z_trg
,
"z_trg2"
:
z_trg2
}
return
batch
# for PaddleSlim
...
...
paddlespeech/t2s/datasets/data_table.py
浏览文件 @
bd0d69ca
...
...
@@ -11,12 +11,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
random
from
multiprocessing
import
Manager
from
typing
import
Any
from
typing
import
Callable
from
typing
import
Dict
from
typing
import
List
import
numpy
as
np
from
paddle.io
import
Dataset
...
...
@@ -131,3 +133,54 @@ class DataTable(Dataset):
The length of the dataset
"""
return
len
(
self
.
data
)
class
StarGANv2VCDataTable
(
DataTable
):
def
__init__
(
self
,
data
:
List
[
Dict
[
str
,
Any
]]):
super
().
__init__
(
data
)
raw_data
=
data
spk_id_set
=
list
(
set
([
item
[
'spk_id'
]
for
item
in
raw_data
]))
data_list_per_class
=
{}
for
spk_id
in
spk_id_set
:
data_list_per_class
[
spk_id
]
=
[]
for
item
in
raw_data
:
for
spk_id
in
spk_id_set
:
if
item
[
'spk_id'
]
==
spk_id
:
data_list_per_class
[
spk_id
].
append
(
item
)
self
.
data_list_per_class
=
data_list_per_class
def
__getitem__
(
self
,
idx
:
int
)
->
Dict
[
str
,
Any
]:
"""Get an example given an index.
Args:
idx (int): Index of the example to get
Returns:
Dict[str, Any]: A converted example
"""
if
self
.
use_cache
and
self
.
caches
[
idx
]
is
not
None
:
return
self
.
caches
[
idx
]
data
=
self
.
_get_metadata
(
idx
)
# 裁剪放到 batch_fn 里面
# 返回一个字典
"""
{'utt_id': 'p225_111', 'spk_id': '1', 'speech': 'path of *.npy'}
"""
ref_data
=
random
.
choice
(
self
.
data
)
ref_label
=
ref_data
[
'spk_id'
]
ref_data_2
=
random
.
choice
(
self
.
data_list_per_class
[
ref_label
])
# mel_tensor, label, ref_mel_tensor, ref2_mel_tensor, ref_label
new_example
=
{
'utt_id'
:
data
[
'utt_id'
],
'mel'
:
np
.
load
(
data
[
'speech'
]),
'label'
:
int
(
data
[
'spk_id'
]),
'ref_mel'
:
np
.
load
(
ref_data
[
'speech'
]),
'ref_mel_2'
:
np
.
load
(
ref_data_2
[
'speech'
]),
'ref_label'
:
int
(
ref_label
)
}
if
self
.
use_cache
:
self
.
caches
[
idx
]
=
new_example
return
new_example
paddlespeech/t2s/exps/starganv2_vc/normalize.py
0 → 100644
浏览文件 @
bd0d69ca
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Normalize feature files and dump them."""
import
argparse
import
logging
from
operator
import
itemgetter
from
pathlib
import
Path
import
jsonlines
import
numpy
as
np
import
tqdm
from
paddlespeech.t2s.datasets.data_table
import
DataTable
def
main
():
"""Run preprocessing process."""
parser
=
argparse
.
ArgumentParser
(
description
=
"Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
)
parser
.
add_argument
(
"--metadata"
,
type
=
str
,
required
=
True
,
help
=
"directory including feature files to be normalized. "
"you need to specify either *-scp or rootdir."
)
parser
.
add_argument
(
"--dumpdir"
,
type
=
str
,
required
=
True
,
help
=
"directory to dump normalized feature files."
)
parser
.
add_argument
(
"--speaker-dict"
,
type
=
str
,
default
=
None
,
help
=
"speaker id map file."
)
args
=
parser
.
parse_args
()
dumpdir
=
Path
(
args
.
dumpdir
).
expanduser
()
# use absolute path
dumpdir
=
dumpdir
.
resolve
()
dumpdir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
# get dataset
with
jsonlines
.
open
(
args
.
metadata
,
'r'
)
as
reader
:
metadata
=
list
(
reader
)
dataset
=
DataTable
(
metadata
,
converters
=
{
"speech"
:
np
.
load
,
})
logging
.
info
(
f
"The number of files =
{
len
(
dataset
)
}
."
)
vocab_speaker
=
{}
with
open
(
args
.
speaker_dict
,
'rt'
)
as
f
:
spk_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
for
spk
,
id
in
spk_id
:
vocab_speaker
[
spk
]
=
int
(
id
)
# process each file
output_metadata
=
[]
for
item
in
tqdm
.
tqdm
(
dataset
):
utt_id
=
item
[
'utt_id'
]
speech
=
item
[
'speech'
]
# normalize
# 这里暂时写死
mean
,
std
=
-
4
,
4
speech
=
(
speech
-
mean
)
/
std
speech_path
=
dumpdir
/
f
"
{
utt_id
}
_speech.npy"
np
.
save
(
speech_path
,
speech
.
astype
(
np
.
float32
),
allow_pickle
=
False
)
spk_id
=
vocab_speaker
[
item
[
"speaker"
]]
record
=
{
"utt_id"
:
item
[
'utt_id'
],
"spk_id"
:
spk_id
,
"speech"
:
str
(
speech_path
),
}
output_metadata
.
append
(
record
)
output_metadata
.
sort
(
key
=
itemgetter
(
'utt_id'
))
output_metadata_path
=
Path
(
args
.
dumpdir
)
/
"metadata.jsonl"
with
jsonlines
.
open
(
output_metadata_path
,
'w'
)
as
writer
:
for
item
in
output_metadata
:
writer
.
write
(
item
)
logging
.
info
(
f
"metadata dumped into
{
output_metadata_path
}
"
)
if
__name__
==
"__main__"
:
main
()
paddlespeech/t2s/exps/starganv2_vc/preprocess.py
0 → 100644
浏览文件 @
bd0d69ca
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
os
from
concurrent.futures
import
ThreadPoolExecutor
from
operator
import
itemgetter
from
pathlib
import
Path
from
typing
import
Any
from
typing
import
Dict
from
typing
import
List
import
jsonlines
import
librosa
import
numpy
as
np
import
tqdm
import
yaml
from
yacs.config
import
CfgNode
from
paddlespeech.t2s.datasets.get_feats
import
LogMelFBank
from
paddlespeech.t2s.datasets.preprocess_utils
import
get_spk_id_map
speaker_set
=
set
()
def
process_sentence
(
config
:
Dict
[
str
,
Any
],
fp
:
Path
,
output_dir
:
Path
,
mel_extractor
=
None
):
utt_id
=
fp
.
stem
# for vctk
if
utt_id
.
endswith
(
"_mic2"
):
utt_id
=
utt_id
[:
-
5
]
speaker
=
utt_id
.
split
(
'_'
)[
0
]
speaker_set
.
add
(
speaker
)
# 需要额外获取 speaker
record
=
None
# reading, resampling may occur
# 源码的 bug, 读取的时候按照 24000 读取,但是提取 mel 的时候按照 16000 提取
# 具体参考 https://github.com/PaddlePaddle/PaddleSpeech/blob/c7d24ba42c377fe4c0765c6b1faa202a9aeb136f/paddlespeech/t2s/exps/starganv2_vc/vc.py#L165
# 之后需要换成按照 24000 读取和按照 24000 提取 mel
wav
,
_
=
librosa
.
load
(
str
(
fp
),
sr
=
24000
)
max_value
=
np
.
abs
(
wav
).
max
()
if
max_value
>
1.0
:
wav
=
wav
/
max_value
assert
len
(
wav
.
shape
)
==
1
,
f
"
{
utt_id
}
is not a mono-channel audio."
assert
np
.
abs
(
wav
).
max
()
<=
1.0
,
f
"
{
utt_id
}
is seems to be different that 16 bit PCM."
# extract mel feats
# 注意这里 base = 'e', 后续需要换成 base='10', 我们其他 TTS 模型都是 base='10'
logmel
=
mel_extractor
.
get_log_mel_fbank
(
wav
,
base
=
'e'
)
mel_path
=
output_dir
/
(
utt_id
+
"_speech.npy"
)
np
.
save
(
mel_path
,
logmel
)
record
=
{
"utt_id"
:
utt_id
,
"speech"
:
str
(
mel_path
),
"speaker"
:
speaker
}
return
record
def
process_sentences
(
config
,
fps
:
List
[
Path
],
output_dir
:
Path
,
mel_extractor
=
None
,
nprocs
:
int
=
1
,
):
if
nprocs
==
1
:
results
=
[]
for
fp
in
tqdm
.
tqdm
(
fps
,
total
=
len
(
fps
)):
record
=
process_sentence
(
config
=
config
,
fp
=
fp
,
output_dir
=
output_dir
,
mel_extractor
=
mel_extractor
)
if
record
:
results
.
append
(
record
)
else
:
with
ThreadPoolExecutor
(
nprocs
)
as
pool
:
futures
=
[]
with
tqdm
.
tqdm
(
total
=
len
(
fps
))
as
progress
:
for
fp
in
fps
:
future
=
pool
.
submit
(
process_sentence
,
config
,
fp
,
output_dir
,
mel_extractor
)
future
.
add_done_callback
(
lambda
p
:
progress
.
update
())
futures
.
append
(
future
)
results
=
[]
for
ft
in
futures
:
record
=
ft
.
result
()
if
record
:
results
.
append
(
record
)
results
.
sort
(
key
=
itemgetter
(
"utt_id"
))
with
jsonlines
.
open
(
output_dir
/
"metadata.jsonl"
,
'w'
)
as
writer
:
for
item
in
results
:
writer
.
write
(
item
)
print
(
"Done"
)
def
main
():
# parse config and args
parser
=
argparse
.
ArgumentParser
(
description
=
"Preprocess audio and then extract features."
)
parser
.
add_argument
(
"--dataset"
,
default
=
"vctk"
,
type
=
str
,
help
=
"name of dataset, should in {vctk} now"
)
parser
.
add_argument
(
"--rootdir"
,
default
=
None
,
type
=
str
,
help
=
"directory to dataset."
)
parser
.
add_argument
(
"--dumpdir"
,
type
=
str
,
required
=
True
,
help
=
"directory to dump feature files."
)
parser
.
add_argument
(
"--config"
,
type
=
str
,
help
=
"StarGANv2VC config file."
)
parser
.
add_argument
(
"--num-cpu"
,
type
=
int
,
default
=
1
,
help
=
"number of process."
)
args
=
parser
.
parse_args
()
rootdir
=
Path
(
args
.
rootdir
).
expanduser
()
dumpdir
=
Path
(
args
.
dumpdir
).
expanduser
()
# use absolute path
dumpdir
=
dumpdir
.
resolve
()
dumpdir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
assert
rootdir
.
is_dir
()
with
open
(
args
.
config
,
'rt'
)
as
f
:
config
=
CfgNode
(
yaml
.
safe_load
(
f
))
if
args
.
dataset
==
"vctk"
:
sub_num_dev
=
5
wav_dir
=
rootdir
/
"wav48_silence_trimmed"
train_wav_files
=
[]
dev_wav_files
=
[]
test_wav_files
=
[]
# only for test
for
speaker
in
os
.
listdir
(
wav_dir
):
wav_files
=
sorted
(
list
((
wav_dir
/
speaker
).
rglob
(
"*_mic2.flac"
)))
if
len
(
wav_files
)
>
100
:
train_wav_files
+=
wav_files
[:
-
sub_num_dev
*
2
]
dev_wav_files
+=
wav_files
[
-
sub_num_dev
*
2
:
-
sub_num_dev
]
test_wav_files
+=
wav_files
[
-
sub_num_dev
:]
else
:
train_wav_files
+=
wav_files
else
:
print
(
"dataset should in {vctk} now!"
)
train_dump_dir
=
dumpdir
/
"train"
/
"raw"
train_dump_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
dev_dump_dir
=
dumpdir
/
"dev"
/
"raw"
dev_dump_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
test_dump_dir
=
dumpdir
/
"test"
/
"raw"
test_dump_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
# Extractor
mel_extractor
=
LogMelFBank
(
sr
=
config
.
fs
,
n_fft
=
config
.
n_fft
,
hop_length
=
config
.
n_shift
,
win_length
=
config
.
win_length
,
window
=
config
.
window
,
n_mels
=
config
.
n_mels
,
fmin
=
config
.
fmin
,
fmax
=
config
.
fmax
,
# None here
norm
=
config
.
norm
,
htk
=
config
.
htk
,
power
=
config
.
power
)
# process for the 3 sections
if
train_wav_files
:
process_sentences
(
config
=
config
,
fps
=
train_wav_files
,
output_dir
=
train_dump_dir
,
mel_extractor
=
mel_extractor
,
nprocs
=
args
.
num_cpu
)
if
dev_wav_files
:
process_sentences
(
config
=
config
,
fps
=
dev_wav_files
,
output_dir
=
dev_dump_dir
,
mel_extractor
=
mel_extractor
,
nprocs
=
args
.
num_cpu
)
if
test_wav_files
:
process_sentences
(
config
=
config
,
fps
=
test_wav_files
,
output_dir
=
test_dump_dir
,
mel_extractor
=
mel_extractor
,
nprocs
=
args
.
num_cpu
)
speaker_id_map_path
=
dumpdir
/
"speaker_id_map.txt"
get_spk_id_map
(
speaker_set
,
speaker_id_map_path
)
if
__name__
==
"__main__"
:
main
()
paddlespeech/t2s/exps/starganv2_vc/vc.py
浏览文件 @
bd0d69ca
...
...
@@ -57,9 +57,10 @@ def get_mel_extractor():
def
preprocess
(
wave
,
mel_extractor
):
# (T, 80)
logmel
=
mel_extractor
.
get_log_mel_fbank
(
wave
,
base
=
'e'
)
# [1, 80, 1011]
mean
,
std
=
-
4
,
4
# [1, 80, T]
mel_tensor
=
(
paddle
.
to_tensor
(
logmel
.
T
).
unsqueeze
(
0
)
-
mean
)
/
std
return
mel_tensor
...
...
@@ -67,6 +68,7 @@ def preprocess(wave, mel_extractor):
def
compute_style
(
speaker_dicts
,
mel_extractor
,
style_encoder
,
mapping_network
):
reference_embeddings
=
{}
for
key
,
(
path
,
speaker
)
in
speaker_dicts
.
items
():
# path = ''
if
path
==
''
:
label
=
paddle
.
to_tensor
([
speaker
],
dtype
=
paddle
.
int64
)
latent_dim
=
mapping_network
.
shared
[
0
].
weight
.
shape
[
0
]
...
...
@@ -164,6 +166,15 @@ def voice_conversion(args, uncompress_path):
wave
,
sr
=
librosa
.
load
(
args
.
source_path
,
sr
=
24000
)
source
=
preprocess
(
wave
=
wave
,
mel_extractor
=
mel_extractor
)
# # 测试 preprocess.py 的输出是否 ok
# # 直接用 raw 然后 norm 的在这里 ok
# # 直接用 norm 在这里 ok
# import numpy as np
# source = np.load("~/PaddleSpeech_stargan_preprocess/PaddleSpeech/examples/vctk/vc3/dump/train/norm/p329_414_speech.npy")
# # !!!对 mel_extractor norm 后的操作
# # [1, 80, T]
# source = paddle.to_tensor(source.T).unsqueeze(0)
output_dir
=
Path
(
args
.
output_dir
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
orig_wav_name
=
str
(
output_dir
/
'orig_voc.wav'
)
...
...
paddlespeech/t2s/models/starganv2_vc/starganv2_vc.py
浏览文件 @
bd0d69ca
...
...
@@ -431,14 +431,13 @@ class MappingNetwork(nn.Layer):
"""Calculate forward propagation.
Args:
z(Tensor(float32)):
Shape (B,
1, n_mels, T
).
Shape (B,
latent_dim
).
y(Tensor(float32)):
speaker label. Shape (B, ).
Returns:
Tensor:
Shape (style_dim, )
"""
h
=
self
.
shared
(
z
)
out
=
[]
for
layer
in
self
.
unshared
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录