Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
64f0bad5
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
64f0bad5
编写于
4月 08, 2021
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refactor data, build vocab; add format data
上级
12c01f39
变更
14
显示空白变更内容
内联
并排
Showing
14 changed file
with
370 addition
and
71 deletion
+370
-71
deepspeech/frontend/utility.py
deepspeech/frontend/utility.py
+62
-19
deepspeech/io/collator.py
deepspeech/io/collator.py
+1
-1
deepspeech/models/u2.py
deepspeech/models/u2.py
+1
-1
deepspeech/utils/tensor_utils.py
deepspeech/utils/tensor_utils.py
+0
-2
examples/dataset/aishell/aishell.py
examples/dataset/aishell/aishell.py
+10
-6
examples/dataset/chime3_background/chime3_background.py
examples/dataset/chime3_background/chime3_background.py
+8
-5
examples/dataset/librispeech/librispeech.py
examples/dataset/librispeech/librispeech.py
+7
-3
examples/dataset/mini_librispeech/mini_librispeech.py
examples/dataset/mini_librispeech/mini_librispeech.py
+7
-3
examples/dataset/musan/musan.py
examples/dataset/musan/musan.py
+10
-6
examples/dataset/rir_noise/rir_noise.py
examples/dataset/rir_noise/rir_noise.py
+10
-6
examples/dataset/voxforge/voxforge.py
examples/dataset/voxforge/voxforge.py
+3
-2
examples/tiny/s0/local/data.sh
examples/tiny/s0/local/data.sh
+28
-5
utils/build_vocab.py
utils/build_vocab.py
+96
-12
utils/format_data.py
utils/format_data.py
+127
-0
未找到文件。
deepspeech/frontend/utility.py
浏览文件 @
64f0bad5
...
...
@@ -29,40 +29,79 @@ logger = logging.getLogger(__name__)
__all__
=
[
"load_cmvn"
,
"read_manifest"
,
"rms_to_db"
,
"rms_to_dbfs"
,
"max_dbfs"
,
"mean_dbfs"
,
"gain_db_to_ratio"
,
"normalize_audio"
"mean_dbfs"
,
"gain_db_to_ratio"
,
"normalize_audio"
,
"SOS"
,
"EOS"
,
"UNK"
,
"BLANK"
]
IGNORE_ID
=
-
1
SOS
=
"<sos/eos>"
EOS
=
SOS
UNK
=
"<unk>"
BLANK
=
"<blank>"
# """Load and parse manifest file.
# Instances with durations outside [min_duration, max_duration] will be
# filtered out.
# :param manifest_path: Manifest file to load and parse.
# :type manifest_path: str
# :param max_duration:maximum output seq length, in seconds for raw wav, in frame numbers for feature data.
# :type max_duration: float
# :param min_duration: minimum input seq length, in seconds for raw wav, in frame numbers for feature data.
# :type min_duration: float
# :return: Manifest parsing results. List of dict.
# :rtype: list
# :raises IOError: If failed to parse the manifest.
# """
def
read_manifest
(
manifest_path
,
max_input_len
=
float
(
'inf'
),
min_input_len
=
0.0
,
max_output_len
=
500.0
,
min_output_len
=
0.0
,
max_output_input_ratio
=
10.0
,
min_output_input_ratio
=
0.05
,
):
def
read_manifest
(
manifest_path
,
max_duration
=
float
(
'inf'
),
min_duration
=
0.0
):
"""Load and parse manifest file.
Instances with durations outside [min_duration, max_duration] will be
filtered out.
:param manifest_path: Manifest file to load and parse.
:type manifest_path: str
:param max_duration: Maximal duration in seconds for instance filter.
:type max_duration: float
:param min_duration: Minimal duration in seconds for instance filter.
:type min_duration: float
:return: Manifest parsing results. List of dict.
:rtype: list
:raises IOError: If failed to parse the manifest.
"""
manifest
=
[]
for
json_line
in
codecs
.
open
(
manifest_path
,
'r'
,
'utf-8'
):
try
:
json_data
=
json
.
loads
(
json_line
)
except
Exception
as
e
:
raise
IOError
(
"Error reading manifest: %s"
%
str
(
e
))
if
(
json_data
[
"duration"
]
<=
max_duration
and
json_data
[
"duration"
]
>=
min_duration
):
feat_len
=
json_data
[
"feat_shape"
][
0
]
token_len
=
json_data
[
"token_shape"
][
0
]
conditions
=
[
feat_len
>
min_input_len
,
feat_len
<
max_input_len
,
token_len
>
min_output_len
,
token_len
<
max_output_len
,
token_len
/
feat_len
>
min_output_input_ratio
,
token_len
/
feat_len
<
max_output_input_ratio
,
]
if
all
(
conditions
):
manifest
.
append
(
json_data
)
return
manifest
# parser.add_argument('--max_input_len', type=float,
# default=20,
# help='maximum output seq length, in seconds for raw wav, in frame numbers for feature data')
# parser.add_argument('--min_output_len', type=float,
# default=0, help='minimum input seq length, in modeling units')
# parser.add_argument('--max_output_len', type=float,
# default=500,
# help='maximum output seq length, in modeling units')
# parser.add_argument('--min_output_input_ratio', type=float, default=0.05,
# help='minimum output seq length/output seq length ratio')
# parser.add_argument('--max_output_input_ratio', type=float, default=10,
# help='maximum output seq length/output seq length ratio')
def
rms_to_db
(
rms
:
float
):
"""Root Mean Square to dB.
Args:
rms ([float]): root mean square
...
...
@@ -145,8 +184,10 @@ def normalize_audio(sample_data: np.ndarray, dbfs: float=-3.0103):
def
_load_json_cmvn
(
json_cmvn_file
):
""" Load the json format cmvn stats file and calculate cmvn
Args:
json_cmvn_file: cmvn stats file in json format
Returns:
a numpy array of [means, vars]
"""
...
...
@@ -168,10 +209,12 @@ def _load_json_cmvn(json_cmvn_file):
def
_load_kaldi_cmvn
(
kaldi_cmvn_file
):
""" Load the kaldi format cmvn stats file and calculate cmvn
Args:
kaldi_cmvn_file: kaldi text style global cmvn file, which
is generated by:
compute-cmvn-stats --binary=false scp:feats.scp global_cmvn
Returns:
a numpy array of [means, vars]
"""
...
...
deepspeech/io/collator.py
浏览文件 @
64f0bad5
...
...
@@ -17,7 +17,7 @@ import numpy as np
from
collections
import
namedtuple
from
deepspeech.io.utility
import
pad_sequence
from
deepspeech.
utils.tensor_utils
import
IGNORE_ID
from
deepspeech.
frontend.utility
import
IGNORE_ID
logger
=
logging
.
getLogger
(
__name__
)
...
...
deepspeech/models/u2.py
浏览文件 @
64f0bad5
...
...
@@ -42,11 +42,11 @@ from deepspeech.modules.decoder import TransformerDecoder
from
deepspeech.modules.loss
import
LabelSmoothingLoss
from
deepspeech.frontend.utility
import
load_cmvn
from
deepspeech.frontend.utility
import
IGNORE_ID
from
deepspeech.utils
import
checkpoint
from
deepspeech.utils
import
layer_tools
from
deepspeech.utils.utility
import
log_add
from
deepspeech.utils.tensor_utils
import
IGNORE_ID
from
deepspeech.utils.tensor_utils
import
add_sos_eos
from
deepspeech.utils.tensor_utils
import
th_accuracy
from
deepspeech.utils.tensor_utils
import
pad_sequence
...
...
deepspeech/utils/tensor_utils.py
浏览文件 @
64f0bad5
...
...
@@ -22,8 +22,6 @@ logger = logging.getLogger(__name__)
__all__
=
[
"pad_sequence"
,
"add_sos_eos"
,
"th_accuracy"
]
IGNORE_ID
=
-
1
def
pad_sequence
(
sequences
:
List
[
paddle
.
Tensor
],
batch_first
:
bool
=
False
,
...
...
examples/dataset/aishell/aishell.py
浏览文件 @
64f0bad5
...
...
@@ -62,9 +62,9 @@ def create_manifest(data_dir, manifest_path_prefix):
transcript_dict
[
audio_id
]
=
text
data_types
=
[
'train'
,
'dev'
,
'test'
]
for
type
in
data_types
:
for
d
type
in
data_types
:
del
json_lines
[:]
audio_dir
=
os
.
path
.
join
(
data_dir
,
'wav'
,
type
)
audio_dir
=
os
.
path
.
join
(
data_dir
,
'wav'
,
d
type
)
for
subfolder
,
_
,
filelist
in
sorted
(
os
.
walk
(
audio_dir
)):
for
fname
in
filelist
:
audio_path
=
os
.
path
.
join
(
subfolder
,
fname
)
...
...
@@ -78,12 +78,16 @@ def create_manifest(data_dir, manifest_path_prefix):
json_lines
.
append
(
json
.
dumps
(
{
'audio_filepath'
:
audio_path
,
'duration'
:
duration
,
'text'
:
text
'utt'
:
os
.
path
.
splitext
(
os
.
path
.
basename
(
audio_path
))[
0
],
'feat'
:
audio_path
,
'feat_shape'
:
(
duration
,
),
#second
'text'
:
text
},
ensure_ascii
=
False
))
manifest_path
=
manifest_path_prefix
+
'.'
+
type
manifest_path
=
manifest_path_prefix
+
'.'
+
d
type
with
codecs
.
open
(
manifest_path
,
'w'
,
'utf-8'
)
as
fout
:
for
line
in
json_lines
:
fout
.
write
(
line
+
'
\n
'
)
...
...
examples/dataset/chime3_background/chime3_background.py
浏览文件 @
64f0bad5
...
...
@@ -95,10 +95,13 @@ def create_manifest(data_dir, manifest_path):
audio_data
,
samplerate
=
soundfile
.
read
(
filepath
)
duration
=
float
(
len
(
audio_data
))
/
samplerate
json_lines
.
append
(
json
.
dumps
({
'audio_filepath'
:
filepath
,
'duration'
:
duration
,
'text'
:
''
json
.
dumps
(
{
'utt'
:
os
.
path
.
splitext
(
os
.
path
.
basename
(
filepath
))[
0
],
'feat'
:
filepath
,
'feat_shape'
:
(
duration
,
),
#second
'type'
:
'background'
}))
with
io
.
open
(
manifest_path
,
mode
=
'w'
,
encoding
=
'utf8'
)
as
out_file
:
for
line
in
json_lines
:
...
...
examples/dataset/librispeech/librispeech.py
浏览文件 @
64f0bad5
...
...
@@ -89,9 +89,13 @@ def create_manifest(data_dir, manifest_path):
duration
=
float
(
len
(
audio_data
))
/
samplerate
json_lines
.
append
(
json
.
dumps
({
'audio_filepath'
:
audio_filepath
,
'duration'
:
duration
,
'text'
:
text
'utt'
:
os
.
path
.
splitext
(
os
.
path
.
basename
(
audio_filepath
))[
0
],
'feat'
:
audio_filepath
,
'feat_shape'
:
(
duration
,
),
#second
'text'
:
text
}))
with
codecs
.
open
(
manifest_path
,
'w'
,
'utf-8'
)
as
out_file
:
for
line
in
json_lines
:
...
...
examples/dataset/mini_librispeech/mini_librispeech.py
浏览文件 @
64f0bad5
...
...
@@ -71,9 +71,13 @@ def create_manifest(data_dir, manifest_path):
duration
=
float
(
len
(
audio_data
))
/
samplerate
json_lines
.
append
(
json
.
dumps
({
'audio_filepath'
:
audio_filepath
,
'duration'
:
duration
,
'text'
:
text
'utt'
:
os
.
path
.
splitext
(
os
.
path
.
basename
(
audio_filepath
))[
0
],
'feat'
:
audio_filepath
,
'feat_shape'
:
(
duration
,
),
#second
'text'
:
text
}))
with
codecs
.
open
(
manifest_path
,
'w'
,
'utf-8'
)
as
out_file
:
for
line
in
json_lines
:
...
...
examples/dataset/musan/musan.py
浏览文件 @
64f0bad5
...
...
@@ -53,9 +53,9 @@ def create_manifest(data_dir, manifest_path_prefix):
print
(
"Creating manifest %s ..."
%
manifest_path_prefix
)
json_lines
=
[]
data_types
=
[
'music'
,
'noise'
,
'speech'
]
for
type
in
data_types
:
for
d
type
in
data_types
:
del
json_lines
[:]
audio_dir
=
os
.
path
.
join
(
data_dir
,
type
)
audio_dir
=
os
.
path
.
join
(
data_dir
,
d
type
)
for
subfolder
,
_
,
filelist
in
sorted
(
os
.
walk
(
audio_dir
)):
print
(
'x, '
,
subfolder
)
for
fname
in
filelist
:
...
...
@@ -67,12 +67,16 @@ def create_manifest(data_dir, manifest_path_prefix):
json_lines
.
append
(
json
.
dumps
(
{
'audio_filepath'
:
audio_path
,
'duration'
:
duration
,
'type'
:
type
,
'utt'
:
os
.
path
.
splitext
(
os
.
path
.
basename
(
audio_path
))[
0
],
'feat'
:
audio_path
,
'feat_shape'
:
(
duration
,
),
#second
'type'
:
dtype
,
},
ensure_ascii
=
False
))
manifest_path
=
manifest_path_prefix
+
'.'
+
type
manifest_path
=
manifest_path_prefix
+
'.'
+
d
type
with
codecs
.
open
(
manifest_path
,
'w'
,
'utf-8'
)
as
fout
:
for
line
in
json_lines
:
fout
.
write
(
line
+
'
\n
'
)
...
...
examples/dataset/rir_noise/rir_noise.py
浏览文件 @
64f0bad5
...
...
@@ -55,9 +55,9 @@ def create_manifest(data_dir, manifest_path_prefix):
data_types
=
[
'pointsource_noises'
,
'real_rirs_isotropic_noises'
,
'simulated_rirs'
]
for
type
in
data_types
:
for
d
type
in
data_types
:
del
json_lines
[:]
audio_dir
=
os
.
path
.
join
(
data_dir
,
type
)
audio_dir
=
os
.
path
.
join
(
data_dir
,
d
type
)
for
subfolder
,
_
,
filelist
in
sorted
(
os
.
walk
(
audio_dir
)):
for
fname
in
filelist
:
audio_path
=
os
.
path
.
join
(
subfolder
,
fname
)
...
...
@@ -68,12 +68,16 @@ def create_manifest(data_dir, manifest_path_prefix):
json_lines
.
append
(
json
.
dumps
(
{
'audio_filepath'
:
audio_path
,
'duration'
:
duration
,
'type'
:
type
,
'utt'
:
os
.
path
.
splitext
(
os
.
path
.
basename
(
audio_path
))[
0
],
'feat'
:
audio_path
,
'feat_shape'
:
(
duration
,
),
#second
'type'
:
dtype
,
},
ensure_ascii
=
False
))
manifest_path
=
manifest_path_prefix
+
'.'
+
type
manifest_path
=
manifest_path_prefix
+
'.'
+
d
type
with
codecs
.
open
(
manifest_path
,
'w'
,
'utf-8'
)
as
fout
:
for
line
in
json_lines
:
fout
.
write
(
line
+
'
\n
'
)
...
...
examples/dataset/voxforge/voxforge.py
浏览文件 @
64f0bad5
...
...
@@ -174,8 +174,9 @@ def generate_manifest(data_dir, manifest_path):
duration
=
float
(
len
(
audio_data
))
/
samplerate
json_lines
.
append
(
json
.
dumps
({
'audio_filepath'
:
u
,
'duration'
:
duration
,
'utt'
:
os
.
path
.
splitext
(
os
.
path
.
basename
(
u
))[
0
],
'feat'
:
u
,
'feat_shape'
:
(
duration
,
),
#second
'text'
:
trans
.
lower
()
}))
...
...
examples/tiny/s0/local/data.sh
浏览文件 @
64f0bad5
...
...
@@ -15,13 +15,20 @@ if [ $? -ne 0 ]; then
exit
1
fi
head
-n
64 data/manifest.dev-clean
>
data/manifest.tiny
head
-n
64 data/manifest.dev-clean
>
data/manifest.tiny
.raw
# bpemode (unigram or bpe)
nbpe
=
200
bpemode
=
unigram
bpeprefix
=
"data/bpe_
${
bpemode
}
_
${
nbpe
}
"
# build vocabulary
python3
${
MAIN_ROOT
}
/utils/build_vocab.py
\
--count_threshold
=
0
\
--unit_type
"bpe"
\
--count_threshold
=
${
nbpe
}
\
--bpe_mode
${
bpemode
}
\
--bpe_model_prefix
${
bpeprefix
}
\
--vocab_path
=
"data/vocab.txt"
\
--manifest_paths
=
"data/manifest.tiny"
--manifest_paths
=
"data/manifest.tiny
.raw
"
if
[
$?
-ne
0
]
;
then
echo
"Build vocabulary failed. Terminated."
...
...
@@ -31,7 +38,7 @@ fi
# compute mean and stddev for normalizer
python3
${
MAIN_ROOT
}
/utils/compute_mean_std.py
\
--manifest_path
=
"data/manifest.tiny"
\
--manifest_path
=
"data/manifest.tiny
.raw
"
\
--num_samples
=
64
\
--specgram_type
=
"linear"
\
--output_path
=
"data/mean_std.npz"
...
...
@@ -41,5 +48,21 @@ if [ $? -ne 0 ]; then
exit
1
fi
# format manifest with tokenids, vocab size
python3
${
MAIN_ROOT
}
/utils/format_data.py
\
--feat_type
"raw"
\
--unit_type
"bpe"
\
--bpe_model_prefix
${
bpeprefix
}
\
--vocab_path
=
"data/vocab.txt"
\
--manifest_path
=
"data/manifest.tiny.raw"
\
--output_path
=
"data/manifest.tiny"
if
[
$?
-ne
0
]
;
then
echo
"Formt mnaifest failed. Terminated."
exit
1
fi
echo
"LibriSpeech Data preparation done."
exit
0
\ No newline at end of file
utils/build_vocab.py
浏览文件 @
64f0bad5
...
...
@@ -17,18 +17,24 @@ Each item in vocabulary file is a character.
import
argparse
import
functools
import
codecs
import
json
from
collections
import
Counter
import
os.path
import
os
import
copy
import
tempfile
from
deepspeech.frontend.utility
import
read_manifest
from
deepspeech.utils.utility
import
add_arguments
,
print_arguments
from
deepspeech.frontend.utility
import
UNK
from
deepspeech.frontend.utility
import
BLANK
from
deepspeech.frontend.utility
import
SOS
from
deepspeech.utils.utility
import
add_arguments
from
deepspeech.utils.utility
import
print_arguments
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
add_arg
=
functools
.
partial
(
add_arguments
,
argparser
=
parser
)
# yapf: disable
add_arg
(
'count_threshold'
,
int
,
0
,
"Truncation threshold for char counts."
)
add_arg
(
'unit_type'
,
str
,
"character"
,
"Unit type, e.g. character, word, bpe"
)
add_arg
(
'count_threshold'
,
int
,
0
,
"Truncation threshold for char/word/bpe counts."
)
add_arg
(
'vocab_path'
,
str
,
'examples/librispeech/data/vocab.txt'
,
"Filepath to write the vocabulary."
)
...
...
@@ -38,6 +44,11 @@ add_arg('manifest_paths', str,
"You can provide multiple manifest files."
,
nargs
=
'+'
,
required
=
True
)
# bpe
add_arg
(
'bpe_mode'
,
str
,
'unigram'
,
"bpe model type, e.g. unigram, bpe, char, word. only need when `unit_type` is bpe"
)
add_arg
(
'bpe_model_prefix'
,
str
,
"bpe_model_%(bpe_mode)_%(count_threshold)"
,
"bpe model prefix, only need when `unit_type` is bpe"
)
# yapf: disable
args
=
parser
.
parse_args
()
...
...
@@ -45,23 +56,96 @@ args = parser.parse_args()
def
count_manifest
(
counter
,
manifest_path
):
manifest_jsons
=
read_manifest
(
manifest_path
)
for
line_json
in
manifest_jsons
:
if
args
.
unit_type
==
'character'
:
for
char
in
line_json
[
'text'
]:
counter
.
update
(
char
)
elif
args
.
unit_type
==
'word'
:
for
word
in
line_json
[
'text'
].
split
():
counter
.
update
(
word
)
def
read_text_manifest
(
fileobj
,
manifest_path
):
manifest_jsons
=
read_manifest
(
manifest_path
)
for
line_json
in
manifest_jsons
:
fileobj
.
write
(
line_json
[
'text'
]
+
"
\n
"
)
def
main
():
print_arguments
(
args
)
fout
=
open
(
args
.
vocab_path
,
'w'
,
encoding
=
'utf-8'
)
fout
.
write
(
BLANK
+
"
\n
"
)
# 0 will be used for "blank" in CTC
fout
.
write
(
UNK
+
'
\n
'
)
# <unk> must be 1
if
args
.
unit_type
!=
'bpe'
:
counter
=
Counter
()
for
manifest_path
in
args
.
manifest_paths
:
count_manifest
(
counter
,
manifest_path
)
count_sorted
=
sorted
(
counter
.
items
(),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
with
codecs
.
open
(
args
.
vocab_path
,
'w'
,
'utf-8'
)
as
fout
:
fout
.
write
(
'<unk>'
+
'
\n
'
)
for
char
,
count
in
count_sorted
:
if
count
<
args
.
count_threshold
:
break
fout
.
write
(
char
+
'
\n
'
)
else
:
# tools/spm_train --input=$wave_data/lang_char/input.txt
# --vocab_size=${nbpe} --model_type=${bpemode}
# --model_prefix=${bpemodel} --input_sentence_size=100000000
import
sentencepiece
as
spm
fp
=
tempfile
.
NamedTemporaryFile
(
mode
=
'w'
,
delete
=
False
)
for
manifest_path
in
args
.
manifest_paths
:
read_text_manifest
(
fp
,
manifest_path
)
fp
.
close
()
# train
spm
.
SentencePieceTrainer
.
Train
(
input
=
fp
.
name
,
vocab_size
=
args
.
count_threshold
,
model_type
=
args
.
bpe_mode
,
model_prefix
=
args
.
bpe_model_prefix
,
input_sentence_size
=
100000000
,
character_coverage
=
0.9995
)
os
.
unlink
(
fp
.
name
)
# encode
sp
=
spm
.
SentencePieceProcessor
()
sp
.
Load
(
args
.
bpe_model_prefix
+
'.model'
)
stats
=
{
"num_empty"
:
0
,
"num_filtered"
:
0
}
def
valid
(
line
):
return
True
def
encode
(
l
):
return
sp
.
EncodeAsPieces
(
l
)
def
encode_line
(
line
):
line
=
line
.
strip
()
if
len
(
line
)
>
0
:
line
=
encode
(
line
)
if
valid
(
line
):
return
line
else
:
stats
[
"num_filtered"
]
+=
1
else
:
stats
[
"num_empty"
]
+=
1
return
None
vocabs
=
set
()
for
manifest_path
in
args
.
manifest_paths
:
manifest_jsons
=
read_manifest
(
manifest_path
)
for
line_json
in
manifest_jsons
:
line
=
line_json
[
'text'
]
enc_line
=
encode_line
(
line
)
for
code
in
enc_line
:
vocabs
.
add
(
code
)
#print(" ".join(enc_line))
vocabs_sorted
=
sorted
(
vocabs
)
for
unit
in
vocabs_sorted
:
fout
.
write
(
unit
+
"
\n
"
)
print
(
f
"bpe vocab size:
{
len
(
vocabs_sorted
)
}
"
)
print
(
f
"skip
{
stats
[
'num_empty'
]
}
empty lines"
)
print
(
f
"filter
{
stats
[
'num_filtered'
]
}
invalid lines"
)
fout
.
write
(
SOS
+
"
\n
"
)
# <sos/eos>
fout
.
close
()
if
__name__
==
'__main__'
:
...
...
utils/format_data.py
0 → 100644
浏览文件 @
64f0bad5
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""format manifest with more metadata."""
import
argparse
import
functools
import
json
from
collections
import
Counter
import
os
import
copy
import
tempfile
from
deepspeech.frontend.utility
import
read_manifest
from
deepspeech.frontend.utility
import
UNK
from
deepspeech.frontend.utility
import
BLANK
from
deepspeech.frontend.utility
import
SOS
from
deepspeech.utils.utility
import
add_arguments
from
deepspeech.utils.utility
import
print_arguments
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
add_arg
=
functools
.
partial
(
add_arguments
,
argparser
=
parser
)
# yapf: disable
add_arg
(
'feat_type'
,
str
,
"raw"
,
"speech feature type, e.g. raw(wav, flac), kaldi"
)
add_arg
(
'unit_type'
,
str
,
"character"
,
"Unit type, e.g. character, word, bpe"
)
add_arg
(
'vocab_path'
,
str
,
'examples/librispeech/data/vocab.txt'
,
"Filepath to write the vocabulary."
)
add_arg
(
'manifest_paths'
,
str
,
None
,
"Filepaths of manifests for building vocabulary. "
"You can provide multiple manifest files."
,
nargs
=
'+'
,
required
=
True
)
# bpe
add_arg
(
'bpe_model_prefix'
,
str
,
"bpe_model_%(bpe_mode)_%(count_threshold)"
,
"bpe model prefix, only need when `unit_type` is bpe"
)
add_arg
(
'output_path'
,
str
,
None
,
"filepath of formated manifest."
,
required
=
True
)
# yapf: disable
args
=
parser
.
parse_args
()
def
main
():
print_arguments
(
args
)
# read vocab
vocab
=
dict
()
with
open
(
args
.
vocab_path
,
'r'
,
encoding
=
'utf-8'
)
as
fin
:
for
line
in
fin
:
token
=
line
.
strip
()
vocab
[
token
]
=
len
(
vocab
)
vocab_size
=
len
(
vocab
)
fout
=
open
(
args
.
output_path
,
'w'
,
encoding
=
'utf-8'
)
if
args
.
unit_type
!=
'bpe'
:
for
manifest_path
in
args
.
manifest_paths
:
manifest_jsons
=
read_manifest
(
manifest_path
)
for
line_json
in
manifest_jsons
:
tokens
=
[]
tokenids
=
[]
if
args
.
unit_type
==
'character'
:
for
char
in
line_json
[
'text'
]:
tokens
.
append
(
char
)
tokenids
.
append
(
vocab
[
char
])
elif
args
.
unit_type
==
'word'
:
for
word
in
line_json
[
'text'
].
split
():
tokens
.
append
(
word
)
tokenids
.
append
(
vocab
[
word
])
line_json
[
'token'
]
=
tokens
line_json
[
'token_id'
]
=
tokenids
line_json
[
'token_shape'
]
=
(
len
(
tokenids
),
vocab_size
)
fout
.
write
(
json
.
dumps
(
line_json
)
+
'
\n
'
)
else
:
import
sentencepiece
as
spm
# encode
sp
=
spm
.
SentencePieceProcessor
()
sp
.
Load
(
args
.
bpe_model_prefix
+
'.model'
)
def
valid
(
line
):
return
True
def
encode
(
l
):
return
sp
.
EncodeAsPieces
(
l
)
def
encode_line
(
line
):
line
=
line
.
strip
()
if
len
(
line
)
>
0
:
line
=
encode
(
line
)
if
valid
(
line
):
return
line
else
:
stats
[
"num_filtered"
]
+=
1
else
:
stats
[
"num_empty"
]
+=
1
return
None
for
manifest_path
in
args
.
manifest_paths
:
manifest_jsons
=
read_manifest
(
manifest_path
)
for
line_json
in
manifest_jsons
:
line
=
line_json
[
'text'
]
tokens
=
[]
tokenids
=
[]
enc_line
=
encode_line
(
line
)
for
code
in
enc_line
:
tokens
.
append
(
code
)
tokenids
.
append
(
vocab
[
code
])
#print(code, vocab[code])
line_json
[
'token'
]
=
tokens
line_json
[
'token_id'
]
=
tokenids
line_json
[
'token_shape'
]
=
(
len
(
tokenids
),
vocab_size
)
fout
.
write
(
json
.
dumps
(
line_json
)
+
'
\n
'
)
fout
.
close
()
if
__name__
==
'__main__'
:
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录