Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
8998f4c2
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
8998f4c2
编写于
7月 06, 2021
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add timit
上级
0296639d
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
271 addition
and
4 deletion
+271
-4
examples/dataset/gigaspeech/run.sh
examples/dataset/gigaspeech/run.sh
+6
-2
examples/dataset/thchs30/thchs30.py
examples/dataset/thchs30/thchs30.py
+3
-2
examples/dataset/timit/.gitignore
examples/dataset/timit/.gitignore
+4
-0
examples/dataset/timit/timit.py
examples/dataset/timit/timit.py
+239
-0
utils/utility.py
utils/utility.py
+19
-0
未找到文件。
examples/dataset/gigaspeech/run.sh
100644 → 100755
浏览文件 @
8998f4c2
...
@@ -5,6 +5,10 @@ set -e
...
@@ -5,6 +5,10 @@ set -e
curdir
=
$PWD
curdir
=
$PWD
test
-d
GigaSpeech
||
git clone https://github.com/SpeechColab/GigaSpeech.git
test
-d
GigaSpeech
||
git clone https://github.com/SpeechColab/GigaSpeech.git
cd
GigaSpeech
pushd
GigaSpeech
source
env_vars.sh
source
env_vars.sh
utils/gigaspeech_download.sh
${
curdir
}
/
./utils/download_gigaspeech.sh
${
curdir
}
/
#toolkits/kaldi/gigaspeech_data_prep.sh --train-subset XL /disk1/audio_data/gigaspeech ../data
popd
examples/dataset/thchs30/thchs30.py
浏览文件 @
8998f4c2
...
@@ -117,20 +117,21 @@ def create_manifest(data_dir, manifest_path_prefix):
...
@@ -117,20 +117,21 @@ def create_manifest(data_dir, manifest_path_prefix):
audio_data
,
samplerate
=
soundfile
.
read
(
audio_path
)
audio_data
,
samplerate
=
soundfile
.
read
(
audio_path
)
duration
=
float
(
len
(
audio_data
)
/
samplerate
)
duration
=
float
(
len
(
audio_data
)
/
samplerate
)
# not dump alignment infos
json_lines
.
append
(
json_lines
.
append
(
json
.
dumps
(
json
.
dumps
(
{
{
'utt'
:
audio_id
,
'utt'
:
audio_id
,
'feat'
:
audio_path
,
'feat'
:
audio_path
,
'feat_shape'
:
(
duration
,
),
# second
'feat_shape'
:
(
duration
,
),
# second
'text'
:
word_text
,
'text'
:
word_text
,
# charactor
'syllable'
:
syllable_text
,
'syllable'
:
syllable_text
,
'phone'
:
phone_text
,
'phone'
:
phone_text
,
},
},
ensure_ascii
=
False
))
ensure_ascii
=
False
))
total_sec
+=
duration
total_sec
+=
duration
total_text
+=
len
(
text
)
total_text
+=
len
(
word_
text
)
total_num
+=
1
total_num
+=
1
manifest_path
=
manifest_path_prefix
+
'.'
+
dtype
manifest_path
=
manifest_path_prefix
+
'.'
+
dtype
...
...
examples/dataset/timit/.gitignore
0 → 100644
浏览文件 @
8998f4c2
TIMIT.*
TIMIT
manifest.*
*.meta
examples/dataset/timit/timit.py
0 → 100644
浏览文件 @
8998f4c2
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Prepare Librispeech ASR datasets.
Download, unpack and create manifest files.
Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
"""
import
argparse
import
codecs
import
json
import
os
import
re
import
string
from
pathlib
import
Path
import
soundfile
from
utils.utility
import
unzip
URL_ROOT
=
""
MD5_DATA
=
"45c68037c7fdfe063a43c851f181fb2d"
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
"--target_dir"
,
default
=
'~/.cache/paddle/dataset/speech/timit'
,
type
=
str
,
help
=
"Directory to save the dataset. (default: %(default)s)"
)
parser
.
add_argument
(
"--manifest_prefix"
,
default
=
"manifest"
,
type
=
str
,
help
=
"Filepath prefix for output manifests. (default: %(default)s)"
)
args
=
parser
.
parse_args
()
#: A string containing Chinese punctuation marks (non-stops).
non_stops
=
(
# Fullwidth ASCII variants
'
\uFF02\uFF03\uFF04\uFF05\uFF06\uFF07\uFF08\uFF09\uFF0A\uFF0B\uFF0C\uFF0D
'
'
\uFF0F\uFF1A\uFF1B\uFF1C\uFF1D\uFF1E\uFF20\uFF3B\uFF3C\uFF3D\uFF3E\uFF3F
'
'
\uFF40\uFF5B\uFF5C\uFF5D\uFF5E\uFF5F\uFF60
'
# Halfwidth CJK punctuation
'
\uFF62\uFF63\uFF64
'
# CJK symbols and punctuation
'
\u3000\u3001\u3003
'
# CJK angle and corner brackets
'
\u3008\u3009\u300A\u300B\u300C\u300D\u300E\u300F\u3010\u3011
'
# CJK brackets and symbols/punctuation
'
\u3014\u3015\u3016\u3017\u3018\u3019\u301A\u301B\u301C\u301D\u301E\u301F
'
# Other CJK symbols
'
\u3030
'
# Special CJK indicators
'
\u303E\u303F
'
# Dashes
'
\u2013\u2014
'
# Quotation marks and apostrophe
'
\u2018\u2019\u201B\u201C\u201D\u201E\u201F
'
# General punctuation
'
\u2026\u2027
'
# Overscores and underscores
'
\uFE4F
'
# Small form variants
'
\uFE51\uFE54
'
# Latin punctuation
'
\u00B7
'
)
#: A string of Chinese stops.
stops
=
(
'
\uFF01
'
# Fullwidth exclamation mark
'
\uFF1F
'
# Fullwidth question mark
'
\uFF61
'
# Halfwidth ideographic full stop
'
\u3002
'
# Ideographic full stop
)
#: A string containing all Chinese punctuation.
punctuation
=
non_stops
+
stops
def
tn
(
text
):
# lower text
text
=
text
.
lower
()
# remove punc
text
=
re
.
sub
(
f
'[
{
punctuation
}{
string
.
punctuation
}
]'
,
""
,
text
)
return
text
def
read_txt
(
filepath
:
str
)
->
str
:
with
open
(
filepath
,
'r'
)
as
f
:
line
=
f
.
read
().
strip
().
split
(
maxsplit
=
2
)[
2
]
return
tn
(
line
)
def
read_algin
(
filepath
:
str
)
->
str
:
"""read word or phone alignment file.
<start-sample> <end-sample> <token><newline>
Args:
filepath (str): [description]
Returns:
str: token sepearte by <space>
"""
aligns
=
[]
# (start, end, token)
with
open
(
filepath
,
'r'
)
as
f
:
for
line
in
f
:
items
=
line
.
strip
().
split
()
# for phone: (Note: beginning and ending silence regions are marked with h#)
if
items
[
2
].
strip
()
==
'h#'
:
continue
aligns
.
append
(
items
)
return
' '
.
join
([
item
[
2
]
for
item
in
aligns
])
def
create_manifest
(
data_dir
,
manifest_path_prefix
):
"""Create a manifest json file summarizing the data set, with each line
containing the meta data (i.e. audio filepath, transcription text, audio
duration) of each audio file within the data set.
"""
print
(
"Creating manifest %s ..."
%
manifest_path_prefix
)
json_lines
=
[]
utts
=
set
()
data_types
=
[
'TRAIN'
,
'TEST'
]
for
dtype
in
data_types
:
del
json_lines
[:]
total_sec
=
0.0
total_text
=
0.0
total_num
=
0
audio_dir
=
Path
(
os
.
path
.
join
(
data_dir
,
dtype
))
for
fname
in
sorted
(
audio_dir
.
rglob
(
'*.WAV'
)):
audio_path
=
fname
.
resolve
()
# .WAV
audio_id
=
audio_path
.
stem
# if uttid exits, then skipped
if
audio_id
in
utts
:
continue
utts
.
add
(
audio_id
)
text_path
=
audio_path
.
with_suffix
(
'.TXT'
)
phone_path
=
audio_path
.
with_suffix
(
'.PHN'
)
word_path
=
audio_path
.
with_suffix
(
'.WRD'
)
audio_data
,
samplerate
=
soundfile
.
read
(
str
(
audio_path
),
dtype
=
'int16'
)
duration
=
float
(
len
(
audio_data
)
/
samplerate
)
word_text
=
read_txt
(
text_path
)
phone_text
=
read_algin
(
phone_path
)
gender_spk
=
str
(
audio_path
.
parent
.
stem
)
spk
=
gender_spk
[
1
:]
gender
=
gender_spk
[
0
]
utt_id
=
'_'
.
join
([
spk
,
gender
,
audio_id
])
# not dump alignment infos
json_lines
.
append
(
json
.
dumps
(
{
'utt'
:
utt_id
,
'feat'
:
str
(
audio_path
),
'feat_shape'
:
(
duration
,
),
# second
'text'
:
word_text
,
# word
'phone'
:
phone_text
,
'spk'
:
spk
,
'gender'
:
gender
,
},
ensure_ascii
=
False
))
total_sec
+=
duration
total_text
+=
len
(
word_text
.
split
())
total_num
+=
1
manifest_path
=
manifest_path_prefix
+
'.'
+
dtype
.
lower
()
with
codecs
.
open
(
manifest_path
,
'w'
,
'utf-8'
)
as
fout
:
for
line
in
json_lines
:
fout
.
write
(
line
+
'
\n
'
)
with
open
(
dtype
.
lower
()
+
'.meta'
,
'w'
)
as
f
:
print
(
f
"
{
dtype
}
:"
,
file
=
f
)
print
(
f
"
{
total_num
}
utts"
,
file
=
f
)
print
(
f
"
{
total_sec
/
(
60
*
60
)
}
h"
,
file
=
f
)
print
(
f
"
{
total_text
}
text"
,
file
=
f
)
print
(
f
"
{
total_text
/
total_sec
}
text/sec"
,
file
=
f
)
print
(
f
"
{
total_sec
/
total_num
}
sec/utt"
,
file
=
f
)
def
prepare_dataset
(
url
,
md5sum
,
target_dir
,
manifest_path
):
"""Download, unpack and create summmary manifest file.
"""
filepath
=
os
.
path
.
join
(
target_dir
,
"TIMIT.zip"
)
if
not
os
.
path
.
exists
(
filepath
):
print
(
f
"Please download TIMIT.zip into
{
target_dir
}
."
)
raise
FileNotFoundError
if
not
os
.
path
.
exists
(
os
.
path
.
join
(
target_dir
,
"TIMIT"
)):
# check md5sum
assert
check_md5sum
(
filepath
,
md5sum
)
# unpack
unzip
(
filepath
,
target_dir
)
else
:
print
(
"Skip downloading and unpacking. Data already exists in %s."
%
target_dir
)
# create manifest json file
create_manifest
(
os
.
path
.
join
(
target_dir
,
"TIMIT"
),
manifest_path
)
def
main
():
if
args
.
target_dir
.
startswith
(
'~'
):
args
.
target_dir
=
os
.
path
.
expanduser
(
args
.
target_dir
)
prepare_dataset
(
URL_ROOT
,
MD5_DATA
,
args
.
target_dir
,
args
.
manifest_prefix
)
print
(
"Data download and manifest prepare done!"
)
if
__name__
==
'__main__'
:
main
()
utils/utility.py
浏览文件 @
8998f4c2
...
@@ -14,9 +14,15 @@
...
@@ -14,9 +14,15 @@
import
os
import
os
import
tarfile
import
tarfile
import
zipfile
import
zipfile
from
typing
import
Text
from
paddle.dataset.common
import
md5file
from
paddle.dataset.common
import
md5file
__all__
=
[
"check_md5sum"
,
"getfile_insensitive"
,
"download_multi"
,
"download"
,
"unpack"
,
"unzip"
]
def
getfile_insensitive
(
path
):
def
getfile_insensitive
(
path
):
"""Get the actual file path when given insensitive filename."""
"""Get the actual file path when given insensitive filename."""
...
@@ -54,6 +60,19 @@ def download(url, md5sum, target_dir):
...
@@ -54,6 +60,19 @@ def download(url, md5sum, target_dir):
return
filepath
return
filepath
def
check_md5sum
(
filepath
:
Text
,
md5sum
:
Text
)
->
bool
:
"""check md5sum of file.
Args:
filepath (Text): [description]
md5sum (Text): [description]
Returns:
bool: same or not.
"""
return
md5file
(
filepath
)
==
md5sum
def
unpack
(
filepath
,
target_dir
,
rm_tar
=
False
):
def
unpack
(
filepath
,
target_dir
,
rm_tar
=
False
):
"""Unpack the file to the target_dir."""
"""Unpack the file to the target_dir."""
print
(
"Unpacking %s ..."
%
filepath
)
print
(
"Unpacking %s ..."
%
filepath
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录