Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
e9a42044
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e9a42044
编写于
9月 19, 2017
作者:
Y
yangyaming
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add data preparing for Aishell.
上级
3bed29dd
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
155 addition
and
3 deletion
+155
-3
data/aishell/aishell.py
data/aishell/aishell.py
+109
-0
data/librispeech/librispeech.py
data/librispeech/librispeech.py
+2
-2
data_utils/utility.py
data_utils/utility.py
+1
-0
examples/aishell/run_data.sh
examples/aishell/run_data.sh
+42
-0
examples/librispeech/run_data.sh
examples/librispeech/run_data.sh
+1
-1
未找到文件。
data/aishell/aishell.py
0 → 100644
浏览文件 @
e9a42044
"""Prepare Aishell mandarin dataset
Download, unpack and create manifest files.
Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
os
import
codecs
import
soundfile
import
json
import
argparse
from
data_utils.utility
import
download
,
unpack
DATA_HOME
=
os
.
path
.
expanduser
(
'~/.cache/paddle/dataset/speech'
)
URL_ROOT
=
'http://www.openslr.org/resources/33'
DATA_URL
=
URL_ROOT
+
'/data_aishell.tgz'
MD5_DATA
=
'2f494334227864a8a8fec932999db9d8'
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
"--target_dir"
,
default
=
DATA_HOME
+
"/Aishell"
,
type
=
str
,
help
=
"Directory to save the dataset. (default: %(default)s)"
)
parser
.
add_argument
(
"--manifest_prefix"
,
default
=
"manifest"
,
type
=
str
,
help
=
"Filepath prefix for output manifests. (default: %(default)s)"
)
args
=
parser
.
parse_args
()
def
create_manifest
(
data_dir
,
manifest_path_prefix
):
print
(
"Creating manifest %s ..."
%
manifest_path_prefix
)
json_lines
=
[]
transcript_path
=
os
.
path
.
join
(
data_dir
,
'transcript'
,
'aishell_transcript_v0.8.txt'
)
transcript_dict
=
{}
for
line
in
codecs
.
open
(
transcript_path
,
'r'
,
'utf-8'
):
line
=
line
.
strip
()
if
line
==
''
:
continue
audio_id
,
text
=
line
.
split
(
' '
,
1
)
# remove withespace
text
=
''
.
join
(
text
.
split
())
transcript_dict
[
audio_id
]
=
text
data_types
=
[
'train'
,
'dev'
,
'test'
]
for
type
in
data_types
:
audio_dir
=
os
.
path
.
join
(
data_dir
,
'wav'
,
type
)
for
subfolder
,
_
,
filelist
in
sorted
(
os
.
walk
(
audio_dir
)):
for
fname
in
filelist
:
audio_path
=
os
.
path
.
join
(
subfolder
,
fname
)
audio_id
=
fname
[:
-
4
]
# if no transcription for audio then skipped
if
audio_id
not
in
transcript_dict
:
continue
audio_data
,
samplerate
=
soundfile
.
read
(
audio_path
)
duration
=
float
(
len
(
audio_data
)
/
samplerate
)
text
=
transcript_dict
[
audio_id
]
json_lines
.
append
(
json
.
dumps
(
{
'audio_filepath'
:
audio_path
,
'duration'
:
duration
,
'text'
:
text
},
ensure_ascii
=
False
))
manifest_path
=
manifest_path_prefix
+
'.'
+
type
with
codecs
.
open
(
manifest_path
,
'w'
,
'utf-8'
)
as
fout
:
for
line
in
json_lines
:
fout
.
write
(
line
+
'
\n
'
)
def
prepare_dataset
(
url
,
md5sum
,
target_dir
,
manifest_path
):
"""Download, unpack and create manifest file."""
data_dir
=
os
.
path
.
join
(
target_dir
,
'data_aishell'
)
if
not
os
.
path
.
exists
(
data_dir
):
filepath
=
download
(
url
,
md5sum
,
target_dir
)
unpack
(
filepath
,
target_dir
)
# unpack all audio tar files
audio_dir
=
os
.
path
.
join
(
data_dir
,
'wav'
)
for
subfolder
,
_
,
filelist
in
sorted
(
os
.
walk
(
audio_dir
)):
for
ftar
in
filelist
:
unpack
(
os
.
path
.
join
(
subfolder
,
ftar
),
subfolder
,
True
)
else
:
print
(
"Skip downloading and unpacking. Data already exists in %s."
%
target_dir
)
create_manifest
(
data_dir
,
manifest_path
)
def
main
():
if
args
.
target_dir
.
startswith
(
'~'
):
args
.
target_dir
=
os
.
path
.
expanduser
(
args
.
target_dir
)
prepare_dataset
(
url
=
DATA_URL
,
md5sum
=
MD5_DATA
,
target_dir
=
args
.
target_dir
,
manifest_path
=
args
.
manifest_prefix
)
if
__name__
==
'__main__'
:
main
()
data/librispeech/librispeech.py
浏览文件 @
e9a42044
...
@@ -16,7 +16,6 @@ import argparse
...
@@ -16,7 +16,6 @@ import argparse
import
soundfile
import
soundfile
import
json
import
json
import
codecs
import
codecs
from
paddle.v2.dataset.common
import
md5file
from
data_utils.utility
import
download
,
unpack
from
data_utils.utility
import
download
,
unpack
URL_ROOT
=
"http://www.openslr.org/resources/12"
URL_ROOT
=
"http://www.openslr.org/resources/12"
...
@@ -104,6 +103,7 @@ def prepare_dataset(url, md5sum, target_dir, manifest_path):
...
@@ -104,6 +103,7 @@ def prepare_dataset(url, md5sum, target_dir, manifest_path):
def
main
():
def
main
():
if
args
.
target_dir
.
startswith
(
'~'
):
args
.
target_dir
=
os
.
path
.
expanduser
(
args
.
target_dir
)
args
.
target_dir
=
os
.
path
.
expanduser
(
args
.
target_dir
)
prepare_dataset
(
prepare_dataset
(
...
...
data_utils/utility.py
浏览文件 @
e9a42044
...
@@ -7,6 +7,7 @@ import json
...
@@ -7,6 +7,7 @@ import json
import
codecs
import
codecs
import
os
import
os
import
tarfile
import
tarfile
from
paddle.v2.dataset.common
import
md5file
def
read_manifest
(
manifest_path
,
max_duration
=
float
(
'inf'
),
min_duration
=
0.0
):
def
read_manifest
(
manifest_path
,
max_duration
=
float
(
'inf'
),
min_duration
=
0.0
):
...
...
examples/aishell/run_data.sh
0 → 100644
浏览文件 @
e9a42044
#! /usr/bin/env bash
pushd
../..
>
/dev/null
# download data, generate manifests
PYTHONPATH
=
.:
$PYTHONPATH
python data/aishell/aishell.py
\
--manifest_prefix
=
'data/aishell/manifest'
\
--target_dir
=
'~/.cache/paddle/dataset/speech/Aishell'
if
[
$?
-ne
0
]
;
then
echo
"Prepare Aishell failed. Terminated."
exit
1
fi
# build vocabulary
python tools/build_vocab.py
\
--count_threshold
=
0
\
--vocab_path
=
'data/aishell/vocab.txt'
\
--manifest_paths
=
'data/aishell/manifest.train'
if
[
$?
-ne
0
]
;
then
echo
"Build vocabulary failed. Terminated."
exit
1
fi
# compute mean and stddev for normalizer
python tools/compute_mean_std.py
\
--manifest_path
=
'data/aishell/manifest.train'
\
--num_samples
=
2000
\
--specgram_type
=
'linear'
\
--output_path
=
'data/aishell/mean_std.npz'
if
[
$?
-ne
0
]
;
then
echo
"Compute mean and stddev failed. Terminated."
exit
1
fi
echo
"Aishell data preparation done."
exit
0
examples/librispeech/run_data.sh
浏览文件 @
e9a42044
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
pushd
../..
>
/dev/null
pushd
../..
>
/dev/null
# download data, generate manifests
# download data, generate manifests
python data/librispeech/librispeech.py
\
PYTHONPATH
=
.:
$PYPYTHONPATH
python data/librispeech/librispeech.py
\
--manifest_prefix
=
'data/librispeech/manifest'
\
--manifest_prefix
=
'data/librispeech/manifest'
\
--target_dir
=
'~/.cache/paddle/dataset/speech/Libri'
\
--target_dir
=
'~/.cache/paddle/dataset/speech/Libri'
\
--full_download
=
'True'
--full_download
=
'True'
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录