提交 9f95c2ab 编写于 作者: Y yangyaming

Add THCHS30 dataset.

上级 a9c817cd
......@@ -16,12 +16,10 @@ For some machines, we also need to install libsndfile1. Details to be added.
### Preparing Data
```
cd datasets
sh run_all.sh
cd ..
sh datasets/run_all.sh
```
`sh run_all.sh` prepares all ASR datasets (currently, only LibriSpeech available). After running, we have several summarization manifest files in json-format.
`sh datasets/run_all.sh` prepares all ASR datasets (currently, only LibriSpeech and THCHS30 available). After running, we have several summarization manifest files in json-format.
A manifest file summarizes a speech data set, with each line containing the meta data (i.e. audio filepath, transcript text, audio duration) of each audio file within the data set, in json format. Manifest file serves as an interface informing our system of where and what to read the speech samples.
......
"""Provide some common utility functions."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import tarfile
import wget
from paddle.v2.dataset.common import md5file
def download(url, md5sum, target_dir):
"""Download file from url to target_dir, and check md5sum."""
if not os.path.exists(target_dir): os.makedirs(target_dir)
filepath = os.path.join(target_dir, url.split("/")[-1])
if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
print("Downloading %s ..." % url)
wget.download(url, target_dir)
print("\nMD5 Chesksum %s ..." % filepath)
if not md5file(filepath) == md5sum:
raise RuntimeError("MD5 checksum failed.")
else:
print("File exists, skip downloading. (%s)" % filepath)
return filepath
def unpack(filepath, target_dir, rm_tar=False):
"""Unpack the file to the target_dir."""
print("Unpacking %s ..." % filepath)
tar = tarfile.open(filepath)
tar.extractall(target_dir)
tar.close()
if rm_tar == True:
os.remove(filepath)
......@@ -11,12 +11,10 @@ from __future__ import print_function
import distutils.util
import os
import wget
import tarfile
import argparse
import soundfile
import json
from paddle.v2.dataset.common import md5file
from datasets.common import download, unpack
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
......@@ -45,7 +43,7 @@ parser.add_argument(
help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument(
"--manifest_prefix",
default="manifest",
default="manifest-libri",
type=str,
help="Filepath prefix for output manifests. (default: %(default)s)")
parser.add_argument(
......@@ -58,33 +56,6 @@ parser.add_argument(
args = parser.parse_args()
def download(url, md5sum, target_dir):
"""
Download file from url to target_dir, and check md5sum.
"""
if not os.path.exists(target_dir): os.makedirs(target_dir)
filepath = os.path.join(target_dir, url.split("/")[-1])
if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
print("Downloading %s ..." % url)
wget.download(url, target_dir)
print("\nMD5 Chesksum %s ..." % filepath)
if not md5file(filepath) == md5sum:
raise RuntimeError("MD5 checksum failed.")
else:
print("File exists, skip downloading. (%s)" % filepath)
return filepath
def unpack(filepath, target_dir):
"""
Unpack the file to the target_dir.
"""
print("Unpacking %s ..." % filepath)
tar = tarfile.open(filepath)
tar.extractall(target_dir)
tar.close()
def create_manifest(data_dir, manifest_path):
"""
Create a manifest json file summarizing the data set, with each line
......
cd librispeech
python librispeech.py
export PYTHONPATH=`pwd`:$PYTHONPATH
cd datasets
python thchs30/thchs30.py
if [ $? -ne 0 ]; then
echo "Prepare LHCHS30 failed. Terminated."
exit 1
fi
python librispeech/librispeech.py
if [ $? -ne 0 ]; then
echo "Prepare LibriSpeech failed. Terminated."
exit 1
fi
cd -
cat librispeech/manifest.train* | shuf > manifest.train
cat librispeech/manifest.dev-clean > manifest.dev
cat librispeech/manifest.test-clean > manifest.test
echo "All done."
cd -
"""Prepare THCHS-30 Chinese Speech Corpus.
Download, unpack and create manifest files.
Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import distutils.util
import os
import argparse
import soundfile
import json
from datasets.common import download, unpack
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
URL_ROOT = "http://www.openslr.org/resources/18"
URL_CLEAN_DATA = URL_ROOT + "/data_thchs30.tgz"
URL_0DB_NOISY_TEST_DATA = URL_ROOT + "/test-noise.tgz"
MD5_CLEAN_DATA = "2d2252bde5c8429929e1841d4cb95e90"
MD5_0DB_NOISY_TEST_DATA = "7e8a985fb965b84141b68c68556c2030"
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--target_dir",
default=DATA_HOME + "/THCHS30",
type=str,
help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument(
"--manifest_prefix",
default="manifest-thchs30",
type=str,
help="Filepath prefix for output manifests. (default: %(default)s)")
parser.add_argument(
"--download_0db_noise_test",
default="True",
type=distutils.util.strtobool,
help="Whether to download 0db noisy test dataset."
" If True, download 0Db noise mixed test data. (default: %(default)s)")
parser.add_argument(
"--remove_tar",
default="True",
type=distutils.util.strtobool,
help="If True, remove tar file after unpacking automatically."
" (default: %(default)s)")
args = parser.parse_args()
def create_manifest(transcript_data_dir, audio_data_dir, manifest_path):
"""Create a manifest json file summarizing the data set, with each line
containing the meta data (i.e. audio filepath, transcription text, audio
duration) of each audio file within the data set.
"""
print("Creating manifest %s ..." % manifest_path)
json_lines = []
filelist = sorted(os.walk(audio_data_dir))[0][2]
audio_filelist = [fname for fname in filelist if fname.endswith('.wav')]
for audio_file in audio_filelist:
transcript_file_path = os.path.join(transcript_data_dir,
audio_file + '.trn')
if not os.path.isfile(transcript_file_path):
raise IOError("Transcript file %s not exists." % \
transcript_file_path)
transcript_text = open(transcript_file_path).readline().strip()
transcript_text = ''.join(transcript_text.split(' '))
audio_file_path = os.path.join(audio_data_dir, audio_file)
audio_data, samplerate = soundfile.read(audio_file_path)
duration = float(len(audio_data)) / samplerate
json_lines.append(
json.dumps(
{
'audio_filepath': audio_file_path,
'duration': duration,
'text': transcript_text
},
ensure_ascii=False))
with open(manifest_path, 'w') as out_file:
for line in json_lines:
out_file.write(line + '\n')
def prepare_dataset(target_dir, manifest_prefix, download_noisy, rm_tar):
def download_unpack(url, md5sum, download_dir, unpack_dir, rm_tar):
if not os.path.exists(unpack_dir):
filepath = download(url, md5sum, download_dir)
unpack(filepath, unpack_dir, rm_tar)
else:
print("Skip downloading and unpacking. Data already exists in %s" %
unpack_dir)
clean_dir = os.path.join(target_dir, "Clean")
download_unpack(URL_CLEAN_DATA, MD5_CLEAN_DATA, target_dir, clean_dir,
rm_tar)
# create [train-clean|dev-clean|test-clean] manifest file
base_dir = os.path.join(clean_dir, 'data_thchs30')
transcript_data_dir = os.path.join(base_dir, 'data')
for data_type in ['train', 'dev', 'test']:
manifest_path = manifest_prefix + '.' + data_type + '-clean'
audio_data_dir = os.path.join(base_dir, data_type)
create_manifest(transcript_data_dir, audio_data_dir, manifest_path)
if download_noisy == True:
# create test-0db-noise-[cafe|car|white] manifest file
noisy_test_dir = os.path.join(target_dir, "0DB-Noisy-Test")
download_unpack(URL_0DB_NOISY_TEST_DATA, MD5_0DB_NOISY_TEST_DATA,
target_dir, noisy_test_dir, rm_tar)
noisy_base_dir = os.path.join(noisy_test_dir, 'test-noise', '0db')
for data_type in ['cafe', 'car', 'white']:
manifest_path = manifest_prefix + '.' + \
'test-0db-noise-' + data_type
audio_data_dir = os.path.join(noisy_base_dir, data_type)
create_manifest(transcript_data_dir, audio_data_dir, manifest_path)
def main():
target_dir = args.target_dir
manifest_prefix = args.manifest_prefix
download_noisy = False
if args.download_0db_noise_test == True:
download_noisy = True
rm_tar = False
if args.remove_tar == True:
rm_tar = True
prepare_dataset(target_dir, manifest_prefix, download_noisy, rm_tar)
if __name__ == '__main__':
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册