Merge pull request #287 from pkuyym/fix-244

Fix 244

Merge pull request #287 from pkuyym/fix-244
Fix 244
4f785a7b · Xinghai Sun · GitHub · c66a40d7 · e9a42044 · 4f785a7b
5 changed file
--- a/data/aishell/aishell.py
+++ b/data/aishell/aishell.py
+"""Prepare Aishell mandarin dataset
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import codecs
+import soundfile
+import json
+import argparse
+from data_utils.utility import download, unpack
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+URL_ROOT = 'http://www.openslr.org/resources/33'
+DATA_URL = URL_ROOT + '/data_aishell.tgz'
+MD5_DATA = '2f494334227864a8a8fec932999db9d8'
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/Aishell",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+def create_manifest(data_dir, manifest_path_prefix):
+    print("Creating manifest %s ..." % manifest_path_prefix)
+    json_lines = []
+    transcript_path = os.path.join(data_dir, 'transcript',
+                                   'aishell_transcript_v0.8.txt')
+    transcript_dict = {}
+    for line in codecs.open(transcript_path, 'r', 'utf-8'):
+        line = line.strip()
+        if line == '': continue
+        audio_id, text = line.split(' ', 1)
+        # remove withespace
+        text = ''.join(text.split())
+        transcript_dict[audio_id] = text
+    data_types = ['train', 'dev', 'test']
+    for type in data_types:
+        audio_dir = os.path.join(data_dir, 'wav', type)
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for fname in filelist:
+                audio_path = os.path.join(subfolder, fname)
+                audio_id = fname[:-4]
+                # if no transcription for audio then skipped
+                if audio_id not in transcript_dict:
+                    continue
+                audio_data, samplerate = soundfile.read(audio_path)
+                duration = float(len(audio_data) / samplerate)
+                text = transcript_dict[audio_id]
+                json_lines.append(
+                    json.dumps(
+                        {
+                            'audio_filepath': audio_path,
+                            'duration': duration,
+                            'text': text
+                        },
+                        ensure_ascii=False))
+        manifest_path = manifest_path_prefix + '.' + type
+        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
+            for line in json_lines:
+                fout.write(line + '\n')
+def prepare_dataset(url, md5sum, target_dir, manifest_path):
+    """Download, unpack and create manifest file."""
+    data_dir = os.path.join(target_dir, 'data_aishell')
+    if not os.path.exists(data_dir):
+        filepath = download(url, md5sum, target_dir)
+        unpack(filepath, target_dir)
+        # unpack all audio tar files
+        audio_dir = os.path.join(data_dir, 'wav')
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for ftar in filelist:
+                unpack(os.path.join(subfolder, ftar), subfolder, True)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+    create_manifest(data_dir, manifest_path)
+def main():
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+    prepare_dataset(
+        url=DATA_URL,
+        md5sum=MD5_DATA,
+        target_dir=args.target_dir,
+        manifest_path=args.manifest_prefix)
+if __name__ == '__main__':
+    main()
--- a/data/librispeech/librispeech.py
+++ b/data/librispeech/librispeech.py
@@ -12,12 +12,11 @@ from __future__ import print_function
 import distutils.util
 import os
 import sys
-import tarfile
 import argparse
 import soundfile
 import json
 import codecs
-from paddle.v2.dataset.common import md5file
+from data_utils.utility import download, unpack
 URL_ROOT = "http://www.openslr.org/resources/12"
 URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
@@ -57,31 +56,6 @@ parser.add_argument(
 args = parser.parse_args()
-def download(url, md5sum, target_dir):
-    """Download file from url to target_dir, and check md5sum.
-    """
-    if not os.path.exists(target_dir): os.makedirs(target_dir)
-    filepath = os.path.join(target_dir, url.split("/")[-1])
-    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
-        print("Downloading %s ..." % url)
-        os.system("wget -c " + url + " -P " + target_dir)
-        print("\nMD5 Chesksum %s ..." % filepath)
-        if not md5file(filepath) == md5sum:
-            raise RuntimeError("MD5 checksum failed.")
-    else:
-        print("File exists, skip downloading. (%s)" % filepath)
-    return filepath
-def unpack(filepath, target_dir):
-    """Unpack the file to the target_dir.
-    """
-    print("Unpacking %s ..." % filepath)
-    tar = tarfile.open(filepath)
-    tar.extractall(target_dir)
-    tar.close()
 def create_manifest(data_dir, manifest_path):
    """Create a manifest json file summarizing the data set, with each line
    containing the meta data (i.e. audio filepath, transcription text, audio
@@ -129,7 +103,8 @@ def prepare_dataset(url, md5sum, target_dir, manifest_path):
 def main():
-    args.target_dir = os.path.expanduser(args.target_dir)
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
    prepare_dataset(
        url=URL_TEST_CLEAN,

--- a/data_utils/utility.py
+++ b/data_utils/utility.py
@@ -5,6 +5,9 @@ from __future__ import print_function
 import json
 import codecs
+import os
+import tarfile
+from paddle.v2.dataset.common import md5file
 def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
@@ -33,3 +36,28 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
                json_data["duration"] >= min_duration):
            manifest.append(json_data)
    return manifest
+def download(url, md5sum, target_dir):
+    """Download file from url to target_dir, and check md5sum."""
+    if not os.path.exists(target_dir): os.makedirs(target_dir)
+    filepath = os.path.join(target_dir, url.split("/")[-1])
+    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
+        print("Downloading %s ..." % url)
+        os.system("wget -c " + url + " -P " + target_dir)
+        print("\nMD5 Chesksum %s ..." % filepath)
+        if not md5file(filepath) == md5sum:
+            raise RuntimeError("MD5 checksum failed.")
+    else:
+        print("File exists, skip downloading. (%s)" % filepath)
+    return filepath
+def unpack(filepath, target_dir, rm_tar=False):
+    """Unpack the file to the target_dir."""
+    print("Unpacking %s ..." % filepath)
+    tar = tarfile.open(filepath)
+    tar.extractall(target_dir)
+    tar.close()
+    if rm_tar == True:
+        os.remove(filepath)
--- a/examples/aishell/run_data.sh
+++ b/examples/aishell/run_data.sh
+#! /usr/bin/env bash
+pushd ../.. > /dev/null
+# download data, generate manifests
+PYTHONPATH=.:$PYTHONPATH python data/aishell/aishell.py \
+--manifest_prefix='data/aishell/manifest' \
+--target_dir='~/.cache/paddle/dataset/speech/Aishell'
+if [ $? -ne 0 ]; then
+    echo "Prepare Aishell failed. Terminated."
+    exit 1
+fi
+# build vocabulary
+python tools/build_vocab.py \
+--count_threshold=0 \
+--vocab_path='data/aishell/vocab.txt' \
+--manifest_paths='data/aishell/manifest.train'
+if [ $? -ne 0 ]; then
+    echo "Build vocabulary failed. Terminated."
+    exit 1
+fi
+# compute mean and stddev for normalizer
+python tools/compute_mean_std.py \
+--manifest_path='data/aishell/manifest.train' \
+--num_samples=2000 \
+--specgram_type='linear' \
+--output_path='data/aishell/mean_std.npz'
+if [ $? -ne 0 ]; then
+    echo "Compute mean and stddev failed. Terminated."
+    exit 1
+fi
+echo "Aishell data preparation done."
+exit 0
--- a/examples/librispeech/run_data.sh
+++ b/examples/librispeech/run_data.sh
@@ -3,7 +3,7 @@
 pushd ../.. > /dev/null
 # download data, generate manifests
-python data/librispeech/librispeech.py \
+PYTHONPATH=.:$PYPYTHONPATH python data/librispeech/librispeech.py \
 --manifest_prefix='data/librispeech/manifest' \
 --target_dir='~/.cache/paddle/dataset/speech/Libri' \
 --full_download='True'