aishell.py 5.3 KB
Newer Older
H
Hui Zhang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Y
yangyaming 已提交
14 15 16 17 18 19 20
"""Prepare Aishell mandarin dataset

Download, unpack and create manifest files.
Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
"""
21
import argparse
Y
yangyaming 已提交
22 23
import codecs
import json
24 25 26 27 28 29
import os

import soundfile

from utils.utility import download
from utils.utility import unpack
Y
yangyaming 已提交
30 31 32 33

DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

URL_ROOT = 'http://www.openslr.org/resources/33'
H
Hui Zhang 已提交
34
# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
Y
yangyaming 已提交
35 36
DATA_URL = URL_ROOT + '/data_aishell.tgz'
MD5_DATA = '2f494334227864a8a8fec932999db9d8'
H
Hui Zhang 已提交
37 38
RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz'
MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5'
Y
yangyaming 已提交
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
    "--target_dir",
    default=DATA_HOME + "/Aishell",
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument(
    "--manifest_prefix",
    default="manifest",
    type=str,
    help="Filepath prefix for output manifests. (default: %(default)s)")
args = parser.parse_args()


def create_manifest(data_dir, manifest_path_prefix):
    print("Creating manifest %s ..." % manifest_path_prefix)
    json_lines = []
    transcript_path = os.path.join(data_dir, 'transcript',
                                   'aishell_transcript_v0.8.txt')
    transcript_dict = {}
    for line in codecs.open(transcript_path, 'r', 'utf-8'):
        line = line.strip()
62 63
        if line == '':
            continue
Y
yangyaming 已提交
64
        audio_id, text = line.split(' ', 1)
H
Hui Zhang 已提交
65
        # remove withespace, charactor text
Y
yangyaming 已提交
66 67 68 69
        text = ''.join(text.split())
        transcript_dict[audio_id] = text

    data_types = ['train', 'dev', 'test']
70
    for dtype in data_types:
71
        del json_lines[:]
H
Hui Zhang 已提交
72 73 74 75
        total_sec = 0.0
        total_text = 0.0
        total_num = 0

76
        audio_dir = os.path.join(data_dir, 'wav', dtype)
Y
yangyaming 已提交
77 78
        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
            for fname in filelist:
H
Hui Zhang 已提交
79 80
                audio_path = os.path.abspath(os.path.join(subfolder, fname))
                audio_id = os.path.basename(fname)[:-4]
Y
yangyaming 已提交
81 82 83 84 85 86 87 88 89
                # if no transcription for audio then skipped
                if audio_id not in transcript_dict:
                    continue
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
                text = transcript_dict[audio_id]
                json_lines.append(
                    json.dumps(
                        {
H
Hui Zhang 已提交
90 91
                            'utt': audio_id,
                            'feat': audio_path,
92
                            'feat_shape': (duration, ),  # second
H
Hui Zhang 已提交
93
                            'text': text
Y
yangyaming 已提交
94 95
                        },
                        ensure_ascii=False))
H
Hui Zhang 已提交
96 97 98 99 100

                total_sec += duration
                total_text += len(text)
                total_num += 1

101
        manifest_path = manifest_path_prefix + '.' + dtype
Y
yangyaming 已提交
102 103 104 105
        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
            for line in json_lines:
                fout.write(line + '\n')

H
Hui Zhang 已提交
106 107 108 109 110 111 112 113
        with open(dtype + '.meta', 'w') as f:
            print(f"{dtype}:", file=f)
            print(f"{total_num} utts", file=f)
            print(f"{total_sec / (60*60)} h", file=f)
            print(f"{total_text} text", file=f)
            print(f"{total_text / total_sec} text/sec", file=f)
            print(f"{total_sec / total_num} sec/utt", file=f)

Y
yangyaming 已提交
114

H
Hui Zhang 已提交
115
def prepare_dataset(url, md5sum, target_dir, manifest_path=None):
Y
yangyaming 已提交
116 117 118 119 120 121 122 123 124 125 126 127 128
    """Download, unpack and create manifest file."""
    data_dir = os.path.join(target_dir, 'data_aishell')
    if not os.path.exists(data_dir):
        filepath = download(url, md5sum, target_dir)
        unpack(filepath, target_dir)
        # unpack all audio tar files
        audio_dir = os.path.join(data_dir, 'wav')
        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
            for ftar in filelist:
                unpack(os.path.join(subfolder, ftar), subfolder, True)
    else:
        print("Skip downloading and unpacking. Data already exists in %s." %
              target_dir)
H
Hui Zhang 已提交
129 130 131

    if manifest_path:
        create_manifest(data_dir, manifest_path)
Y
yangyaming 已提交
132 133 134 135 136 137 138 139 140 141 142 143


def main():
    if args.target_dir.startswith('~'):
        args.target_dir = os.path.expanduser(args.target_dir)

    prepare_dataset(
        url=DATA_URL,
        md5sum=MD5_DATA,
        target_dir=args.target_dir,
        manifest_path=args.manifest_prefix)

H
Hui Zhang 已提交
144 145 146 147 148 149
    prepare_dataset(
        url=RESOURCE_URL,
        md5sum=MD5_RESOURCE,
        target_dir=args.target_dir,
        manifest_path=None)

H
Hui Zhang 已提交
150 151
    print("Data download and manifest prepare done!")

Y
yangyaming 已提交
152 153 154

if __name__ == '__main__':
    main()