aishell.py 5.5 KB
Newer Older
H
Hui Zhang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Y
yangyaming 已提交
14 15 16 17 18 19 20
"""Prepare Aishell mandarin dataset

Download, unpack and create manifest files.
Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
"""
21
import argparse
Y
yangyaming 已提交
22 23
import codecs
import json
24
import os
H
Hui Zhang 已提交
25
from pathlib import Path
26 27 28 29 30

import soundfile

from utils.utility import download
from utils.utility import unpack
Y
yangyaming 已提交
31 32 33 34

DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

URL_ROOT = 'http://www.openslr.org/resources/33'
H
Hui Zhang 已提交
35
# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
Y
yangyaming 已提交
36 37
DATA_URL = URL_ROOT + '/data_aishell.tgz'
MD5_DATA = '2f494334227864a8a8fec932999db9d8'
H
Hui Zhang 已提交
38 39
RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz'
MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5'
Y
yangyaming 已提交
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
    "--target_dir",
    default=DATA_HOME + "/Aishell",
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument(
    "--manifest_prefix",
    default="manifest",
    type=str,
    help="Filepath prefix for output manifests. (default: %(default)s)")
args = parser.parse_args()


def create_manifest(data_dir, manifest_path_prefix):
    print("Creating manifest %s ..." % manifest_path_prefix)
    json_lines = []
    transcript_path = os.path.join(data_dir, 'transcript',
                                   'aishell_transcript_v0.8.txt')
    transcript_dict = {}
    for line in codecs.open(transcript_path, 'r', 'utf-8'):
        line = line.strip()
63 64
        if line == '':
            continue
Y
yangyaming 已提交
65
        audio_id, text = line.split(' ', 1)
H
Hui Zhang 已提交
66
        # remove withespace, charactor text
Y
yangyaming 已提交
67 68 69 70
        text = ''.join(text.split())
        transcript_dict[audio_id] = text

    data_types = ['train', 'dev', 'test']
71
    for dtype in data_types:
72
        del json_lines[:]
H
Hui Zhang 已提交
73 74 75 76
        total_sec = 0.0
        total_text = 0.0
        total_num = 0

77
        audio_dir = os.path.join(data_dir, 'wav', dtype)
Y
yangyaming 已提交
78 79
        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
            for fname in filelist:
H
Hui Zhang 已提交
80 81
                audio_path = os.path.abspath(os.path.join(subfolder, fname))
                audio_id = os.path.basename(fname)[:-4]
Y
yangyaming 已提交
82 83 84
                # if no transcription for audio then skipped
                if audio_id not in transcript_dict:
                    continue
H
Hui Zhang 已提交
85 86
               
                utt2spk = Path(audio_path).parent.name
Y
yangyaming 已提交
87 88 89 90 91 92
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
                text = transcript_dict[audio_id]
                json_lines.append(
                    json.dumps(
                        {
H
Hui Zhang 已提交
93
                            'utt': audio_id,
H
Hui Zhang 已提交
94
                            'utt2spk': str(utt2spk),
H
Hui Zhang 已提交
95
                            'feat': audio_path,
96
                            'feat_shape': (duration, ),  # second
H
Hui Zhang 已提交
97
                            'text': text
Y
yangyaming 已提交
98 99
                        },
                        ensure_ascii=False))
H
Hui Zhang 已提交
100 101 102 103 104

                total_sec += duration
                total_text += len(text)
                total_num += 1

105
        manifest_path = manifest_path_prefix + '.' + dtype
Y
yangyaming 已提交
106 107 108 109
        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
            for line in json_lines:
                fout.write(line + '\n')

H
Hui Zhang 已提交
110 111 112
        manifest_dir = os.path.dirname(manifest_path_prefix)
        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
        with open(meta_path, 'w') as f:
H
Hui Zhang 已提交
113 114 115 116 117 118 119
            print(f"{dtype}:", file=f)
            print(f"{total_num} utts", file=f)
            print(f"{total_sec / (60*60)} h", file=f)
            print(f"{total_text} text", file=f)
            print(f"{total_text / total_sec} text/sec", file=f)
            print(f"{total_sec / total_num} sec/utt", file=f)

Y
yangyaming 已提交
120

H
Hui Zhang 已提交
121
def prepare_dataset(url, md5sum, target_dir, manifest_path=None):
Y
yangyaming 已提交
122 123 124 125 126 127 128 129 130 131 132 133 134
    """Download, unpack and create manifest file."""
    data_dir = os.path.join(target_dir, 'data_aishell')
    if not os.path.exists(data_dir):
        filepath = download(url, md5sum, target_dir)
        unpack(filepath, target_dir)
        # unpack all audio tar files
        audio_dir = os.path.join(data_dir, 'wav')
        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
            for ftar in filelist:
                unpack(os.path.join(subfolder, ftar), subfolder, True)
    else:
        print("Skip downloading and unpacking. Data already exists in %s." %
              target_dir)
H
Hui Zhang 已提交
135 136 137

    if manifest_path:
        create_manifest(data_dir, manifest_path)
Y
yangyaming 已提交
138 139 140 141 142 143 144 145 146 147 148 149


def main():
    if args.target_dir.startswith('~'):
        args.target_dir = os.path.expanduser(args.target_dir)

    prepare_dataset(
        url=DATA_URL,
        md5sum=MD5_DATA,
        target_dir=args.target_dir,
        manifest_path=args.manifest_prefix)

H
Hui Zhang 已提交
150 151 152 153 154 155
    prepare_dataset(
        url=RESOURCE_URL,
        md5sum=MD5_RESOURCE,
        target_dir=args.target_dir,
        manifest_path=None)

H
Hui Zhang 已提交
156 157
    print("Data download and manifest prepare done!")

Y
yangyaming 已提交
158 159 160

if __name__ == '__main__':
    main()