aishell.py 5.0 KB
Newer Older
H
Hui Zhang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Y
yangyaming 已提交
14 15 16 17 18 19 20
"""Prepare Aishell mandarin dataset

Download, unpack and create manifest files.
Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
"""
21
import argparse
Y
yangyaming 已提交
22 23
import codecs
import json
24 25 26 27 28 29
import os

import soundfile

from utils.utility import download
from utils.utility import unpack
Y
yangyaming 已提交
30 31 32 33

DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

URL_ROOT = 'http://www.openslr.org/resources/33'
H
Hui Zhang 已提交
34
# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
Y
yangyaming 已提交
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
DATA_URL = URL_ROOT + '/data_aishell.tgz'
MD5_DATA = '2f494334227864a8a8fec932999db9d8'

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
    "--target_dir",
    default=DATA_HOME + "/Aishell",
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument(
    "--manifest_prefix",
    default="manifest",
    type=str,
    help="Filepath prefix for output manifests. (default: %(default)s)")
args = parser.parse_args()


def create_manifest(data_dir, manifest_path_prefix):
    print("Creating manifest %s ..." % manifest_path_prefix)
    json_lines = []
    transcript_path = os.path.join(data_dir, 'transcript',
                                   'aishell_transcript_v0.8.txt')
    transcript_dict = {}
    for line in codecs.open(transcript_path, 'r', 'utf-8'):
        line = line.strip()
60 61
        if line == '':
            continue
Y
yangyaming 已提交
62
        audio_id, text = line.split(' ', 1)
H
Hui Zhang 已提交
63
        # remove withespace, charactor text
Y
yangyaming 已提交
64 65 66 67
        text = ''.join(text.split())
        transcript_dict[audio_id] = text

    data_types = ['train', 'dev', 'test']
68
    for dtype in data_types:
69
        del json_lines[:]
H
Hui Zhang 已提交
70 71 72 73
        total_sec = 0.0
        total_text = 0.0
        total_num = 0

74
        audio_dir = os.path.join(data_dir, 'wav', dtype)
Y
yangyaming 已提交
75 76
        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
            for fname in filelist:
H
Hui Zhang 已提交
77 78
                audio_path = os.path.abspath(os.path.join(subfolder, fname))
                audio_id = os.path.basename(fname)[:-4]
Y
yangyaming 已提交
79 80 81 82 83 84 85 86 87
                # if no transcription for audio then skipped
                if audio_id not in transcript_dict:
                    continue
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
                text = transcript_dict[audio_id]
                json_lines.append(
                    json.dumps(
                        {
H
Hui Zhang 已提交
88 89
                            'utt': audio_id,
                            'feat': audio_path,
90
                            'feat_shape': (duration, ),  # second
H
Hui Zhang 已提交
91
                            'text': text
Y
yangyaming 已提交
92 93
                        },
                        ensure_ascii=False))
H
Hui Zhang 已提交
94 95 96 97 98

                total_sec += duration
                total_text += len(text)
                total_num += 1

99
        manifest_path = manifest_path_prefix + '.' + dtype
Y
yangyaming 已提交
100 101 102 103
        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
            for line in json_lines:
                fout.write(line + '\n')

H
Hui Zhang 已提交
104 105 106 107 108 109 110 111
        with open(dtype + '.meta', 'w') as f:
            print(f"{dtype}:", file=f)
            print(f"{total_num} utts", file=f)
            print(f"{total_sec / (60*60)} h", file=f)
            print(f"{total_text} text", file=f)
            print(f"{total_text / total_sec} text/sec", file=f)
            print(f"{total_sec / total_num} sec/utt", file=f)

Y
yangyaming 已提交
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139

def prepare_dataset(url, md5sum, target_dir, manifest_path):
    """Download, unpack and create manifest file."""
    data_dir = os.path.join(target_dir, 'data_aishell')
    if not os.path.exists(data_dir):
        filepath = download(url, md5sum, target_dir)
        unpack(filepath, target_dir)
        # unpack all audio tar files
        audio_dir = os.path.join(data_dir, 'wav')
        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
            for ftar in filelist:
                unpack(os.path.join(subfolder, ftar), subfolder, True)
    else:
        print("Skip downloading and unpacking. Data already exists in %s." %
              target_dir)
    create_manifest(data_dir, manifest_path)


def main():
    if args.target_dir.startswith('~'):
        args.target_dir = os.path.expanduser(args.target_dir)

    prepare_dataset(
        url=DATA_URL,
        md5sum=MD5_DATA,
        target_dir=args.target_dir,
        manifest_path=args.manifest_prefix)

H
Hui Zhang 已提交
140 141
    print("Data download and manifest prepare done!")

Y
yangyaming 已提交
142 143 144

if __name__ == '__main__':
    main()