prepare_data.py 1.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
"""
This tool is used for preparing data for DeepSpeech2 trainning on paddle cloud.

Steps:
1. Read original manifest and get the local path of sound files.
2. Tar all local sound files into one tar file.
3. Modify original manifest to remove the local path information.

Finally, we will get a tar file and a manifest with sound file name, duration
and text.
"""
W
wanghaoshuang 已提交
12 13 14 15 16
import json
import os
import tarfile
import sys
import argparse
17 18
sys.path.append('../')
from data_utils.utils import read_manifest
W
wanghaoshuang 已提交
19 20 21 22

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
    "--manifest_path",
23
    default="../datasets/manifest.train",
W
wanghaoshuang 已提交
24 25 26 27
    type=str,
    help="Manifest of target data. (default: %(default)s)")
parser.add_argument(
    "--out_tar_path",
28
    default="./tmp/cloud.train.tar",
W
wanghaoshuang 已提交
29 30 31 32
    type=str,
    help="Output tar file path. (default: %(default)s)")
parser.add_argument(
    "--out_manifest_path",
33
    default="./tmp/cloud.train.manifest",
W
wanghaoshuang 已提交
34 35 36 37 38 39 40 41 42 43 44
    type=str,
    help="Manifest of output data. (default: %(default)s)")
args = parser.parse_args()


def gen_pcloud_data(manifest_path, out_tar_path, out_manifest_path):
    '''
    1. According manifest, tar sound files into out_tar_path
    2. Generate a new manifest for output tar file
    '''
    out_tar = tarfile.open(out_tar_path, 'w')
45 46 47
    manifest = read_manifest(manifest_path)
    results = []
    for json_data in manifest:
W
wanghaoshuang 已提交
48 49 50 51
        sound_file = json_data['audio_filepath']
        filename = os.path.basename(sound_file)
        out_tar.add(sound_file, arcname=filename)
        json_data['audio_filepath'] = filename
52
        results.append("%s\n" % json.dumps(json_data))
W
wanghaoshuang 已提交
53
    with open(out_manifest_path, 'w') as out_manifest:
54
        out_manifest.writelines(results)
W
wanghaoshuang 已提交
55 56 57 58 59 60 61
    out_manifest.close()
    out_tar.close()


if __name__ == '__main__':
    gen_pcloud_data(args.manifest_path, args.out_tar_path,
                    args.out_manifest_path)