upload_data.py 4.4 KB
Newer Older
1
"""This script is for uploading data for DeepSpeech2 training on paddlecloud.
2 3

Steps:
4 5 6
1. Read original manifests and extract local sound files.
2. Tar all local sound files into multiple tar files and upload them.
3. Modify original manifests with updated paths in cloud filesystem.
7
"""
W
wanghaoshuang 已提交
8 9 10
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
11

12 13 14 15 16 17 18
import json
import os
import tarfile
import sys
import argparse
import shutil
from subprocess import call
19 20
import _init_paths
from data_utils.utils import read_manifest
21 22 23

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
24
    "--in_manifest_paths",
25 26 27 28
    default=[
        "../datasets/manifest.train", "../datasets/manifest.dev",
        "../datasets/manifest.test"
    ],
29
    type=str,
30 31
    nargs='+',
    help="Local filepaths of input manifests to load, pack and upload."
32
    "(default: %(default)s)")
33
parser.add_argument(
34
    "--out_manifest_paths",
35 36 37 38
    default=[
        "./cloud.manifest.train", "./cloud.manifest.dev",
        "./cloud.manifest.test"
    ],
39
    type=str,
40 41 42
    nargs='+',
    help="Local filepaths of modified manifests to write to. "
    "(default: %(default)s)")
43
parser.add_argument(
44
    "--cloud_data_dir",
45 46
    required=True,
    type=str,
47 48 49 50 51 52
    help="Destination directory on paddlecloud to upload data to.")
parser.add_argument(
    "--num_shards",
    default=10,
    type=int,
    help="Number of parts to split data to. (default: %(default)s)")
53
parser.add_argument(
54
    "--local_tmp_dir",
55 56
    default="./tmp/",
    type=str,
57
    help="Local directory for storing temporary data. (default: %(default)s)")
58 59 60
args = parser.parse_args()


61 62 63 64 65
def upload_data(in_manifest_path_list, out_manifest_path_list, local_tmp_dir,
                upload_tar_dir, num_shards):
    """Extract and pack sound files listed in the manifest files into multple
    tar files and upload them to padldecloud. Besides, generate new manifest
    files with updated paths in paddlecloud.
66
    """
67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
    # compute total audio number
    total_line = 0
    for manifest_path in in_manifest_path_list:
        with open(manifest_path, 'r') as f:
            total_line += len(f.readlines())
    line_per_tar = (total_line // num_shards) + 1

    # pack and upload shard by shard
    line_count, tar_file = 0, None
    for manifest_path, out_manifest_path in zip(in_manifest_path_list,
                                                out_manifest_path_list):
        manifest = read_manifest(manifest_path)
        out_manifest = []
        for json_data in manifest:
            sound_filepath = json_data['audio_filepath']
            sound_filename = os.path.basename(sound_filepath)
            if line_count % line_per_tar == 0:
                if tar_file != None:
                    tar_file.close()
                    pcloud_cp(tar_path, upload_tar_dir)
                    os.remove(tar_path)
                tar_name = 'part-%s-of-%s.tar' % (
                    str(line_count // line_per_tar).zfill(5),
                    str(num_shards).zfill(5))
                tar_path = os.path.join(local_tmp_dir, tar_name)
                tar_file = tarfile.open(tar_path, 'w')
            tar_file.add(sound_filepath, arcname=sound_filename)
            line_count += 1
            json_data['audio_filepath'] = "tar:%s#%s" % (
                os.path.join(upload_tar_dir, tar_name), sound_filename)
            out_manifest.append("%s\n" % json.dumps(json_data))
        with open(out_manifest_path, 'w') as f:
            f.writelines(out_manifest)
100
        pcloud_cp(out_manifest_path, upload_tar_dir)
101 102 103
    tar_file.close()
    pcloud_cp(tar_path, upload_tar_dir)
    os.remove(tar_path)
104 105


106 107 108 109 110 111 112
def pcloud_mkdir(dir):
    """Make directory in PaddleCloud filesystem.
    """
    if call(['paddlecloud', 'mkdir', dir]) != 0:
        raise IOError("PaddleCloud mkdir failed: %s." % dir)


W
wanghaoshuang 已提交
113
def pcloud_cp(src, dst):
114 115
    """Copy src from local filesytem to dst in PaddleCloud filesystem,
    or downlowd src from PaddleCloud filesystem to dst in local filesystem.
W
wanghaoshuang 已提交
116
    """
117 118
    if call(['paddlecloud', 'cp', src, dst]) != 0:
        raise IOError("PaddleCloud cp failed: from [%s] to [%s]." % (src, dst))
W
wanghaoshuang 已提交
119 120


121
if __name__ == '__main__':
122 123 124
    if not os.path.exists(args.local_tmp_dir):
        os.makedirs(args.local_tmp_dir)
    pcloud_mkdir(args.cloud_data_dir)
125

126
    upload_data(args.in_manifest_paths, args.out_manifest_paths,
127
                args.local_tmp_dir, args.cloud_data_dir, args.num_shards)
128

129
    shutil.rmtree(args.local_tmp_dir)