upload_data.py 4.8 KB
Newer Older
1 2
"""This script is used for preparing data for DeepSpeech2 trainning on paddle
cloud.
3 4 5 6 7 8

Steps:
1. Read original manifest and get the local path of sound files.
2. Tar all local sound files into one tar file.
3. Modify original manifest to remove the local path information.

9
Finally, we will get a tar file and a new manifest.
10
"""
W
wanghaoshuang 已提交
11 12 13
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
14

15 16 17 18 19 20 21
import json
import os
import tarfile
import sys
import argparse
import shutil
from subprocess import call
22 23
import _init_paths
from data_utils.utils import read_manifest
24 25 26

TRAIN_TAR = "cloud.train.tar"
TRAIN_MANIFEST = "cloud.train.manifest"
27 28
DEV_TAR = "cloud.dev.tar"
DEV_MANIFEST = "cloud.dev.manifest"
29 30 31 32 33 34 35 36
VOCAB_FILE = "vocab.txt"
MEAN_STD_FILE = "mean_std.npz"

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
    "--train_manifest_path",
    default="../datasets/manifest.train",
    type=str,
37
    help="Manifest file path for train data. (default: %(default)s)")
38
parser.add_argument(
39 40
    "--dev_manifest_path",
    default="../datasets/manifest.dev",
41
    type=str,
42
    help="Manifest file path for validation data. (default: %(default)s)")
43 44 45 46
parser.add_argument(
    "--vocab_file",
    default="../datasets/vocab/eng_vocab.txt",
    type=str,
47 48
    help="Vocabulary file to be uploaded to paddlecloud. "
    "(default: %(default)s)")
49 50 51 52
parser.add_argument(
    "--mean_std_file",
    default="../mean_std.npz",
    type=str,
53 54
    help="Normalizer's statistics (mean and stddev) file to be uploaded to "
    "paddlecloud. (default: %(default)s)")
55 56 57 58
parser.add_argument(
    "--cloud_data_path",
    required=True,
    type=str,
59
    help="Destination path on paddlecloud. (default: %(default)s)")
60 61 62 63
parser.add_argument(
    "--local_tmp_path",
    default="./tmp/",
    type=str,
64
    help="Local directory for storing temporary data. (default: %(default)s)")
65 66 67 68
args = parser.parse_args()


def pack_data(manifest_path, out_tar_path, out_manifest_path):
69 70 71
    """1. According to the manifest, tar sound files into out_tar_path.
    2. Generate a new manifest for output tar file.
    """
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
    out_tar = tarfile.open(out_tar_path, 'w')
    manifest = read_manifest(manifest_path)
    results = []
    for json_data in manifest:
        sound_file = json_data['audio_filepath']
        filename = os.path.basename(sound_file)
        out_tar.add(sound_file, arcname=filename)
        json_data['audio_filepath'] = filename
        results.append("%s\n" % json.dumps(json_data))
    with open(out_manifest_path, 'w') as out_manifest:
        out_manifest.writelines(results)
    out_manifest.close()
    out_tar.close()


87 88 89 90 91 92 93
def pcloud_mkdir(dir):
    """Make directory in PaddleCloud filesystem.
    """
    if call(['paddlecloud', 'mkdir', dir]) != 0:
        raise IOError("PaddleCloud mkdir failed: %s." % dir)


W
wanghaoshuang 已提交
94
def pcloud_cp(src, dst):
95 96
    """Copy src from local filesytem to dst in PaddleCloud filesystem,
    or downlowd src from PaddleCloud filesystem to dst in local filesystem.
W
wanghaoshuang 已提交
97
    """
98 99
    if call(['paddlecloud', 'cp', src, dst]) != 0:
        raise IOError("PaddleCloud cp failed: from [%s] to [%s]." % (src, dst))
W
wanghaoshuang 已提交
100 101 102 103 104 105 106 107 108


def pcloud_exist(path):
    """Check if file or directory exists in PaddleCloud filesystem.
    """
    ret = call(['paddlecloud', 'ls', path])
    return ret


109
if __name__ == '__main__':
W
wanghaoshuang 已提交
110 111
    cloud_train_manifest = os.path.join(args.cloud_data_path, TRAIN_MANIFEST)
    cloud_train_tar = os.path.join(args.cloud_data_path, TRAIN_TAR)
112 113
    cloud_dev_manifest = os.path.join(args.cloud_data_path, DEV_MANIFEST)
    cloud_dev_tar = os.path.join(args.cloud_data_path, DEV_TAR)
W
wanghaoshuang 已提交
114 115 116 117 118
    cloud_vocab_file = os.path.join(args.cloud_data_path, VOCAB_FILE)
    cloud_mean_file = os.path.join(args.cloud_data_path, MEAN_STD_FILE)

    local_train_manifest = os.path.join(args.local_tmp_path, TRAIN_MANIFEST)
    local_train_tar = os.path.join(args.local_tmp_path, TRAIN_TAR)
119 120
    local_dev_manifest = os.path.join(args.local_tmp_path, DEV_MANIFEST)
    local_dev_tar = os.path.join(args.local_tmp_path, DEV_TAR)
121

122
    # prepare local and cloud dir
123 124 125
    if os.path.exists(args.local_tmp_path):
        shutil.rmtree(args.local_tmp_path)
    os.makedirs(args.local_tmp_path)
126 127 128 129 130 131 132 133 134 135 136
    pcloud_mkdir(args.cloud_data_path)

    # pack and upload train data
    pack_data(args.train_manifest_path, local_train_tar, local_train_manifest)
    pcloud_cp(local_train_manifest, cloud_train_manifest)
    pcloud_cp(local_train_tar, cloud_train_tar)

    # pack and upload validation data
    pack_data(args.dev_manifest_path, local_dev_tar, local_dev_manifest)
    pcloud_cp(local_dev_manifest, cloud_dev_manifest)
    pcloud_cp(local_dev_tar, cloud_dev_tar)
137

138 139 140
    # upload vocab file and mean_std file
    pcloud_cp(args.vocab_file, cloud_vocab_file)
    pcloud_cp(args.mean_std_file, cloud_mean_file)
141

W
wanghaoshuang 已提交
142
    shutil.rmtree(args.local_tmp_path)