upload_data.py 4.8 KB
Newer Older
W
wanghaoshuang 已提交
1
"""This tool is used for preparing data for DeepSpeech2 trainning on paddle cloud.
2 3 4 5 6 7 8 9 10

Steps:
1. Read original manifest and get the local path of sound files.
2. Tar all local sound files into one tar file.
3. Modify original manifest to remove the local path information.

Finally, we will get a tar file and a manifest with sound file name, duration
and text.
"""
W
wanghaoshuang 已提交
11 12 13
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
import json
import os
import tarfile
import sys
import argparse
import shutil
sys.path.append('../')
from data_utils.utils import read_manifest
from subprocess import call

TRAIN_TAR = "cloud.train.tar"
TRAIN_MANIFEST = "cloud.train.manifest"
TEST_TAR = "cloud.test.tar"
TEST_MANIFEST = "cloud.test.manifest"
VOCAB_FILE = "vocab.txt"
MEAN_STD_FILE = "mean_std.npz"

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
    "--train_manifest_path",
    default="../datasets/manifest.train",
    type=str,
    help="Manifest file of train data. (default: %(default)s)")
parser.add_argument(
    "--test_manifest_path",
    default="../datasets/manifest.test",
    type=str,
    help="Manifest file of test data. (default: %(default)s)")
parser.add_argument(
    "--vocab_file",
    default="../datasets/vocab/eng_vocab.txt",
    type=str,
    help="Vocab file to be uploaded to paddlecloud. (default: %(default)s)")
parser.add_argument(
    "--mean_std_file",
    default="../mean_std.npz",
    type=str,
    help="mean_std file to be uploaded to paddlecloud. (default: %(default)s)")
parser.add_argument(
    "--cloud_data_path",
    required=True,
    type=str,
    help="Destination path on  paddlecloud. (default: %(default)s)")
args = parser.parse_args()

parser.add_argument(
    "--local_tmp_path",
    default="./tmp/",
    type=str,
    help="Local directory for storing temporary  data. (default: %(default)s)")
args = parser.parse_args()


def pack_data(manifest_path, out_tar_path, out_manifest_path):
W
wanghaoshuang 已提交
68
    '''1. According to the manifest, tar sound files into out_tar_path
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
    2. Generate a new manifest for output tar file
    '''
    out_tar = tarfile.open(out_tar_path, 'w')
    manifest = read_manifest(manifest_path)
    results = []
    for json_data in manifest:
        sound_file = json_data['audio_filepath']
        filename = os.path.basename(sound_file)
        out_tar.add(sound_file, arcname=filename)
        json_data['audio_filepath'] = filename
        results.append("%s\n" % json.dumps(json_data))
    with open(out_manifest_path, 'w') as out_manifest:
        out_manifest.writelines(results)
    out_manifest.close()
    out_tar.close()


W
wanghaoshuang 已提交
86 87 88 89 90 91 92 93 94 95 96 97 98 99
def pcloud_cp(src, dst):
    """Copy src from local filesytem to dst in PaddleCloud filesystem.
    """
    ret = call(['paddlecloud', 'cp', src, dst])
    return ret


def pcloud_exist(path):
    """Check if file or directory exists in PaddleCloud filesystem.
    """
    ret = call(['paddlecloud', 'ls', path])
    return ret


100
if __name__ == '__main__':
W
wanghaoshuang 已提交
101 102 103 104 105 106 107 108 109 110 111
    cloud_train_manifest = os.path.join(args.cloud_data_path, TRAIN_MANIFEST)
    cloud_train_tar = os.path.join(args.cloud_data_path, TRAIN_TAR)
    cloud_test_manifest = os.path.join(args.cloud_data_path, TEST_MANIFEST)
    cloud_test_tar = os.path.join(args.cloud_data_path, TEST_TAR)
    cloud_vocab_file = os.path.join(args.cloud_data_path, VOCAB_FILE)
    cloud_mean_file = os.path.join(args.cloud_data_path, MEAN_STD_FILE)

    local_train_manifest = os.path.join(args.local_tmp_path, TRAIN_MANIFEST)
    local_train_tar = os.path.join(args.local_tmp_path, TRAIN_TAR)
    local_test_manifest = os.path.join(args.local_tmp_path, TEST_MANIFEST)
    local_test_tar = os.path.join(args.local_tmp_path, TEST_TAR)
112 113 114 115 116 117 118

    if os.path.exists(args.local_tmp_path):
        shutil.rmtree(args.local_tmp_path)
    os.makedirs(args.local_tmp_path)

    # train data
    if args.train_manifest_path != "":
W
wanghaoshuang 已提交
119
        ret = pcloud_exist(cloud_train_manifest)
120 121 122
        if ret != 0:
            pack_data(args.train_manifest_path, local_train_tar,
                      local_train_manifest)
W
wanghaoshuang 已提交
123 124
            pcloud_cp(local_train_manifest, cloud_train_manifest)
            pcloud_cp(local_train_tar, cloud_train_tar)
125 126 127

    # test data
    if args.test_manifest_path != "":
W
wanghaoshuang 已提交
128
        ret = pcloud_exist(cloud_test_manifest)
129 130 131
        if ret != 0:
            pack_data(args.test_manifest_path, local_test_tar,
                      local_test_manifest)
W
wanghaoshuang 已提交
132 133
            pcloud_cp(local_test_manifest, cloud_test_manifest)
            pcloud_cp(local_test_tar, cloud_test_tar)
134 135 136

    # vocab file
    if args.vocab_file != "":
W
wanghaoshuang 已提交
137
        ret = pcloud_exist(cloud_vocab_file)
138
        if ret != 0:
W
wanghaoshuang 已提交
139
            pcloud_cp(args.vocab_file, cloud_vocab_file)
140 141 142

    # mean_std file
    if args.mean_std_file != "":
W
wanghaoshuang 已提交
143
        ret = pcloud_exist(cloud_mean_file)
144
        if ret != 0:
W
wanghaoshuang 已提交
145
            pcloud_cp(args.mean_std_file, cloud_mean_file)
146

W
wanghaoshuang 已提交
147
    shutil.rmtree(args.local_tmp_path)