upload_data.py 4.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
"""
This tool is used for preparing data for DeepSpeech2 trainning on paddle cloud.

Steps:
1. Read original manifest and get the local path of sound files.
2. Tar all local sound files into one tar file.
3. Modify original manifest to remove the local path information.

Finally, we will get a tar file and a manifest with sound file name, duration
and text.
"""
import json
import os
import tarfile
import sys
import argparse
import shutil
sys.path.append('../')
from data_utils.utils import read_manifest
from subprocess import call

TRAIN_TAR = "cloud.train.tar"
TRAIN_MANIFEST = "cloud.train.manifest"
TEST_TAR = "cloud.test.tar"
TEST_MANIFEST = "cloud.test.manifest"
VOCAB_FILE = "vocab.txt"
MEAN_STD_FILE = "mean_std.npz"

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
    "--train_manifest_path",
    default="../datasets/manifest.train",
    type=str,
    help="Manifest file of train data. (default: %(default)s)")
parser.add_argument(
    "--test_manifest_path",
    default="../datasets/manifest.test",
    type=str,
    help="Manifest file of test data. (default: %(default)s)")
parser.add_argument(
    "--vocab_file",
    default="../datasets/vocab/eng_vocab.txt",
    type=str,
    help="Vocab file to be uploaded to paddlecloud. (default: %(default)s)")
parser.add_argument(
    "--mean_std_file",
    default="../mean_std.npz",
    type=str,
    help="mean_std file to be uploaded to paddlecloud. (default: %(default)s)")
parser.add_argument(
    "--cloud_data_path",
    required=True,
    default="",
    type=str,
    help="Destination path on  paddlecloud. (default: %(default)s)")
args = parser.parse_args()

parser.add_argument(
    "--local_tmp_path",
    default="./tmp/",
    type=str,
    help="Local directory for storing temporary  data. (default: %(default)s)")
args = parser.parse_args()


def pack_data(manifest_path, out_tar_path, out_manifest_path):
    '''
    1. According manifest, tar sound files into out_tar_path
    2. Generate a new manifest for output tar file
    '''
    out_tar = tarfile.open(out_tar_path, 'w')
    manifest = read_manifest(manifest_path)
    results = []
    for json_data in manifest:
        sound_file = json_data['audio_filepath']
        filename = os.path.basename(sound_file)
        out_tar.add(sound_file, arcname=filename)
        json_data['audio_filepath'] = filename
        results.append("%s\n" % json.dumps(json_data))
    with open(out_manifest_path, 'w') as out_manifest:
        out_manifest.writelines(results)
    out_manifest.close()
    out_tar.close()


if __name__ == '__main__':
    cloud_train_manifest = "%s/%s" % (args.cloud_data_path, TRAIN_MANIFEST)
    cloud_train_tar = "%s/%s" % (args.cloud_data_path, TRAIN_TAR)
    cloud_test_manifest = "%s/%s" % (args.cloud_data_path, TEST_MANIFEST)
    cloud_test_tar = "%s/%s" % (args.cloud_data_path, TEST_TAR)
    cloud_vocab_file = "%s/%s" % (args.cloud_data_path, VOCAB_FILE)
    cloud_mean_file = "%s/%s" % (args.cloud_data_path, MEAN_STD_FILE)

    local_train_manifest = "%s/%s" % (args.local_tmp_path, TRAIN_MANIFEST)
    local_train_tar = "%s/%s" % (args.local_tmp_path, TRAIN_TAR)
    local_test_manifest = "%s/%s" % (args.local_tmp_path, TEST_MANIFEST)
    local_test_tar = "%s/%s" % (args.local_tmp_path, TEST_TAR)

    if os.path.exists(args.local_tmp_path):
        shutil.rmtree(args.local_tmp_path)
    os.makedirs(args.local_tmp_path)

    ret = 1
    # train data
    if args.train_manifest_path != "":
        ret = call(['paddlecloud', 'ls', cloud_train_manifest])
        if ret != 0:
            print "%s does't exist" % cloud_train_manifest
            pack_data(args.train_manifest_path, local_train_tar,
                      local_train_manifest)
            call([
                'paddlecloud', 'cp', local_train_manifest, cloud_train_manifest
            ])
            call(['paddlecloud', 'cp', local_train_tar, cloud_train_tar])

    # test data
    if args.test_manifest_path != "":
        try:
            ret = call(['paddlecloud', 'ls', cloud_test_manifest])
        except Exception:
            ret = 1
        if ret != 0:
            pack_data(args.test_manifest_path, local_test_tar,
                      local_test_manifest)
            call(
                ['paddlecloud', 'cp', local_test_manifest, cloud_test_manifest])
            call(['paddlecloud', 'cp', local_test_tar, cloud_test_tar])

    # vocab file
    if args.vocab_file != "":
        try:
            ret = call(['paddlecloud', 'ls', cloud_vocab_file])
        except Exception:
            ret = 1
        if ret != 0:
            call(['paddlecloud', 'cp', args.vocab_file, cloud_vocab_file])

    # mean_std file
    if args.mean_std_file != "":
        try:
            ret = call(['paddlecloud', 'ls', cloud_mean_file])
        except Exception:
            ret = 1
        if ret != 0:
            call(['paddlecloud', 'cp', args.mean_std_file, cloud_mean_file])

    os.removedirs(args.local_tmp_path)