diff --git a/deep_speech_2/cloud/_init_paths.py b/deep_speech_2/cloud/_init_paths.py new file mode 100644 index 0000000000000000000000000000000000000000..3305d7488ff1cfb03db7175a53f70c1a107fe52e --- /dev/null +++ b/deep_speech_2/cloud/_init_paths.py @@ -0,0 +1,17 @@ +"""Set up paths for DS2""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os.path +import sys + + +def add_path(path): + if path not in sys.path: + sys.path.insert(0, path) + + +this_dir = os.path.dirname(__file__) +proj_path = os.path.join(this_dir, '..') +add_path(proj_path) diff --git a/deep_speech_2/cloud/pcloud_submit.sh b/deep_speech_2/cloud/pcloud_submit.sh index 5ecb011bc80e9008ba796177712b57d0869610b4..78172c1a5226630310437ad3e8fc51bd4d0a2769 100644 --- a/deep_speech_2/cloud/pcloud_submit.sh +++ b/deep_speech_2/cloud/pcloud_submit.sh @@ -1,17 +1,22 @@ # Configure input data set in local filesystem -TRAIN_MANIFEST="/home/work/demo/ds2/pcloud/models/deep_speech_2/datasets/manifest.dev" -TEST_MANIFEST="/home/work/demo/ds2/pcloud/models/deep_speech_2/datasets/manifest.dev" -VOCAB_FILE="/home/work/demo/ds2/pcloud/models/deep_speech_2/datasets/vocab/eng_vocab.txt" -MEAN_STD_FILE="/home/work/demo/ds2/pcloud/models/deep_speech_2/mean_std.npz" - +TRAIN_MANIFEST="../datasets/manifest.dev" +DEV_MANIFEST="../datasets/manifest.dev" +VOCAB_FILE="../datasets/vocab/eng_vocab.txt" +MEAN_STD_FILE="../mean_std.npz" # Configure output path in PaddleCloud filesystem -CLOUD_DATA_DIR="/pfs/dlnel/home/demo/deepspeech2/data" -CLOUD_MODEL_DIR="/pfs/dlnel/home/demo/deepspeech2/model" +CLOUD_DATA_DIR="/pfs/dlnel/home/sunxinghai@baidu.com/deepspeech2/data" +CLOUD_MODEL_DIR="/pfs/dlnel/home/sunxinghai@baidu.com/deepspeech2/model" +# Configure cloud resources +NUM_CPU=12 +NUM_GPU=4 +NUM_NODE=2 +MEMORY="10Gi" +IS_LOCAL="False" # Pack and upload local data to PaddleCloud filesystem python upload_data.py \ --train_manifest_path=${TRAIN_MANIFEST} \ ---test_manifest_path=${TEST_MANIFEST} \ +--dev_manifest_path=${DEV_MANIFEST} \ --vocab_file=${VOCAB_FILE} \ --mean_std_file=${MEAN_STD_FILE} \ --cloud_data_path=${CLOUD_DATA_DIR} @@ -21,23 +26,23 @@ then exit 1 fi -JOB_NAME=deepspeech`date +%Y%m%d%H%M%S` +# Submit job to PaddleCloud +JOB_NAME=deepspeech-`date +%Y%m%d%H%M%S` DS2_PATH=${PWD%/*} cp -f pcloud_train.sh ${DS2_PATH} -# Configure computation resource and submit job to PaddleCloud paddlecloud submit \ -image bootstrapper:5000/wanghaoshuang/pcloud_ds2:latest \ -jobname ${JOB_NAME} \ --cpu 4 \ --gpu 4 \ --memory 10Gi \ --parallelism 2 \ +-cpu ${NUM_CPU} \ +-gpu ${NUM_GPU} \ +-memory ${MEMORY} \ +-parallelism ${NUM_NODE} \ -pscpu 1 \ -pservers 1 \ --psmemory 10Gi \ +-psmemory ${MEMORY} \ -passes 1 \ --entry "sh pcloud_train.sh ${CLOUD_DATA_DIR} ${CLOUD_MODEL_DIR}" \ +-entry "sh pcloud_train.sh ${CLOUD_DATA_DIR} ${CLOUD_MODEL_DIR} ${NUM_CPU} ${NUM_GPU} ${IS_LOCAL}" \ ${DS2_PATH} rm ${DS2_PATH}/pcloud_train.sh diff --git a/deep_speech_2/cloud/pcloud_train.sh b/deep_speech_2/cloud/pcloud_train.sh index b9a50360ac0ff512706cd82a18ccd5b58f880e7b..21bd43f92e3e5f3b59bd6f28ab133546349d7b7b 100644 --- a/deep_speech_2/cloud/pcloud_train.sh +++ b/deep_speech_2/cloud/pcloud_train.sh @@ -1,28 +1,36 @@ DATA_PATH=$1 MODEL_PATH=$2 +NUM_CPU=$3 +NUM_GPU=$4 +IS_LOCAL=$5 + TRAIN_MANI=${DATA_PATH}/cloud.train.manifest -DEV_MANI=${DATA_PATH}/cloud.test.manifest +DEV_MANI=${DATA_PATH}/cloud.dev.manifest TRAIN_TAR=${DATA_PATH}/cloud.train.tar -DEV_TAR=${DATA_PATH}/cloud.test.tar +DEV_TAR=${DATA_PATH}/cloud.dev.tar VOCAB_PATH=${DATA_PATH}/vocab.txt MEAN_STD_FILE=${DATA_PATH}/mean_std.npz # split train data for each pcloud node python ./cloud/split_data.py \ ---in_manifest_path=$TRAIN_MANI \ ---data_tar_path=$TRAIN_TAR \ ---out_manifest_path='./local.train.manifest' +--in_manifest_path=${TRAIN_MANI} \ +--data_tar_path=${TRAIN_TAR} \ +--out_manifest_path='/local.train.manifest' # split dev data for each pcloud node python ./cloud/split_data.py \ ---in_manifest_path=$DEV_MANI \ ---data_tar_path=$DEV_TAR \ ---out_manifest_path='./local.test.manifest' +--in_manifest_path=${DEV_MANI} \ +--data_tar_path=${DEV_TAR} \ +--out_manifest_path='/local.dev.manifest' +# run train python train.py \ --use_gpu=1 \ ---mean_std_filepath=$MEAN_STD_FILE \ ---train_manifest_path='./local.train.manifest' \ ---dev_manifest_path='./local.test.manifest' \ ---vocab_filepath=$VOCAB_PATH \ +--trainer_count=${NUM_GPU} \ +--num_threads_data=${NUM_CPU} \ +--is_local=${IS_LOCAL} \ +--mean_std_filepath=${MEAN_STD_FILE} \ +--train_manifest_path='/local.train.manifest' \ +--dev_manifest_path='/local.dev.manifest' \ +--vocab_filepath=${VOCAB_PATH} \ --output_model_dir=${MODEL_PATH} diff --git a/deep_speech_2/cloud/split_data.py b/deep_speech_2/cloud/split_data.py index 6b0754a80cbb9420f5aa0dce60e30ac0c166d5fa..8df194a62bac052a0b49ce4c8993e640fdc9dc88 100644 --- a/deep_speech_2/cloud/split_data.py +++ b/deep_speech_2/cloud/split_data.py @@ -6,6 +6,7 @@ This script should be called in paddle cloud. from __future__ import absolute_import from __future__ import division from __future__ import print_function + import os import json import argparse diff --git a/deep_speech_2/cloud/upload_data.py b/deep_speech_2/cloud/upload_data.py index 3336f722b478658a2751f3a0ac3a4a65ded7fa98..efa9e77c0e1e154cc8245a4444db983a76d82510 100644 --- a/deep_speech_2/cloud/upload_data.py +++ b/deep_speech_2/cloud/upload_data.py @@ -1,30 +1,31 @@ -"""This tool is used for preparing data for DeepSpeech2 trainning on paddle cloud. +"""This script is used for preparing data for DeepSpeech2 trainning on paddle +cloud. Steps: 1. Read original manifest and get the local path of sound files. 2. Tar all local sound files into one tar file. 3. Modify original manifest to remove the local path information. -Finally, we will get a tar file and a manifest with sound file name, duration -and text. +Finally, we will get a tar file and a new manifest. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function + import json import os import tarfile import sys import argparse import shutil -sys.path.append('../') -from data_utils.utils import read_manifest from subprocess import call +import _init_paths +from data_utils.utils import read_manifest TRAIN_TAR = "cloud.train.tar" TRAIN_MANIFEST = "cloud.train.manifest" -TEST_TAR = "cloud.test.tar" -TEST_MANIFEST = "cloud.test.manifest" +DEV_TAR = "cloud.dev.tar" +DEV_MANIFEST = "cloud.dev.manifest" VOCAB_FILE = "vocab.txt" MEAN_STD_FILE = "mean_std.npz" @@ -33,41 +34,41 @@ parser.add_argument( "--train_manifest_path", default="../datasets/manifest.train", type=str, - help="Manifest file of train data. (default: %(default)s)") + help="Manifest file path for train data. (default: %(default)s)") parser.add_argument( - "--test_manifest_path", - default="../datasets/manifest.test", + "--dev_manifest_path", + default="../datasets/manifest.dev", type=str, - help="Manifest file of test data. (default: %(default)s)") + help="Manifest file path for validation data. (default: %(default)s)") parser.add_argument( "--vocab_file", default="../datasets/vocab/eng_vocab.txt", type=str, - help="Vocab file to be uploaded to paddlecloud. (default: %(default)s)") + help="Vocabulary file to be uploaded to paddlecloud. " + "(default: %(default)s)") parser.add_argument( "--mean_std_file", default="../mean_std.npz", type=str, - help="mean_std file to be uploaded to paddlecloud. (default: %(default)s)") + help="Normalizer's statistics (mean and stddev) file to be uploaded to " + "paddlecloud. (default: %(default)s)") parser.add_argument( "--cloud_data_path", required=True, type=str, - help="Destination path on paddlecloud. (default: %(default)s)") -args = parser.parse_args() - + help="Destination path on paddlecloud. (default: %(default)s)") parser.add_argument( "--local_tmp_path", default="./tmp/", type=str, - help="Local directory for storing temporary data. (default: %(default)s)") + help="Local directory for storing temporary data. (default: %(default)s)") args = parser.parse_args() def pack_data(manifest_path, out_tar_path, out_manifest_path): - '''1. According to the manifest, tar sound files into out_tar_path - 2. Generate a new manifest for output tar file - ''' + """1. According to the manifest, tar sound files into out_tar_path. + 2. Generate a new manifest for output tar file. + """ out_tar = tarfile.open(out_tar_path, 'w') manifest = read_manifest(manifest_path) results = [] @@ -83,11 +84,19 @@ def pack_data(manifest_path, out_tar_path, out_manifest_path): out_tar.close() +def pcloud_mkdir(dir): + """Make directory in PaddleCloud filesystem. + """ + if call(['paddlecloud', 'mkdir', dir]) != 0: + raise IOError("PaddleCloud mkdir failed: %s." % dir) + + def pcloud_cp(src, dst): - """Copy src from local filesytem to dst in PaddleCloud filesystem. + """Copy src from local filesytem to dst in PaddleCloud filesystem, + or downlowd src from PaddleCloud filesystem to dst in local filesystem. """ - ret = call(['paddlecloud', 'cp', src, dst]) - return ret + if call(['paddlecloud', 'cp', src, dst]) != 0: + raise IOError("PaddleCloud cp failed: from [%s] to [%s]." % (src, dst)) def pcloud_exist(path): @@ -100,48 +109,34 @@ def pcloud_exist(path): if __name__ == '__main__': cloud_train_manifest = os.path.join(args.cloud_data_path, TRAIN_MANIFEST) cloud_train_tar = os.path.join(args.cloud_data_path, TRAIN_TAR) - cloud_test_manifest = os.path.join(args.cloud_data_path, TEST_MANIFEST) - cloud_test_tar = os.path.join(args.cloud_data_path, TEST_TAR) + cloud_dev_manifest = os.path.join(args.cloud_data_path, DEV_MANIFEST) + cloud_dev_tar = os.path.join(args.cloud_data_path, DEV_TAR) cloud_vocab_file = os.path.join(args.cloud_data_path, VOCAB_FILE) cloud_mean_file = os.path.join(args.cloud_data_path, MEAN_STD_FILE) local_train_manifest = os.path.join(args.local_tmp_path, TRAIN_MANIFEST) local_train_tar = os.path.join(args.local_tmp_path, TRAIN_TAR) - local_test_manifest = os.path.join(args.local_tmp_path, TEST_MANIFEST) - local_test_tar = os.path.join(args.local_tmp_path, TEST_TAR) + local_dev_manifest = os.path.join(args.local_tmp_path, DEV_MANIFEST) + local_dev_tar = os.path.join(args.local_tmp_path, DEV_TAR) + # prepare local and cloud dir if os.path.exists(args.local_tmp_path): shutil.rmtree(args.local_tmp_path) os.makedirs(args.local_tmp_path) + pcloud_mkdir(args.cloud_data_path) + + # pack and upload train data + pack_data(args.train_manifest_path, local_train_tar, local_train_manifest) + pcloud_cp(local_train_manifest, cloud_train_manifest) + pcloud_cp(local_train_tar, cloud_train_tar) + + # pack and upload validation data + pack_data(args.dev_manifest_path, local_dev_tar, local_dev_manifest) + pcloud_cp(local_dev_manifest, cloud_dev_manifest) + pcloud_cp(local_dev_tar, cloud_dev_tar) - # train data - if args.train_manifest_path != "": - ret = pcloud_exist(cloud_train_manifest) - if ret != 0: - pack_data(args.train_manifest_path, local_train_tar, - local_train_manifest) - pcloud_cp(local_train_manifest, cloud_train_manifest) - pcloud_cp(local_train_tar, cloud_train_tar) - - # test data - if args.test_manifest_path != "": - ret = pcloud_exist(cloud_test_manifest) - if ret != 0: - pack_data(args.test_manifest_path, local_test_tar, - local_test_manifest) - pcloud_cp(local_test_manifest, cloud_test_manifest) - pcloud_cp(local_test_tar, cloud_test_tar) - - # vocab file - if args.vocab_file != "": - ret = pcloud_exist(cloud_vocab_file) - if ret != 0: - pcloud_cp(args.vocab_file, cloud_vocab_file) - - # mean_std file - if args.mean_std_file != "": - ret = pcloud_exist(cloud_mean_file) - if ret != 0: - pcloud_cp(args.mean_std_file, cloud_mean_file) + # upload vocab file and mean_std file + pcloud_cp(args.vocab_file, cloud_vocab_file) + pcloud_cp(args.mean_std_file, cloud_mean_file) shutil.rmtree(args.local_tmp_path) diff --git a/deep_speech_2/train.py b/deep_speech_2/train.py index 0d4e2508dddf5cc6834b4f61f0c2cc8deee405af..379e364c9266dd3fd6aab72a1cf49b1b491c23e9 100644 --- a/deep_speech_2/train.py +++ b/deep_speech_2/train.py @@ -127,6 +127,12 @@ parser.add_argument( type=str, help="Augmentation configuration in json-format. " "(default: %(default)s)") +parser.add_argument( + "--is_local", + default=True, + type=distutils.util.strtobool, + help="Set to false if running with pserver in paddlecloud. " + "(default: %(default)s)") args = parser.parse_args() @@ -178,7 +184,10 @@ def train(): def main(): utils.print_arguments(args) - paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) + paddle.init( + use_gpu=args.use_gpu, + trainer_count=args.trainer_count, + is_local=args.is_local) train()