From 30344ca8578f5a21ec0362b81a2c42cb0ee73015 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Fri, 11 Aug 2017 18:55:43 +0800 Subject: [PATCH] Fix some syntax errors. --- deep_speech_2/cloud/README.md | 30 +++++----- deep_speech_2/cloud/pcloud_submit.sh | 7 ++- deep_speech_2/cloud/pcloud_train.sh | 2 - deep_speech_2/cloud/split_data.py | 6 +- deep_speech_2/cloud/upload_data.py | 82 ++++++++++++++-------------- deep_speech_2/data_utils/data.py | 6 +- 6 files changed, 68 insertions(+), 65 deletions(-) diff --git a/deep_speech_2/cloud/README.md b/deep_speech_2/cloud/README.md index 392088cf..8e7e49f9 100644 --- a/deep_speech_2/cloud/README.md +++ b/deep_speech_2/cloud/README.md @@ -1,12 +1,13 @@ # Run DS2 on PaddleCloud ->Note: Make sure current directory is `models/deep_speech_2/cloud/` +>Note: +>Make sure [PaddleCloud client](https://github.com/PaddlePaddle/cloud/blob/develop/doc/usage_cn.md#%E4%B8%8B%E8%BD%BD%E5%B9%B6%E9%85%8D%E7%BD%AEpaddlecloud) has be installed and current directory is `models/deep_speech_2/cloud/` -## Step1 Configure data set +## Step-1 Configure data set -You can configure your input data and output path in pcloud_submit.sh: +Configure your input data and output path in pcloud_submit.sh: -- `TRAIN_MANIFEST`: Absolute path of train data manifest file in local file system.This file has format as bellow: +- `TRAIN_MANIFEST`: Absolute path of train data manifest file in local file system.This file has format as bellow: ``` {"audio_filepath": "/home/disk1/LibriSpeech/dev-clean/1272/128104/1272-128104-0000.flac", "duration": 5.855, "text @@ -15,19 +16,18 @@ You can configure your input data and output path in pcloud_submit.sh: ": "nor is mister ..."} ``` -- `TEST_MANIFEST`: Absolute path of train data manifest file in local filesystem.This file has format like TRAIN_MANIFEST. - +- `TEST_MANIFEST`: Absolute path of train data manifest file in local filesystem. This file has format like `TRAIN_MANIFEST`. - `VOCAB_FILE`: Absolute path of vocabulary file in local filesytem. -- `MEAN_STD_FILE`: Absolute path of vocabulary file in local filesytem. +- `MEAN_STD_FILE`: Absolute path of normalizer's statistic file in local filesytem. - `CLOUD_DATA_DIR:` Absolute path in PaddleCloud filesystem. We will upload local train data to this directory. - `CLOUD_MODEL_DIR`: Absolute path in PaddleCloud filesystem. PaddleCloud trainer will save model to this directory. +>Note: Upload will be skipped if target file has existed in `CLOUD_DATA_DIR`. ->Note: Upload will be skipped if target file has existed in ${CLOUD_DATA_DIR}. +## Step-2 Configure computation resource -## Step2 Configure computation resource +Configure computation resource in pcloud_submit.sh: -You can configure computation resource in pcloud_submit.sh: ``` # Configure computation resource and submit job to PaddleCloud paddlecloud submit \ @@ -44,10 +44,10 @@ You can configure computation resource in pcloud_submit.sh: -entry "sh pcloud_train.sh ${CLOUD_DATA_DIR} ${CLOUD_MODEL_DIR}" \ ${DS2_PATH} ``` -For more information, please refer to[PaddleCloud](https://github.com/PaddlePaddle/cloud/blob/develop/doc/usage_cn.md#提交任务) +For more information, please refer to [PaddleCloud](https://github.com/PaddlePaddle/cloud/blob/develop/doc/usage_cn.md#提交任务) -## Step3 Configure algorithm options -You can configure algorithm options in pcloud_train.sh: +## Step-3 Configure algorithm options +Configure algorithm options in pcloud_train.sh: ``` python train.py \ --use_gpu=1 \ @@ -65,13 +65,13 @@ cd .. python train.py --help ``` -## Step4 Submit job +## Step-4 Submit job ``` $ sh pcloud_submit.sh ``` -## Step5 Get logs +## Step-5 Get logs ``` $ paddlecloud logs -n 10000 deepspeech20170727130129 ``` diff --git a/deep_speech_2/cloud/pcloud_submit.sh b/deep_speech_2/cloud/pcloud_submit.sh index 179d144f..5ecb011b 100644 --- a/deep_speech_2/cloud/pcloud_submit.sh +++ b/deep_speech_2/cloud/pcloud_submit.sh @@ -15,6 +15,11 @@ python upload_data.py \ --vocab_file=${VOCAB_FILE} \ --mean_std_file=${MEAN_STD_FILE} \ --cloud_data_path=${CLOUD_DATA_DIR} +if [ $? -ne 0 ] +then + echo "upload data failed!" + exit 1 +fi JOB_NAME=deepspeech`date +%Y%m%d%H%M%S` DS2_PATH=${PWD%/*} @@ -27,7 +32,7 @@ paddlecloud submit \ -cpu 4 \ -gpu 4 \ -memory 10Gi \ --parallelism 1 \ +-parallelism 2 \ -pscpu 1 \ -pservers 1 \ -psmemory 10Gi \ diff --git a/deep_speech_2/cloud/pcloud_train.sh b/deep_speech_2/cloud/pcloud_train.sh index 64a0fac3..ce184367 100644 --- a/deep_speech_2/cloud/pcloud_train.sh +++ b/deep_speech_2/cloud/pcloud_train.sh @@ -22,8 +22,6 @@ python ./cloud/split_data.py \ python train.py \ --use_gpu=1 \ --trainer_count=4 \ ---batch_size=32 \ ---num_threads_data=4 \ --mean_std_filepath=$MEAN_STD_FILE \ --train_manifest_path='./local.train.manifest' \ --dev_manifest_path='./local.test.manifest' \ diff --git a/deep_speech_2/cloud/split_data.py b/deep_speech_2/cloud/split_data.py index 78bf3174..6b0754a8 100644 --- a/deep_speech_2/cloud/split_data.py +++ b/deep_speech_2/cloud/split_data.py @@ -1,9 +1,11 @@ -""" -This tool is used for splitting data into each node of +"""This tool is used for splitting data into each node of paddle cloud by total trainer count and current trainer id. The meaning of trainer is a instance of k8s cluster. This script should be called in paddle cloud. """ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function import os import json import argparse diff --git a/deep_speech_2/cloud/upload_data.py b/deep_speech_2/cloud/upload_data.py index 75dcf010..3336f722 100644 --- a/deep_speech_2/cloud/upload_data.py +++ b/deep_speech_2/cloud/upload_data.py @@ -1,5 +1,4 @@ -""" -This tool is used for preparing data for DeepSpeech2 trainning on paddle cloud. +"""This tool is used for preparing data for DeepSpeech2 trainning on paddle cloud. Steps: 1. Read original manifest and get the local path of sound files. @@ -9,6 +8,9 @@ Steps: Finally, we will get a tar file and a manifest with sound file name, duration and text. """ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function import json import os import tarfile @@ -50,7 +52,6 @@ parser.add_argument( parser.add_argument( "--cloud_data_path", required=True, - default="", type=str, help="Destination path on paddlecloud. (default: %(default)s)") args = parser.parse_args() @@ -64,8 +65,7 @@ args = parser.parse_args() def pack_data(manifest_path, out_tar_path, out_manifest_path): - ''' - 1. According manifest, tar sound files into out_tar_path + '''1. According to the manifest, tar sound files into out_tar_path 2. Generate a new manifest for output tar file ''' out_tar = tarfile.open(out_tar_path, 'w') @@ -83,65 +83,65 @@ def pack_data(manifest_path, out_tar_path, out_manifest_path): out_tar.close() +def pcloud_cp(src, dst): + """Copy src from local filesytem to dst in PaddleCloud filesystem. + """ + ret = call(['paddlecloud', 'cp', src, dst]) + return ret + + +def pcloud_exist(path): + """Check if file or directory exists in PaddleCloud filesystem. + """ + ret = call(['paddlecloud', 'ls', path]) + return ret + + if __name__ == '__main__': - cloud_train_manifest = "%s/%s" % (args.cloud_data_path, TRAIN_MANIFEST) - cloud_train_tar = "%s/%s" % (args.cloud_data_path, TRAIN_TAR) - cloud_test_manifest = "%s/%s" % (args.cloud_data_path, TEST_MANIFEST) - cloud_test_tar = "%s/%s" % (args.cloud_data_path, TEST_TAR) - cloud_vocab_file = "%s/%s" % (args.cloud_data_path, VOCAB_FILE) - cloud_mean_file = "%s/%s" % (args.cloud_data_path, MEAN_STD_FILE) - - local_train_manifest = "%s/%s" % (args.local_tmp_path, TRAIN_MANIFEST) - local_train_tar = "%s/%s" % (args.local_tmp_path, TRAIN_TAR) - local_test_manifest = "%s/%s" % (args.local_tmp_path, TEST_MANIFEST) - local_test_tar = "%s/%s" % (args.local_tmp_path, TEST_TAR) + cloud_train_manifest = os.path.join(args.cloud_data_path, TRAIN_MANIFEST) + cloud_train_tar = os.path.join(args.cloud_data_path, TRAIN_TAR) + cloud_test_manifest = os.path.join(args.cloud_data_path, TEST_MANIFEST) + cloud_test_tar = os.path.join(args.cloud_data_path, TEST_TAR) + cloud_vocab_file = os.path.join(args.cloud_data_path, VOCAB_FILE) + cloud_mean_file = os.path.join(args.cloud_data_path, MEAN_STD_FILE) + + local_train_manifest = os.path.join(args.local_tmp_path, TRAIN_MANIFEST) + local_train_tar = os.path.join(args.local_tmp_path, TRAIN_TAR) + local_test_manifest = os.path.join(args.local_tmp_path, TEST_MANIFEST) + local_test_tar = os.path.join(args.local_tmp_path, TEST_TAR) if os.path.exists(args.local_tmp_path): shutil.rmtree(args.local_tmp_path) os.makedirs(args.local_tmp_path) - ret = 1 # train data if args.train_manifest_path != "": - ret = call(['paddlecloud', 'ls', cloud_train_manifest]) + ret = pcloud_exist(cloud_train_manifest) if ret != 0: - print "%s does't exist" % cloud_train_manifest pack_data(args.train_manifest_path, local_train_tar, local_train_manifest) - call([ - 'paddlecloud', 'cp', local_train_manifest, cloud_train_manifest - ]) - call(['paddlecloud', 'cp', local_train_tar, cloud_train_tar]) + pcloud_cp(local_train_manifest, cloud_train_manifest) + pcloud_cp(local_train_tar, cloud_train_tar) # test data if args.test_manifest_path != "": - try: - ret = call(['paddlecloud', 'ls', cloud_test_manifest]) - except Exception: - ret = 1 + ret = pcloud_exist(cloud_test_manifest) if ret != 0: pack_data(args.test_manifest_path, local_test_tar, local_test_manifest) - call( - ['paddlecloud', 'cp', local_test_manifest, cloud_test_manifest]) - call(['paddlecloud', 'cp', local_test_tar, cloud_test_tar]) + pcloud_cp(local_test_manifest, cloud_test_manifest) + pcloud_cp(local_test_tar, cloud_test_tar) # vocab file if args.vocab_file != "": - try: - ret = call(['paddlecloud', 'ls', cloud_vocab_file]) - except Exception: - ret = 1 + ret = pcloud_exist(cloud_vocab_file) if ret != 0: - call(['paddlecloud', 'cp', args.vocab_file, cloud_vocab_file]) + pcloud_cp(args.vocab_file, cloud_vocab_file) # mean_std file if args.mean_std_file != "": - try: - ret = call(['paddlecloud', 'ls', cloud_mean_file]) - except Exception: - ret = 1 + ret = pcloud_exist(cloud_mean_file) if ret != 0: - call(['paddlecloud', 'cp', args.mean_std_file, cloud_mean_file]) + pcloud_cp(args.mean_std_file, cloud_mean_file) - os.removedirs(args.local_tmp_path) + shutil.rmtree(args.local_tmp_path) diff --git a/deep_speech_2/data_utils/data.py b/deep_speech_2/data_utils/data.py index 1e524b0f..f404b4fa 100644 --- a/deep_speech_2/data_utils/data.py +++ b/deep_speech_2/data_utils/data.py @@ -89,9 +89,6 @@ class DataGenerator(object): self._num_threads = num_threads self._rng = random.Random(random_seed) self._epoch = 0 - # for caching tar files info - self.tar2info = {} - self.tar2object = {} def process_utterance(self, filename, transcript): """Load, augment, featurize and normalize for speech data. @@ -237,6 +234,7 @@ class DataGenerator(object): def _get_file_object(self, file): """Get file object by file path. + If file startwith tar, it will return a tar file object and cached tar file info for next reading request. It will return file directly, if the type of file is not str. @@ -254,7 +252,7 @@ class DataGenerator(object): return local_data.tar2object[tarpath].extractfile( local_data.tar2info[tarpath][filename]) else: - return open(file) + return open(file, 'r') def _instance_reader_creator(self, manifest): """ -- GitLab