From 88eabacd6ad6a36783fdc6a8ae46f14665b05fcd Mon Sep 17 00:00:00 2001
From: Xinghai Sun <sunxinghai1216@gmail.com>
Date: Tue, 15 Aug 2017 21:52:44 +0800
Subject: [PATCH] Update DS2 cloud training according to review comments.

---
 deep_speech_2/cloud/README.md             | 17 +++++------------
 deep_speech_2/cloud/pcloud_submit.sh      |  8 ++++----
 deep_speech_2/cloud/pcloud_upload_data.sh |  8 ++++----
 deep_speech_2/cloud/upload_data.py        | 13 ++++++++++---
 4 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/deep_speech_2/cloud/README.md b/deep_speech_2/cloud/README.md
index 274fe374..a5be1c42 100755
--- a/deep_speech_2/cloud/README.md
+++ b/deep_speech_2/cloud/README.md
@@ -9,16 +9,9 @@ Provided with several input manifests, `pcloud_upload_data.sh` will pack and upl
 
 Please modify the following arguments in `pcloud_upload_data.sh`:
 
--  `IN_MANIFESTS`： Paths (in local filesystem) of manifest files containing the audio files to be uploaded. Multiple paths can be concatenated with a whitespace delimeter. Lines of manifest files are in the following format:
-
-```
-{"audio_filepath": "/home/disk1/LibriSpeech/dev-clean/1272/128104/1272-128104-0000.flac", "duration": 5.855, "text
-": "mister quilter is the ..."}
-{"audio_filepath": "/home/disk1/LibriSpeech/dev-clean/1272/128104/1272-128104-0001.flac", "duration": 4.815, "text
-": "nor is mister ..."}
-```
-- `OUT_MANIFESTS`: Paths (in local filesystem) to write the updated output manifest files to. Multiple paths can be concatenated with a whitespace delimeter. The values of `audio_filepath` in the output manifests are jjjjjkknew paths in PaddleCloud filesystem.
-- `CLOUD_DATA_DIR`:  Directory (in PaddleCloud filesystem) to upload the data to.
+- `IN_MANIFESTS`： Paths (in local filesystem) of manifest files containing the audio files to be uploaded. Multiple paths can be concatenated with a whitespace delimeter.
+- `OUT_MANIFESTS`: Paths (in local filesystem) to write the updated output manifest files to. Multiple paths can be concatenated with a whitespace delimeter. The values of `audio_filepath` in the output manifests are updated with cloud filesystem paths.
+- `CLOUD_DATA_DIR`:  Directory (in PaddleCloud filesystem) to upload the data to. Don't forget to replace `USERNAME` in the default directory and make sure that you have the permission to write it.
 - `NUM_SHARDS`: Number of data shards / parts (in tar files) to be generated when packing and uploading data. Smaller `num_shards` requires larger temoporal local disk space for packing data.
 
 By running:
@@ -28,7 +21,7 @@ sh pcloud_upload_data.sh
 ```
 all the audio files will be uploaded to PaddleCloud filesystem, and you will get modified manifests files in `OUT_MANIFESTS`.
 
-You have to take this step only once, when it is your first time to do the cloud training. Later on, the data is persisitent on the cloud filesystem and is reusable for multple jobs.
+You have to take this step only once, in the very first time you do the cloud training. Later on, the data is persisitent on the cloud filesystem and reusable for further job submissions.
 
 ## Step 2:  Configure Training
 
@@ -36,7 +29,7 @@ Configure cloud training arguments in `pcloud_submit.sh`, with the following arg
 
 - `TRAIN_MANIFEST`: Manifest filepath (in local filesystem) for training. Notice that the`audio_filepath` should be in cloud filesystem, like those generated by `pcloud_upload_data.sh`.
 - `DEV_MANIFEST`: Manifest filepath (in local filesystem) for validation.
-- `CLOUD_MODEL_DIR`: Directory (in PaddleCloud filesystem) to save the model parameters (checkpoints).
+- `CLOUD_MODEL_DIR`: Directory (in PaddleCloud filesystem) to save the model parameters (checkpoints). Don't forget to replace `USERNAME` in the default directory and make sure that you have the permission to write it.
 - `BATCH_SIZE`: Training batch size for a single node.
 - `NUM_GPU`: Number of GPUs allocated for a single node.
 - `NUM_NODE`: Number of nodes (machines) allocated for this job.
diff --git a/deep_speech_2/cloud/pcloud_submit.sh b/deep_speech_2/cloud/pcloud_submit.sh
index 35fe54f2..a7fb42cb 100644
--- a/deep_speech_2/cloud/pcloud_submit.sh
+++ b/deep_speech_2/cloud/pcloud_submit.sh
@@ -1,6 +1,6 @@
-TRAIN_MANIFEST="cloud/cloud.manifest.test"
+TRAIN_MANIFEST="cloud/cloud.manifest.train"
 DEV_MANIFEST="cloud/cloud.manifest.dev"
-CLOUD_MODEL_DIR="/pfs/dlnel/home/sunxinghai@baidu.com/deepspeech2/model"
+CLOUD_MODEL_DIR="/pfs/dlnel/home/USERNAME/deepspeech2/model"
 BATCH_SIZE=256
 NUM_GPU=8
 NUM_NODE=1
@@ -15,11 +15,11 @@ paddlecloud submit \
 -jobname ${JOB_NAME} \
 -cpu ${NUM_GPU} \
 -gpu ${NUM_GPU} \
--memory 10Gi \
+-memory 64Gi \
 -parallelism ${NUM_NODE} \
 -pscpu 1 \
 -pservers 1 \
--psmemory 10Gi \
+-psmemory 64Gi \
 -passes 1 \
 -entry "sh pcloud_train.sh ${TRAIN_MANIFEST} ${DEV_MANIFEST} ${CLOUD_MODEL_DIR} ${NUM_GPU} ${BATCH_SIZE} ${IS_LOCAL}" \
 ${DS2_PATH}
diff --git a/deep_speech_2/cloud/pcloud_upload_data.sh b/deep_speech_2/cloud/pcloud_upload_data.sh
index 1422b8a1..97a0ab18 100644
--- a/deep_speech_2/cloud/pcloud_upload_data.sh
+++ b/deep_speech_2/cloud/pcloud_upload_data.sh
@@ -1,7 +1,7 @@
-IN_MANIFESTS="../datasets/manifest.tmp ../datasets/manifest.dev ../datasets/manifest.test"
-OUT_MANIFESTS="./cloud.manifest.tmp ./cloud.manifest.dev ./cloud.manifest.test"
-CLOUD_DATA_DIR="/pfs/dlnel/home/sunxinghai@baidu.com/deepspeech2/data"
-NUM_SHARDS=10
+IN_MANIFESTS="../datasets/manifest.train ../datasets/manifest.dev ../datasets/manifest.test"
+OUT_MANIFESTS="./cloud.manifest.train ./cloud.manifest.dev ./cloud.manifest.test"
+CLOUD_DATA_DIR="/pfs/dlnel/home/USERNAME/deepspeech2/data/librispeech"
+NUM_SHARDS=50
 
 python upload_data.py \
 --in_manifest_paths ${IN_MANIFESTS} \
diff --git a/deep_speech_2/cloud/upload_data.py b/deep_speech_2/cloud/upload_data.py
index 66857574..9973f8c7 100644
--- a/deep_speech_2/cloud/upload_data.py
+++ b/deep_speech_2/cloud/upload_data.py
@@ -22,14 +22,20 @@ from data_utils.utils import read_manifest
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
     "--in_manifest_paths",
-    default=["../datasets/manifest.test", "../datasets/manifest.dev"],
+    default=[
+        "../datasets/manifest.train", "../datasets/manifest.dev",
+        "../datasets/manifest.test"
+    ],
     type=str,
     nargs='+',
     help="Local filepaths of input manifests to load, pack and upload."
     "(default: %(default)s)")
 parser.add_argument(
     "--out_manifest_paths",
-    default=["./cloud.manifest.test", "./cloud.manifest.dev"],
+    default=[
+        "./cloud.manifest.train", "./cloud.manifest.dev",
+        "./cloud.manifest.test"
+    ],
     type=str,
     nargs='+',
     help="Local filepaths of modified manifests to write to. "
@@ -91,6 +97,7 @@ def upload_data(in_manifest_path_list, out_manifest_path_list, local_tmp_dir,
             out_manifest.append("%s\n" % json.dumps(json_data))
         with open(out_manifest_path, 'w') as f:
             f.writelines(out_manifest)
+        pcloud_cp(out_manifest_path, upload_tar_dir)
     tar_file.close()
     pcloud_cp(tar_path, upload_tar_dir)
     os.remove(tar_path)
@@ -117,6 +124,6 @@ if __name__ == '__main__':
     pcloud_mkdir(args.cloud_data_dir)
 
     upload_data(args.in_manifest_paths, args.out_manifest_paths,
-                args.local_tmp_dir, args.cloud_data_dir, 10)
+                args.local_tmp_dir, args.cloud_data_dir, args.num_shards)
 
     shutil.rmtree(args.local_tmp_dir)
-- 
GitLab