From 16966dc524b3744edba1dfd38f5498065b861de6 Mon Sep 17 00:00:00 2001 From: tangwei Date: Tue, 12 May 2020 16:44:34 +0800 Subject: [PATCH] add qsub submit --- fleet_rec/core/engine/cluster/cluster.py | 4 +++ models/rank/dnn/backend.yaml | 11 ++++--- models/rank/dnn/submit.sh | 39 ++++++++++++++++++------ 3 files changed, 39 insertions(+), 15 deletions(-) diff --git a/fleet_rec/core/engine/cluster/cluster.py b/fleet_rec/core/engine/cluster/cluster.py index ffe54ff7..c533a83a 100644 --- a/fleet_rec/core/engine/cluster/cluster.py +++ b/fleet_rec/core/engine/cluster/cluster.py @@ -58,6 +58,10 @@ class ClusterEngine(Engine): role = envs.get_runtime_environ("engine_role") if role == "MASTER": + worker_script = {} + worker_script["engine_worker"] = self.job_script + envs.set_runtime_environs(worker_script) + self.start_master_procs() elif role == "WORKER": diff --git a/models/rank/dnn/backend.yaml b/models/rank/dnn/backend.yaml index 1f448998..767aae6e 100755 --- a/models/rank/dnn/backend.yaml +++ b/models/rank/dnn/backend.yaml @@ -16,6 +16,11 @@ engine: workspace: "fleetrec.models.rank.dnn" backend: "MPI" + hdfs: + name: "hdfs://nmg01-taihang-hdfs.dmop.baidu.com:54310" + ugi: "fcr,SaK2VqfEDeXzKPor" + output: "/app/ecom/fcr/fanyabo/wadstyleimageq/tangwei12/output_1/" + package: build_script: "{workspace}/package.sh" python: "/home/tangwei/fleet_rec_env/cpython-2.7.11-ucs4" @@ -23,11 +28,7 @@ engine: submit: hpc: "/home/tangwei/submit-tieba/smart_client/" - hdfs: "xx" - hout: "xxx" - ugi: "xxxx" + qconf: "/home/tangwei/Plines/imageq/package/my_conf/para.conf" nodes: 10 - before_hook: "" - end_hook: "" scrpit: "{workspace}/submit.sh" \ No newline at end of file diff --git a/models/rank/dnn/submit.sh b/models/rank/dnn/submit.sh index b5dca9bd..3204e754 100644 --- a/models/rank/dnn/submit.sh +++ b/models/rank/dnn/submit.sh @@ -22,6 +22,19 @@ function vars_get_from_env() { function package() { g_run_stage="package" + temp=${engine_temp_path} + echo "package temp dir: " ${temp} + + cp ${engine_worker} ${temp} + echo "copy job.sh from " ${engine_worker} " to " ${temp} + + mkdir ${temp}/python + cp -r ${engine_package_python}/* ${temp}/python/ + echo "copy python from " ${engine_package_python} " to " ${temp} + + mkdir ${temp}/whl + cp ${engine_package_paddlerec} ${temp}/whl/ + echo "copy " ${engine_package_paddlerec} " to " ${temp}"/whl/" } #----------------------------------------------------------------------------------------------------------------- @@ -50,20 +63,26 @@ function after_submit() { function submit() { g_run_stage="submit" - before_submit + g_job_name="paddle_rec_mpi" + g_hdfs_path=$g_hdfs_path + + g_job_entry="worker.sh" - ${g_hpc_path}/bin/qsub_f \ + ${$engine_submit_hpc}/bin/qsub_f \ -N ${g_job_name} \ - --conf ${g_qsub_conf} \ - --hdfs ${g_hdfs_path} \ - --ugi ${g_hdfs_ugi} \ - --hout ${g_hdfs_output} \ - --files ${g_submit_package} \ - -l nodes=${g_job_nodes},walltime=1000:00:00,resource=full ${g_job_entry} + --conf ${engine_submit_qconf} \ + --hdfs ${engine_hdfs_name} \ + --ugi ${engine_hdfs_ugi} \ + --hout ${engine_hdfs_output} \ + --files ${engine_temp_path} \ + -l nodes=${engine_submit_nodes},walltime=1000:00:00,resource=full ${g_job_entry} - after_submit } function main() { - echo "run submit done" + package + + before_submit + submit + after_submit } -- GitLab