diff --git a/fleet_rec/core/engine/cluster/cluster.py b/fleet_rec/core/engine/cluster/cluster.py index c533a83aa617fc7ee95c543884da7daa19e2696d..7136d60ff50575a89d8dced633d74e8abdde1c75 100644 --- a/fleet_rec/core/engine/cluster/cluster.py +++ b/fleet_rec/core/engine/cluster/cluster.py @@ -28,7 +28,6 @@ class ClusterEngine(Engine): def __init_impl__(self): abs_dir = os.path.dirname(os.path.abspath(__file__)) self.submit_script = os.path.join(abs_dir, "master.sh") - self.job_script = os.path.join(abs_dir, "worker.sh") def start_worker_procs(self): default_env = os.environ.copy() @@ -58,10 +57,6 @@ class ClusterEngine(Engine): role = envs.get_runtime_environ("engine_role") if role == "MASTER": - worker_script = {} - worker_script["engine_worker"] = self.job_script - envs.set_runtime_environs(worker_script) - self.start_master_procs() elif role == "WORKER": diff --git a/models/rank/dnn/backend.yaml b/models/rank/dnn/backend.yaml index 767aae6e5161966f636546e4fc2b34ac8a3d5b86..60529faf9fc02bd7cb7a182dc0e2ff2a12ad8af2 100755 --- a/models/rank/dnn/backend.yaml +++ b/models/rank/dnn/backend.yaml @@ -27,8 +27,9 @@ engine: paddlerec: "/home/tangwei/fleet_rec_env/FleetRec" submit: - hpc: "/home/tangwei/submit-tieba/smart_client/" - qconf: "/home/tangwei/Plines/imageq/package/my_conf/para.conf" + hpc: "/home/tangwei/Plines/client/smart_client_khan/" + qconf: "/home/tangwei/Plines/imageq/qsub_f.conf" nodes: 10 - scrpit: "{workspace}/submit.sh" \ No newline at end of file + submit_scrpit: "{workspace}/submit.sh" + job_scrpit: "{workspace}/worker.sh" diff --git a/models/rank/dnn/submit.sh b/models/rank/dnn/submit.sh index 3204e754e50fd0f448e81e34b5193e0e726034cf..c5c7ce82cd77bdfd244efe868abf1c3d6eb1f230 100644 --- a/models/rank/dnn/submit.sh +++ b/models/rank/dnn/submit.sh @@ -25,15 +25,16 @@ function package() { temp=${engine_temp_path} echo "package temp dir: " ${temp} - cp ${engine_worker} ${temp} + cp ${engine_job_scrpit} ${temp} + cp ${engine_submit_qconf} ${temp} echo "copy job.sh from " ${engine_worker} " to " ${temp} - mkdir ${temp}/python - cp -r ${engine_package_python}/* ${temp}/python/ + mkdir -p ${temp}/package/python + cp -r ${engine_package_python}/* ${temp}/package/python/ echo "copy python from " ${engine_package_python} " to " ${temp} - mkdir ${temp}/whl - cp ${engine_package_paddlerec} ${temp}/whl/ + mkdir ${temp}/package/whl + cp ${engine_package_paddlerec} ${temp}/package/whl/ echo "copy " ${engine_package_paddlerec} " to " ${temp}"/whl/" } @@ -68,7 +69,11 @@ function submit() { g_job_entry="worker.sh" - ${$engine_submit_hpc}/bin/qsub_f \ + engine_hdfs_output=${engine_hdfs_output}/`date +%Y%m%d%H%M%S` + + cd ${engine_temp_path} + + ${engine_submit_hpc}/bin/qsub_f \ -N ${g_job_name} \ --conf ${engine_submit_qconf} \ --hdfs ${engine_hdfs_name} \ @@ -85,4 +90,4 @@ function main() { before_submit submit after_submit -} +} \ No newline at end of file diff --git a/fleet_rec/core/engine/cluster/worker.sh b/models/rank/dnn/worker.sh similarity index 93% rename from fleet_rec/core/engine/cluster/worker.sh rename to models/rank/dnn/worker.sh index b287eb15d855a5d359cf35322a0c5220fc6233ae..2211a04a21871f4ecf07a89b3904d72a3ff08708 100644 --- a/fleet_rec/core/engine/cluster/worker.sh +++ b/models/rank/dnn/worker.sh @@ -49,7 +49,10 @@ function user_define_variables() { } function job() { - mpirun -npernode 2 -timestamp-output -tag-output -machinefile ${PBS_NODEFILE} python -u ${g_job_entry} + echo "job" + g_run_stage="job" + + # mpirun -npernode 2 -timestamp-output -tag-output -machinefile ${PBS_NODEFILE} python -u ${g_job_entry} } function main() {