From 21ddc2656e96134db024f2643c2361d4a7e08d30 Mon Sep 17 00:00:00 2001 From: tangwei Date: Wed, 13 May 2020 17:02:38 +0800 Subject: [PATCH] add cluster run --- example/__init__.py | 0 example/mpi/__init__.py | 0 example/mpi/backend.yaml | 35 ++++++++++++++++ example/mpi/submit.sh | 90 ++++++++++++++++++++++++++++++++++++++++ example/mpi/worker.sh | 75 +++++++++++++++++++++++++++++++++ 5 files changed, 200 insertions(+) create mode 100644 example/__init__.py create mode 100644 example/mpi/__init__.py create mode 100755 example/mpi/backend.yaml create mode 100644 example/mpi/submit.sh create mode 100644 example/mpi/worker.sh diff --git a/example/__init__.py b/example/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/example/mpi/__init__.py b/example/mpi/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/example/mpi/backend.yaml b/example/mpi/backend.yaml new file mode 100755 index 00000000..77384db4 --- /dev/null +++ b/example/mpi/backend.yaml @@ -0,0 +1,35 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +engine: + workspace: "paddlerec.models.rank.dnn" + backend: "MPI" + + hdfs: + name: "hdfs://nmg01-taihang-hdfs.dmop.baidu.com:54310" + ugi: "fcr,SaK2VqfEDeXzKPor" + output: "/app/ecom/fcr/fanyabo/wadstyleimageq/tangwei12/output_1/" + + package: + build_script: "{workspace}/package.sh" + python: "/home/tangwei/fleet_rec_env/cpython-2.7.11-ucs4" + paddlerec: "/home/tangwei/fleet_rec_env/PaddleRec" + + submit: + hpc: "/home/tangwei/Plines/client/smart_client_khan/" + qconf: "/home/tangwei/Plines/imageq/qsub_f.conf" + nodes: 10 + + submit_scrpit: "{workspace}/submit.sh" + job_scrpit: "{workspace}/worker.sh" diff --git a/example/mpi/submit.sh b/example/mpi/submit.sh new file mode 100644 index 00000000..56b5f879 --- /dev/null +++ b/example/mpi/submit.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +################################################### +# Usage: submit.sh +# Description: run mpi submit clinet implement +################################################### + +#----------------------------------------------------------------------------------------------------------------- +#fun : get argument from env, set it into variables +#param : N/A +#return : 0 -- success; not 0 -- failure +#----------------------------------------------------------------------------------------------------------------- +function vars_get_from_env() { + echo "xx" +} + +#----------------------------------------------------------------------------------------------------------------- +#fun : package +#param : N/A +#return : 0 -- success; not 0 -- failure +#----------------------------------------------------------------------------------------------------------------- +function package() { + g_run_stage="package" + + temp=${engine_temp_path} + echo "package temp dir: " ${temp} + + cp ${engine_job_scrpit} ${temp} + cp ${engine_submit_qconf} ${temp} + echo "copy job.sh from " ${engine_worker} " to " ${temp} + + mkdir -p ${temp}/package + cp -r ${engine_package_python} ${temp}/package/ + echo "copy python from " ${engine_package_python} " to " ${temp} + + mkdir ${temp}/package/whl + cp ${engine_package_paddlerec} ${temp}/package/whl/ + echo "copy " ${engine_package_paddlerec} " to " ${temp}"/whl/" +} + +#----------------------------------------------------------------------------------------------------------------- +#fun : before hook submit to cluster +#param : N/A +#return : 0 -- success; not 0 -- failure +#----------------------------------------------------------------------------------------------------------------- +function before_submit() { + echo "before_submit" +} + +#----------------------------------------------------------------------------------------------------------------- +#fun : after hook submit to cluster +#param : N/A +#return : 0 -- success; not 0 -- failure +#----------------------------------------------------------------------------------------------------------------- +function after_submit() { + echo "after_submit" +} + +#----------------------------------------------------------------------------------------------------------------- +#fun : submit to cluster +#param : N/A +#return : 0 -- success; not 0 -- failure +#----------------------------------------------------------------------------------------------------------------- +function submit() { + g_run_stage="submit" + g_job_name="paddle_rec_mpi" + g_hdfs_path=$g_hdfs_path + g_job_entry="worker.sh" + + engine_hdfs_output=${engine_hdfs_output}/$(date +%Y%m%d%H%M%S) + + cd ${engine_temp_path} + + ${engine_submit_hpc}/bin/qsub_f \ + -N ${g_job_name} \ + --conf ${engine_submit_qconf} \ + --hdfs ${engine_hdfs_name} \ + --ugi ${engine_hdfs_ugi} \ + --hout ${engine_hdfs_output} \ + --files ./package \ + -l nodes=${engine_submit_nodes},walltime=1000:00:00,resource=full ${g_job_entry} +} + +function main() { + package + + before_submit + submit + after_submit +} diff --git a/example/mpi/worker.sh b/example/mpi/worker.sh new file mode 100644 index 00000000..9daf7488 --- /dev/null +++ b/example/mpi/worker.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +################################################### +# Usage: job.sh +# Description: run job on mpi per node +################################################### + +# ---------------------------------------------------------------------------- # +# variable define # +# ---------------------------------------------------------------------------- # +declare g_curPath="" +declare g_scriptName="" +declare g_workPath="" +declare g_run_stage="" + +# ---------------------------------------------------------------------------- # +# const define # +# ---------------------------------------------------------------------------- # +export FLAGS_communicator_thread_pool_size=5 +export FLAGS_communicator_send_queue_size=18 +export FLAGS_communicator_thread_pool_size=20 +export FLAGS_communicator_max_merge_var_num=18 +################################################################################ + +#----------------------------------------------------------------------------------------------------------------- +#fun : check function return code +#param : N/A +#return : 0 -- success; not 0 -- failure +#----------------------------------------------------------------------------------------------------------------- +function check_error() { + if [ ${?} -ne 0 ]; then + echo "execute " + $g_run_stage + " raise exception! please check ..." + exit 1 + fi +} + +#----------------------------------------------------------------------------------------------------------------- +#fun : check function return code +#param : N/A +#return : 0 -- success; not 0 -- failure +#----------------------------------------------------------------------------------------------------------------- +function env_prepare() { + g_run_stage="env_prepare" + WORKDIR=$(pwd) + mpirun -npernode 1 mv package/* ./ + echo "current:"$WORKDIR + + mpirun -npernode 1 tar -zxvf python.tar.gz > /dev/null + + export PYTHONPATH=$WORKDIR/python/ + export PYTHONROOT=$WORKDIR/python/ + export LIBRARY_PATH=$PYTHONPATH/lib:$LIBRARY_PATH + export LD_LIBRARY_PATH=$PYTHONPATH/lib:$LD_LIBRARY_PATH + export PATH=$PYTHONPATH/bin:$PATH + export LIBRARY_PATH=$PYTHONROOT/lib:$LIBRARY_PATH + + python -c "print('heheda')" + + mpirun -npernode 1 python/bin/python -m pip uninstall -y paddle-rec + mpirun -npernode 1 python/bin/python -m pip install whl/fleet_rec-0.0.2-py2-none-any.whl --index-url=http://pip.baidu.com/pypi/simple --trusted-host pip.baidu.com + check_error +} + +function run() { + echo "run" + g_run_stage="run" + mpirun -npernode 2 -timestamp-output -tag-output -machinefile ${PBS_NODEFILE} python/bin/python -u -m paddlerec.run -m paddlerec.models.rank.dnn --engine cluster --role worker +} + +function main() { + env_prepare + run +} + +main -- GitLab