diff --git a/core/utils/envs.py b/core/utils/envs.py index 1c3b6c45ef30b62b0cda65c721b94cd9f312baf7..7f6cfb61d3997ce6a673bae10919779ef696b054 100755 --- a/core/utils/envs.py +++ b/core/utils/envs.py @@ -17,6 +17,7 @@ import copy import sys import socket from contextlib import closing + global_envs = {} @@ -176,17 +177,12 @@ def get_platform(): return "WINDOWS" -<< << << < HEAD: fleet_rec/core/utils/envs.py - -== == == = ->>>>>> > upstream/develop: core/utils/envs.py - - def find_free_port(): def __free_port(): with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: s.bind(('', 0)) return s.getsockname()[1] + new_port = __free_port() return new_port diff --git a/models/rank/dnn/backend.yaml b/models/rank/dnn/backend.yaml deleted file mode 100755 index 77384db41d4ab9163ec7b3dcc196358341f7a51a..0000000000000000000000000000000000000000 --- a/models/rank/dnn/backend.yaml +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -engine: - workspace: "paddlerec.models.rank.dnn" - backend: "MPI" - - hdfs: - name: "hdfs://nmg01-taihang-hdfs.dmop.baidu.com:54310" - ugi: "fcr,SaK2VqfEDeXzKPor" - output: "/app/ecom/fcr/fanyabo/wadstyleimageq/tangwei12/output_1/" - - package: - build_script: "{workspace}/package.sh" - python: "/home/tangwei/fleet_rec_env/cpython-2.7.11-ucs4" - paddlerec: "/home/tangwei/fleet_rec_env/PaddleRec" - - submit: - hpc: "/home/tangwei/Plines/client/smart_client_khan/" - qconf: "/home/tangwei/Plines/imageq/qsub_f.conf" - nodes: 10 - - submit_scrpit: "{workspace}/submit.sh" - job_scrpit: "{workspace}/worker.sh" diff --git a/models/rank/dnn/submit.sh b/models/rank/dnn/submit.sh deleted file mode 100644 index 56b5f8798f0e4181dfd54d9e831078e4b1533d39..0000000000000000000000000000000000000000 --- a/models/rank/dnn/submit.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash - -################################################### -# Usage: submit.sh -# Description: run mpi submit clinet implement -################################################### - -#----------------------------------------------------------------------------------------------------------------- -#fun : get argument from env, set it into variables -#param : N/A -#return : 0 -- success; not 0 -- failure -#----------------------------------------------------------------------------------------------------------------- -function vars_get_from_env() { - echo "xx" -} - -#----------------------------------------------------------------------------------------------------------------- -#fun : package -#param : N/A -#return : 0 -- success; not 0 -- failure -#----------------------------------------------------------------------------------------------------------------- -function package() { - g_run_stage="package" - - temp=${engine_temp_path} - echo "package temp dir: " ${temp} - - cp ${engine_job_scrpit} ${temp} - cp ${engine_submit_qconf} ${temp} - echo "copy job.sh from " ${engine_worker} " to " ${temp} - - mkdir -p ${temp}/package - cp -r ${engine_package_python} ${temp}/package/ - echo "copy python from " ${engine_package_python} " to " ${temp} - - mkdir ${temp}/package/whl - cp ${engine_package_paddlerec} ${temp}/package/whl/ - echo "copy " ${engine_package_paddlerec} " to " ${temp}"/whl/" -} - -#----------------------------------------------------------------------------------------------------------------- -#fun : before hook submit to cluster -#param : N/A -#return : 0 -- success; not 0 -- failure -#----------------------------------------------------------------------------------------------------------------- -function before_submit() { - echo "before_submit" -} - -#----------------------------------------------------------------------------------------------------------------- -#fun : after hook submit to cluster -#param : N/A -#return : 0 -- success; not 0 -- failure -#----------------------------------------------------------------------------------------------------------------- -function after_submit() { - echo "after_submit" -} - -#----------------------------------------------------------------------------------------------------------------- -#fun : submit to cluster -#param : N/A -#return : 0 -- success; not 0 -- failure -#----------------------------------------------------------------------------------------------------------------- -function submit() { - g_run_stage="submit" - g_job_name="paddle_rec_mpi" - g_hdfs_path=$g_hdfs_path - g_job_entry="worker.sh" - - engine_hdfs_output=${engine_hdfs_output}/$(date +%Y%m%d%H%M%S) - - cd ${engine_temp_path} - - ${engine_submit_hpc}/bin/qsub_f \ - -N ${g_job_name} \ - --conf ${engine_submit_qconf} \ - --hdfs ${engine_hdfs_name} \ - --ugi ${engine_hdfs_ugi} \ - --hout ${engine_hdfs_output} \ - --files ./package \ - -l nodes=${engine_submit_nodes},walltime=1000:00:00,resource=full ${g_job_entry} -} - -function main() { - package - - before_submit - submit - after_submit -} diff --git a/models/rank/dnn/worker.sh b/models/rank/dnn/worker.sh deleted file mode 100644 index 9daf7488d86116460ec4857aa7a752b6af07abd0..0000000000000000000000000000000000000000 --- a/models/rank/dnn/worker.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/bin/bash - -################################################### -# Usage: job.sh -# Description: run job on mpi per node -################################################### - -# ---------------------------------------------------------------------------- # -# variable define # -# ---------------------------------------------------------------------------- # -declare g_curPath="" -declare g_scriptName="" -declare g_workPath="" -declare g_run_stage="" - -# ---------------------------------------------------------------------------- # -# const define # -# ---------------------------------------------------------------------------- # -export FLAGS_communicator_thread_pool_size=5 -export FLAGS_communicator_send_queue_size=18 -export FLAGS_communicator_thread_pool_size=20 -export FLAGS_communicator_max_merge_var_num=18 -################################################################################ - -#----------------------------------------------------------------------------------------------------------------- -#fun : check function return code -#param : N/A -#return : 0 -- success; not 0 -- failure -#----------------------------------------------------------------------------------------------------------------- -function check_error() { - if [ ${?} -ne 0 ]; then - echo "execute " + $g_run_stage + " raise exception! please check ..." - exit 1 - fi -} - -#----------------------------------------------------------------------------------------------------------------- -#fun : check function return code -#param : N/A -#return : 0 -- success; not 0 -- failure -#----------------------------------------------------------------------------------------------------------------- -function env_prepare() { - g_run_stage="env_prepare" - WORKDIR=$(pwd) - mpirun -npernode 1 mv package/* ./ - echo "current:"$WORKDIR - - mpirun -npernode 1 tar -zxvf python.tar.gz > /dev/null - - export PYTHONPATH=$WORKDIR/python/ - export PYTHONROOT=$WORKDIR/python/ - export LIBRARY_PATH=$PYTHONPATH/lib:$LIBRARY_PATH - export LD_LIBRARY_PATH=$PYTHONPATH/lib:$LD_LIBRARY_PATH - export PATH=$PYTHONPATH/bin:$PATH - export LIBRARY_PATH=$PYTHONROOT/lib:$LIBRARY_PATH - - python -c "print('heheda')" - - mpirun -npernode 1 python/bin/python -m pip uninstall -y paddle-rec - mpirun -npernode 1 python/bin/python -m pip install whl/fleet_rec-0.0.2-py2-none-any.whl --index-url=http://pip.baidu.com/pypi/simple --trusted-host pip.baidu.com - check_error -} - -function run() { - echo "run" - g_run_stage="run" - mpirun -npernode 2 -timestamp-output -tag-output -machinefile ${PBS_NODEFILE} python/bin/python -u -m paddlerec.run -m paddlerec.models.rank.dnn --engine cluster --role worker -} - -function main() { - env_prepare - run -} - -main