From 9438cb36d9f7842f397568b1fd5f726b0a4ea075 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 13 May 2020 11:52:55 +0800 Subject: [PATCH] bug fix --- fleet_rec/core/trainers/cluster_trainer.py | 4 +++- fleet_rec/core/trainers/transpiler_trainer.py | 3 ++- fleet_rec/run.py | 8 +++---- models/rank/dnn/submit.sh | 4 ++-- models/rank/dnn/worker.sh | 23 ++++++++++++++----- 5 files changed, 28 insertions(+), 14 deletions(-) diff --git a/fleet_rec/core/trainers/cluster_trainer.py b/fleet_rec/core/trainers/cluster_trainer.py index 4635d910..bc5a9173 100755 --- a/fleet_rec/core/trainers/cluster_trainer.py +++ b/fleet_rec/core/trainers/cluster_trainer.py @@ -23,6 +23,7 @@ import paddle.fluid as fluid from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory from paddle.fluid.incubate.fleet.base.role_maker import PaddleCloudRoleMaker +from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker from fleetrec.core.utils import envs from fleetrec.core.trainers.transpiler_trainer import TranspileTrainer @@ -30,7 +31,8 @@ from fleetrec.core.trainers.transpiler_trainer import TranspileTrainer class ClusterTrainer(TranspileTrainer): def processor_register(self): - role = PaddleCloudRoleMaker() + #role = PaddleCloudRoleMaker() + role = MPISymetricRoleMaker() fleet.init(role) if fleet.is_server(): diff --git a/fleet_rec/core/trainers/transpiler_trainer.py b/fleet_rec/core/trainers/transpiler_trainer.py index 96afab21..683b921e 100755 --- a/fleet_rec/core/trainers/transpiler_trainer.py +++ b/fleet_rec/core/trainers/transpiler_trainer.py @@ -72,7 +72,8 @@ class TranspileTrainer(Trainer): train_data_path = envs.get_global_env( "test_data_path", None, namespace) - threads = int(envs.get_runtime_environ("train.trainer.threads")) + #threads = int(envs.get_runtime_environ("train.trainer.threads")) + threads = 2 batch_size = envs.get_global_env("batch_size", None, namespace) reader_class = envs.get_global_env("class", None, namespace) abs_dir = os.path.dirname(os.path.abspath(__file__)) diff --git a/fleet_rec/run.py b/fleet_rec/run.py index 34adbcd7..aa8d5965 100755 --- a/fleet_rec/run.py +++ b/fleet_rec/run.py @@ -110,7 +110,6 @@ def single_engine(args): def cluster_engine(args): - from fleetrec.core.engine.cluster.cluster import ClusterEngine def update_workspace(cluster_envs): workspace = cluster_envs.get("engine_workspace", None) @@ -131,6 +130,7 @@ def cluster_engine(args): cluster_envs[name] = value def master(): + from fleetrec.core.engine.cluster.cluster import ClusterEngine with open(args.backend, 'r') as rb: _envs = yaml.load(rb.read(), Loader=yaml.FullLoader) @@ -155,10 +155,10 @@ def cluster_engine(args): print("launch {} engine with cluster to with model: {}".format(trainer, args.model)) set_runtime_envs(cluster_envs, args.model) - launch = LocalClusterEngine(cluster_envs, args.model) - return launch + trainer = TrainerFactory.create(args.model) + return trainer - if args.role == "worker": + if args.role == "WORKER": return worker() else: return master() diff --git a/models/rank/dnn/submit.sh b/models/rank/dnn/submit.sh index 1a11360c..56b5f879 100644 --- a/models/rank/dnn/submit.sh +++ b/models/rank/dnn/submit.sh @@ -29,8 +29,8 @@ function package() { cp ${engine_submit_qconf} ${temp} echo "copy job.sh from " ${engine_worker} " to " ${temp} - mkdir -p ${temp}/package/python - cp -r ${engine_package_python}/* ${temp}/package/python/ + mkdir -p ${temp}/package + cp -r ${engine_package_python} ${temp}/package/ echo "copy python from " ${engine_package_python} " to " ${temp} mkdir ${temp}/package/whl diff --git a/models/rank/dnn/worker.sh b/models/rank/dnn/worker.sh index 57cf12c9..3ca2a1f0 100644 --- a/models/rank/dnn/worker.sh +++ b/models/rank/dnn/worker.sh @@ -16,10 +16,10 @@ declare g_run_stage="" # ---------------------------------------------------------------------------- # # const define # # ---------------------------------------------------------------------------- # -declare -r FLAGS_communicator_thread_pool_size=5 -declare -r FLAGS_communicator_send_queue_size=18 -declare -r FLAGS_communicator_thread_pool_size=20 -declare -r FLAGS_communicator_max_merge_var_num=18 +export FLAGS_communicator_thread_pool_size=5 +export FLAGS_communicator_send_queue_size=18 +export FLAGS_communicator_thread_pool_size=20 +export FLAGS_communicator_max_merge_var_num=18 ################################################################################ #----------------------------------------------------------------------------------------------------------------- @@ -44,9 +44,20 @@ function env_prepare() { WORKDIR=$(pwd) mpirun -npernode 1 mv package/* ./ echo "current:"$WORKDIR - export LIBRARY_PATH=$WORKDIR/python/lib:$LIBRARY_PATH - mpirun -npernode 1 python/bin/python -m pip install whl/fleet_rec-0.0.2-py2-none-any.whl --index-url=http://pip.baidu.com/pypi/simple --trusted-host pip.baidu.com >/dev/null + mpirun -npernode 1 tar -zxvf python.tar.gz > /dev/null + + export PYTHONPATH=$WORKDIR/python/ + export PYTHONROOT=$WORKDIR/python/ + export LIBRARY_PATH=$PYTHONPATH/lib:$LIBRARY_PATH + export LD_LIBRARY_PATH=$PYTHONPATH/lib:$LD_LIBRARY_PATH + export PATH=$PYTHONPATH/bin:$PATH + export LIBRARY_PATH=$PYTHONROOT/lib:$LIBRARY_PATH + + python -c "print('heheda')" + + mpirun -npernode 1 python/bin/python -m pip uninstall -y fleet-rec + mpirun -npernode 1 python/bin/python -m pip install whl/fleet_rec-0.0.2-py2-none-any.whl --index-url=http://pip.baidu.com/pypi/simple --trusted-host pip.baidu.com check_error } -- GitLab