提交 4cdab80b 编写于 作者: T tangwei

remove cluster example

上级 077e682f
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
engine:
workspace: "/home/tangwei/fleet_rec_env/paddlerec/example/cloud"
backend: "PaddleCloud"
submit:
sk: "2907445932295c57bffab28676fc5a66"
ak: "55cb2027d7a05b9f8b213de483698cc8"
version: "paddle-fluid-v1.7.1"
priority: "high"
jobname: "paddlerec_ctr_dnn"
group: "paddle_benchmark"
cluster: "SZWG-SUNWAY"
config: "{workspace}/config.ini"
nodes: 8
submit_scrpit: "{workspace}/submit.sh"
echo "Run before_hook.sh ..."
../python27-gcc482/bin/python ../python27-gcc482/bin/pip install ./thirdparty/paddle_rec-0.0.2-py2-none-any.whl --index-url=http://pip.baidu.com/pypi/simple --trusted-host pip.baidu.com
echo "End before_hook.sh ..."
#type of storage cluster
storage_type="hdfs"
#attention: files for training should be put on hdfs
force_reuse_output_path="True"
fs_name="afs://yinglong.afs.baidu.com:9902"
fs_ugi="paddle,paddle"
thirdparty_path="/user/paddle/benchmark/ctr/thirdparty"
#train data path on hdfs
train_data_path="/user/paddle/benchmark/ctr/train_data_paddle/part_1*"
#test data path on hdfs
#test_data_path="/user/paddle/benchmark/ctr/test_data"
#the output directory on hdfs
output_path="/user/paddle/ly"
\ No newline at end of file
train:
trainer:
# for cluster training
strategy: "async"
epochs: 10
workspace: "paddlerec.models.rank.dnn"
reader:
batch_size: 512
class: "{workspace}/../criteo_reader.py"
train_data_path: "train_data"
reader_debug_mode: False
model:
models: "{workspace}/model.py"
hyper_parameters:
sparse_inputs_slots: 27
sparse_feature_number: 1000001
sparse_feature_dim: 10
dense_input_dim: 13
fc_sizes: [400, 400, 400]
learning_rate: 0.0001
optimizer: adam
save:
increment:
dirname: "increment"
epoch_interval: 2
save_last: True
inference:
dirname: "inference"
epoch_interval: 4
save_last: True
#!/bin/bash
###################################################
# Usage: job.sh
# Description: run mpi job clinet implement
###################################################
# ---------------------------------------------------------------------------- #
# variable define #
# ---------------------------------------------------------------------------- #
export CPU_NUM=16
export GLOG_v=0
export FLAGS_rpc_deadline=300000
# ---------------------------------------------------------------------------- #
python -m paddlerec.run -m paddle_rec_config.yaml -e cluster -r worker
#!/bin/bash
###################################################
# Usage: submit.sh
# Description: run mpi submit clinet implement
###################################################
g_package_files=""
#-----------------------------------------------------------------------------------------------------------------
#fun : before hook submit to cluster
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function before_submit_hook() {
echo "before_submit"
}
#-----------------------------------------------------------------------------------------------------------------
#fun : after hook submit to cluster
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function after_submit_hook() {
echo "after_submit"
}
#-----------------------------------------------------------------------------------------------------------------
#fun : package to cluster
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function package() {
echo "package"
temp=${engine_temp_path}
cp ${engine_workspace}/job.sh ${temp}
cp ${engine_workspace}/before_hook.sh ${temp}
cp ${engine_run_config} ${temp}/paddle_rec_config.yaml
g_submitfiles="job.sh before_hook.sh paddle_rec_config.yaml"
g_run_cmd="sh job.sh"
}
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
engine:
workspace: "paddlerec.models.rank.dnn"
backend: "MPI"
hdfs:
name: "hdfs://nmg01-taihang-hdfs.dmop.baidu.com:54310"
ugi: "fcr,SaK2VqfEDeXzKPor"
output: "/app/ecom/fcr/fanyabo/wadstyleimageq/tangwei12/output_1/"
package:
build_script: "{workspace}/package.sh"
python: "/home/tangwei/fleet_rec_env/cpython-2.7.11-ucs4"
paddlerec: "/home/tangwei/fleet_rec_env/PaddleRec"
submit:
hpc: "/home/tangwei/Plines/client/smart_client_khan/"
qconf: "/home/tangwei/Plines/imageq/qsub_f.conf"
nodes: 10
submit_scrpit: "{workspace}/submit.sh"
job_scrpit: "{workspace}/worker.sh"
#!/bin/bash
###################################################
# Usage: job.sh
# Description: run job on mpi per node
###################################################
# ---------------------------------------------------------------------------- #
# variable define #
# ---------------------------------------------------------------------------- #
declare g_curPath=""
declare g_scriptName=""
declare g_workPath=""
declare g_run_stage=""
# ---------------------------------------------------------------------------- #
# const define #
# ---------------------------------------------------------------------------- #
export FLAGS_communicator_thread_pool_size=5
export FLAGS_communicator_send_queue_size=18
export FLAGS_communicator_thread_pool_size=20
export FLAGS_communicator_max_merge_var_num=18
################################################################################
#-----------------------------------------------------------------------------------------------------------------
#fun : check function return code
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function check_error() {
if [ ${?} -ne 0 ]; then
echo "execute " + $g_run_stage + " raise exception! please check ..."
exit 1
fi
}
#-----------------------------------------------------------------------------------------------------------------
#fun : check function return code
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function env_prepare() {
g_run_stage="env_prepare"
WORKDIR=$(pwd)
mpirun -npernode 1 mv package/* ./
echo "current:"$WORKDIR
mpirun -npernode 1 tar -zxvf python.tar.gz > /dev/null
export PYTHONPATH=$WORKDIR/python/
export PYTHONROOT=$WORKDIR/python/
export LIBRARY_PATH=$PYTHONPATH/lib:$LIBRARY_PATH
export LD_LIBRARY_PATH=$PYTHONPATH/lib:$LD_LIBRARY_PATH
export PATH=$PYTHONPATH/bin:$PATH
export LIBRARY_PATH=$PYTHONROOT/lib:$LIBRARY_PATH
mpirun -npernode 1 python/bin/python -m pip uninstall -y paddle-rec
mpirun -npernode 1 python/bin/python -m pip install whl/fleet_rec-0.0.2-py2-none-any.whl --index-url=http://pip.baidu.com/pypi/simple --trusted-host pip.baidu.com
check_error
}
function run() {
echo "run"
g_run_stage="run"
mpirun -npernode 2 -timestamp-output -tag-output -machinefile ${PBS_NODEFILE} python/bin/python -u -m paddlerec.run -m paddlerec.models.rank.dnn --engine cluster --role worker
}
function main() {
env_prepare
run
}
main
#!/bin/bash
###################################################
# Usage: submit.sh
# Description: run mpi submit clinet implement
###################################################
#-----------------------------------------------------------------------------------------------------------------
#fun : get argument from env, set it into variables
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function vars_get_from_env() {
echo "xx"
}
#-----------------------------------------------------------------------------------------------------------------
#fun : package
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function package() {
g_run_stage="package"
temp=${engine_temp_path}
echo "package temp dir: " ${temp}
cp ${engine_job_scrpit} ${temp}
cp ${engine_submit_qconf} ${temp}
echo "copy job.sh from " ${engine_worker} " to " ${temp}
mkdir -p ${temp}/package
cp -r ${engine_package_python} ${temp}/package/
echo "copy python from " ${engine_package_python} " to " ${temp}
mkdir ${temp}/package/whl
cp ${engine_package_paddlerec} ${temp}/package/whl/
echo "copy " ${engine_package_paddlerec} " to " ${temp}"/whl/"
}
#-----------------------------------------------------------------------------------------------------------------
#fun : before hook submit to cluster
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function before_submit() {
echo "before_submit"
}
#-----------------------------------------------------------------------------------------------------------------
#fun : after hook submit to cluster
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function after_submit() {
echo "after_submit"
}
#-----------------------------------------------------------------------------------------------------------------
#fun : submit to cluster
#param : N/A
#return : 0 -- success; not 0 -- failure
#-----------------------------------------------------------------------------------------------------------------
function submit() {
g_run_stage="submit"
g_job_name="paddle_rec_mpi"
g_hdfs_path=$g_hdfs_path
g_job_entry="job.sh"
engine_hdfs_output=${engine_hdfs_output}/$(date +%Y%m%d%H%M%S)
cd ${engine_temp_path}
${engine_submit_hpc}/bin/qsub_f \
-N ${g_job_name} \
--conf ${engine_submit_qconf} \
--hdfs ${engine_hdfs_name} \
--ugi ${engine_hdfs_ugi} \
--hout ${engine_hdfs_output} \
--files ./package \
-l nodes=${engine_submit_nodes},walltime=1000:00:00,resource=full ${g_job_entry}
}
function main() {
package
before_submit
submit
after_submit
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册