From d0d14e9d09b8b7e8dae9f42276202af53e1bf297 Mon Sep 17 00:00:00 2001 From: linan17 Date: Tue, 17 Dec 2019 13:48:38 +0800 Subject: [PATCH] update package Change-Id: I6063fd83d5d0a2f641858e965ac373eb82ca974f --- cmake/external/pslib.cmake | 2 +- .../feed/feed_deploy/news_jingpai/job.sh | 9 +---- .../news_jingpai/package/my_nets/clear_ssd.sh | 38 ++++++++++++++++++ .../news_jingpai/package/my_nets/config.py | 10 ++--- .../package/my_nets/reqi_fleet_desc | 4 +- .../package/my_nets/trainer_online.py | 40 ++++++++++++++----- .../feed/feed_deploy/news_jingpai/submit.sh | 4 +- 7 files changed, 81 insertions(+), 26 deletions(-) mode change 100644 => 100755 cmake/external/pslib.cmake mode change 100755 => 100644 paddle/fluid/feed/feed_deploy/news_jingpai/job.sh create mode 100644 paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/clear_ssd.sh mode change 100755 => 100644 paddle/fluid/feed/feed_deploy/news_jingpai/submit.sh diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake old mode 100644 new mode 100755 index 99a1c23b..983d13e3 --- a/cmake/external/pslib.cmake +++ b/cmake/external/pslib.cmake @@ -31,7 +31,7 @@ IF((NOT DEFINED PSLIB_VER) OR (NOT DEFINED PSLIB_URL)) MESSAGE(STATUS "use pre defined download url") SET(PSLIB_VER "0.1.1" CACHE STRING "" FORCE) SET(PSLIB_NAME "pslib" CACHE STRING "" FORCE) - SET(PSLIB_URL "https://pslib.bj.bcebos.com/pslib.tar.gz" CACHE STRING "" FORCE) + SET(PSLIB_URL "ftp://yq01-ps-201704-m12-tianqi026.yq01.baidu.com/home/work/pslib_online/pslib.tar.gz" CACHE STRING "" FORCE) ENDIF() MESSAGE(STATUS "PSLIB_NAME: ${PSLIB_NAME}, PSLIB_URL: ${PSLIB_URL}") SET(PSLIB_SOURCE_DIR "${THIRD_PARTY_PATH}/pslib") diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/job.sh b/paddle/fluid/feed/feed_deploy/news_jingpai/job.sh old mode 100755 new mode 100644 index c63d9fec..58aa0428 --- a/paddle/fluid/feed/feed_deploy/news_jingpai/job.sh +++ b/paddle/fluid/feed/feed_deploy/news_jingpai/job.sh @@ -6,16 +6,11 @@ echo "current:"$WORKDIR mpirun -npernode 1 mv package/* ./ export LIBRARY_PATH=$WORKDIR/python/lib:$LIBRARY_PATH +export HADOOP_HOME="${WORKDIR}/hadoop-client/hadoop" ulimit -c unlimited -#export FLAGS_check_nan_inf=True -#export check_nan_inf=True - -#FLAGS_check_nan_inf=True check_nan_inf=True - -#mpirun -npernode 2 -timestamp-output -tag-output -machinefile ${PBS_NODEFILE} python/bin/python -u trainer_online.py - +mpirun -npernode 1 sh clear_ssd.sh mpirun -npernode 2 -timestamp-output -tag-output python/bin/python -u trainer_online.py if [[ $? -ne 0 ]]; then diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/clear_ssd.sh b/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/clear_ssd.sh new file mode 100644 index 00000000..a26c21a0 --- /dev/null +++ b/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/clear_ssd.sh @@ -0,0 +1,38 @@ +#!bash + +function check_appid_valid() { + appid="$1" + num=`echo "${appid}" |awk -F '-' '{print NF}'` + if [ $num -ne 4 ];then + return 1 + fi + return 0 +} + +function appid_running_num() { + appid="$1" + proc_num=`ps -ef |grep "${appid}"|grep -v grep|wc -l` + if [ $? -ne 0 ];then + #if failed, return 1, avoid + return 1 + fi + return ${proc_num} +} + +work_dir="$1" +base_dir=`echo "${work_dir}" |awk -F 'app-user-' '{print $1}'` +database_list=`find ${base_dir} -type d -name 'database'` +for element in ${database_list[@]} +do + app_id=`echo "$element"|awk -F 'app-user-' '{print $2}' |awk -F '/' '{print "app-user-"$1}'` + check_appid_valid "${app_id}" + if [ $? -ne 0 ];then + continue + fi + appid_running_num "${app_id}" + if [ $? -eq 0 ];then + echo "remove ${element}" + rm -rf ${element} + fi +done + diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/config.py b/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/config.py index a73bb4c7..907fb4c1 100644 --- a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/config.py +++ b/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/config.py @@ -1,5 +1,4 @@ dataset_type="InMemoryDataset" -sparse_table_storage="ssd" batch_size=32 thread_num=12 shuffle_thread=12 @@ -10,13 +9,13 @@ fs_name="afs://xingtian.afs.baidu.com:9902" fs_ugi="mlarch_pro,proisvip" train_data_path=["afs:/user/feed/mlarch/samplejoin/mondr_shoubai_dnn_master/feasign"] init_model_path="" -days="{20190915..20190930} {20191001..20191031} {20191101..20191130} {20191201..20191231} {20200101..20200131}" +days="{20191201..20191231} {20200101..20200131} {20200201..20200228} {20200301..20200331}" hours="{0..23}" split_interval=5 split_per_pass=2 is_data_hourly_placed=False save_first_base=False -output_path="afs:/user/feed/mlarch/model/feed_muye_ln_paddle" +output_path="afs:/user/feed/mlarch/model/feed_muye_news_paddle" pipe_command="./read_feasign | python/bin/python ins_weight.py | awk -f format_newcate_hotnews.awk | ./parse_feasign all_slot.dict" save_xbox_before_update=True check_exist_seconds=30 @@ -36,5 +35,6 @@ node_memory=100000 mpi_server=yq01-hpc-lvliang01-smart-master.dmop.baidu.com mpi_queue=feed5 mpi_priority=very_high -smart_client_home=/home/work/online_model/news_fsort/submit_jingpai_xiaoliuliang_paddlef50e701_pslibf7995_compile02255_reqi/smart_client/ -local_hadoop_home=/home/work/online_model/news_fsort/submit_jingpai_xiaoliuliang_paddlef50e701_pslibf7995_compile02255_reqi/hadoop-client/hadoop +smart_client_home=/home/work/xiexionghang/news_paddle_online/smart_client/ +local_hadoop_home=/home/work/xiexionghang/news_paddle_online/hadoop-client/hadoop +sparse_table_storage="ssd" diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/reqi_fleet_desc b/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/reqi_fleet_desc index 580da7c4..c0d3ab82 100644 --- a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/reqi_fleet_desc +++ b/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/reqi_fleet_desc @@ -2,9 +2,9 @@ server_param { downpour_server_param { downpour_table_param { table_id: 0 - table_class: "DownpourSparseSSDTable" + table_class: "DownpourSparseTable" shard_num: 1950 - sparse_table_cache_rate: 0.0035 + sparse_table_cache_rate: 0.00055 accessor { accessor_class: "DownpourCtrAccessor" sparse_sgd_param { diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/trainer_online.py b/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/trainer_online.py index 0ee4ccdf..8f29b42c 100644 --- a/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/trainer_online.py +++ b/paddle/fluid/feed/feed_deploy/news_jingpai/package/my_nets/trainer_online.py @@ -19,6 +19,21 @@ fleet_util = FleetUtil() def time_prefix_str(): return "\n" + time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) + "[0]:" +auc_record = {} +def check_auc_ok(auc_label, auc_log, auc_alarm): + auc_datas = auc_log.split(' AUC=') + if len(auc_datas) < 2: + return True + if auc_label not in auc_record: + auc_record[auc_label] = 0.0 + auc = float(auc_datas[1].split(' ')[0]) + if auc < auc_record[auc_label] and auc < auc_alarm: + fleet_util.rank0_print("label:%s, auc:%s, check bad" % (auc_label, auc)) + return False + auc_record[auc_label] = auc + fleet_util.rank0_print("label:%s, auc:%s, check ok" % (auc_label, auc)) + return True + def create_model(slot_file, slot_common_file, all_slot_file): join_common_model = ModelJoinCommon(slot_file, slot_common_file, all_slot_file, 20) update_model = Model(slot_file, all_slot_file, False, 0, True) @@ -444,6 +459,7 @@ if __name__ == "__main__": join_common_model.join_prob.name, join_common_model.join_q.name, join_common_model.join_pos.name, join_common_model.join_total.name, "joining pass:")#"join pass:") + check_auc_ok("joining pass:", log_str, 0.79) monitor_data += log_str stdout_str += time_prefix_str() + "joining pass:" stdout_str += time_prefix_str() + log_str @@ -453,6 +469,7 @@ if __name__ == "__main__": join_common_model.common_prob.name, join_common_model.common_q.name, join_common_model.common_pos.name, join_common_model.common_total.name, "common pass:") + check_auc_ok("common pass:", log_str, 0.70) monitor_data += " " + log_str stdout_str += time_prefix_str() + "common pass:" stdout_str += time_prefix_str() + log_str @@ -491,6 +508,7 @@ if __name__ == "__main__": update_model.sqrerr.name, update_model.abserr.name, update_model.prob.name, update_model.q.name, update_model.pos.name, update_model.total.name, "updating pass:")#"update pass:") + check_auc_ok("updating pass:", log_str, 0.79) stdout_str += time_prefix_str() + "updating pass:" stdout_str += time_prefix_str() + log_str fleet_util.rank0_print("End update pass") @@ -540,16 +558,6 @@ if __name__ == "__main__": stdout_str = "" continue - fleet_util.rank0_print("shrink table") - begin = time.time() - fleet.shrink_sparse_table() - fleet.shrink_dense_table(0.98, scope=scope2, table_id=1) - fleet.shrink_dense_table(0.98, scope=scope2, table_id=2) - fleet.shrink_dense_table(0.98, scope=scope3, table_id=3) - end = time.time() - log_str = "shrink table done, cost %s min" % ((end - begin) / 60.0) - fleet_util.rank0_print(log_str) - stdout_str += time_prefix_str() + log_str fleet_util.rank0_print("going to save batch model/base xbox model") last_base_day, last_base_path, _ = fleet_util.get_last_save_xbox_base(config.output_path, config.fs_name, config.fs_ugi) @@ -562,6 +570,18 @@ if __name__ == "__main__": stdout_str += save_delta(nextday, -1, xbox_base_key, cur_path, exe, scope2, scope2, scope3, join_common_model, join_common_model, update_model, join_save_params, common_save_params, update_save_params, monitor_data) + + fleet_util.rank0_print("shrink table") + begin = time.time() + fleet.shrink_sparse_table() + fleet.shrink_dense_table(0.98, scope=scope2, table_id=1) + fleet.shrink_dense_table(0.98, scope=scope2, table_id=2) + fleet.shrink_dense_table(0.98, scope=scope3, table_id=3) + end = time.time() + log_str = "shrink table done, cost %s min" % ((end - begin) / 60.0) + fleet_util.rank0_print(log_str) + stdout_str += time_prefix_str() + log_str + begin = time.time() fleet_util.save_batch_model(config.output_path, nextday) fleet_util.write_model_donefile(config.output_path, nextday, -1, xbox_base_key, config.fs_name, config.fs_ugi) diff --git a/paddle/fluid/feed/feed_deploy/news_jingpai/submit.sh b/paddle/fluid/feed/feed_deploy/news_jingpai/submit.sh old mode 100755 new mode 100644 index 062265d2..26715ec6 --- a/paddle/fluid/feed/feed_deploy/news_jingpai/submit.sh +++ b/paddle/fluid/feed/feed_deploy/news_jingpai/submit.sh @@ -9,12 +9,14 @@ cd tmp mkdir ./package cp -r ../package/python ./package cp -r ../package/my_nets/* ./package +cp -r ../hadoop-client_mpi ./package/hadoop-client cp ../qsub_f.conf ./ cp ../job.sh ./ cp ../job.sh ./package if [ "a${sparse_table_storage}" = "assd" ];then - sed -i 's/DownpourSparseTable/DownpourSparseSSDTable' ./package/my_nets/reqi_fleet_desc + sed -i 's/DownpourSparseTable/DownpourSparseSSDTable/g' ./package/reqi_fleet_desc + sed -i 's/sparse_table_cache_rate: 0.00055/sparse_table_cache_rate: 0.0025/g' ./package/reqi_fleet_desc fi current=`date "+%Y-%m-%d %H:%M:%S"` -- GitLab