提交 d0d14e9d 编写于 作者: L linan17

update package

Change-Id: I6063fd83d5d0a2f641858e965ac373eb82ca974f
上级 0e6740a4
......@@ -31,7 +31,7 @@ IF((NOT DEFINED PSLIB_VER) OR (NOT DEFINED PSLIB_URL))
MESSAGE(STATUS "use pre defined download url")
SET(PSLIB_VER "0.1.1" CACHE STRING "" FORCE)
SET(PSLIB_NAME "pslib" CACHE STRING "" FORCE)
SET(PSLIB_URL "https://pslib.bj.bcebos.com/pslib.tar.gz" CACHE STRING "" FORCE)
SET(PSLIB_URL "ftp://yq01-ps-201704-m12-tianqi026.yq01.baidu.com/home/work/pslib_online/pslib.tar.gz" CACHE STRING "" FORCE)
ENDIF()
MESSAGE(STATUS "PSLIB_NAME: ${PSLIB_NAME}, PSLIB_URL: ${PSLIB_URL}")
SET(PSLIB_SOURCE_DIR "${THIRD_PARTY_PATH}/pslib")
......
......@@ -6,16 +6,11 @@ echo "current:"$WORKDIR
mpirun -npernode 1 mv package/* ./
export LIBRARY_PATH=$WORKDIR/python/lib:$LIBRARY_PATH
export HADOOP_HOME="${WORKDIR}/hadoop-client/hadoop"
ulimit -c unlimited
#export FLAGS_check_nan_inf=True
#export check_nan_inf=True
#FLAGS_check_nan_inf=True check_nan_inf=True
#mpirun -npernode 2 -timestamp-output -tag-output -machinefile ${PBS_NODEFILE} python/bin/python -u trainer_online.py
mpirun -npernode 1 sh clear_ssd.sh
mpirun -npernode 2 -timestamp-output -tag-output python/bin/python -u trainer_online.py
if [[ $? -ne 0 ]]; then
......
#!bash
function check_appid_valid() {
appid="$1"
num=`echo "${appid}" |awk -F '-' '{print NF}'`
if [ $num -ne 4 ];then
return 1
fi
return 0
}
function appid_running_num() {
appid="$1"
proc_num=`ps -ef |grep "${appid}"|grep -v grep|wc -l`
if [ $? -ne 0 ];then
#if failed, return 1, avoid
return 1
fi
return ${proc_num}
}
work_dir="$1"
base_dir=`echo "${work_dir}" |awk -F 'app-user-' '{print $1}'`
database_list=`find ${base_dir} -type d -name 'database'`
for element in ${database_list[@]}
do
app_id=`echo "$element"|awk -F 'app-user-' '{print $2}' |awk -F '/' '{print "app-user-"$1}'`
check_appid_valid "${app_id}"
if [ $? -ne 0 ];then
continue
fi
appid_running_num "${app_id}"
if [ $? -eq 0 ];then
echo "remove ${element}"
rm -rf ${element}
fi
done
dataset_type="InMemoryDataset"
sparse_table_storage="ssd"
batch_size=32
thread_num=12
shuffle_thread=12
......@@ -10,13 +9,13 @@ fs_name="afs://xingtian.afs.baidu.com:9902"
fs_ugi="mlarch_pro,proisvip"
train_data_path=["afs:/user/feed/mlarch/samplejoin/mondr_shoubai_dnn_master/feasign"]
init_model_path=""
days="{20190915..20190930} {20191001..20191031} {20191101..20191130} {20191201..20191231} {20200101..20200131}"
days="{20191201..20191231} {20200101..20200131} {20200201..20200228} {20200301..20200331}"
hours="{0..23}"
split_interval=5
split_per_pass=2
is_data_hourly_placed=False
save_first_base=False
output_path="afs:/user/feed/mlarch/model/feed_muye_ln_paddle"
output_path="afs:/user/feed/mlarch/model/feed_muye_news_paddle"
pipe_command="./read_feasign | python/bin/python ins_weight.py | awk -f format_newcate_hotnews.awk | ./parse_feasign all_slot.dict"
save_xbox_before_update=True
check_exist_seconds=30
......@@ -36,5 +35,6 @@ node_memory=100000
mpi_server=yq01-hpc-lvliang01-smart-master.dmop.baidu.com
mpi_queue=feed5
mpi_priority=very_high
smart_client_home=/home/work/online_model/news_fsort/submit_jingpai_xiaoliuliang_paddlef50e701_pslibf7995_compile02255_reqi/smart_client/
local_hadoop_home=/home/work/online_model/news_fsort/submit_jingpai_xiaoliuliang_paddlef50e701_pslibf7995_compile02255_reqi/hadoop-client/hadoop
smart_client_home=/home/work/xiexionghang/news_paddle_online/smart_client/
local_hadoop_home=/home/work/xiexionghang/news_paddle_online/hadoop-client/hadoop
sparse_table_storage="ssd"
......@@ -2,9 +2,9 @@ server_param {
downpour_server_param {
downpour_table_param {
table_id: 0
table_class: "DownpourSparseSSDTable"
table_class: "DownpourSparseTable"
shard_num: 1950
sparse_table_cache_rate: 0.0035
sparse_table_cache_rate: 0.00055
accessor {
accessor_class: "DownpourCtrAccessor"
sparse_sgd_param {
......
......@@ -19,6 +19,21 @@ fleet_util = FleetUtil()
def time_prefix_str():
return "\n" + time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) + "[0]:"
auc_record = {}
def check_auc_ok(auc_label, auc_log, auc_alarm):
auc_datas = auc_log.split(' AUC=')
if len(auc_datas) < 2:
return True
if auc_label not in auc_record:
auc_record[auc_label] = 0.0
auc = float(auc_datas[1].split(' ')[0])
if auc < auc_record[auc_label] and auc < auc_alarm:
fleet_util.rank0_print("label:%s, auc:%s, check bad" % (auc_label, auc))
return False
auc_record[auc_label] = auc
fleet_util.rank0_print("label:%s, auc:%s, check ok" % (auc_label, auc))
return True
def create_model(slot_file, slot_common_file, all_slot_file):
join_common_model = ModelJoinCommon(slot_file, slot_common_file, all_slot_file, 20)
update_model = Model(slot_file, all_slot_file, False, 0, True)
......@@ -444,6 +459,7 @@ if __name__ == "__main__":
join_common_model.join_prob.name,
join_common_model.join_q.name, join_common_model.join_pos.name,
join_common_model.join_total.name, "joining pass:")#"join pass:")
check_auc_ok("joining pass:", log_str, 0.79)
monitor_data += log_str
stdout_str += time_prefix_str() + "joining pass:"
stdout_str += time_prefix_str() + log_str
......@@ -453,6 +469,7 @@ if __name__ == "__main__":
join_common_model.common_prob.name,
join_common_model.common_q.name, join_common_model.common_pos.name,
join_common_model.common_total.name, "common pass:")
check_auc_ok("common pass:", log_str, 0.70)
monitor_data += " " + log_str
stdout_str += time_prefix_str() + "common pass:"
stdout_str += time_prefix_str() + log_str
......@@ -491,6 +508,7 @@ if __name__ == "__main__":
update_model.sqrerr.name, update_model.abserr.name, update_model.prob.name,
update_model.q.name, update_model.pos.name, update_model.total.name,
"updating pass:")#"update pass:")
check_auc_ok("updating pass:", log_str, 0.79)
stdout_str += time_prefix_str() + "updating pass:"
stdout_str += time_prefix_str() + log_str
fleet_util.rank0_print("End update pass")
......@@ -540,16 +558,6 @@ if __name__ == "__main__":
stdout_str = ""
continue
fleet_util.rank0_print("shrink table")
begin = time.time()
fleet.shrink_sparse_table()
fleet.shrink_dense_table(0.98, scope=scope2, table_id=1)
fleet.shrink_dense_table(0.98, scope=scope2, table_id=2)
fleet.shrink_dense_table(0.98, scope=scope3, table_id=3)
end = time.time()
log_str = "shrink table done, cost %s min" % ((end - begin) / 60.0)
fleet_util.rank0_print(log_str)
stdout_str += time_prefix_str() + log_str
fleet_util.rank0_print("going to save batch model/base xbox model")
last_base_day, last_base_path, _ = fleet_util.get_last_save_xbox_base(config.output_path, config.fs_name, config.fs_ugi)
......@@ -562,6 +570,18 @@ if __name__ == "__main__":
stdout_str += save_delta(nextday, -1, xbox_base_key, cur_path, exe, scope2, scope2, scope3,
join_common_model, join_common_model, update_model,
join_save_params, common_save_params, update_save_params, monitor_data)
fleet_util.rank0_print("shrink table")
begin = time.time()
fleet.shrink_sparse_table()
fleet.shrink_dense_table(0.98, scope=scope2, table_id=1)
fleet.shrink_dense_table(0.98, scope=scope2, table_id=2)
fleet.shrink_dense_table(0.98, scope=scope3, table_id=3)
end = time.time()
log_str = "shrink table done, cost %s min" % ((end - begin) / 60.0)
fleet_util.rank0_print(log_str)
stdout_str += time_prefix_str() + log_str
begin = time.time()
fleet_util.save_batch_model(config.output_path, nextday)
fleet_util.write_model_donefile(config.output_path, nextday, -1, xbox_base_key, config.fs_name, config.fs_ugi)
......
......@@ -9,12 +9,14 @@ cd tmp
mkdir ./package
cp -r ../package/python ./package
cp -r ../package/my_nets/* ./package
cp -r ../hadoop-client_mpi ./package/hadoop-client
cp ../qsub_f.conf ./
cp ../job.sh ./
cp ../job.sh ./package
if [ "a${sparse_table_storage}" = "assd" ];then
sed -i 's/DownpourSparseTable/DownpourSparseSSDTable' ./package/my_nets/reqi_fleet_desc
sed -i 's/DownpourSparseTable/DownpourSparseSSDTable/g' ./package/reqi_fleet_desc
sed -i 's/sparse_table_cache_rate: 0.00055/sparse_table_cache_rate: 0.0025/g' ./package/reqi_fleet_desc
fi
current=`date "+%Y-%m-%d %H:%M:%S"`
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册