diff --git a/README.md b/README.md index 51ed968def32662605d2bfb2292ef31b8ab7c3b4..c9a38e222ed9db7be2d545e2b74c3d222f0e581c 100644 --- a/README.md +++ b/README.md @@ -1,102 +1,110 @@ -([简体中文](./README_CN.md)|English) +(简体中文|[English](./README_EN.md)) +
- + +
+
+
-
- +
-- Recommendation system helps users quickly find useful and interesting information from massive data. - -- Recommendation system is also a silver bullet to attract users, retain users, increase users' stickness or conversionn. - - > Who can better use the recommendation system, who can gain more advantage in the fierce competition. - > - > At the same time, there are many problems in the process of using the recommendation system, such as: huge data, complex model, inefficient distributed training, and so on. - -
@@ -160,22 +169,22 @@ python -m paddlerec.run -m paddlerec.models.rank.dnn
-### Version history +### 版本历史 - 2020.06.17 - PaddleRec v0.1.0 - 2020.06.03 - PaddleRec v0.0.2 - 2020.05.14 - PaddleRec v0.0.1 -### License -[Apache 2.0 license](LICENSE) +### 许可证书 +本项目的发布受[Apache 2.0 license](LICENSE)许可认证。 -### Contact us +### 联系我们 -For any feedback, please propose a [GitHub Issue](https://github.com/PaddlePaddle/PaddleRec/issues) +如有意见、建议及使用中的BUG,欢迎在[GitHub Issue](https://github.com/PaddlePaddle/PaddleRec/issues)提交 -You can also communicate with us in the following ways: +亦可通过以下方式与我们沟通交流: -- QQ group id:`861717190` -- Wechat account:`paddlerec2020` +- QQ群号码:`861717190` +- 微信小助手微信号:`paddlerec2020`
-
PaddleRec QQ Group PaddleRec Wechat account
+PaddleRec交流QQ群 PaddleRec微信小助手
diff --git a/README_CN.md b/README_CN.md deleted file mode 100644 index 81a872e90af08b237ee3ad4bdc29568e8cc0f514..0000000000000000000000000000000000000000 --- a/README_CN.md +++ /dev/null @@ -1,186 +0,0 @@ -(简体中文|[English](./README.md)) - -- -
-
- -
-
- -
- - -
- -
- -- 推荐系统是在互联网信息爆炸式增长的时代背景下,帮助用户高效获得感兴趣信息的关键; - -- 推荐系统也是帮助产品最大限度吸引用户、留存用户、增加用户粘性、提高用户转化率的银弹。 - -- 有无数优秀的产品依靠用户可感知的推荐系统建立了良好的口碑,也有无数的公司依靠直击用户痛点的推荐系统在行业中占领了一席之地。 - - > 可以说,谁能掌握和利用好推荐系统,谁就能在信息分发的激烈竞争中抢得先机。 - > 但与此同时,有着许多问题困扰着推荐系统的开发者,比如:庞大的数据量,复杂的模型结构,低效的分布式训练环境,波动的在离线一致性,苛刻的上线部署要求,以上种种,不胜枚举。 - -
-
-
-
-
-
-
- -### 版本历史 -- 2020.06.17 - PaddleRec v0.1.0 -- 2020.06.03 - PaddleRec v0.0.2 -- 2020.05.14 - PaddleRec v0.0.1 - -### 许可证书 -本项目的发布受[Apache 2.0 license](LICENSE)许可认证。 - -### 联系我们 - -如有意见、建议及使用中的BUG,欢迎在[GitHub Issue](https://github.com/PaddlePaddle/PaddleRec/issues)提交 - -亦可通过以下方式与我们沟通交流: - -- QQ群号码:`861717190` -- 微信小助手微信号:`paddlerec2020` - -
-
PaddleRec交流QQ群 PaddleRec微信小助手
diff --git a/README_EN.md b/README_EN.md new file mode 100644 index 0000000000000000000000000000000000000000..b0ab5aefe0dacf953fa562b73f05648f5127c769 --- /dev/null +++ b/README_EN.md @@ -0,0 +1,185 @@ +([简体中文](./README.md)|English) ++ +
+
+ +
+ + +
+ +
+ +- Recommendation system helps users quickly find useful and interesting information from massive data. + +- Recommendation system is also a silver bullet to attract users, retain users, increase users' stickness or conversionn. + + > Who can better use the recommendation system, who can gain more advantage in the fierce competition. + > + > At the same time, there are many problems in the process of using the recommendation system, such as: huge data, complex model, inefficient distributed training, and so on. + +
+
+
+
+
+
+
+ +### Version history +- 2020.06.17 - PaddleRec v0.1.0 +- 2020.06.03 - PaddleRec v0.0.2 +- 2020.05.14 - PaddleRec v0.0.1 + +### License +[Apache 2.0 license](LICENSE) + +### Contact us + +For any feedback, please propose a [GitHub Issue](https://github.com/PaddlePaddle/PaddleRec/issues) + +You can also communicate with us in the following ways: + +- QQ group id:`861717190` +- Wechat account:`paddlerec2020` + +
+
PaddleRec QQ Group PaddleRec Wechat account
diff --git a/core/engine/cluster/cloud/before_hook_cpu.sh.template b/core/engine/cluster/cloud/before_hook_cpu.sh.template new file mode 100644 index 0000000000000000000000000000000000000000..d0bd67b2fbe60221ad51e99073d097675286eac7 --- /dev/null +++ b/core/engine/cluster/cloud/before_hook_cpu.sh.template @@ -0,0 +1,15 @@ +echo "Run before_hook.sh ..." + +wget https://paddlerec.bj.bcebos.com/whl/PaddleRec.tar.gz --no-check-certificate + +tar -xf PaddleRec.tar.gz + +cd PaddleRec + +python setup.py install + +pip uninstall -y paddlepaddle + +pip install paddlepaddle==<$ PADDLEPADDLE_VERSION $> --index-url=http://pip.baidu.com/pypi/simple --trusted-host pip.baidu.com + +echo "End before_hook.sh ..." diff --git a/core/engine/cluster/cloud/before_hook_gpu.sh.template b/core/engine/cluster/cloud/before_hook_gpu.sh.template new file mode 100644 index 0000000000000000000000000000000000000000..1a9d5e189870e84670e60571dfbeadd48e1245b0 --- /dev/null +++ b/core/engine/cluster/cloud/before_hook_gpu.sh.template @@ -0,0 +1,15 @@ +echo "Run before_hook.sh ..." + +wget https://paddlerec.bj.bcebos.com/whl/PaddleRec.tar.gz --no-check-certificate + +tar -xf PaddleRec.tar.gz + +cd PaddleRec + +python setup.py install + +pip uninstall -y paddlepaddle + +pip install paddlepaddle-gpu==<$ PADDLEPADDLE_VERSION $>.post107 --index-url=http://pip.baidu.com/pypi/simple --trusted-host pip.baidu.com + +echo "End before_hook.sh ..." diff --git a/core/engine/cluster/cloud/cluster.sh b/core/engine/cluster/cloud/cluster.sh index 1a0605fd9aeefbf87542e5e5156470eb1d81b836..8f8c5479df508dfc5e74ee936b665ba08d4647b1 100644 --- a/core/engine/cluster/cloud/cluster.sh +++ b/core/engine/cluster/cloud/cluster.sh @@ -16,23 +16,13 @@ ################################################### # Usage: submit.sh -# Description: run mpi submit client implement +# Description: run paddlecloud submit client implement ################################################### # ---------------------------------------------------------------------------- # # variable define # # ---------------------------------------------------------------------------- # -#----------------------------------------------------------------------------------------------------------------- -#fun : package -#param : N/A -#return : 0 -- success; not 0 -- failure -#----------------------------------------------------------------------------------------------------------------- -function package_hook() { - g_run_stage="package" - package -} - #----------------------------------------------------------------------------------------------------------------- #fun : before hook submit to cluster #param : N/A @@ -40,17 +30,128 @@ function package_hook() { #----------------------------------------------------------------------------------------------------------------- function _before_submit() { echo "before_submit" - before_submit_hook + + if [ ${DISTRIBUTE_MODE} == "PS_CPU_MPI" ]; then + _gen_cpu_before_hook + _gen_mpi_config + _gen_mpi_job + _gen_end_hook + elif [ ${DISTRIBUTE_MODE} == "COLLECTIVE_GPU_K8S" ]; then + _gen_gpu_before_hook + _gen_k8s_config + _gen_k8s_gpu_job + _gen_end_hook + elif [ ${DISTRIBUTE_MODE} == "PS_CPU_K8S" ]; then + _gen_cpu_before_hook + _gen_k8s_config + _gen_k8s_cpu_job + _gen_end_hook + fi + } +function _gen_mpi_config() { + echo "gen mpi_config.ini" + sed -e "s#<$ FS_NAME $>#$FS_NAME#g" \ + -e "s#<$ FS_UGI $>#$FS_UGI#g" \ + -e "s#<$ TRAIN_DATA_PATH $>#$TRAIN_DATA_PATH#g" \ + -e "s#<$ TEST_DATA_PATH $>#$TEST_DATA_PATH#g" \ + -e "s#<$ OUTPUT_PATH $>#$OUTPUT_PATH#g" \ + -e "s#<$ THIRDPARTY_PATH $>#$THIRDPARTY_PATH#g" \ + -e "s#<$ CPU_NUM $>#$max_thread_num#g" \ + -e "s#<$ FLAGS_communicator_is_sgd_optimizer $>#$FLAGS_communicator_is_sgd_optimizer#g" \ + -e "s#<$ FLAGS_communicator_send_queue_size $>#$FLAGS_communicator_send_queue_size#g" \ + -e "s#<$ FLAGS_communicator_thread_pool_size $>#$FLAGS_communicator_thread_pool_size#g" \ + -e "s#<$ FLAGS_communicator_max_merge_var_num $>#$FLAGS_communicator_max_merge_var_num#g" \ + -e "s#<$ FLAGS_communicator_max_send_grad_num_before_recv $>#$FLAGS_communicator_max_send_grad_num_before_recv#g" \ + -e "s#<$ FLAGS_communicator_fake_rpc $>#$FLAGS_communicator_fake_rpc#g" \ + -e "s#<$ FLAGS_rpc_retry_times $>#$FLAGS_rpc_retry_times#g" \ + ${abs_dir}/cloud/mpi_config.ini.template >${PWD}/config.ini +} + +function _gen_k8s_config() { + echo "gen k8s_config.ini" + sed -e "s#<$ FS_NAME $>#$FS_NAME#g" \ + -e "s#<$ FS_UGI $>#$FS_UGI#g" \ + -e "s#<$ AFS_REMOTE_MOUNT_POINT $>#$AFS_REMOTE_MOUNT_POINT#g" \ + -e "s#<$ OUTPUT_PATH $>#$OUTPUT_PATH#g" \ + -e "s#<$ CPU_NUM $>#$max_thread_num#g" \ + -e "s#<$ FLAGS_communicator_is_sgd_optimizer $>#$FLAGS_communicator_is_sgd_optimizer#g" \ + -e "s#<$ FLAGS_communicator_send_queue_size $>#$FLAGS_communicator_send_queue_size#g" \ + -e "s#<$ FLAGS_communicator_thread_pool_size $>#$FLAGS_communicator_thread_pool_size#g" \ + -e "s#<$ FLAGS_communicator_max_merge_var_num $>#$FLAGS_communicator_max_merge_var_num#g" \ + -e "s#<$ FLAGS_communicator_max_send_grad_num_before_recv $>#$FLAGS_communicator_max_send_grad_num_before_recv#g" \ + -e "s#<$ FLAGS_communicator_fake_rpc $>#$FLAGS_communicator_fake_rpc#g" \ + -e "s#<$ FLAGS_rpc_retry_times $>#$FLAGS_rpc_retry_times#g" \ + ${abs_dir}/cloud/k8s_config.ini.template >${PWD}/config.ini +} + +function _gen_cpu_before_hook() { + echo "gen cpu before_hook.sh" + sed -e "s#<$ PADDLEPADDLE_VERSION $>#$PADDLE_VERSION#g" \ + ${abs_dir}/cloud/before_hook_cpu.sh.template >${PWD}/before_hook.sh +} + +function _gen_gpu_before_hook() { + echo "gen gpu before_hook.sh" + sed -e "s#<$ PADDLEPADDLE_VERSION $>#$PADDLE_VERSION#g" \ + ${abs_dir}/cloud/before_hook_gpu.sh.template >${PWD}/before_hook.sh +} + +function _gen_end_hook() { + echo "gen end_hook.sh" + cp ${abs_dir}/cloud/end_hook.sh.template ${PWD}/end_hook.sh +} + +function _gen_mpi_job() { + echo "gen mpi_job.sh" + sed -e "s#<$ GROUP_NAME $>#$GROUP_NAME#g" \ + -e "s#<$ JOB_NAME $>#$OLD_JOB_NAME#g" \ + -e "s#<$ AK $>#$AK#g" \ + -e "s#<$ SK $>#$SK#g" \ + -e "s#<$ MPI_PRIORITY $>#$PRIORITY#g" \ + -e "s#<$ MPI_NODES $>#$MPI_NODES#g" \ + -e "s#<$ START_CMD $>#$START_CMD#g" \ + ${abs_dir}/cloud/mpi_job.sh.template >${PWD}/job.sh +} + +function _gen_k8s_gpu_job() { + echo "gen k8s_job.sh" + sed -e "s#<$ GROUP_NAME $>#$GROUP_NAME#g" \ + -e "s#<$ JOB_NAME $>#$OLD_JOB_NAME#g" \ + -e "s#<$ AK $>#$AK#g" \ + -e "s#<$ SK $>#$SK#g" \ + -e "s#<$ K8S_PRIORITY $>#$PRIORITY#g" \ + -e "s#<$ K8S_TRAINERS $>#$K8S_TRAINERS#g" \ + -e "s#<$ K8S_CPU_CORES $>#$K8S_CPU_CORES#g" \ + -e "s#<$ K8S_GPU_CARD $>#$K8S_GPU_CARD#g" \ + -e "s#<$ START_CMD $>#$START_CMD#g" \ + ${abs_dir}/cloud/k8s_job.sh.template >${PWD}/job.sh +} + +function _gen_k8s_cpu_job() { + echo "gen k8s_job.sh" + sed -e "s#<$ GROUP_NAME $>#$GROUP_NAME#g" \ + -e "s#<$ JOB_NAME $>#$OLD_JOB_NAME#g" \ + -e "s#<$ AK $>#$AK#g" \ + -e "s#<$ SK $>#$SK#g" \ + -e "s#<$ K8S_PRIORITY $>#$PRIORITY#g" \ + -e "s#<$ K8S_TRAINERS $>#$K8S_TRAINERS#g" \ + -e "s#<$ K8S_PS_NUM $>#$K8S_PS_NUM#g" \ + -e "s#<$ K8S_PS_CORES $>#$K8S_PS_CORES#g" \ + -e "s#<$ K8S_CPU_CORES $>#$K8S_CPU_CORES#g" \ + -e "s#<$ START_CMD $>#$START_CMD#g" \ + ${abs_dir}/cloud/k8s_cpu_job.sh.template >${PWD}/job.sh +} + + #----------------------------------------------------------------------------------------------------------------- #fun : after hook submit to cluster #param : N/A #return : 0 -- success; not 0 -- failure #----------------------------------------------------------------------------------------------------------------- function _after_submit() { - echo "after_submit" - after_submit_hook + echo "end submit" } #----------------------------------------------------------------------------------------------------------------- @@ -60,23 +161,19 @@ function _after_submit() { #----------------------------------------------------------------------------------------------------------------- function _submit() { g_run_stage="submit" + sh job.sh +} - cd ${engine_temp_path} - - paddlecloud job --ak ${engine_submit_ak} --sk ${engine_submit_sk} train --cluster-name ${engine_submit_cluster} \ - --job-version ${engine_submit_version} \ - --mpi-priority ${engine_submit_priority} \ - --mpi-wall-time 300:59:00 \ - --mpi-nodes ${engine_submit_nodes} --is-standalone 0 \ - --mpi-memory 110Gi \ - --job-name ${engine_submit_jobname} \ - --start-cmd "${g_run_cmd}" \ - --group-name ${engine_submit_group} \ - --job-conf ${engine_submit_config} \ - --files ${g_submitfiles} \ - --json - - cd - +function package_hook() { + cur_time=`date +"%Y%m%d%H%M"` + new_job_name="${JOB_NAME}_${cur_time}" + export OLD_JOB_NAME=${JOB_NAME} + export JOB_NAME=${new_job_name} + export job_file_path="${PWD}/${new_job_name}" + mkdir ${job_file_path} + cp $FILES ${job_file_path}/ + cd ${job_file_path} + echo "The task submission folder is generated at ${job_file_path}" } function submit_hook() { @@ -86,8 +183,6 @@ function submit_hook() { } function main() { - source ${engine_submit_scrpit} - package_hook submit_hook } diff --git a/core/engine/cluster/cloud/end_hook.sh.template b/core/engine/cluster/cloud/end_hook.sh.template new file mode 100644 index 0000000000000000000000000000000000000000..9abf8dd019e42d69c72366dde08cfbcc3f63a000 --- /dev/null +++ b/core/engine/cluster/cloud/end_hook.sh.template @@ -0,0 +1 @@ +echo "Run before_hook.sh ..." \ No newline at end of file diff --git a/core/engine/cluster/cloud/k8s_config.ini.template b/core/engine/cluster/cloud/k8s_config.ini.template new file mode 100644 index 0000000000000000000000000000000000000000..904bfbc5e1453f90ec1163d1681d554b52dae45f --- /dev/null +++ b/core/engine/cluster/cloud/k8s_config.ini.template @@ -0,0 +1,31 @@ +# 必须涵盖的参数 +fs_name=<$ FS_NAME $> +fs_ugi=<$ FS_UGI $> + +# 模型输出目录 +output_path=<$ OUTPUT_PATH $> +# =================== +# 以下是新增参数 +# =================== +# 挂载 afs 的开关 +mount_afs="true" + +# afs 路径的远端挂载点 +AFS_REMOTE_MOUNT_POINT=<$ AFS_REMOTE_MOUNT_POINT $> + +# 作业运行环境的本地挂载点,/root/paddlejob/workspace/env_run/是一个固定路径,是平台运行时workspace的路径 +afs_local_mount_point="/root/paddlejob/workspace/env_run/afs/" +# 可以访问运行时默认文件夹下的 ./afs/ 目录拿到挂载目录的文件 +# 新k8s afs挂载帮助文档: http://wiki.baidu.com/pages/viewpage.action?pageId=906443193 + +PADDLE_PADDLEREC_ROLE=WORKER +CPU_NUM=<$ CPU_NUM $> +GLOG_v=0 + +FLAGS_communicator_is_sgd_optimizer=<$ FLAGS_communicator_is_sgd_optimizer $> +FLAGS_communicator_send_queue_size=<$ FLAGS_communicator_send_queue_size $> +FLAGS_communicator_thread_pool_size=<$ FLAGS_communicator_thread_pool_size $> +FLAGS_communicator_max_merge_var_num=<$ FLAGS_communicator_max_merge_var_num $> +FLAGS_communicator_max_send_grad_num_before_recv=<$ FLAGS_communicator_max_send_grad_num_before_recv $> +FLAGS_communicator_fake_rpc=<$ FLAGS_communicator_fake_rpc $> +FLAGS_rpc_retry_times=<$ FLAGS_rpc_retry_times $> \ No newline at end of file diff --git a/core/engine/cluster/cloud/k8s_cpu_job.sh.template b/core/engine/cluster/cloud/k8s_cpu_job.sh.template new file mode 100644 index 0000000000000000000000000000000000000000..c5203fcad76b28b5a48de62067b46f4ed5bf1696 --- /dev/null +++ b/core/engine/cluster/cloud/k8s_cpu_job.sh.template @@ -0,0 +1,40 @@ +#!/bin/bash +############################################################### +## 注意-- 注意--注意 ## +## K8S PS-CPU多机作业作业示例 ## +############################################################### +job_name=<$ JOB_NAME $> + +# 作业参数 +group_name="<$ GROUP_NAME $>" +job_version="paddle-fluid-v1.7.1" +start_cmd="<$ START_CMD $>" +wall_time="10:00:00" + +k8s_priority=<$ K8S_PRIORITY $> +k8s_trainers=<$ K8S_TRAINERS $> +k8s_cpu_cores=<$ K8S_CPU_CORES $> +k8s_ps_num=<$ K8S_PS_NUM $> +k8s_ps_cores=<$ K8S_PS_CORES $> + +# 你的ak/sk(可在paddlecloud web页面【个人中心】处获取) +ak=<$ AK $> +sk=<$ SK $> + +paddlecloud job --ak ${ak} --sk ${sk} \ + train --job-name ${job_name} \ + --group-name ${group_name} \ + --job-conf config.ini \ + --start-cmd "${start_cmd}" \ + --files ./* \ + --job-version ${job_version} \ + --k8s-priority ${k8s_priority} \ + --wall-time ${wall_time} \ + --k8s-trainers ${k8s_trainers} \ + --k8s-cpu-cores ${k8s_cpu_cores} \ + --k8s-ps-num ${k8s_ps_num} \ + --k8s-ps-cores ${k8s_ps_cores} \ + --is-standalone 0 \ + --distribute-job-type "PSERVER" \ + --json + \ No newline at end of file diff --git a/core/engine/cluster/cloud/k8s_job.sh.template b/core/engine/cluster/cloud/k8s_job.sh.template new file mode 100644 index 0000000000000000000000000000000000000000..9886f11aebbbe547ed1fb433a35c653e2a77f6f3 --- /dev/null +++ b/core/engine/cluster/cloud/k8s_job.sh.template @@ -0,0 +1,49 @@ +#!/bin/bash +############################################################### +## 注意-- 注意--注意 ## +## K8S NCCL2多机作业作业示例 ## +############################################################### +job_name=<$ JOB_NAME $> + +# 作业参数 +group_name="<$ GROUP_NAME $>" +job_version="paddle-fluid-v1.7.1" +start_cmd="<$ START_CMD $>" +wall_time="10:00:00" + +k8s_priority=<$ K8S_PRIORITY $> +k8s_trainers=<$ K8S_TRAINERS $> +k8s_cpu_cores=<$ K8S_CPU_CORES $> +k8s_gpu_cards=<$ K8S_GPU_CARD $> + +is_stand_alone=0 +nccl="--distribute-job-type "NCCL2"" +if [ ${k8s_trainers} == 1 ];then + is_stand_alone=1 + nccl="--job-remark single-trainer" + if [ ${k8s_gpu_cards} == 1];then + nccl="--job-remark single-gpu" + echo "Attention: Use single GPU card for PaddleRec distributed training, please set runner class from 'cluster_train' to 'train' in config.yaml." + fi +fi + +# 你的ak/sk(可在paddlecloud web页面【个人中心】处获取) +ak=<$ AK $> +sk=<$ SK $> + +paddlecloud job --ak ${ak} --sk ${sk} \ + train --job-name ${job_name} \ + --group-name ${group_name} \ + --job-conf config.ini \ + --start-cmd "${start_cmd}" \ + --files ./* \ + --job-version ${job_version} \ + --k8s-trainers ${k8s_trainers} \ + --k8s-cpu-cores ${k8s_cpu_cores} \ + --k8s-gpu-cards ${k8s_gpu_cards} \ + --k8s-priority ${k8s_priority} \ + --wall-time ${wall_time} \ + --is-standalone ${is_stand_alone} \ + --json \ + ${nccl} + \ No newline at end of file diff --git a/core/engine/cluster/cloud/mpi_config.ini.template b/core/engine/cluster/cloud/mpi_config.ini.template new file mode 100644 index 0000000000000000000000000000000000000000..8312d46a01449b3d6eac322b098d5b029bb67f86 --- /dev/null +++ b/core/engine/cluster/cloud/mpi_config.ini.template @@ -0,0 +1,29 @@ +#type of storage cluster +storage_type="hdfs" + +#attention: files for training should be put on hdfs +force_reuse_output_path="True" + +# 可以替换成自己的hdfs集群 +fs_name=<$ FS_NAME $> +fs_ugi=<$ FS_UGI $> + +FLAGS_rpc_deadline=300000 + +##train data path on hdfs +train_data_path=<$ TRAIN_DATA_PATH $> +test_data_path=<$ TEST_DATA_PATH $> +output_path=<$ OUTPUT_PATH $> +thirdparty_path=<$ THIRDPARTY_PATH $> + +PADDLE_PADDLEREC_ROLE=WORKER +CPU_NUM=<$ CPU_NUM $> +GLOG_v=0 + +FLAGS_communicator_is_sgd_optimizer=<$ FLAGS_communicator_is_sgd_optimizer $> +FLAGS_communicator_send_queue_size=<$ FLAGS_communicator_send_queue_size $> +FLAGS_communicator_thread_pool_size=<$ FLAGS_communicator_thread_pool_size $> +FLAGS_communicator_max_merge_var_num=<$ FLAGS_communicator_max_merge_var_num $> +FLAGS_communicator_max_send_grad_num_before_recv=<$ FLAGS_communicator_max_send_grad_num_before_recv $> +FLAGS_communicator_fake_rpc=<$ FLAGS_communicator_fake_rpc $> +FLAGS_rpc_retry_times=<$ FLAGS_rpc_retry_times $> diff --git a/core/engine/cluster/cloud/mpi_job.sh.template b/core/engine/cluster/cloud/mpi_job.sh.template new file mode 100644 index 0000000000000000000000000000000000000000..46d68d2130d591c86f4a587000498c139c1e74aa --- /dev/null +++ b/core/engine/cluster/cloud/mpi_job.sh.template @@ -0,0 +1,31 @@ +#!/bin/bash +############################################################### +## 注意--注意--注意 ## +## MPI 类型作业演示 ## +############################################################### +job_name=<$ JOB_NAME $> + +# 作业参数 +group_name=<$ GROUP_NAME $> +job_version="paddle-fluid-v1.7.1" +start_cmd="<$ START_CMD $>" +wall_time="2:00:00" + +# 你的ak/sk(可在paddlecloud web页面【个人中心】处获取) +ak=<$ AK $> +sk=<$ SK $> + +paddlecloud job --ak ${ak} --sk ${sk} \ + train \ + --job-name ${job_name} \ + --mpi-priority <$ MPI_PRIORITY $> \ + --group-name ${group_name} \ + --mpi-wall-time ${wall_time} \ + --mpi-nodes <$ MPI_NODES $> \ + --is-standalone 0 \ + --permission group \ + --job-version ${job_version} \ + --job-conf config.ini \ + --start-cmd "${start_cmd}" \ + --files ./* \ + --json diff --git a/core/engine/cluster/cluster.py b/core/engine/cluster/cluster.py index 4c392e5470c58f213562d49a3f78f7d870462981..7dbb5708e572340c37265972e541bb00ef2ee195 100644 --- a/core/engine/cluster/cluster.py +++ b/core/engine/cluster/cluster.py @@ -18,6 +18,7 @@ from __future__ import unicode_literals import copy import os import subprocess +import warnings from paddlerec.core.engine.engine import Engine from paddlerec.core.factory import TrainerFactory @@ -26,24 +27,35 @@ from paddlerec.core.utils import envs class ClusterEngine(Engine): def __init_impl__(self): + self.role = envs.get_runtime_environ("engine_role") + if self.role == "WORKER": + return + abs_dir = os.path.dirname(os.path.abspath(__file__)) + os.environ["abs_dir"] = str(abs_dir) - backend = envs.get_runtime_environ("engine_backend") - if not backend: - backend = "" - backend = backend.upper() - if backend == "PADDLECLOUD": + self.backend = envs.get_runtime_environ("backend") + if not self.backend: + self.backend = "" + self.backend = self.backend.upper() + if self.backend == "PADDLECLOUD": self.submit_script = os.path.join(abs_dir, "cloud/cluster.sh") - elif backend == "KUBERNETES": + elif self.backend == "KUBERNETES": self.submit_script = os.path.join(abs_dir, "k8s/cluster.sh") else: - raise ValueError("{} can not be supported now".format(backend)) + raise ValueError("{} can not be supported now".format( + self.backend)) def start_worker_procs(self): trainer = TrainerFactory.create(self.trainer) trainer.run() def start_master_procs(self): + if self.backend == "PADDLECLOUD": + self.paddlecloud_env_check() + elif self.backend == "KUBERNETES": + self.kubernetes_env_check() + default_env = os.environ.copy() current_env = copy.copy(default_env) current_env.pop("http_proxy", None) @@ -55,21 +67,245 @@ class ClusterEngine(Engine): @staticmethod def workspace_replace(): - workspace = envs.get_runtime_environ("engine_workspace") + remote_workspace = envs.get_runtime_environ("remote_workspace") for k, v in os.environ.items(): - v = v.replace("{workspace}", workspace) + v = v.replace("{workspace}", remote_workspace) os.environ[k] = str(v) def run(self): - role = envs.get_runtime_environ("engine_role") - - if role == "MASTER": + if self.role == "MASTER": self.start_master_procs() - elif role == "WORKER": + elif self.role == "WORKER": self.start_worker_procs() else: raise ValueError("role {} error, must in MASTER/WORKER".format( - role)) + self.role)) + + def paddlecloud_env_check(self): + # get fleet mode + fleet_mode = envs.get_runtime_environ("fleet_mode") + # get device + device = envs.get_runtime_environ("device") + # get cluster type + cluster_type = envs.get_runtime_environ("cluster_type") + + cluster_env_check_tool = None + if cluster_type.upper() == "MPI": + if device == "CPU" and fleet_mode == "PS": + cluster_env_check_tool = PaddleCloudMpiEnv() + else: + raise ValueError( + "Paddlecloud with Mpi don't support GPU training, check your config.yaml & backend.yaml" + ) + elif cluster_type.upper() == "K8S": + if fleet_mode == "PS": + if device == "CPU": + cluster_env_check_tool = CloudPsCpuEnv() + elif device == "GPU": + raise ValueError( + "PS-GPU on paddlecloud is not supported at this time, comming soon" + ) + if fleet_mode == "COLLECTIVE": + if device == "GPU": + cluster_env_check_tool = CloudCollectiveEnv() + elif device == "CPU": + raise ValueError( + "Unexpected config -> device: CPU with fleet_mode: Collective, check your config.yaml" + ) + else: + raise ValueError("cluster_type {} error, must in MPI/K8S".format( + cluster_type)) + + cluster_env_check_tool.env_check() + cluster_env_check_tool.env_set() + + def kubernetes_env_check(self): + pass + + +class ClusterEnvBase(object): + def __init__(self): + # get backend env + backend_yaml = envs.get_runtime_environ("backend_yaml") + _env = envs.load_yaml(backend_yaml) + self.backend_env = envs.flatten_environs(_env, ".") + self.cluster_env = {} + + def env_check(self): + # check common env + # fs_name & fs_ugi + self.cluster_env["FS_NAME"] = self.backend_env.get("config.fs_name", + "") + self.cluster_env["FS_UGI"] = self.backend_env.get("config.fs_ugi", "") + if self.cluster_env["FS_NAME"] == "" or self.cluster_env[ + "FS_UGI"] == "": + raise ValueError( + "No -- FS_UGI or FS_NAME -- found in your backend.yaml, please check." + ) + + # output_path + self.cluster_env["OUTPUT_PATH"] = self.backend_env.get( + "config.output_path", "") + if self.cluster_env["OUTPUT_PATH"] == "": + warnings.warn( + "Job output_path not set! Please check your backend yaml.", + category=UserWarning, + stacklevel=2) + + # paddle_version + self.cluster_env["PADDLE_VERSION"] = self.backend_env.get( + "config.paddle_version", "1.7.2") + + # communicator + self.cluster_env[ + "FLAGS_communicator_is_sgd_optimizer"] = self.backend_env.get( + "config.communicator.FLAGS_communicator_is_sgd_optimizer", 0) + self.cluster_env[ + "FLAGS_communicator_send_queue_size"] = self.backend_env.get( + "config.communicator.FLAGS_communicator_send_queue_size", 5) + self.cluster_env[ + "FLAGS_communicator_thread_pool_size"] = self.backend_env.get( + "config.communicator.FLAGS_communicator_thread_pool_size", 32) + self.cluster_env[ + "FLAGS_communicator_max_merge_var_num"] = self.backend_env.get( + "config.communicator.FLAGS_communicator_max_merge_var_num", 5) + self.cluster_env[ + "FLAGS_communicator_max_send_grad_num_before_recv"] = self.backend_env.get( + "config.communicator.FLAGS_communicator_max_send_grad_num_before_recv", + 5) + self.cluster_env["FLAGS_communicator_fake_rpc"] = self.backend_env.get( + "config.communicator.FLAGS_communicator_fake_rpc", 0) + self.cluster_env["FLAGS_rpc_retry_times"] = self.backend_env.get( + "config.communicator.FLAGS_rpc_retry_times", 3) + + # ak & sk + self.cluster_env["AK"] = self.backend_env.get("submit.ak", "") + self.cluster_env["SK"] = self.backend_env.get("submit.sk", "") + if self.cluster_env["AK"] == "" or self.cluster_env["SK"] == "": + raise ValueError( + "No -- AK or SK -- found in your backend.yaml, please check.") + + # priority + self.cluster_env["PRIORITY"] = self.backend_env.get("submit.priority", + "high") + + # job name + self.cluster_env["JOB_NAME"] = self.backend_env.get( + "submit.job_name", "PaddleRecClusterJob") + + # group + self.cluster_env["GROUP_NAME"] = self.backend_env.get("submit.group", + "paddle") + + # start_cmd + self.cluster_env["START_CMD"] = self.backend_env.get( + "submit.start_cmd", "python -m paddlerec.run -m config.yaml") + + # files + self.cluster_env["FILES"] = self.backend_env.get("submit.files", "") + if self.cluster_env["FILES"] == "": + raise ValueError( + "No -- files -- found in your backend.yaml, please check.") + + def env_set(self): + envs.set_runtime_environs(self.cluster_env) + flattens = envs.flatten_environs(self.cluster_env) + print(envs.pretty_print_envs(flattens, ("Cluster Envs", "Value"))) + + +class PaddleCloudMpiEnv(ClusterEnvBase): + def __init__(self): + super(PaddleCloudMpiEnv, self).__init__() + + def env_check(self): + super(PaddleCloudMpiEnv, self).env_check() + + # check mpi env + + self.cluster_env["DISTRIBUTE_MODE"] = "PS_CPU_MPI" + + # train_data_path + self.cluster_env["TRAIN_DATA_PATH"] = self.backend_env.get( + "config.train_data_path", "") + if self.cluster_env["TRAIN_DATA_PATH"] == "": + raise ValueError( + "No -- TRAIN_DATA_PATH -- found in your backend.yaml, please add train_data_path in your backend yaml." + ) + # test_data_path + self.cluster_env["TEST_DATA_PATH"] = self.backend_env.get( + "config.test_data_path", "") + if self.cluster_env["TEST_DATA_PATH"] == "": + warnings.warn( + "Job test_data_path not set! Please check your backend yaml.", + category=UserWarning, + stacklevel=2) + + # thirdparty_path + self.cluster_env["THIRDPARTY_PATH"] = self.backend_env.get( + "config.thirdparty_path", "") + if self.cluster_env["THIRDPARTY_PATH"] == "": + warnings.warn( + "Job thirdparty_path not set! Please check your backend yaml.", + category=UserWarning, + stacklevel=2) + + # nodes + self.cluster_env["MPI_NODES"] = self.backend_env.get("submit.nodes", 1) + + +class PaddleCloudK8sEnv(ClusterEnvBase): + def __init__(self): + super(PaddleCloudK8sEnv, self).__init__() + + def env_check(self): + super(PaddleCloudK8sEnv, self).env_check() + + # check afs_remote_mount_point + self.cluster_env["AFS_REMOTE_MOUNT_POINT"] = self.backend_env.get( + "config.afs_remote_mount_point", "") + if self.cluster_env["AFS_REMOTE_MOUNT_POINT"] == "": + warnings.warn( + "Job afs_remote_mount_point not set! Please check your backend yaml.", + category=UserWarning, + stacklevel=2) + warnings.warn( + "The remote afs path will be mounted to the ./afs/", + category=UserWarning, + stacklevel=2) + + +class CloudCollectiveEnv(PaddleCloudK8sEnv): + def __init__(self): + super(CloudCollectiveEnv, self).__init__() + + def env_check(self): + super(CloudCollectiveEnv, self).env_check() + + self.cluster_env["DISTRIBUTE_MODE"] = "COLLECTIVE_GPU_K8S" + self.cluster_env["K8S_TRAINERS"] = self.backend_env.get( + "submit.k8s_trainers", 1) + self.cluster_env["K8S_GPU_CARD"] = self.backend_env.get( + "submit.k8s_gpu_card", 1) + self.cluster_env["K8S_CPU_CORES"] = self.backend_env.get( + "submit.k8s_cpu_cores", 1) + + +class CloudPsCpuEnv(PaddleCloudK8sEnv): + def __init__(self): + super(CloudPsCpuEnv, self).__init__() + + def env_check(self): + super(CloudPsCpuEnv, self).env_check() + + self.cluster_env["DISTRIBUTE_MODE"] = "PS_CPU_K8S" + self.cluster_env["K8S_TRAINERS"] = self.backend_env.get( + "submit.k8s_trainers", 1) + self.cluster_env["K8S_CPU_CORES"] = self.backend_env.get( + "submit.k8s_cpu_cores", 2) + self.cluster_env["K8S_PS_NUM"] = self.backend_env.get( + "submit.k8s_ps_num", 1) + self.cluster_env["K8S_PS_CORES"] = self.backend_env.get( + "submit.k8s_ps_cores", 2) diff --git a/core/trainers/framework/dataset.py b/core/trainers/framework/dataset.py index 273e3a2ab4823fb5dd3ee1adcb5eb2b50e2f4bd2..8059eeb09a482671b8329fb88f5b52cfd64f163b 100644 --- a/core/trainers/framework/dataset.py +++ b/core/trainers/framework/dataset.py @@ -118,6 +118,7 @@ class QueueDataset(DatasetBase): dataset.set_batch_size(batch_size) dataset.set_pipe_command(pipe_cmd) train_data_path = envs.get_global_env(name + "data_path") + file_list = [ os.path.join(train_data_path, x) for x in os.listdir(train_data_path) @@ -125,7 +126,7 @@ class QueueDataset(DatasetBase): if context["engine"] == EngineMode.LOCAL_CLUSTER: file_list = split_files(file_list, context["fleet"].worker_index(), context["fleet"].worker_num()) - + print("File_list: {}".format(file_list)) dataset.set_filelist(file_list) for model_dict in context["phases"]: if model_dict["dataset_name"] == dataset_name: diff --git a/core/utils/dataloader_instance.py b/core/utils/dataloader_instance.py index c66d1b36571df0331b8319798cdc692fa825a481..2461473aa79a51133db8aa319f4ee7d45981d815 100755 --- a/core/utils/dataloader_instance.py +++ b/core/utils/dataloader_instance.py @@ -42,7 +42,7 @@ def dataloader_by_name(readerclass, if context["engine"] == EngineMode.LOCAL_CLUSTER: files = split_files(files, context["fleet"].worker_index(), context["fleet"].worker_num()) - print("file_list : {}".format(files)) + print("file_list : {}".format(files)) reader = reader_class(yaml_file) reader.init() diff --git a/doc/distributed_train.md b/doc/distributed_train.md index 339c5a83ffd26f9416a67a02390a11ba4c87c29d..9e7dbf1bd903e459d78f18f66e5893cb3d3ced1b 100644 --- a/doc/distributed_train.md +++ b/doc/distributed_train.md @@ -1,9 +1,548 @@ -# PaddleRec 分布式训练 +目录 +================= -## PaddleRec分布式运行 -> 占位 -### 本地模拟分布式 -> 占位 +- [目录](#目录) +- [基于PaddleCloud的分布式训练启动方法](#基于paddlecloud的分布式训练启动方法) + - [使用PaddleRec提交](#使用paddlerec提交) + - [第一步:运行环境下安装PaddleCloud的Client](#第一步运行环境下安装paddlecloud的client) + - [第二步:更改模型运行`config.yaml`配置](#第二步更改模型运行configyaml配置) + - [第三步:增加集群运行`backend.yaml`配置](#第三步增加集群运行backendyaml配置) + - [MPI集群的Parameter Server模式配置](#mpi集群的parameter-server模式配置) + - [K8S集群的Collective模式配置](#k8s集群的collective模式配置) + - [K8S集群的PS-CPU模式配置](#k8s集群的ps-cpu模式配置) + - [第四步:任务提交](#第四步任务提交) + - [使用PaddleCloud Client提交](#使用paddlecloud-client提交) + - [第一步:在`before_hook.sh`里手动安装PaddleRec](#第一步在before_hooksh里手动安装paddlerec) + - [第二步:在`config.ini`中调整超参](#第二步在configini中调整超参) + - [第三步:在`job.sh`中上传文件及修改启动命令](#第三步在jobsh中上传文件及修改启动命令) + - [第四步: 提交任务](#第四步-提交任务) -### K8S集群运行分布式 -> 占位 +# 基于PaddleCloud的分布式训练启动方法 + +> PaddleCloud目前处于百度内部测试推广阶段,将适时推出面向广大用户的公有云版本,欢迎持续关注 + +## 使用PaddleRec提交 + +### 第一步:运行环境下安装PaddleCloud的Client + +- 环境要求:python > 2.7.5 +- 首先在PaddleCloud平台申请`group`的权限,获得计算资源 +- 然后在[PaddleCloud client使用手册](http://wiki.baidu.com/pages/viewpage.action?pageId=1017488941#1.%20安装PaddleCloud客户端)下载安装`PaddleCloud-Cli` +- 在PaddleCloud的个人中心获取`AK`及`SK` + + +### 第二步:更改模型运行`config.yaml`配置 + +分布式运行首先需要更改`config.yaml`,主要调整以下内容: + +- workspace: 调整为在远程点运行时的工作目录,一般设置为`"./"`即可 +- runner_class: 从单机的"train"调整为"cluster_train",单机训练->分布式训练(例外情况,k8s上单机单卡训练仍然为train) +- fleet_mode: 选则参数服务器模式(ps),抑或GPU的all-reduce模式(collective) +- distribute_strategy: 可选项,选择分布式训练的策略,目前只在参数服务器模式下生效,可选项:`sync、asycn、half_async、geo` + +配置选项具体参数,可以参考[yaml配置说明](./yaml.md) + +以Rank/dnn模型为例 + +单机训练配置: + +```yaml +# workspace +workspace: "paddlerec.models.rank.dnn" + +mode: [single_cpu_train] +runner: +- name: single_cpu_train + class: train + epochs: 4 + device: cpu + save_checkpoint_interval: 2 + save_checkpoint_path: "increment_dnn" + init_model_path: "" + print_interval: 10 + phases: [phase1] + +dataset: +- name: dataloader_train + batch_size: 2 + type: DataLoader + data_path: "{workspace}/data/sample_data/train" + sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26" + dense_slots: "dense_var:13" +``` + +分布式的训练配置可以改为: +```yaml +# 改变一:代码上传至节点后,在默认目录下 +workspace: "./" + +mode: [ps_cluster] +runner: +- name: ps_cluster + # 改变二:调整runner的class + class: cluster_train + epochs: 4 + device: cpu + # 改变三 & 四: 指定fleet_mode 与 distribute_strategy + fleet_mode: ps + distribute_strategy: async + save_checkpoint_interval: 2 + save_checkpoint_path: "increment_dnn" + init_model_path: "" + print_interval: 10 + phases: [phase1] + +dataset: +- name: dataloader_train + batch_size: 2 + type: DataLoader + # 改变五: 改变数据的读取目录 + # 通常而言,mpi模式下,数据会下载到远程节点执行目录的'./train_data'下, k8s则与挂载位置有关 + data_path: "{workspace}/train_data" + sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26" + dense_slots: "dense_var:13" +``` + +除此之外,还需关注数据及模型加载的路径,一般而言: +- PaddleCloud MPI集群下,训练数据会下载到节点运行目录的`./train_data/`,测试数据位于`./test_data/`,其他数据及文件可以通过上传到hdfs配置的`thirdparty`后,自动下载到节点运行目录的`./thirdparty/`文件夹下。 +- PaddleCloud K8S集群下,hdfs的指定目录会挂载到节点工作目录的`./afs/` + +### 第三步:增加集群运行`backend.yaml`配置 + +分布式训练除了模型的部分调整外,更重要的是加入集群的配置选项,我们通过另一个yaml文件来指定分布式的运行配置,将分布式配置与模型超参解耦。 + +下面给出一个完整的`backend.yaml`示例: + +```yaml +backend: "PaddleCloud" +cluster_type: mpi # k8s 可选 + +config: + # 填写任务运行的paddle官方版本号 >= 1.7.2, 默认1.7.2 + paddle_version: "1.7.2" + + # hdfs/afs的配置信息填写 + fs_name: "afs://xxx.com" + fs_ugi: "usr,pwd" + + # 填任务输出目录的远程地址,如afs:/user/your/path/ 则此处填 /user/your/path + output_path: "" + + # for mpi + # 填远程数据及地址,如afs:/user/your/path/ 则此处填 /user/your/path + train_data_path: "" + test_data_path: "" + thirdparty_path: "" + + # for k8s + # 填远程挂载地址,如afs:/user/your/path/ 则此处填 /user/your/path + afs_remote_mount_point: "" + + # paddle参数服务器分布式底层超参,无特殊需求不理不改 + communicator: + FLAGS_communicator_is_sgd_optimizer: 0 + FLAGS_communicator_send_queue_size: 5 + FLAGS_communicator_thread_pool_size: 32 + FLAGS_communicator_max_merge_var_num: 5 + FLAGS_communicator_max_send_grad_num_before_recv: 5 + FLAGS_communicator_fake_rpc: 0 + FLAGS_rpc_retry_times: 3 + +submit: + # PaddleCloud 个人信息 AK 及 SK + ak: "" + sk: "" + + # 任务运行优先级,默认high + priority: "high" + + # 任务名称 + job_name: "PaddleRec_CTR" + + # 训练资源所在组 + group: "" + + # 节点上的任务启动命令 + start_cmd: "python -m paddlerec.run -m ./config.yaml" + + # 本地需要上传到节点工作目录的文件 + files: ./*.py ./*.yaml + + # for mpi ps-cpu + # mpi 参数服务器模式下,任务的节点数 + nodes: 2 + + # for k8s gpu + # k8s gpu 模式下,训练节点数,及每个节点上的GPU卡数 + k8s_trainers: 2 + k8s-cpu-cores: 4 + k8s_gpu_card: 1 + + # for k8s ps-cpu + k8s_trainers: 2 + k8s-cpu-cores: 4 + k8s_ps_num: 2 + k8s_ps_cores: 4 + +``` + +更多backend.yaml配置选项信息,可以查看[yaml配置说明](./yaml.md) + +除此之外,我们还需要关注上传到工作目录的文件(`files选项`)的路径问题,在示例中是`./*.py`,说明我们执行任务提交时,与这些py文件在同一目录。若不在同一目录,则需要适当调整files路径,或改为这些文件的绝对路径。 + +不建议利用`files`上传过大的数据文件,可以通过指定`train_data_path`自动下载,或在k8s模式下指定`afs_remote_mount_point`挂载实现数据到节点的转移。 + +#### MPI集群的Parameter Server模式配置 + +下面是一个利用PaddleCloud提交MPI参数服务器模式任务的`backend.yaml`示例 + +首先调整`config.yaml`: +```yaml +workspace: "./" +mode: [ps_cluster] + +dataset: +- name: dataloader_train + batch_size: 2 + type: DataLoader + data_path: "{workspace}/train_data" + sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26" + dense_slots: "dense_var:13" + +runner: +- name: ps_cluster + class: cluster_train + epochs: 2 + device: cpu + fleet_mode: ps + save_checkpoint_interval: 1 + save_checkpoint_path: "increment_dnn" + init_model_path: "" + print_interval: 1 + phases: [phase1] + +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: dataloader_train + thread_num: 1 +``` + + +再新增`backend.yaml` +```yaml +backend: "PaddleCloud" +cluster_type: mpi + +config: + paddle_version: "1.7.2" + + # hdfs/afs的配置信息填写 + fs_name: "afs://xxx.com" + fs_ugi: "usr,pwd" + + # 填任务输出目录的远程地址,如afs:/user/your/path/ 则此处填 /user/your/path + output_path: "" + + # for mpi + # 填远程数据及地址,如afs:/user/your/path/ 则此处填 /user/your/path + train_data_path: "" + test_data_path: "" + thirdparty_path: "" + +submit: + # PaddleCloud 个人信息 AK 及 SK + ak: "" + sk: "" + + # 任务运行优先级,默认high + priority: "high" + + # 任务名称 + job_name: "PaddleRec_CTR" + + # 训练资源所在组 + group: "" + + # 节点上的任务启动命令 + start_cmd: "python -m paddlerec.run -m ./config.yaml" + + # 本地需要上传到节点工作目录的文件 + files: ./*.py ./*.yaml + + # for mpi ps-cpu + # mpi 参数服务器模式下,任务的节点数 + nodes: 2 +``` + +#### K8S集群的Collective模式配置 + +下面是一个利用PaddleCloud提交K8S集群进行GPU训练的`backend.yaml`示例 + +首先调整`config.yaml` + +```yaml +workspace: "./" +mode: [collective_cluster] + +dataset: +- name: dataloader_train + batch_size: 2 + type: DataLoader + data_path: "{workspace}/train_data" + sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26" + dense_slots: "dense_var:13" + +runner: +- name: collective_cluster + class: cluster_train + epochs: 2 + device: gpu + fleet_mode: collective + save_checkpoint_interval: 1 # save model interval of epochs + save_checkpoint_path: "increment_dnn" # save checkpoint path + init_model_path: "" # load model path + print_interval: 1 + phases: [phase1] + +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: dataloader_train + thread_num: 1 +``` + + +再增加`backend.yaml` + +```yaml +backend: "PaddleCloud" +cluster_type: k8s # k8s 可选 + +config: + # 填写任务运行的paddle官方版本号 >= 1.7.2, 默认1.7.2 + paddle_version: "1.7.2" + + # hdfs/afs的配置信息填写 + fs_name: "afs://xxx.com" + fs_ugi: "usr,pwd" + + # 填任务输出目录的远程地址,如afs:/user/your/path/ 则此处填 /user/your/path + output_path: "" + + # for k8s + # 填远程挂载地址,如afs:/user/your/path/ 则此处填 /user/your/path + afs_remote_mount_point: "" + +submit: + # PaddleCloud 个人信息 AK 及 SK + ak: "" + sk: "" + + # 任务运行优先级,默认high + priority: "high" + + # 任务名称 + job_name: "PaddleRec_CTR" + + # 训练资源所在组 + group: "" + + # 节点上的任务启动命令 + start_cmd: "python -m paddlerec.run -m ./config.yaml" + + # 本地需要上传到节点工作目录的文件 + files: ./*.py ./*.yaml + + # for k8s gpu + # k8s gpu 模式下,训练节点数,及每个节点上的GPU卡数 + k8s_trainers: 2 + k8s-cpu-cores: 4 + k8s_gpu_card: 1 +``` + +#### K8S集群的PS-CPU模式配置 +下面是一个利用PaddleCloud提交K8S集群进行参数服务器CPU训练的`backend.yaml`示例 + +首先调整`config.yaml`: +```yaml +workspace: "./" +mode: [ps_cluster] + +dataset: +- name: dataloader_train + batch_size: 2 + type: DataLoader + data_path: "{workspace}/train_data" + sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26" + dense_slots: "dense_var:13" + +runner: +- name: ps_cluster + class: cluster_train + epochs: 2 + device: cpu + fleet_mode: ps + save_checkpoint_interval: 1 + save_checkpoint_path: "increment_dnn" + init_model_path: "" + print_interval: 1 + phases: [phase1] + +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: dataloader_train + thread_num: 1 +``` + +再新增`backend.yaml` +```yaml +backend: "PaddleCloud" +cluster_type: k8s # k8s 可选 + +config: + # 填写任务运行的paddle官方版本号 >= 1.7.2, 默认1.7.2 + paddle_version: "1.7.2" + + # hdfs/afs的配置信息填写 + fs_name: "afs://xxx.com" + fs_ugi: "usr,pwd" + + # 填任务输出目录的远程地址,如afs:/user/your/path/ 则此处填 /user/your/path + output_path: "" + + # for k8s + # 填远程挂载地址,如afs:/user/your/path/ 则此处填 /user/your/path + afs_remote_mount_point: "" + +submit: + # PaddleCloud 个人信息 AK 及 SK + ak: "" + sk: "" + + # 任务运行优先级,默认high + priority: "high" + + # 任务名称 + job_name: "PaddleRec_CTR" + + # 训练资源所在组 + group: "" + + # 节点上的任务启动命令 + start_cmd: "python -m paddlerec.run -m ./config.yaml" + + # 本地需要上传到节点工作目录的文件 + files: ./*.py ./*.yaml + + # for k8s gpu + # k8s ps-cpu 模式下,训练节点数,参数服务器节点数,及每个节点上的cpu核心数及内存限制 + k8s_trainers: 2 + k8s-cpu-cores: 4 + k8s_ps_num: 2 + k8s_ps_cores: 4 +``` + +### 第四步:任务提交 + +当我们准备好`config.yaml`与`backend.yaml`,便可以进行一键任务提交,命令为: + +```shell +python -m paddlerec.run -m config.yaml -b backend.yaml +``` + +执行过程中会进行配置的若干check,并给出错误提示。键入提交命令后,会有以下提交信息打印在屏幕上: + +```shell +The task submission folder is generated at /home/PaddleRec/models/rank/dnn/PaddleRec_CTR_202007091308 +before_submit +gen gpu before_hook.sh +gen k8s_config.ini +gen k8s_job.sh +gen end_hook.sh +Start checking your job configuration, please be patient. +Congratulations! Job configuration check passed! +Congratulations! The new job is ready for training. +{ + "groupName": "xxxxxxx", + "jobId": "job-xxxxxx", + "userId": "x-x-x-x-x" +} +end submit +``` + +则代表任务已顺利提交PaddleCloud,恭喜。 + +同时,我们还可以进入`/home/PaddleRec/models/rank/dnn/PaddleRec_CTR_202007091308`这个目录检查我们的提交环境,该目录下有以下文件: + +```shell +. +├── backend.yaml # 用户定义的分布式配置backend.yaml +├── config.yaml # 用户定义的模型执行config.yaml +├── before_hook.sh # PaddleRec生成的训练前执行的脚本 +├── config.ini # PaddleRec生成的PaddleCloud环境配置 +├── end_hook.sh # PaddleRec生成的训练后执行的脚本 +├── job.sh # PaddleRec生成的PaddleCloud任务提交脚本 +└── model.py # CTR模型的组网.py文件 +``` + +该目录下的文件会被打平上传到节点的工作目录,用户可以复查PaddleRec生成的配置文件是否符合预期,如不符合预期,既可以调整backend.yaml,亦可以直接修改生成的文件,并执行: + +```shell +sh job.sh +``` +再次提交任务。 + + +## 使用PaddleCloud Client提交 + +假如你已经很熟悉PaddleCloud的使用,并且之前是用PaddleCloud-Client提交过任务,熟悉`before_hook.sh`、`config.ini`、`job.sh`,希望通过之前的方式提交PaddleCloud任务,PaddleRec也支持。 + + +我们可以不添加`backend.yaml`,直接用PaddleCloud-Client的提交要求提交任务,除了为分布式训练[修改config.yaml](#第二步更改模型运行configyaml配置)以外,有以下几个额外的步骤: + +### 第一步:在`before_hook.sh`里手动安装PaddleRec + +```shell +# before_hook.sh +echo "Run before_hook.sh ..." + +wget https://paddlerec.bj.bcebos.com/whl/PaddleRec.tar.gz + +tar -xf PaddleRec.tar.gz + +cd PaddleRec + +python setup.py install + +echo "End before_hook.sh ..." +``` + +### 第二步:在`config.ini`中调整超参 + +```shell +# config.ini +# 设置PADDLE_PADDLEREC_ROLE环境变量为WORKER +# 告诉PaddleRec当前运行环境在节点中,无需执行提交流程,直接执行分布式训练 +PADDLE_PADDLEREC_ROLE=WORKER +``` + +### 第三步:在`job.sh`中上传文件及修改启动命令 + +我们需要在`job.sh`中上传运行PaddleRec所需的必要文件,如运行该模型的`model.py`、`config.yaml`以及`reader.py`等,PaddleRec的框架代码无需上传,已在before_hook中安装。 + +同时还需调整启动命令(start_cmd),调整为 +```shell +python -m paddlerec.run -m config.yaml +``` + +### 第四步: 提交任务 + +直接运行: + +```shell +sh job.sh +``` + +复用之前的提交脚本执行任务的提交。 diff --git a/doc/imgs/flen.png b/doc/imgs/flen.png new file mode 100644 index 0000000000000000000000000000000000000000..b8f6cbbe5833237b7a54c60801a142182970fa9b Binary files /dev/null and b/doc/imgs/flen.png differ diff --git a/models/rank/AutoInt/__init__.py b/models/rank/AutoInt/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/models/rank/AutoInt/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/rank/AutoInt/config.yaml b/models/rank/AutoInt/config.yaml new file mode 100755 index 0000000000000000000000000000000000000000..942f98c81f0eefa30bf41991d83c2fe10f0dac91 --- /dev/null +++ b/models/rank/AutoInt/config.yaml @@ -0,0 +1,79 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# global settings +debug: false +workspace: "paddlerec.models.rank.AutoInt" + + +dataset: + - name: train_sample + type: QueueDataset + batch_size: 5 + data_path: "{workspace}/../dataset/Criteo_data/sample_data/train" + sparse_slots: "label feat_idx" + dense_slots: "feat_value:39" + - name: infer_sample + type: QueueDataset + batch_size: 5 + data_path: "{workspace}/../dataset/Criteo_data/sample_data/train" + sparse_slots: "label feat_idx" + dense_slots: "feat_value:39" + +hyper_parameters: + optimizer: + class: SGD + learning_rate: 0.0001 + sparse_feature_number: 1086460 + sparse_feature_dim: 96 + num_field: 39 + d_model: 96 + d_key: 16 + d_value: 16 + n_head: 6 + dropout_rate: 0 + n_interacting_layers: 1 + + + +mode: train_runner +# if infer, change mode to "infer_runner" and change phase to "infer_phase" + +runner: + - name: train_runner + class: train + epochs: 2 + device: cpu + init_model_path: "" + save_checkpoint_interval: 1 + save_inference_interval: 1 + save_checkpoint_path: "increment" + save_inference_path: "inference" + print_interval: 1 + - name: infer_runner + class: infer + device: cpu + init_model_path: "increment/0" + print_interval: 1 + + +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: train_sample + thread_num: 1 +#- name: infer_phase +# model: "{workspace}/model.py" +# dataset_name: infer_sample +# thread_num: 1 diff --git a/models/rank/AutoInt/model.py b/models/rank/AutoInt/model.py new file mode 100755 index 0000000000000000000000000000000000000000..77af923bfdc0963c637b3fabd4119294c69dacb5 --- /dev/null +++ b/models/rank/AutoInt/model.py @@ -0,0 +1,223 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import paddle.fluid as fluid + +from paddlerec.core.utils import envs +from paddlerec.core.model import ModelBase + + +class Model(ModelBase): + def __init__(self, config): + ModelBase.__init__(self, config) + + def _init_hyper_parameters(self): + self.sparse_feature_number = envs.get_global_env( + "hyper_parameters.sparse_feature_number", None) + self.sparse_feature_dim = envs.get_global_env( + "hyper_parameters.sparse_feature_dim", None) + self.num_field = envs.get_global_env("hyper_parameters.num_field", + None) + self.d_model = envs.get_global_env("hyper_parameters.d_model", None) + self.d_key = envs.get_global_env("hyper_parameters.d_key", None) + self.d_value = envs.get_global_env("hyper_parameters.d_value", None) + self.n_head = envs.get_global_env("hyper_parameters.n_head", None) + self.dropout_rate = envs.get_global_env( + "hyper_parameters.dropout_rate", 0) + self.n_interacting_layers = envs.get_global_env( + "hyper_parameters.n_interacting_layers", 1) + + def multi_head_attention(self, queries, keys, values, d_key, d_value, + d_model, n_head, dropout_rate): + keys = queries if keys is None else keys + values = keys if values is None else values + if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3 + ): + raise ValueError( + "Inputs: quries, keys and values should all be 3-D tensors.") + + def __compute_qkv(queries, keys, values, n_head, d_key, d_value): + """ + Add linear projection to queries, keys, and values. + """ + q = fluid.layers.fc(input=queries, + size=d_key * n_head, + bias_attr=False, + num_flatten_dims=2) + k = fluid.layers.fc(input=keys, + size=d_key * n_head, + bias_attr=False, + num_flatten_dims=2) + v = fluid.layers.fc(input=values, + size=d_value * n_head, + bias_attr=False, + num_flatten_dims=2) + return q, k, v + + def __split_heads_qkv(queries, keys, values, n_head, d_key, d_value): + """ + Reshape input tensors at the last dimension to split multi-heads + and then transpose. Specifically, transform the input tensor with shape + [bs, max_sequence_length, n_head * hidden_dim] to the output tensor + with shape [bs, n_head, max_sequence_length, hidden_dim]. + """ + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + reshaped_q = fluid.layers.reshape( + x=queries, shape=[0, 0, n_head, d_key], inplace=True) + # permuate the dimensions into: + # [batch_size, n_head, max_sequence_len, hidden_size_per_head] + q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3]) + # For encoder-decoder attention in inference, insert the ops and vars + # into global block to use as cache among beam search. + reshaped_k = fluid.layers.reshape( + x=keys, shape=[0, 0, n_head, d_key], inplace=True) + k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3]) + reshaped_v = fluid.layers.reshape( + x=values, shape=[0, 0, n_head, d_value], inplace=True) + v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3]) + + return q, k, v + + def scaled_dot_product_attention(q, k, v, d_key, dropout_rate): + """ + Scaled Dot-Product Attention + """ + product = fluid.layers.matmul( + x=q, y=k, transpose_y=True, alpha=d_key**-0.5) + + weights = fluid.layers.softmax(product) + if dropout_rate: + weights = fluid.layers.dropout( + weights, + dropout_prob=dropout_rate, + seed=None, + is_test=False) + out = fluid.layers.matmul(weights, v) + return out + + def __combine_heads(x): + """ + Transpose and then reshape the last two dimensions of inpunt tensor x + so that it becomes one dimension, which is reverse to __split_heads. + """ + if len(x.shape) != 4: + raise ValueError("Input(x) should be a 4-D Tensor.") + + trans_x = fluid.layers.transpose(x, perm=[0, 2, 1, 3]) + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + return fluid.layers.reshape( + x=trans_x, + shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], + inplace=True) + + q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) + q, k, v = __split_heads_qkv(q, k, v, n_head, d_key, d_value) + + ctx_multiheads = scaled_dot_product_attention(q, k, v, self.d_model, + dropout_rate) + + out = __combine_heads(ctx_multiheads) + + return out + + def interacting_layer(self, x): + attention_out = self.multi_head_attention( + x, None, None, self.d_key, self.d_value, self.d_model, self.n_head, + self.dropout_rate) + W_0_x = fluid.layers.fc(input=x, + size=self.d_model, + bias_attr=False, + num_flatten_dims=2) + res_out = fluid.layers.relu(attention_out + W_0_x) + + return res_out + + def net(self, inputs, is_infer=False): + init_value_ = 0.1 + is_distributed = True if envs.get_trainer() == "CtrTrainer" else False + + # ------------------------- network input -------------------------- + + raw_feat_idx = self._sparse_data_var[1] + raw_feat_value = self._dense_data_var[0] + self.label = self._sparse_data_var[0] + + feat_idx = raw_feat_idx + feat_value = fluid.layers.reshape( + raw_feat_value, [-1, self.num_field, 1]) # None * num_field * 1 + + # ------------------------- Embedding -------------------------- + + feat_embeddings_re = fluid.embedding( + input=feat_idx, + is_sparse=True, + is_distributed=is_distributed, + dtype='float32', + size=[self.sparse_feature_number + 1, self.sparse_feature_dim], + padding_idx=0, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.TruncatedNormalInitializer( + loc=0.0, + scale=init_value_ / + math.sqrt(float(self.sparse_feature_dim))))) + feat_embeddings = fluid.layers.reshape( + feat_embeddings_re, + shape=[-1, self.num_field, self.sparse_feature_dim + ]) # None * num_field * embedding_size + # None * num_field * embedding_size + feat_embeddings = feat_embeddings * feat_value + + inter_input = feat_embeddings + + # ------------------------- interacting layer -------------------------- + + for _ in range(self.n_interacting_layers): + interacting_layer_out = self.interacting_layer(inter_input) + inter_input = interacting_layer_out + + # ------------------------- DNN -------------------------- + + dnn_input = fluid.layers.flatten(interacting_layer_out, axis=1) + + y_dnn = fluid.layers.fc( + input=dnn_input, + size=1, + act=None, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.TruncatedNormalInitializer( + loc=0.0, scale=init_value_)), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.TruncatedNormalInitializer( + loc=0.0, scale=init_value_))) + + self.predict = fluid.layers.sigmoid(y_dnn) + cost = fluid.layers.log_loss( + input=self.predict, label=fluid.layers.cast(self.label, "float32")) + avg_cost = fluid.layers.reduce_sum(cost) + + self._cost = avg_cost + + predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1) + label_int = fluid.layers.cast(self.label, 'int64') + auc_var, batch_auc_var, _ = fluid.layers.auc(input=predict_2d, + label=label_int, + slide_steps=0) + self._metrics["AUC"] = auc_var + self._metrics["BATCH_AUC"] = batch_auc_var + if is_infer: + self._infer_results["AUC"] = auc_var diff --git a/models/rank/BST/__init__.py b/models/rank/BST/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/models/rank/BST/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/rank/BST/config.yaml b/models/rank/BST/config.yaml new file mode 100755 index 0000000000000000000000000000000000000000..73e39f19576f617dd83b813a7a12d626446c6f27 --- /dev/null +++ b/models/rank/BST/config.yaml @@ -0,0 +1,84 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# global settings +debug: false +workspace: "paddlerec.models.rank.BST" + +dataset: +- name: sample_1 + type: DataLoader + batch_size: 5 + data_path: "{workspace}/data/train_data" + sparse_slots: "label history cate position target target_cate target_position" +- name: infer_sample + type: DataLoader + batch_size: 5 + data_path: "{workspace}/data/train_data" + sparse_slots: "label history cate position target target_cate target_position" + +hyper_parameters: + optimizer: + class: SGD + learning_rate: 0.0001 + use_DataLoader: True + item_emb_size: 96 + cat_emb_size: 96 + position_emb_size: 96 + is_sparse: False + item_count: 63001 + cat_count: 801 + position_count: 5001 + n_encoder_layers: 1 + d_model: 288 + d_key: 48 + d_value: 48 + n_head: 6 + dropout_rate: 0 + postprocess_cmd: "da" + prepostprocess_dropout: 0 + d_inner_hid: 512 + relu_dropout: 0.0 + act: "relu" + fc_sizes: [1024, 512, 256] + + +mode: train_runner + +runner: + - name: train_runner + class: train + epochs: 1 + device: cpu + init_model_path: "" + save_checkpoint_interval: 1 + save_inference_interval: 1 + save_checkpoint_path: "increment_BST" + save_inference_path: "inference_BST" + print_interval: 1 + - name: infer_runner + class: infer + device: cpu + init_model_path: "increment_BST/0" + print_interval: 1 + +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: sample_1 + thread_num: 1 +#- name: infer_phase +# model: "{workspace}/model.py" +# dataset_name: infer_sample +# thread_num: 1 diff --git a/models/rank/BST/data/build_dataset.py b/models/rank/BST/data/build_dataset.py new file mode 100755 index 0000000000000000000000000000000000000000..137d8652d61cc7be9eb074e15942de8e5cce19d9 --- /dev/null +++ b/models/rank/BST/data/build_dataset.py @@ -0,0 +1,116 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import random +import pickle + +random.seed(1234) + +print("read and process data") + +with open('./raw_data/remap.pkl', 'rb') as f: + reviews_df = pickle.load(f) + cate_list = pickle.load(f) + user_count, item_count, cate_count, example_count = pickle.load(f) + +train_set = [] +test_set = [] +for reviewerID, hist in reviews_df.groupby('reviewerID'): + pos_list = hist['asin'].tolist() + time_list = hist['unixReviewTime'].tolist() + + def gen_neg(): + neg = pos_list[0] + while neg in pos_list: + neg = random.randint(0, item_count - 1) + return neg + + neg_list = [gen_neg() for i in range(len(pos_list))] + + for i in range(1, len(pos_list)): + hist = pos_list[:i] + # set maximum position value + time_seq = [ + min(int((time_list[i] - time_list[j]) / (3600 * 24)), 5000) + for j in range(i) + ] + if i != len(pos_list) - 1: + train_set.append((reviewerID, hist, pos_list[i], 1, time_seq)) + train_set.append((reviewerID, hist, neg_list[i], 0, time_seq)) + else: + label = (pos_list[i], neg_list[i]) + test_set.append((reviewerID, hist, label, time_seq)) + +random.shuffle(train_set) +random.shuffle(test_set) + +assert len(test_set) == user_count + + +def print_to_file(data, fout, slot): + if not isinstance(data, list): + data = [data] + for i in range(len(data)): + fout.write(slot + ":" + str(data[i])) + fout.write(' ') + + +print("make train data") +with open("paddle_train.txt", "w") as fout: + for line in train_set: + history = line[1] + target = line[2] + label = line[3] + position = line[4] + cate = [cate_list[x] for x in history] + print_to_file(history, fout, "history") + print_to_file(cate, fout, "cate") + print_to_file(position, fout, "position") + print_to_file(target, fout, "target") + print_to_file(cate_list[target], fout, "target_cate") + print_to_file(0, fout, "target_position") + print_to_file(label, fout, "label") + fout.write("\n") + +print("make test data") +with open("paddle_test.txt", "w") as fout: + for line in test_set: + history = line[1] + target = line[2] + position = line[3] + cate = [cate_list[x] for x in history] + + print_to_file(history, fout, "history") + print_to_file(cate, fout, "cate") + print_to_file(position, fout, "position") + print_to_file(target[0], fout, "target") + print_to_file(cate_list[target[0]], fout, "target_cate") + print_to_file(0, fout, "target_position") + fout.write("label:1\n") + + print_to_file(history, fout, "history") + print_to_file(cate, fout, "cate") + print_to_file(position, fout, "position") + print_to_file(target[0], fout, "target") + print_to_file(cate_list[target[1]], fout, "target_cate") + print_to_file(0, fout, "target_position") + fout.write("label:0\n") + +print("make config data") +with open('config.txt', 'w') as f: + f.write(str(user_count) + "\n") + f.write(str(item_count) + "\n") + f.write(str(cate_count) + "\n") + f.wrire(str(50000) + "\n") diff --git a/models/rank/BST/data/convert_pd.py b/models/rank/BST/data/convert_pd.py new file mode 100755 index 0000000000000000000000000000000000000000..a66290e1561084a10756ab98c3d70b9a5ac5a6ed --- /dev/null +++ b/models/rank/BST/data/convert_pd.py @@ -0,0 +1,41 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import pickle +import pandas as pd + + +def to_df(file_path): + with open(file_path, 'r') as fin: + df = {} + i = 0 + for line in fin: + df[i] = eval(line) + i += 1 + df = pd.DataFrame.from_dict(df, orient='index') + return df + + +print("start to analyse reviews_Electronics_5.json") +reviews_df = to_df('./raw_data/reviews_Electronics_5.json') +with open('./raw_data/reviews.pkl', 'wb') as f: + pickle.dump(reviews_df, f, pickle.HIGHEST_PROTOCOL) + +print("start to analyse meta_Electronics.json") +meta_df = to_df('./raw_data/meta_Electronics.json') +meta_df = meta_df[meta_df['asin'].isin(reviews_df['asin'].unique())] +meta_df = meta_df.reset_index(drop=True) +with open('./raw_data/meta.pkl', 'wb') as f: + pickle.dump(meta_df, f, pickle.HIGHEST_PROTOCOL) diff --git a/models/rank/BST/data/data_process.sh b/models/rank/BST/data/data_process.sh new file mode 100755 index 0000000000000000000000000000000000000000..7bcfc55f43119315d543e06f16fe0ebc0fecb9fc --- /dev/null +++ b/models/rank/BST/data/data_process.sh @@ -0,0 +1,15 @@ +#! /bin/bash + +set -e +echo "begin download data" +mkdir raw_data +cd raw_data +wget -c http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz +gzip -d reviews_Electronics_5.json.gz +wget -c http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Electronics.json.gz +gzip -d meta_Electronics.json.gz +echo "download data successfully" + +cd .. +python convert_pd.py +python remap_id.py diff --git a/models/rank/BST/data/remap_id.py b/models/rank/BST/data/remap_id.py new file mode 100755 index 0000000000000000000000000000000000000000..ee6983d7f0769a58352f61a0a05bbd81c6ccbc13 --- /dev/null +++ b/models/rank/BST/data/remap_id.py @@ -0,0 +1,62 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import random +import pickle +import numpy as np + +random.seed(1234) + +with open('./raw_data/reviews.pkl', 'rb') as f: + reviews_df = pickle.load(f) + reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime']] +with open('./raw_data/meta.pkl', 'rb') as f: + meta_df = pickle.load(f) + meta_df = meta_df[['asin', 'categories']] + meta_df['categories'] = meta_df['categories'].map(lambda x: x[-1][-1]) + + +def build_map(df, col_name): + key = sorted(df[col_name].unique().tolist()) + m = dict(zip(key, range(len(key)))) + df[col_name] = df[col_name].map(lambda x: m[x]) + return m, key + + +asin_map, asin_key = build_map(meta_df, 'asin') +cate_map, cate_key = build_map(meta_df, 'categories') +revi_map, revi_key = build_map(reviews_df, 'reviewerID') + +user_count, item_count, cate_count, example_count =\ + len(revi_map), len(asin_map), len(cate_map), reviews_df.shape[0] +print('user_count: %d\titem_count: %d\tcate_count: %d\texample_count: %d' % + (user_count, item_count, cate_count, example_count)) + +meta_df = meta_df.sort_values('asin') +meta_df = meta_df.reset_index(drop=True) +reviews_df['asin'] = reviews_df['asin'].map(lambda x: asin_map[x]) +reviews_df = reviews_df.sort_values(['reviewerID', 'unixReviewTime']) +reviews_df = reviews_df.reset_index(drop=True) +reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime']] + +cate_list = [meta_df['categories'][i] for i in range(len(asin_map))] +cate_list = np.array(cate_list, dtype=np.int32) + +with open('./raw_data/remap.pkl', 'wb') as f: + pickle.dump(reviews_df, f, pickle.HIGHEST_PROTOCOL) # uid, iid + pickle.dump(cate_list, f, pickle.HIGHEST_PROTOCOL) # cid of iid line + pickle.dump((user_count, item_count, cate_count, example_count), f, + pickle.HIGHEST_PROTOCOL) + pickle.dump((asin_key, cate_key, revi_key), f, pickle.HIGHEST_PROTOCOL) diff --git a/models/rank/BST/data/train_data/paddle_train.100.txt b/models/rank/BST/data/train_data/paddle_train.100.txt new file mode 100755 index 0000000000000000000000000000000000000000..a65d9341d9ae2150d7ff55a7da8a7f4304b85d08 --- /dev/null +++ b/models/rank/BST/data/train_data/paddle_train.100.txt @@ -0,0 +1,100 @@ +history:3737 history:19450 cate:288 cate:196 position:518 position:158 target:18486 target_cate:674 label:1 +history:3647 history:4342 history:6855 history:3805 cate:281 cate:463 cate:558 cate:674 position:242 position:216 position:17 position:5 target:4206 target_cate:463 label:1 +history:1805 history:4309 cate:87 cate:87 position:61 position:0 target:21354 target_cate:556 label:1 +history:18209 history:20753 cate:649 cate:241 position:0 position:0 target:51924 target_cate:610 label:0 +history:13150 cate:351 position:505 target:41455 target_cate:792 label:1 +history:35120 history:40418 cate:157 cate:714 position:0 position:0 target:52035 target_cate:724 label:0 +history:13515 history:20363 history:25356 history:26891 history:24200 history:11694 history:33378 history:34483 history:35370 history:27311 history:40689 history:33319 history:28819 cate:558 cate:123 cate:61 cate:110 cate:738 cate:692 cate:110 cate:629 cate:714 cate:463 cate:281 cate:142 cate:382 position:1612 position:991 position:815 position:668 position:639 position:508 position:456 position:431 position:409 position:222 position:221 position:74 position:34 target:45554 target_cate:558 label:1 +history:19254 history:9021 history:28156 history:19193 history:24602 history:31171 cate:189 cate:462 cate:140 cate:474 cate:157 cate:614 position:375 position:144 position:141 position:0 position:0 position:0 target:48895 target_cate:350 label:1 +history:4716 cate:194 position:2457 target:32497 target_cate:484 label:1 +history:43799 history:47108 cate:368 cate:140 position:181 position:105 target:3503 target_cate:25 label:0 +history:20554 history:41800 history:1582 history:1951 cate:339 cate:776 cate:694 cate:703 position:35 position:35 position:0 position:0 target:4320 target_cate:234 label:0 +history:39713 history:44272 history:45136 history:11687 cate:339 cate:339 cate:339 cate:140 position:40 position:40 position:40 position:0 target:885 target_cate:168 label:0 +history:14398 history:33997 cate:756 cate:347 position:73 position:73 target:20438 target_cate:703 label:1 +history:29341 history:25727 cate:142 cate:616 position:839 position:0 target:4170 target_cate:512 label:0 +history:12197 history:10212 cate:558 cate:694 position:1253 position:677 target:31559 target_cate:24 label:0 +history:11551 cate:351 position:47 target:53485 target_cate:436 label:1 +history:4553 cate:196 position:88 target:7331 target_cate:158 label:1 +history:15190 history:19994 history:33946 history:30716 history:31879 history:45178 history:51598 history:46814 cate:249 cate:498 cate:612 cate:142 cate:746 cate:746 cate:558 cate:174 position:1912 position:1275 position:1170 position:1122 position:773 position:773 position:329 position:291 target:24353 target_cate:251 label:0 +history:4931 history:2200 history:8338 history:23530 cate:785 cate:792 cate:277 cate:523 position:1360 position:975 position:975 position:586 target:3525 target_cate:251 label:0 +history:8881 history:13274 history:12683 history:14696 history:27693 history:1395 history:44373 history:59704 history:27762 history:54268 history:30326 history:11811 history:45371 history:51598 history:55859 history:56039 history:57678 history:47250 history:2073 history:38932 cate:479 cate:558 cate:190 cate:708 cate:335 cate:684 cate:339 cate:725 cate:446 cate:446 cate:44 cate:575 cate:280 cate:558 cate:262 cate:197 cate:368 cate:111 cate:749 cate:188 position:2065 position:2065 position:1292 position:1108 position:647 position:343 position:343 position:343 position:257 position:257 position:143 position:76 position:76 position:76 position:76 position:76 position:76 position:58 position:6 position:6 target:12361 target_cate:616 label:1 +history:16297 history:16797 history:18629 history:20922 history:16727 history:33946 history:51165 history:36796 cate:281 cate:436 cate:462 cate:339 cate:611 cate:612 cate:288 cate:64 position:1324 position:1324 position:1324 position:1118 position:183 position:133 position:6 position:4 target:34724 target_cate:288 label:1 +history:22237 cate:188 position:339 target:40786 target_cate:637 label:0 +history:5396 history:39993 history:42681 history:49832 history:11208 history:34954 history:36523 history:45523 history:51618 cate:351 cate:339 cate:687 cate:281 cate:708 cate:142 cate:629 cate:656 cate:142 position:1117 position:290 position:276 position:191 position:144 position:144 position:120 position:66 position:66 target:38201 target_cate:571 label:0 +history:8881 history:9029 history:17043 history:16620 history:15021 history:32706 cate:479 cate:110 cate:110 cate:749 cate:598 cate:251 position:1218 position:1218 position:790 position:695 position:264 position:1 target:34941 target_cate:657 label:0 +history:53255 cate:444 position:232 target:37953 target_cate:724 label:1 +history:1010 history:4172 history:8613 history:11562 history:11709 history:13118 history:2027 history:15446 cate:674 cate:606 cate:708 cate:436 cate:179 cate:179 cate:692 cate:436 position:324 position:323 position:323 position:323 position:323 position:308 position:307 position:307 target:36998 target_cate:703 label:0 +history:22357 history:24305 history:15222 history:19254 history:22914 cate:189 cate:504 cate:113 cate:189 cate:714 position:321 position:321 position:232 position:232 position:232 target:18201 target_cate:398 label:1 +history:1905 cate:694 position:0 target:23877 target_cate:347 label:1 +history:8444 history:17868 cate:765 cate:712 position:454 position:0 target:50732 target_cate:44 label:0 +history:42301 history:26186 history:38086 cate:142 cate:450 cate:744 position:164 position:0 position:0 target:61547 target_cate:714 label:0 +history:18156 history:35717 history:32070 history:45650 history:47208 history:20975 history:36409 history:44856 history:48072 history:15860 history:47043 history:53289 history:53314 history:33470 history:47926 cate:157 cate:281 cate:650 cate:142 cate:749 cate:291 cate:707 cate:714 cate:157 cate:205 cate:388 cate:474 cate:708 cate:498 cate:495 position:546 position:506 position:296 position:296 position:263 position:253 position:253 position:221 position:121 position:26 position:26 position:26 position:26 position:0 position:0 target:48170 target_cate:746 label:1 +history:56219 cate:108 position:0 target:1988 target_cate:389 label:0 +history:22907 cate:83 position:353 target:752 target_cate:175 label:0 +history:22009 history:32410 history:42987 history:48720 history:683 history:1289 history:2731 history:4736 history:6306 history:8442 history:8946 history:9928 history:11536 history:14947 history:15793 history:16694 history:21736 history:25156 history:25797 history:25874 history:26573 history:30318 history:33946 history:35420 history:1492 history:5236 history:5555 history:6625 history:8867 history:9638 history:11443 history:20225 history:25965 history:27273 history:29001 history:35302 history:42336 history:43347 history:36907 history:2012 cate:317 cate:462 cate:291 cate:142 cate:694 cate:10 cate:574 cate:278 cate:708 cate:281 cate:131 cate:142 cate:367 cate:281 cate:258 cate:345 cate:616 cate:708 cate:111 cate:115 cate:339 cate:113 cate:612 cate:24 cate:368 cate:616 cate:39 cate:197 cate:44 cate:214 cate:558 cate:108 cate:616 cate:558 cate:210 cate:210 cate:142 cate:142 cate:262 cate:351 position:390 position:390 position:390 position:390 position:389 position:389 position:389 position:389 position:389 position:389 position:389 position:389 position:389 position:389 position:389 position:389 position:389 position:389 position:389 position:389 position:389 position:389 position:389 position:389 position:383 position:383 position:383 position:383 position:383 position:383 position:383 position:383 position:383 position:383 position:383 position:383 position:366 position:339 position:333 position:167 target:25540 target_cate:701 label:0 +history:20434 cate:196 position:610 target:18056 target_cate:189 label:0 +history:628 history:5461 cate:194 cate:234 position:294 position:74 target:43677 target_cate:351 label:0 +history:16953 history:15149 history:45143 history:23587 history:5094 history:25105 history:51913 history:54645 cate:484 cate:281 cate:449 cate:792 cate:524 cate:395 cate:388 cate:731 position:1134 position:668 position:626 position:409 position:285 position:285 position:285 position:42 target:57655 target_cate:75 label:1 +history:13584 history:7509 cate:234 cate:744 position:1187 position:231 target:33062 target_cate:749 label:1 +history:170 history:208 history:77 history:109 history:738 history:742 history:1118 history:15349 history:255 history:12067 history:21643 history:55453 cate:330 cate:559 cate:744 cate:115 cate:558 cate:674 cate:111 cate:351 cate:694 cate:694 cate:746 cate:111 position:4920 position:4726 position:4585 position:4585 position:4585 position:4584 position:4108 position:1418 position:1326 position:274 position:89 position:88 target:9821 target_cate:694 label:1 +history:4970 history:16672 cate:540 cate:746 position:416 position:120 target:25685 target_cate:666 label:1 +history:17240 history:60546 cate:708 cate:629 position:165 position:41 target:42110 target_cate:142 label:1 +history:31503 history:31226 history:50628 history:22444 cate:142 cate:156 cate:142 cate:203 position:187 position:162 position:109 position:0 target:47812 target_cate:749 label:0 +history:2443 history:1763 history:3403 history:4225 history:8951 cate:25 cate:707 cate:351 cate:177 cate:351 position:1397 position:1113 position:973 position:637 position:254 target:7954 target_cate:351 label:1 +history:3748 cate:351 position:1086 target:9171 target_cate:657 label:1 +history:1755 history:26204 history:42716 history:32991 cate:446 cate:188 cate:497 cate:746 position:440 position:184 position:91 position:52 target:23910 target_cate:395 label:1 +history:20637 history:27122 cate:558 cate:44 position:1122 position:0 target:19669 target_cate:301 label:0 +history:406 history:872 history:306 history:218 history:883 history:1372 history:1705 history:1709 history:7774 history:2376 history:2879 history:2881 history:13329 history:4992 history:13594 history:11106 history:7131 history:8631 history:1736 history:17585 history:2568 history:16896 history:21971 history:10296 history:22361 history:24108 history:23300 history:11793 history:25351 history:2648 history:24593 history:12692 history:23883 history:25345 history:27129 history:26321 history:21627 history:20738 history:17784 history:28785 history:29281 history:28366 history:24723 history:24319 history:12083 history:29882 history:29974 history:30443 history:30428 history:17072 history:9783 history:16700 history:29421 history:32253 history:28830 history:31299 history:28792 history:33931 history:24973 history:33112 history:21717 history:28339 history:23978 history:18649 history:1841 history:17635 history:19696 history:37448 history:20862 history:30492 history:35736 history:37450 history:2633 history:8675 history:17412 history:25960 history:28389 history:31032 history:37157 history:14555 history:4996 history:33388 history:33393 history:36237 history:38946 history:22793 history:24337 history:34963 history:38819 history:41165 history:39551 history:43019 history:15570 history:25129 history:34593 history:38385 history:42915 history:41407 history:29907 history:31289 history:44229 history:24267 history:34975 history:39462 history:33274 history:43251 history:38302 history:35502 history:44056 history:44675 history:45233 history:47690 history:33472 history:50149 history:29409 history:47183 history:49188 history:48192 history:50628 history:24103 history:28313 history:28358 history:38882 history:44330 history:44346 history:2019 history:2484 history:2675 history:26396 history:48143 history:46039 history:47722 history:48559 history:41719 history:41720 history:43920 history:41983 history:51235 history:34964 history:27287 history:51915 history:33586 history:43630 history:47258 history:52137 history:40954 history:35120 history:29572 history:42405 history:53559 history:44900 history:45761 cate:241 cate:558 cate:395 cate:368 cate:498 cate:110 cate:463 cate:611 cate:558 cate:106 cate:10 cate:112 cate:251 cate:241 cate:48 cate:112 cate:601 cate:674 cate:241 cate:347 cate:733 cate:502 cate:194 cate:119 cate:179 cate:179 cate:578 cate:692 cate:281 cate:115 cate:523 cate:113 cate:281 cate:35 cate:765 cate:196 cate:339 cate:115 cate:90 cate:164 cate:790 cate:708 cate:142 cate:115 cate:342 cate:351 cate:391 cate:281 cate:48 cate:119 cate:74 cate:505 cate:606 cate:68 cate:239 cate:687 cate:687 cate:281 cate:110 cate:281 cate:449 cate:351 cate:38 cate:351 cate:164 cate:176 cate:449 cate:115 cate:70 cate:25 cate:687 cate:115 cate:39 cate:756 cate:35 cate:175 cate:704 cate:119 cate:38 cate:53 cate:115 cate:38 cate:38 cate:142 cate:262 cate:188 cate:614 cate:277 cate:388 cate:615 cate:49 cate:738 cate:106 cate:733 cate:486 cate:666 cate:571 cate:385 cate:708 cate:119 cate:331 cate:463 cate:578 cate:288 cate:142 cate:106 cate:611 cate:611 cate:39 cate:523 cate:388 cate:142 cate:726 cate:702 cate:498 cate:61 cate:142 cate:714 cate:142 cate:654 cate:277 cate:733 cate:603 cate:498 cate:299 cate:97 cate:726 cate:115 cate:637 cate:703 cate:558 cate:74 cate:629 cate:142 cate:142 cate:347 cate:629 cate:746 cate:277 cate:8 cate:49 cate:389 cate:629 cate:408 cate:733 cate:345 cate:157 cate:704 cate:115 cate:398 cate:611 cate:239 position:3925 position:3925 position:3909 position:3897 position:3879 position:3644 position:3611 position:3524 position:2264 position:1913 position:1730 position:1730 position:1684 position:1657 position:1643 position:1626 position:1566 position:1430 position:1375 position:1351 position:1298 position:1298 position:1221 position:1217 position:1177 position:1149 position:1142 position:1141 position:1083 position:1079 position:1067 position:1045 position:1031 position:997 position:994 position:993 position:987 position:968 position:946 position:945 position:905 position:904 position:903 position:897 position:856 position:855 position:813 position:813 position:801 position:799 position:798 position:791 position:791 position:767 position:765 position:761 position:756 position:751 position:747 position:730 position:672 position:659 position:652 position:620 position:619 position:619 position:597 position:596 position:582 position:555 position:555 position:532 position:484 position:484 position:484 position:483 position:483 position:468 position:468 position:467 position:454 position:454 position:454 position:441 position:427 position:409 position:409 position:409 position:409 position:409 position:387 position:387 position:381 position:381 position:381 position:360 position:360 position:357 position:355 position:337 position:332 position:317 position:294 position:271 position:213 position:206 position:204 position:202 position:202 position:182 position:182 position:173 position:154 position:142 position:135 position:114 position:110 position:107 position:107 position:95 position:95 position:95 position:95 position:94 position:92 position:90 position:90 position:90 position:90 position:90 position:86 position:86 position:86 position:84 position:84 position:84 position:83 position:83 position:80 position:65 position:51 position:41 position:23 position:23 position:23 position:22 position:18 position:7 position:3 position:3 position:0 position:0 target:49174 target_cate:368 label:0 +history:29206 history:60955 cate:351 cate:684 position:32 position:32 target:61590 target_cate:76 label:1 +history:8427 history:9692 history:4411 history:3266 history:18234 history:22774 cate:746 cate:281 cate:396 cate:651 cate:446 cate:44 position:1204 position:1129 position:808 position:622 position:134 position:134 target:23393 target_cate:351 label:0 +history:13051 history:15844 history:9347 history:21973 history:18365 history:24220 history:28429 history:4799 history:27488 history:21623 history:13870 history:29346 history:27208 history:31075 history:31635 history:28390 history:30777 history:29334 history:33438 history:16469 history:29423 history:29237 history:25527 history:34808 history:37656 history:21324 history:38263 history:6699 history:33167 history:9295 history:40828 history:18894 cate:339 cate:342 cate:657 cate:194 cate:20 cate:466 cate:179 cate:225 cate:436 cate:364 cate:707 cate:115 cate:36 cate:523 cate:351 cate:674 cate:694 cate:391 cate:674 cate:500 cate:342 cate:216 cate:707 cate:345 cate:616 cate:495 cate:436 cate:363 cate:395 cate:189 cate:203 cate:766 position:1400 position:1032 position:849 position:827 position:804 position:469 position:467 position:463 position:460 position:456 position:455 position:451 position:371 position:371 position:371 position:315 position:315 position:314 position:311 position:287 position:282 position:281 position:239 position:105 position:105 position:70 position:70 position:67 position:56 position:45 position:42 position:6 target:56816 target_cate:396 label:0 +history:5653 history:18042 history:21137 history:17277 history:23847 history:25109 history:21837 history:17163 history:22786 history:27380 history:20789 history:27737 history:30164 history:36402 history:37166 history:38647 history:31746 history:38915 history:38366 history:11151 history:43757 history:38284 history:29817 history:41717 history:41899 history:43279 history:47539 history:37850 history:39789 history:43817 history:11208 history:53361 history:29247 history:51483 history:39940 history:50917 history:53618 history:44055 history:48997 cate:593 cate:251 cate:616 cate:110 cate:110 cate:110 cate:110 cate:105 cate:436 cate:558 cate:311 cate:142 cate:603 cate:738 cate:398 cate:766 cate:1 cate:351 cate:142 cate:584 cate:674 cate:597 cate:142 cate:483 cate:351 cate:157 cate:373 cate:142 cate:629 cate:39 cate:708 cate:251 cate:339 cate:142 cate:262 cate:1 cate:113 cate:142 cate:462 position:1285 position:1258 position:1252 position:1206 position:1206 position:1206 position:1205 position:1194 position:1187 position:992 position:804 position:791 position:703 position:670 position:640 position:549 position:548 position:542 position:489 position:480 position:479 position:455 position:422 position:393 position:319 position:296 position:274 position:266 position:266 position:266 position:222 position:141 position:127 position:127 position:114 position:88 position:56 position:22 position:8 target:13418 target_cate:558 label:0 +history:8719 history:11172 cate:311 cate:217 position:0 position:0 target:11707 target_cate:179 label:1 +history:14968 history:8297 history:22914 history:5998 history:20253 history:41425 history:42664 history:46745 history:51179 history:33481 history:46814 history:55135 history:53124 history:61559 cate:463 cate:766 cate:714 cate:486 cate:628 cate:444 cate:281 cate:714 cate:142 cate:242 cate:174 cate:118 cate:714 cate:714 position:2006 position:1413 position:1323 position:1148 position:977 position:777 position:589 position:487 position:486 position:403 position:349 position:297 position:78 position:12 target:61908 target_cate:714 label:1 +history:61119 cate:714 position:99 target:22907 target_cate:83 label:0 +history:26172 cate:157 position:258 target:54529 target_cate:44 label:0 +history:13830 history:10377 history:8193 history:16072 history:13543 history:18741 history:24205 history:18281 history:37272 history:27784 history:16658 history:27884 cate:384 cate:739 cate:558 cate:739 cate:135 cate:347 cate:558 cate:687 cate:498 cate:142 cate:197 cate:746 position:1447 position:1443 position:1380 position:1312 position:936 position:876 position:695 position:523 position:55 position:25 position:24 position:20 target:34463 target_cate:177 label:1 +history:20842 history:11756 history:22110 history:30562 history:30697 cate:189 cate:68 cate:483 cate:776 cate:225 position:516 position:55 position:21 position:21 position:21 target:49113 target_cate:483 label:0 +history:13646 history:46782 history:54138 cate:142 cate:798 cate:142 position:604 position:346 position:200 target:43698 target_cate:347 label:0 +history:36434 cate:241 position:31 target:51537 target_cate:629 label:0 +history:44121 history:35325 cate:397 cate:653 position:809 position:0 target:43399 target_cate:397 label:1 +history:6438 history:11107 history:20073 history:25026 history:24434 history:35533 history:6318 history:25028 history:28352 history:32359 history:25734 history:26280 history:41466 history:25192 history:1909 history:11753 history:17770 history:24301 history:1728 history:9693 history:36444 history:40256 history:17961 history:36780 history:41093 history:8788 history:439 history:46397 history:46269 history:50462 history:40395 history:437 history:2582 history:4455 history:12361 history:14325 history:22294 history:26153 history:26607 history:29205 history:29878 history:33491 history:38795 history:41585 history:45480 history:51567 history:54245 history:19796 history:52446 cate:356 cate:194 cate:389 cate:89 cate:474 cate:330 cate:347 cate:384 cate:330 cate:90 cate:19 cate:385 cate:177 cate:68 cate:624 cate:68 cate:674 cate:463 cate:624 cate:194 cate:177 cate:389 cate:197 cate:642 cate:239 cate:111 cate:115 cate:113 cate:48 cate:251 cate:554 cate:115 cate:36 cate:163 cate:616 cate:524 cate:84 cate:190 cate:465 cate:398 cate:89 cate:166 cate:113 cate:330 cate:616 cate:449 cate:90 cate:140 cate:330 position:971 position:969 position:969 position:969 position:934 position:934 position:921 position:921 position:921 position:921 position:861 position:794 position:691 position:690 position:689 position:689 position:689 position:686 position:683 position:683 position:681 position:656 position:408 position:341 position:341 position:278 position:276 position:275 position:229 position:226 position:210 position:139 position:139 position:139 position:139 position:139 position:139 position:139 position:139 position:139 position:139 position:139 position:139 position:139 position:139 position:139 position:119 position:110 position:105 target:15142 target_cate:764 label:0 +history:1573 cate:540 position:0 target:18294 target_cate:463 label:1 +history:9837 history:13438 history:13690 cate:351 cate:629 cate:24 position:287 position:287 position:287 target:26044 target_cate:351 label:0 +history:1708 history:2675 history:4935 history:7401 history:14413 history:22177 history:30319 history:32217 history:34342 history:40235 history:42963 history:43949 history:54816 cate:463 cate:115 cate:474 cate:616 cate:474 cate:44 cate:113 cate:279 cate:164 cate:142 cate:616 cate:649 cate:36 position:6 position:6 position:6 position:6 position:6 position:6 position:6 position:6 position:6 position:6 position:6 position:6 position:6 target:31992 target_cate:115 label:0 +history:8025 history:11769 history:36188 history:42006 cate:142 cate:262 cate:714 cate:142 position:1107 position:1107 position:21 position:13 target:8209 target_cate:142 label:0 +history:30266 cate:176 position:0 target:44167 target_cate:692 label:0 +history:13000 history:14769 history:2940 history:27638 history:23158 cate:765 cate:27 cate:736 cate:554 cate:112 position:1155 position:797 position:348 position:348 position:334 target:55050 target_cate:725 label:0 +history:32557 history:18668 history:43441 cate:765 cate:707 cate:396 position:1 position:0 position:0 target:44217 target_cate:681 label:1 +history:5665 history:5964 history:18874 cate:542 cate:746 cate:196 position:1229 position:1202 position:123 target:16747 target_cate:179 label:0 +history:7014 history:29912 history:42468 cate:194 cate:612 cate:558 position:2424 position:0 position:0 target:20800 target_cate:355 label:0 +history:8320 history:9743 history:1735 history:442 history:5216 history:11568 cate:234 cate:251 cate:241 cate:603 cate:476 cate:649 position:211 position:70 position:61 position:34 position:34 position:27 target:32738 target_cate:153 label:0 +history:533 history:1447 cate:744 cate:744 position:664 position:337 target:17843 target_cate:744 label:1 +history:48390 history:48191 cate:714 cate:714 position:137 position:92 target:48864 target_cate:708 label:1 +history:9312 history:16166 history:12754 history:21433 history:28142 history:7486 cate:215 cate:674 cate:241 cate:115 cate:558 cate:241 position:1910 position:1045 position:414 position:371 position:371 position:347 target:38629 target_cate:48 label:1 +history:10401 history:11665 history:10739 cate:142 cate:364 cate:766 position:363 position:217 position:48 target:5989 target_cate:463 label:0 +history:10408 history:14363 history:8807 history:14947 history:24701 history:44676 history:40914 history:12241 history:14906 history:29247 history:32347 history:5834 history:18291 history:18313 history:23375 history:24075 history:7020 history:14307 history:15891 cate:140 cate:140 cate:749 cate:281 cate:444 cate:388 cate:504 cate:385 cate:196 cate:339 cate:746 cate:351 cate:463 cate:746 cate:197 cate:90 cate:746 cate:576 cate:476 position:1338 position:1336 position:1305 position:1267 position:835 position:88 position:87 position:86 position:86 position:86 position:86 position:84 position:84 position:84 position:84 position:84 position:83 position:83 position:83 target:37949 target_cate:330 label:1 +history:50194 cate:444 position:243 target:15572 target_cate:216 label:0 +history:24021 cate:281 position:718 target:25850 target_cate:140 label:1 +history:22185 history:28726 history:55777 cate:142 cate:766 cate:351 position:923 position:923 position:133 target:17 target_cate:541 label:1 +history:31776 history:34767 history:28854 history:34769 history:38022 history:38667 history:32917 history:9094 history:40879 history:41634 history:42252 history:19865 history:47983 history:38818 history:40131 history:40690 history:18915 history:48539 history:49619 history:18554 history:24836 cate:70 cate:239 cate:113 cate:48 cate:486 cate:541 cate:352 cate:197 cate:347 cate:385 cate:34 cate:476 cate:704 cate:388 cate:385 cate:281 cate:225 cate:474 cate:157 cate:706 cate:53 position:490 position:490 position:473 position:360 position:360 position:360 position:209 position:199 position:199 position:199 position:199 position:198 position:198 position:196 position:196 position:174 position:93 position:36 position:36 position:0 position:0 target:25602 target_cate:707 label:1 +history:10544 history:15159 history:23606 history:33556 history:46886 history:55061 history:2079 history:27022 history:40345 history:43556 history:3807 history:28732 cate:642 cate:87 cate:641 cate:113 cate:558 cate:157 cate:564 cate:44 cate:194 cate:26 cate:54 cate:113 position:844 position:362 position:362 position:362 position:362 position:362 position:205 position:205 position:205 position:205 position:0 position:0 target:51293 target_cate:272 label:0 +history:19005 history:41469 history:42368 history:5739 history:30169 history:32266 history:54743 history:56959 history:26271 cate:145 cate:482 cate:707 cate:790 cate:101 cate:347 cate:197 cate:368 cate:674 position:365 position:365 position:365 position:258 position:258 position:258 position:258 position:258 position:0 target:5602 target_cate:158 label:0 +history:7166 history:16886 history:21083 history:7328 history:25545 cate:560 cate:213 cate:87 cate:744 cate:87 position:474 position:474 position:474 position:214 position:214 target:32494 target_cate:321 label:1 +history:2306 cate:260 position:51 target:30286 target_cate:179 label:0 +history:57709 history:55115 cate:351 cate:483 position:99 position:50 target:25035 target_cate:142 label:0 +history:16641 history:35845 cate:153 cate:311 position:0 position:0 target:36985 target_cate:68 label:1 +history:31144 history:4107 cate:189 cate:168 position:1179 position:0 target:50619 target_cate:142 label:0 +history:36331 history:9873 history:10659 history:14382 history:21430 history:28164 cate:680 cate:197 cate:185 cate:11 cate:115 cate:476 position:278 position:0 position:0 position:0 position:0 position:0 target:37887 target_cate:484 label:1 +history:19519 history:3748 history:33772 history:22436 history:38789 history:46337 cate:649 cate:351 cate:210 cate:115 cate:113 cate:115 position:1038 position:517 position:470 position:349 position:150 position:37 target:23980 target_cate:649 label:1 +history:30789 history:37586 history:42354 history:26171 history:15017 history:28654 history:44960 cate:142 cate:714 cate:142 cate:483 cate:484 cate:474 cate:157 position:158 position:158 position:146 position:36 position:26 position:26 position:26 target:41552 target_cate:746 label:1 +history:52662 cate:576 position:0 target:53627 target_cate:776 label:0 +history:12258 history:15133 history:15681 history:5066 history:6420 history:13421 history:6577 history:29202 history:38939 cate:216 cate:558 cate:111 cate:570 cate:447 cate:5 cate:111 cate:281 cate:347 position:1544 position:1359 position:1312 position:743 position:743 position:636 position:560 position:103 position:24 target:7818 target_cate:558 label:0 +history:610 history:1258 history:2332 history:7508 history:10814 history:10797 history:11710 cate:543 cate:611 cate:611 cate:653 cate:110 cate:201 cate:179 position:2452 position:1361 position:935 position:669 position:524 position:55 position:45 target:11495 target_cate:558 label:1 +history:12584 history:2707 history:1664 history:25878 history:25949 cate:790 cate:694 cate:694 cate:142 cate:611 position:768 position:729 position:625 position:236 position:7 target:25286 target_cate:792 label:1 +history:32423 history:24223 cate:135 cate:90 position:421 position:76 target:2323 target_cate:399 label:0 +history:11959 cate:197 position:0 target:15349 target_cate:351 label:1 +history:44448 history:58138 history:41930 history:57603 history:59009 history:61316 history:61559 history:599 cate:339 cate:629 cate:115 cate:388 cate:1 cate:142 cate:714 cate:297 position:320 position:97 position:23 position:23 position:23 position:23 position:23 position:0 target:54434 target_cate:142 label:0 +history:43441 history:12617 history:47970 history:52144 cate:396 cate:196 cate:142 cate:629 position:213 position:208 position:208 position:208 target:29211 target_cate:351 label:1 +history:25327 history:40258 cate:656 cate:398 position:676 position:3 target:40261 target_cate:142 label:1 +history:4637 cate:474 position:62 target:59864 target_cate:687 label:0 diff --git a/models/rank/BST/model.py b/models/rank/BST/model.py new file mode 100755 index 0000000000000000000000000000000000000000..101cb79270115e543bed3b7d7de06a0f150185dc --- /dev/null +++ b/models/rank/BST/model.py @@ -0,0 +1,347 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from functools import partial + +import numpy as np +import paddle.fluid as fluid +import paddle.fluid.layers as layers + +from paddlerec.core.utils import envs +from paddlerec.core.model import ModelBase + + +def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate): + """ + Position-wise Feed-Forward Networks. + This module consists of two linear transformations with a ReLU activation + in between, which is applied to each position separately and identically. + """ + hidden = layers.fc(input=x, + size=d_inner_hid, + num_flatten_dims=2, + act="relu") + if dropout_rate: + hidden = layers.dropout( + hidden, + dropout_prob=dropout_rate, + seed=dropout_seed, + is_test=False) + out = layers.fc(input=hidden, size=d_hid, num_flatten_dims=2) + return out + + +def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.): + """ + Add residual connection, layer normalization and droput to the out tensor + optionally according to the value of process_cmd. + This will be used before or after multi-head attention and position-wise + feed-forward networks. + """ + for cmd in process_cmd: + if cmd == "a": # add residual connection + out = out + prev_out if prev_out else out + elif cmd == "n": # add layer normalization + out = layers.layer_norm( + out, + begin_norm_axis=len(out.shape) - 1, + param_attr=fluid.initializer.Constant(1.), + bias_attr=fluid.initializer.Constant(0.)) + elif cmd == "d": # add dropout + if dropout_rate: + out = layers.dropout( + out, + dropout_prob=dropout_rate, + seed=dropout_seed, + is_test=False) + return out + + +pre_process_layer = partial(pre_post_process_layer, None) +post_process_layer = pre_post_process_layer + + +class Model(ModelBase): + def __init__(self, config): + ModelBase.__init__(self, config) + + def _init_hyper_parameters(self): + self.item_emb_size = envs.get_global_env( + "hyper_parameters.item_emb_size", 64) + self.cat_emb_size = envs.get_global_env( + "hyper_parameters.cat_emb_size", 64) + self.position_emb_size = envs.get_global_env( + "hyper_parameters.position_emb_size", 64) + self.act = envs.get_global_env("hyper_parameters.act", "sigmoid") + self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse", + False) + # significant for speeding up the training process + self.use_DataLoader = envs.get_global_env( + "hyper_parameters.use_DataLoader", False) + self.item_count = envs.get_global_env("hyper_parameters.item_count", + 63001) + self.cat_count = envs.get_global_env("hyper_parameters.cat_count", 801) + self.position_count = envs.get_global_env( + "hyper_parameters.position_count", 5001) + self.n_encoder_layers = envs.get_global_env( + "hyper_parameters.n_encoder_layers", 1) + self.d_model = envs.get_global_env("hyper_parameters.d_model", 96) + self.d_key = envs.get_global_env("hyper_parameters.d_key", None) + self.d_value = envs.get_global_env("hyper_parameters.d_value", None) + self.n_head = envs.get_global_env("hyper_parameters.n_head", None) + self.dropout_rate = envs.get_global_env( + "hyper_parameters.dropout_rate", 0.0) + self.postprocess_cmd = envs.get_global_env( + "hyper_parameters.postprocess_cmd", "da") + self.preprocess_cmd = envs.get_global_env( + "hyper_parameters.postprocess_cmd", "n") + self.prepostprocess_dropout = envs.get_global_env( + "hyper_parameters.prepostprocess_dropout", 0.0) + self.d_inner_hid = envs.get_global_env("hyper_parameters.d_inner_hid", + 512) + self.relu_dropout = envs.get_global_env( + "hyper_parameters.relu_dropout", 0.0) + self.layer_sizes = envs.get_global_env("hyper_parameters.fc_sizes", + None) + + def multi_head_attention(self, queries, keys, values, d_key, d_value, + d_model, n_head, dropout_rate): + keys = queries if keys is None else keys + values = keys if values is None else values + if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3 + ): + raise ValueError( + "Inputs: quries, keys and values should all be 3-D tensors.") + + def __compute_qkv(queries, keys, values, n_head, d_key, d_value): + """ + Add linear projection to queries, keys, and values. + """ + q = fluid.layers.fc(input=queries, + size=d_key * n_head, + bias_attr=False, + num_flatten_dims=2) + k = fluid.layers.fc(input=keys, + size=d_key * n_head, + bias_attr=False, + num_flatten_dims=2) + v = fluid.layers.fc(input=values, + size=d_value * n_head, + bias_attr=False, + num_flatten_dims=2) + return q, k, v + + def __split_heads_qkv(queries, keys, values, n_head, d_key, d_value): + """ + Reshape input tensors at the last dimension to split multi-heads + and then transpose. Specifically, transform the input tensor with shape + [bs, max_sequence_length, n_head * hidden_dim] to the output tensor + with shape [bs, n_head, max_sequence_length, hidden_dim]. + """ + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + reshaped_q = fluid.layers.reshape( + x=queries, shape=[0, 0, n_head, d_key], inplace=True) + # permuate the dimensions into: + # [batch_size, n_head, max_sequence_len, hidden_size_per_head] + q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3]) + # For encoder-decoder attention in inference, insert the ops and vars + # into global block to use as cache among beam search. + reshaped_k = fluid.layers.reshape( + x=keys, shape=[0, 0, n_head, d_key], inplace=True) + k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3]) + reshaped_v = fluid.layers.reshape( + x=values, shape=[0, 0, n_head, d_value], inplace=True) + v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3]) + + return q, k, v + + def scaled_dot_product_attention(q, k, v, d_key, dropout_rate): + """ + Scaled Dot-Product Attention + """ + product = fluid.layers.matmul( + x=q, y=k, transpose_y=True, alpha=d_key**-0.5) + + weights = fluid.layers.softmax(product) + if dropout_rate: + weights = fluid.layers.dropout( + weights, + dropout_prob=dropout_rate, + seed=None, + is_test=False) + out = fluid.layers.matmul(weights, v) + return out + + def __combine_heads(x): + """ + Transpose and then reshape the last two dimensions of inpunt tensor x + so that it becomes one dimension, which is reverse to __split_heads. + """ + if len(x.shape) != 4: + raise ValueError("Input(x) should be a 4-D Tensor.") + + trans_x = fluid.layers.transpose(x, perm=[0, 2, 1, 3]) + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + return fluid.layers.reshape( + x=trans_x, + shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], + inplace=True) + + q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) + q, k, v = __split_heads_qkv(q, k, v, n_head, d_key, d_value) + + ctx_multiheads = scaled_dot_product_attention(q, k, v, d_model, + dropout_rate) + + out = __combine_heads(ctx_multiheads) + + proj_out = fluid.layers.fc(input=out, + size=d_model, + bias_attr=False, + num_flatten_dims=2) + + return proj_out + + def encoder_layer(self, x): + attention_out = self.multi_head_attention( + pre_process_layer(x, self.preprocess_cmd, + self.prepostprocess_dropout), None, None, + self.d_key, self.d_value, self.d_model, self.n_head, + self.dropout_rate) + attn_output = post_process_layer(x, attention_out, + self.postprocess_cmd, + self.prepostprocess_dropout) + ffd_output = positionwise_feed_forward( + pre_process_layer(attn_output, self.preprocess_cmd, + self.prepostprocess_dropout), self.d_inner_hid, + self.d_model, self.relu_dropout) + return post_process_layer(attn_output, ffd_output, + self.postprocess_cmd, + self.prepostprocess_dropout) + + def net(self, inputs, is_infer=False): + + init_value_ = 0.1 + + hist_item_seq = self._sparse_data_var[1] + hist_cat_seq = self._sparse_data_var[2] + position_seq = self._sparse_data_var[3] + target_item = self._sparse_data_var[4] + target_cat = self._sparse_data_var[5] + target_position = self._sparse_data_var[6] + self.label = self._sparse_data_var[0] + + item_emb_attr = fluid.ParamAttr(name="item_emb") + cat_emb_attr = fluid.ParamAttr(name="cat_emb") + position_emb_attr = fluid.ParamAttr(name="position_emb") + + hist_item_emb = fluid.embedding( + input=hist_item_seq, + size=[self.item_count, self.item_emb_size], + param_attr=item_emb_attr, + is_sparse=self.is_sparse) + + hist_cat_emb = fluid.embedding( + input=hist_cat_seq, + size=[self.cat_count, self.cat_emb_size], + param_attr=cat_emb_attr, + is_sparse=self.is_sparse) + + hist_position_emb = fluid.embedding( + input=hist_cat_seq, + size=[self.position_count, self.position_emb_size], + param_attr=position_emb_attr, + is_sparse=self.is_sparse) + + target_item_emb = fluid.embedding( + input=target_item, + size=[self.item_count, self.item_emb_size], + param_attr=item_emb_attr, + is_sparse=self.is_sparse) + + target_cat_emb = fluid.embedding( + input=target_cat, + size=[self.cat_count, self.cat_emb_size], + param_attr=cat_emb_attr, + is_sparse=self.is_sparse) + + target_position_emb = fluid.embedding( + input=target_position, + size=[self.position_count, self.position_emb_size], + param_attr=position_emb_attr, + is_sparse=self.is_sparse) + + item_sequence_target = fluid.layers.reduce_sum( + fluid.layers.sequence_concat([hist_item_emb, target_item_emb]), + dim=1) + cat_sequence_target = fluid.layers.reduce_sum( + fluid.layers.sequence_concat([hist_cat_emb, target_cat_emb]), + dim=1) + position_sequence_target = fluid.layers.reduce_sum( + fluid.layers.sequence_concat( + [hist_position_emb, target_position_emb]), + dim=1) + + whole_embedding_withlod = fluid.layers.concat( + [ + item_sequence_target, cat_sequence_target, + position_sequence_target + ], + axis=1) + pad_value = fluid.layers.assign(input=np.array( + [0.0], dtype=np.float32)) + whole_embedding, _ = fluid.layers.sequence_pad(whole_embedding_withlod, + pad_value) + + for _ in range(self.n_encoder_layers): + enc_output = self.encoder_layer(whole_embedding) + enc_input = enc_output + enc_output = pre_process_layer(enc_output, self.preprocess_cmd, + self.prepostprocess_dropout) + + dnn_input = fluid.layers.reduce_sum(enc_output, dim=1) + + for s in self.layer_sizes: + dnn_input = fluid.layers.fc( + input=dnn_input, + size=s, + act=self.act, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.TruncatedNormalInitializer( + loc=0.0, scale=init_value_ / math.sqrt(float(10)))), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.TruncatedNormalInitializer( + loc=0.0, scale=init_value_))) + + y_dnn = fluid.layers.fc(input=dnn_input, size=1, act=None) + + self.predict = fluid.layers.sigmoid(y_dnn) + cost = fluid.layers.log_loss( + input=self.predict, label=fluid.layers.cast(self.label, "float32")) + avg_cost = fluid.layers.reduce_sum(cost) + + self._cost = avg_cost + + predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1) + label_int = fluid.layers.cast(self.label, 'int64') + auc_var, batch_auc_var, _ = fluid.layers.auc(input=predict_2d, + label=label_int, + slide_steps=0) + self._metrics["AUC"] = auc_var + self._metrics["BATCH_AUC"] = batch_auc_var + if is_infer: + self._infer_results["AUC"] = auc_var diff --git a/models/rank/dnn/backend.yaml b/models/rank/dnn/backend.yaml new file mode 100644 index 0000000000000000000000000000000000000000..03b5efe7847ddb4a6cabf0f817a58f686e12fad1 --- /dev/null +++ b/models/rank/dnn/backend.yaml @@ -0,0 +1,63 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +backend: "PaddleCloud" +cluster_type: k8s # mpi 可选 + +config: + fs_name: "afs://xxx.com" + fs_ugi: "usr,pwd" + output_path: "" # 填远程地址,如afs:/user/your/path/ 则此处填 /user/your/path + + # for mpi + train_data_path: "" # 填远程地址,如afs:/user/your/path/ 则此处填 /user/your/path + test_data_path: "" # 填远程地址,如afs:/user/your/path/ 则此处填 /user/your/path + thirdparty_path: "" # 填远程地址,如afs:/user/your/path/ 则此处填 /user/your/path + paddle_version: "1.7.2" # 填写paddle官方版本号 >= 1.7.2 + + # for k8s + afs_remote_mount_point: "" # 填远程地址,如afs:/user/your/path/ 则此处填 /user/your/path + + # paddle分布式底层超参,无特殊需求不理不改 + communicator: + FLAGS_communicator_is_sgd_optimizer: 0 + FLAGS_communicator_send_queue_size: 5 + FLAGS_communicator_thread_pool_size: 32 + FLAGS_communicator_max_merge_var_num: 5 + FLAGS_communicator_max_send_grad_num_before_recv: 5 + FLAGS_communicator_fake_rpc: 0 + FLAGS_rpc_retry_times: 3 + +submit: + ak: "" + sk: "" + priority: "high" + job_name: "PaddleRec_CTR" + group: "" + start_cmd: "python -m paddlerec.run -m ./config.yaml" + files: ./*.py ./*.yaml + + # for mpi ps-cpu + nodes: 2 + + # for k8s gpu + k8s_trainers: 2 + k8s_cpu_cores: 2 + k8s_gpu_card: 1 + + # for k8s ps-cpu + k8s_trainers: 2 + k8s_cpu_cores: 4 + k8s_ps_num: 2 + k8s_ps_cores: 4 + diff --git a/models/rank/dnn/config.yaml b/models/rank/dnn/config.yaml index a50329705b4c8a2f6ad5327eff587f5953cc5352..38166a55e3bf61ac91af372149be1a07a32ff43a 100755 --- a/models/rank/dnn/config.yaml +++ b/models/rank/dnn/config.yaml @@ -80,6 +80,28 @@ runner: init_model_path: "increment_dnn" # load model path phases: [phase2] +- name: ps_cluster + class: cluster_train + epochs: 2 + device: cpu + fleet_mode: ps + save_checkpoint_interval: 1 # save model interval of epochs + save_checkpoint_path: "increment_dnn" # save checkpoint path + init_model_path: "" # load model path + print_interval: 1 + phases: [phase1] + +- name: collective_cluster + class: cluster_train + epochs: 2 + device: gpu + fleet_mode: collective + save_checkpoint_interval: 1 # save model interval of epochs + save_checkpoint_path: "increment_dnn" # save checkpoint path + init_model_path: "" # load model path + print_interval: 1 + phases: [phase1] + # runner will run all the phase in each epoch phase: - name: phase1 diff --git a/models/rank/fibinet/config.yaml b/models/rank/fibinet/config.yaml index eed0fbe888302298c66128af755fea37a9eb62bf..091915e6a41ec56824557426553c0d062d26127f 100644 --- a/models/rank/fibinet/config.yaml +++ b/models/rank/fibinet/config.yaml @@ -59,8 +59,8 @@ runner: device: cpu save_checkpoint_interval: 2 # save model interval of epochs save_inference_interval: 4 # save inference - save_checkpoint_path: "increment_model" # save checkpoint path - save_inference_path: "inference" # save inference path + save_checkpoint_path: "increment_model_fibinet" # save checkpoint path + save_inference_path: "inference_fibinet" # save inference path save_inference_feed_varnames: [] # feed vars of save inference save_inference_fetch_varnames: [] # fetch vars of save inference init_model_path: "" # load model path @@ -75,8 +75,8 @@ runner: device: gpu save_checkpoint_interval: 1 # save model interval of epochs save_inference_interval: 4 # save inference - save_checkpoint_path: "increment_model" # save checkpoint path - save_inference_path: "inference" # save inference path + save_checkpoint_path: "increment_model_fibinet" # save checkpoint path + save_inference_path: "inference_fibinet" # save inference path save_inference_feed_varnames: [] # feed vars of save inference save_inference_fetch_varnames: [] # fetch vars of save inference init_model_path: "" # load model path @@ -87,14 +87,14 @@ runner: class: infer # device to run training or infer device: cpu - init_model_path: "increment_model" # load model path + init_model_path: "increment_model_fibinet" # load model path phases: [phase2] - name: single_gpu_infer class: infer # device to run training or infer device: gpu - init_model_path: "increment_model" # load model path + init_model_path: "increment_model_fibinet" # load model path phases: [phase2] # runner will run all the phase in each epoch diff --git a/models/rank/flen/README.md b/models/rank/flen/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9dafeac6958ffb4f51c8f54527976fc4d431bf71 --- /dev/null +++ b/models/rank/flen/README.md @@ -0,0 +1,130 @@ +# FLEN + + 以下是本例的简要目录结构及说明: + +``` +├── data #样例数据 + ├── sample_data + ├── train + ├── sample_train.txt + ├── run.sh + ├── get_slot_data.py +├── __init__.py +├── README.md # 文档 +├── model.py #模型文件 +├── config.yaml #配置文件 +``` + +## 简介 + +[《FLEN: Leveraging Field for Scalable CTR Prediction》](https://arxiv.org/pdf/1911.04690.pdf)文章提出了field-wise bi-interaction pooling技术,解决了在大规模应用特征field信息时存在的时间复杂度和空间复杂度高的困境,同时提出了一种缓解梯度耦合问题的方法dicefactor。该模型已应用于美图的大规模推荐系统中,持续稳定地取得业务效果的全面提升。 + +本项目在avazu数据集上验证模型效果 + +## 数据下载及预处理 + +## 环境 + +PaddlePaddle 1.7.2 + +python3.7 + +PaddleRec + +## 单机训练 + +CPU环境 + +在config.yaml文件中设置好设备,epochs等。 + +``` +# select runner by name +mode: [single_cpu_train, single_cpu_infer] +# config of each runner. +# runner is a kind of paddle training class, which wraps the train/infer process. +runner: +- name: single_cpu_train + class: train + # num of epochs + epochs: 4 + # device to run training or infer + device: cpu + save_checkpoint_interval: 2 # save model interval of epochs + save_inference_interval: 4 # save inference + save_checkpoint_path: "increment_model" # save checkpoint path + save_inference_path: "inference" # save inference path + save_inference_feed_varnames: [] # feed vars of save inference + save_inference_fetch_varnames: [] # fetch vars of save inference + init_model_path: "" # load model path + print_interval: 10 + phases: [phase1] +``` + +## 单机预测 + +CPU环境 + +在config.yaml文件中设置好epochs、device等参数。 + +``` +- name: single_cpu_infer + class: infer + # num of epochs + epochs: 1 + # device to run training or infer + device: cpu #选择预测的设备 + init_model_path: "increment_dnn" # load model path + phases: [phase2] +``` + +## 运行 + +``` +python -m paddlerec.run -m paddlerec.models.rank.flen +``` + +## 模型效果 + +在样例数据上测试模型 + +训练: + +``` +0702 13:38:20.903220 7368 parallel_executor.cc:440] The Program will be executed on CPU using ParallelExecutor, 2 cards are used, so 2 programs are executed in parallel. +I0702 13:38:20.925912 7368 parallel_executor.cc:307] Inplace strategy is enabled, when build_strategy.enable_inplace = True +I0702 13:38:20.933356 7368 parallel_executor.cc:375] Garbage collection strategy is enabled, when FLAGS_eager_delete_tensor_gb = 0 +batch: 2, AUC: [0.09090909 0. ], BATCH_AUC: [0.09090909 0. ] +batch: 4, AUC: [0.31578947 0.29411765], BATCH_AUC: [0.31578947 0.29411765] +batch: 6, AUC: [0.41333333 0.33333333], BATCH_AUC: [0.41333333 0.33333333] +batch: 8, AUC: [0.4453125 0.44166667], BATCH_AUC: [0.4453125 0.44166667] +batch: 10, AUC: [0.39473684 0.38888889], BATCH_AUC: [0.44117647 0.41176471] +batch: 12, AUC: [0.41860465 0.45535714], BATCH_AUC: [0.5078125 0.54545455] +batch: 14, AUC: [0.43413729 0.42746615], BATCH_AUC: [0.56666667 0.56 ] +batch: 16, AUC: [0.46433566 0.47460087], BATCH_AUC: [0.53 0.59247649] +batch: 18, AUC: [0.44009217 0.44642857], BATCH_AUC: [0.46 0.47] +batch: 20, AUC: [0.42705314 0.43781095], BATCH_AUC: [0.45878136 0.4874552 ] +batch: 22, AUC: [0.45176471 0.46011281], BATCH_AUC: [0.48046875 0.45878136] +batch: 24, AUC: [0.48375 0.48910256], BATCH_AUC: [0.56630824 0.59856631] +epoch 0 done, use time: 0.21532440185546875 +PaddleRec Finish +``` + +预测 + +``` +PaddleRec: Runner single_cpu_infer Begin +Executor Mode: infer +processor_register begin +Running SingleInstance. +Running SingleNetwork. +QueueDataset can not support PY3, change to DataLoader +QueueDataset can not support PY3, change to DataLoader +Running SingleInferStartup. +Running SingleInferRunner. +load persistables from increment_model/0 +batch: 20, AUC: [0.49121353], BATCH_AUC: [0.66176471] +batch: 40, AUC: [0.51156463], BATCH_AUC: [0.55197133] +Infer phase2 of 0 done, use time: 0.3941819667816162 +PaddleRec Finish +``` + diff --git a/models/rank/flen/__init__.py b/models/rank/flen/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/models/rank/flen/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/rank/flen/config.yaml b/models/rank/flen/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a2dad399fd98a2a888fb2d3efbfa40f52f273de2 --- /dev/null +++ b/models/rank/flen/config.yaml @@ -0,0 +1,110 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# workspace +workspace: "paddlerec.models.rank.flen" + +# list of dataset +dataset: +- name: dataloader_train # name of dataset to distinguish different datasets + batch_size: 2 + type: QueueDataset + data_path: "{workspace}/data/sample_data/train" + sparse_slots: "click user_0 user_1 user_2 user_3 user_4 user_5 user_6 user_7 user_8 user_9 user_10 user_11 item_0 item_1 item_2 contex_0 contex_1 contex_2 contex_3 contex_4 contex_5" + dense_slots: "" +- name: dataset_infer # name + batch_size: 2 + type: QueueDataset + data_path: "{workspace}/data/sample_data/train" + sparse_slots: "click user_0 user_1 user_2 user_3 user_4 user_5 user_6 user_7 user_8 user_9 user_10 user_11 item_0 item_1 item_2 contex_0 contex_1 contex_2 contex_3 contex_4 contex_5" + dense_slots: "" + +# hyper parameters of user-defined network +hyper_parameters: + # optimizer config + optimizer: + class: Adam + learning_rate: 0.001 + strategy: async + # user-defined
+[FLEN](https://arxiv.org/pdf/1911.04690.pdf): + +
+ +
## 使用教程(快速开始) @@ -87,6 +94,7 @@ | Wide&Deep | 40 | 1 | 40 | | xDeepFM | 100 | 1 | 10 | | Fibinet | 1000 | 8 | 4 | +| Flen | 512 | 8 | 1 | ### 数据处理 参考每个模型目录数据下载&预处理脚本 @@ -127,6 +135,7 @@ python -m paddlerec.run -m ./config.yaml # 以DNN为例 | Census-income Data | Wide&Deep | 0.76195 | 0.90577 | -- | -- | | Amazon Product | DIN | 0.47005 | 0.86379 | -- | -- | | Criteo | Fibinet | -- | 0.86662 | -- | -- | +| Avazu | Flen | -- | -- | -- | -- | ## 分布式 diff --git a/run.py b/run.py index b9e15a50ea40393a1f49c1d1e1c876947bc1ef10..6340adfc1c6026d7c67f5576ba8d0230055ec19d 100755 --- a/run.py +++ b/run.py @@ -38,7 +38,7 @@ def engine_registry(): engines["TRANSPILER"]["TRAIN"] = single_train_engine engines["TRANSPILER"]["INFER"] = single_infer_engine engines["TRANSPILER"]["LOCAL_CLUSTER_TRAIN"] = local_cluster_engine - engines["TRANSPILER"]["CLUSTER"] = cluster_engine + engines["TRANSPILER"]["CLUSTER_TRAIN"] = cluster_engine engines["PSLIB"]["TRAIN"] = local_mpi_engine engines["PSLIB"]["LOCAL_CLUSTER_TRAIN"] = local_mpi_engine engines["PSLIB"]["CLUSTER_TRAIN"] = cluster_mpi_engine @@ -111,8 +111,8 @@ def get_engine(args, running_config, mode): engine = running_config.get(engine_class, None) if engine is None: - raise ValueError("not find {} in yaml, please check".format( - mode, engine_class)) + raise ValueError("not find {} in engine_class , please check".format( + engine)) device = running_config.get(engine_device, None) engine = engine.upper() @@ -262,15 +262,48 @@ def single_infer_engine(args): def cluster_engine(args): def master(): from paddlerec.core.engine.cluster.cluster import ClusterEngine - _envs = envs.load_yaml(args.backend) - flattens = envs.flatten_environs(_envs, "_") + + # Get fleet_mode & device + run_extras = get_all_inters_from_yaml(args.model, ["runner."]) + mode = envs.get_runtime_environ("mode") + fleet_class = ".".join(["runner", mode, "fleet_mode"]) + device_class = ".".join(["runner", mode, "device"]) + fleet_mode = run_extras.get(fleet_class, "ps") + device = run_extras.get(device_class, "cpu") + device = device.upper() + fleet_mode = fleet_mode.upper() + + if fleet_mode == "COLLECTIVE" and device != "GPU": + raise ValueError("COLLECTIVE can not be used without GPU") + + # Get Thread nums + model_envs = envs.load_yaml(args.model) + phases_class = ".".join(["runner", mode, "phases"]) + phase_names = run_extras.get(phases_class) + phases = [] + all_phases = model_envs.get("phase") + if phase_names is None: + phases = all_phases + else: + for phase in all_phases: + if phase["name"] in phase_names: + phases.append(phase) + + thread_num = [] + for phase in phases: + thread_num.append(int(phase["thread_num"])) + max_thread_num = max(thread_num) + + backend_envs = envs.load_yaml(args.backend) + flattens = envs.flatten_environs(backend_envs, "_") flattens["engine_role"] = "MASTER" flattens["engine_mode"] = envs.get_runtime_environ("mode") flattens["engine_run_config"] = args.model - flattens["engine_temp_path"] = tempfile.mkdtemp() + flattens["max_thread_num"] = max_thread_num + flattens["fleet_mode"] = fleet_mode + flattens["device"] = device + flattens["backend_yaml"] = args.backend envs.set_runtime_environs(flattens) - ClusterEngine.workspace_replace() - print(envs.pretty_print_envs(flattens, ("Submit Envs", "Value"))) launch = ClusterEngine(None, args.model) return launch @@ -278,40 +311,29 @@ def cluster_engine(args): def worker(mode): if not mode: raise ValueError("mode: {} can not be recognized") + from paddlerec.core.engine.cluster.cluster import ClusterEngine run_extras = get_all_inters_from_yaml(args.model, ["runner."]) trainer_class = ".".join(["runner", mode, "trainer_class"]) fleet_class = ".".join(["runner", mode, "fleet_mode"]) device_class = ".".join(["runner", mode, "device"]) - selected_gpus_class = ".".join(["runner", mode, "selected_gpus"]) strategy_class = ".".join(["runner", mode, "distribute_strategy"]) - worker_class = ".".join(["runner", mode, "worker_num"]) - server_class = ".".join(["runner", mode, "server_num"]) - trainer = run_extras.get(trainer_class, "GeneralTrainer") fleet_mode = run_extras.get(fleet_class, "ps") device = run_extras.get(device_class, "cpu") - selected_gpus = run_extras.get(selected_gpus_class, "0") distributed_strategy = run_extras.get(strategy_class, "async") - worker_num = run_extras.get(worker_class, 1) - server_num = run_extras.get(server_class, 1) executor_mode = "train" device = device.upper() fleet_mode = fleet_mode.upper() - if fleet_mode == "COLLECTIVE" and device != "GPU": - raise ValueError("COLLECTIVE can not be used with GPU") + raise ValueError("COLLECTIVE can not be used without GPU") cluster_envs = {} - if device == "GPU": - cluster_envs["selected_gpus"] = selected_gpus - - cluster_envs["server_num"] = server_num - cluster_envs["worker_num"] = worker_num cluster_envs["fleet_mode"] = fleet_mode + cluster_envs["engine_role"] = "WORKER" cluster_envs["train.trainer.trainer"] = trainer cluster_envs["train.trainer.engine"] = "cluster" cluster_envs["train.trainer.executor_mode"] = executor_mode @@ -321,15 +343,15 @@ def cluster_engine(args): cluster_envs["train.trainer.platform"] = envs.get_platform() print("launch {} engine with cluster to with model: {}".format( trainer, args.model)) - set_runtime_envs(cluster_envs, args.model) - trainer = TrainerFactory.create(args.model) - return trainer + set_runtime_envs(cluster_envs, args.model) + launch = ClusterEngine(None, args.model) + return launch role = os.getenv("PADDLE_PADDLEREC_ROLE", "MASTER") if role == "WORKER": - mode = os.getenv("PADDLE_PADDLEREC_MODE", None) + mode = os.getenv("mode", None) return worker(mode) else: return master() diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000000000000000000000000000000000000..0f7efd39336b4bf0443da4a8c89b7860ad23efd3 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[easy_install] +index_url=http://pip.baidu.com/pypi/simple \ No newline at end of file diff --git a/setup.py b/setup.py index 2133030a60ed6cc8867fb041243fb29aabe1c6c5..db77dc97be184d9834d7d5d09a71a83b3e28b1b7 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ -# coding=utf8 +# -*- coding: utf-8 -*- + # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -69,7 +70,7 @@ def build(dirname): 'Criteo_data/sample_data/train/*' ] - engine_copy = ['*/*.sh'] + engine_copy = ['*/*.sh', '*/*.template'] for package in packages: if package.startswith("paddlerec.models."): package_data[package] = models_copy