diff --git a/fluid/PaddleRec/din/README.md b/fluid/PaddleRec/din/README.md index 3a1285b420c91e3e9aa00ce56cad23496276bb81..3538ba760ff9b80807a6a56aed4b75400c97ae03 100644 --- a/fluid/PaddleRec/din/README.md +++ b/fluid/PaddleRec/din/README.md @@ -36,6 +36,10 @@ DIN通过一个兴趣激活模块(Activation Unit),用预估目标Candidate AD ``` cd data && sh data_process.sh && cd .. ``` +如果执行过程中遇到找不到某个包(例如pandas包)的报错,使用如下命令安装对应的包即可。 +``` +pip install pandas +``` * Step 2: 产生训练集、测试集和config文件 ``` @@ -103,6 +107,12 @@ model saved in din_amazon/global_step_50000 ... ``` +提示: + +* 在单机条件下,使用代码中默认的超参数运行时,产生最优auc的global step大致在440000到500000之间 + +* 训练超出一定的epoch后会稍稍出现过拟合 + ## 预测 参考如下命令,开始预测. diff --git a/fluid/PaddleRec/din/data/build_dataset.py b/fluid/PaddleRec/din/data/build_dataset.py index d2d93cbf73c5e25ef527c54df7b8b95367edd484..34c053ccdb2686c10875740f72f1e0abf3cb4f10 100644 --- a/fluid/PaddleRec/din/data/build_dataset.py +++ b/fluid/PaddleRec/din/data/build_dataset.py @@ -1,6 +1,6 @@ +from __future__ import print_function import random import pickle -from __future__ import print_function random.seed(1234) diff --git a/fluid/PaddleRec/din/data/convert_pd.py b/fluid/PaddleRec/din/data/convert_pd.py index e019a42a27a03a615cd7c7c8256c6dd864a50649..d7927c7ef1a9da28732cad9c44be24e72095983a 100644 --- a/fluid/PaddleRec/din/data/convert_pd.py +++ b/fluid/PaddleRec/din/data/convert_pd.py @@ -1,3 +1,4 @@ +from __future__ import print_function import pickle import pandas as pd @@ -13,10 +14,12 @@ def to_df(file_path): return df +print("start to analyse reviews_Electronics_5.json") reviews_df = to_df('./raw_data/reviews_Electronics_5.json') with open('./raw_data/reviews.pkl', 'wb') as f: pickle.dump(reviews_df, f, pickle.HIGHEST_PROTOCOL) +print("start to analyse meta_Electronics.json") meta_df = to_df('./raw_data/meta_Electronics.json') meta_df = meta_df[meta_df['asin'].isin(reviews_df['asin'].unique())] meta_df = meta_df.reset_index(drop=True) diff --git a/fluid/PaddleRec/din/data/remap_id.py b/fluid/PaddleRec/din/data/remap_id.py index bb6d36b7b5a48370d4733c4007442cef28b7274f..b110dac54de8f8d201ede7248d6a2844ac350c90 100644 --- a/fluid/PaddleRec/din/data/remap_id.py +++ b/fluid/PaddleRec/din/data/remap_id.py @@ -1,7 +1,7 @@ +from __future__ import print_function import random import pickle import numpy as np -from __future__ import print_function random.seed(1234) diff --git a/fluid/PaddleRec/din/train.py b/fluid/PaddleRec/din/train.py index a8c2b752a0e8657482c318f39c3e318e145efb94..db3ac0b178fd77db88ee12af6faaeed742ad2bcf 100644 --- a/fluid/PaddleRec/din/train.py +++ b/fluid/PaddleRec/din/train.py @@ -121,7 +121,7 @@ def train(): loss_sum = 0.0 if (global_step > 400000 and global_step % PRINT_STEP == 0) or ( - global_step < 400000 and global_step % 50000 == 0): + global_step <= 400000 and global_step % 50000 == 0): save_dir = args.model_dir + "/global_step_" + str( global_step) feed_var_name = [