diff --git a/models/recall/gnn/config.yaml b/models/recall/gnn/config.yaml index b488fc656afb4eb548b4da7b013cd0b44a4eab04..ed290b2f81e530def392b2e851a81e1ff74cb8a2 100755 --- a/models/recall/gnn/config.yaml +++ b/models/recall/gnn/config.yaml @@ -49,31 +49,31 @@ runner: - name: train_runner class: train # num of epochs - epochs: 5 + epochs: 2 # device to run training or infer device: cpu save_checkpoint_interval: 1 # save model interval of epochs save_inference_interval: 1 # save inference - save_checkpoint_path: "increment_gnn" # save checkpoint path - save_inference_path: "inference_gnn" # save inference path + save_checkpoint_path: "increment" # save checkpoint path + save_inference_path: "inference" # save inference path save_inference_feed_varnames: [] # feed vars of save inference save_inference_fetch_varnames: [] # fetch vars of save inference init_model_path: "" # load model path - print_interval: 10 + print_interval: 1 - name: infer_runner class: infer # device to run training or infer device: cpu print_interval: 1 - init_model_path: "increment_gnn" # load model path + init_model_path: "increment/0" # load model path # runner will run all the phase in each epoch phase: -- name: phase_train +- name: phase1 model: "{workspace}/model.py" # user-defined model dataset_name: dataset_train # select dataset by name thread_num: 1 -# - name: phase_infer +# - name: phase2 # model: "{workspace}/model.py" # user-defined model # dataset_name: dataset_infer # select dataset by name # thread_num: 1 diff --git a/models/recall/gnn/data/download.py b/models/recall/gnn/data/download.py index 05fe898b4802257dc93f251cb33c6724aa790f6b..9bebdf1b37e2cd45369c14bb7446c206de8017a0 100644 --- a/models/recall/gnn/data/download.py +++ b/models/recall/gnn/data/download.py @@ -57,10 +57,5 @@ def _download_file(url, savepath, print_progress): progress("[%-50s] %.2f%%" % ('=' * 50, 100), end=True) -if sys.argv[1] == "diginetica": - _download_file("https://sr-gnn.bj.bcebos.com/train-item-views.csv", - "./train-item-views.csv", True) -elif sys.argv[1] == "yoochoose": - _download_file( - "https://paddlerec.bj.bcebos.com/gnn%2Fyoochoose-clicks.dat", - "./yoochoose-clicks.dat", True) +_download_file("https://sr-gnn.bj.bcebos.com/train-item-views.csv", + "./train-item-views.csv", True) diff --git a/models/recall/gnn/data/preprocess.py b/models/recall/gnn/data/preprocess.py index 56a9dbff507d68d19a03ac6bf0cd80af31f89f32..3e7f710b221d708183c2f85d2743162c44b863da 100644 --- a/models/recall/gnn/data/preprocess.py +++ b/models/recall/gnn/data/preprocess.py @@ -41,29 +41,39 @@ with open(dataset, "r") as f: curdate = None for data in reader: sessid = data['session_id'] - date = '' + if curdate and not curid == sessid: + date = '' + if opt.dataset == 'yoochoose': + date = time.mktime( + time.strptime(curdate[:19], '%Y-%m-%dT%H:%M:%S')) + else: + date = time.mktime(time.strptime(curdate, '%Y-%m-%d')) + sess_date[curid] = date + curid = sessid if opt.dataset == 'yoochoose': item = data['item_id'] - date = time.mktime( - time.strptime(data['timestamp'][:19], '%Y-%m-%dT%H:%M:%S')) else: item = data['item_id'], int(data['timeframe']) - date = time.mktime(time.strptime(data['eventdate'], '%Y-%m-%d')) - - if sessid not in sess_date: - sess_date[sessid] = date - elif date > sess_date[sessid]: - sess_date[sessid] = date + curdate = '' + if opt.dataset == 'yoochoose': + curdate = data['timestamp'] + else: + curdate = data['eventdate'] if sessid in sess_clicks: sess_clicks[sessid] += [item] else: sess_clicks[sessid] = [item] ctr += 1 - if opt.dataset != 'yoochoose': + date = '' + if opt.dataset == 'yoochoose': + date = time.mktime(time.strptime(curdate[:19], '%Y-%m-%dT%H:%M:%S')) + else: + date = time.mktime(time.strptime(curdate, '%Y-%m-%d')) for i in list(sess_clicks): sorted_clicks = sorted(sess_clicks[i], key=operator.itemgetter(1)) sess_clicks[i] = [c[0] for c in sorted_clicks] + sess_date[curid] = date print("-- Reading data @ %ss" % datetime.datetime.now()) # Filter out length 1 sessions @@ -150,7 +160,7 @@ def obtian_tra(): train_dates += [date] train_seqs += [outseq] print(item_ctr) # 43098, 37484 - with open("./config.txt", "w") as fout: + with open("./diginetica/config.txt", "w") as fout: fout.write(str(item_ctr) + "\n") return train_ids, train_dates, train_seqs diff --git a/models/recall/gnn/data_prepare.sh b/models/recall/gnn/data_prepare.sh index a97e57ab350d91dedf0bde623d1fd2b97908bf96..00a3dcebb01f33424ed9e9517967e5cb613bee81 100644 --- a/models/recall/gnn/data_prepare.sh +++ b/models/recall/gnn/data_prepare.sh @@ -15,31 +15,21 @@ # limitations under the License. set -e - -dataset=$1 -src=$1 - -if [[ $src == "yoochoose1_4" || $src == "yoochoose1_64" ]];then - src="yoochoose" -elif [[ $src == "diginetica" ]];then - src="diginetica" -else - echo "Usage: sh data_prepare.sh [diginetica|yoochoose1_4|yoochoose1_64]" - exit 1 -fi - echo "begin to download data" -cd data && python download.py $src -mkdir $dataset -python preprocess.py --dataset $src + +cd data && python download.py +mkdir diginetica +python preprocess.py --dataset diginetica echo "begin to convert data (binary -> txt)" -python convert_data.py --data_dir $dataset +python convert_data.py --data_dir diginetica -cat ${dataset}/train.txt | wc -l >> config.txt +cat diginetica/train.txt | wc -l >> diginetica/config.txt rm -rf train && mkdir train -mv ${dataset}/train.txt train +mv diginetica/train.txt train rm -rf test && mkdir test -mv ${dataset}/test.txt test +mv diginetica/test.txt test + +mv diginetica/config.txt ./config.txt diff --git a/models/recall/gnn/model.py b/models/recall/gnn/model.py index 21b884215f80180cea5f3daf6fbf53a4187b1600..948324b484e3a25993bdc9aa4f858d5e0a0d10f9 100755 --- a/models/recall/gnn/model.py +++ b/models/recall/gnn/model.py @@ -20,7 +20,6 @@ import paddle.fluid.layers as layers from paddlerec.core.utils import envs from paddlerec.core.model import ModelBase -from paddlerec.core.metrics import Precision class Model(ModelBase): @@ -236,16 +235,16 @@ class Model(ModelBase): softmax = layers.softmax_with_cross_entropy( logits=logits, label=inputs[6]) # [batch_size, 1] self.loss = layers.reduce_mean(softmax) # [1] - acc = Precision(input=logits, label=inputs[6], k=20) - self._cost = self.loss + self.acc = layers.accuracy(input=logits, label=inputs[6], k=20) + self._cost = self.loss if is_infer: - self._infer_results['P@20'] = acc - self._infer_results['LOSS'] = self.loss + self._infer_results['acc'] = self.acc + self._infer_results['loss'] = self.loss return self._metrics["LOSS"] = self.loss - self._metrics["Train_P@20"] = acc + self._metrics["train_acc"] = self.acc def optimizer(self): step_per_epoch = self.corpus_size // self.train_batch_size