From 1630b6cd91b0eed0b2a29b7a35ae21ccaf2b0e7e Mon Sep 17 00:00:00 2001 From: malin10 Date: Tue, 21 Jul 2020 11:04:05 +0800 Subject: [PATCH] update gnn --- models/recall/gnn/config.yaml | 14 ++++++------ models/recall/gnn/data/download.py | 9 ++++++-- models/recall/gnn/data/preprocess.py | 32 ++++++++++------------------ models/recall/gnn/data_prepare.sh | 30 +++++++++++++++++--------- models/recall/gnn/model.py | 11 +++++----- 5 files changed, 51 insertions(+), 45 deletions(-) diff --git a/models/recall/gnn/config.yaml b/models/recall/gnn/config.yaml index ed290b2f..b488fc65 100755 --- a/models/recall/gnn/config.yaml +++ b/models/recall/gnn/config.yaml @@ -49,31 +49,31 @@ runner: - name: train_runner class: train # num of epochs - epochs: 2 + epochs: 5 # device to run training or infer device: cpu save_checkpoint_interval: 1 # save model interval of epochs save_inference_interval: 1 # save inference - save_checkpoint_path: "increment" # save checkpoint path - save_inference_path: "inference" # save inference path + save_checkpoint_path: "increment_gnn" # save checkpoint path + save_inference_path: "inference_gnn" # save inference path save_inference_feed_varnames: [] # feed vars of save inference save_inference_fetch_varnames: [] # fetch vars of save inference init_model_path: "" # load model path - print_interval: 1 + print_interval: 10 - name: infer_runner class: infer # device to run training or infer device: cpu print_interval: 1 - init_model_path: "increment/0" # load model path + init_model_path: "increment_gnn" # load model path # runner will run all the phase in each epoch phase: -- name: phase1 +- name: phase_train model: "{workspace}/model.py" # user-defined model dataset_name: dataset_train # select dataset by name thread_num: 1 -# - name: phase2 +# - name: phase_infer # model: "{workspace}/model.py" # user-defined model # dataset_name: dataset_infer # select dataset by name # thread_num: 1 diff --git a/models/recall/gnn/data/download.py b/models/recall/gnn/data/download.py index 9bebdf1b..05fe898b 100644 --- a/models/recall/gnn/data/download.py +++ b/models/recall/gnn/data/download.py @@ -57,5 +57,10 @@ def _download_file(url, savepath, print_progress): progress("[%-50s] %.2f%%" % ('=' * 50, 100), end=True) -_download_file("https://sr-gnn.bj.bcebos.com/train-item-views.csv", - "./train-item-views.csv", True) +if sys.argv[1] == "diginetica": + _download_file("https://sr-gnn.bj.bcebos.com/train-item-views.csv", + "./train-item-views.csv", True) +elif sys.argv[1] == "yoochoose": + _download_file( + "https://paddlerec.bj.bcebos.com/gnn%2Fyoochoose-clicks.dat", + "./yoochoose-clicks.dat", True) diff --git a/models/recall/gnn/data/preprocess.py b/models/recall/gnn/data/preprocess.py index 3e7f710b..56a9dbff 100644 --- a/models/recall/gnn/data/preprocess.py +++ b/models/recall/gnn/data/preprocess.py @@ -41,39 +41,29 @@ with open(dataset, "r") as f: curdate = None for data in reader: sessid = data['session_id'] - if curdate and not curid == sessid: - date = '' - if opt.dataset == 'yoochoose': - date = time.mktime( - time.strptime(curdate[:19], '%Y-%m-%dT%H:%M:%S')) - else: - date = time.mktime(time.strptime(curdate, '%Y-%m-%d')) - sess_date[curid] = date - curid = sessid + date = '' if opt.dataset == 'yoochoose': item = data['item_id'] + date = time.mktime( + time.strptime(data['timestamp'][:19], '%Y-%m-%dT%H:%M:%S')) else: item = data['item_id'], int(data['timeframe']) - curdate = '' - if opt.dataset == 'yoochoose': - curdate = data['timestamp'] - else: - curdate = data['eventdate'] + date = time.mktime(time.strptime(data['eventdate'], '%Y-%m-%d')) + + if sessid not in sess_date: + sess_date[sessid] = date + elif date > sess_date[sessid]: + sess_date[sessid] = date if sessid in sess_clicks: sess_clicks[sessid] += [item] else: sess_clicks[sessid] = [item] ctr += 1 - date = '' - if opt.dataset == 'yoochoose': - date = time.mktime(time.strptime(curdate[:19], '%Y-%m-%dT%H:%M:%S')) - else: - date = time.mktime(time.strptime(curdate, '%Y-%m-%d')) + if opt.dataset != 'yoochoose': for i in list(sess_clicks): sorted_clicks = sorted(sess_clicks[i], key=operator.itemgetter(1)) sess_clicks[i] = [c[0] for c in sorted_clicks] - sess_date[curid] = date print("-- Reading data @ %ss" % datetime.datetime.now()) # Filter out length 1 sessions @@ -160,7 +150,7 @@ def obtian_tra(): train_dates += [date] train_seqs += [outseq] print(item_ctr) # 43098, 37484 - with open("./diginetica/config.txt", "w") as fout: + with open("./config.txt", "w") as fout: fout.write(str(item_ctr) + "\n") return train_ids, train_dates, train_seqs diff --git a/models/recall/gnn/data_prepare.sh b/models/recall/gnn/data_prepare.sh index 00a3dceb..a97e57ab 100644 --- a/models/recall/gnn/data_prepare.sh +++ b/models/recall/gnn/data_prepare.sh @@ -15,21 +15,31 @@ # limitations under the License. set -e -echo "begin to download data" -cd data && python download.py -mkdir diginetica -python preprocess.py --dataset diginetica +dataset=$1 +src=$1 + +if [[ $src == "yoochoose1_4" || $src == "yoochoose1_64" ]];then + src="yoochoose" +elif [[ $src == "diginetica" ]];then + src="diginetica" +else + echo "Usage: sh data_prepare.sh [diginetica|yoochoose1_4|yoochoose1_64]" + exit 1 +fi + +echo "begin to download data" +cd data && python download.py $src +mkdir $dataset +python preprocess.py --dataset $src echo "begin to convert data (binary -> txt)" -python convert_data.py --data_dir diginetica +python convert_data.py --data_dir $dataset -cat diginetica/train.txt | wc -l >> diginetica/config.txt +cat ${dataset}/train.txt | wc -l >> config.txt rm -rf train && mkdir train -mv diginetica/train.txt train +mv ${dataset}/train.txt train rm -rf test && mkdir test -mv diginetica/test.txt test - -mv diginetica/config.txt ./config.txt +mv ${dataset}/test.txt test diff --git a/models/recall/gnn/model.py b/models/recall/gnn/model.py index 948324b4..21b88421 100755 --- a/models/recall/gnn/model.py +++ b/models/recall/gnn/model.py @@ -20,6 +20,7 @@ import paddle.fluid.layers as layers from paddlerec.core.utils import envs from paddlerec.core.model import ModelBase +from paddlerec.core.metrics import Precision class Model(ModelBase): @@ -235,16 +236,16 @@ class Model(ModelBase): softmax = layers.softmax_with_cross_entropy( logits=logits, label=inputs[6]) # [batch_size, 1] self.loss = layers.reduce_mean(softmax) # [1] - self.acc = layers.accuracy(input=logits, label=inputs[6], k=20) - + acc = Precision(input=logits, label=inputs[6], k=20) self._cost = self.loss + if is_infer: - self._infer_results['acc'] = self.acc - self._infer_results['loss'] = self.loss + self._infer_results['P@20'] = acc + self._infer_results['LOSS'] = self.loss return self._metrics["LOSS"] = self.loss - self._metrics["train_acc"] = self.acc + self._metrics["Train_P@20"] = acc def optimizer(self): step_per_epoch = self.corpus_size // self.train_batch_size -- GitLab