diff --git a/fleet_rec/core/trainers/single_trainer.py b/fleet_rec/core/trainers/single_trainer.py index 772afbf3a7264844bbc3a2e3660dbab014edafd1..4db2947172a32f0443d0d74975eda44e74accdd3 100755 --- a/fleet_rec/core/trainers/single_trainer.py +++ b/fleet_rec/core/trainers/single_trainer.py @@ -93,7 +93,7 @@ class SingleTrainer(TranspileTrainer): metrics = [epoch, batch_id] metrics.extend(metrics_rets) - if batch_id % 10 == 0 and batch_id != 0: + if batch_id % self.fetch_period == 0 and batch_id != 0: print(metrics_format.format(*metrics)) batch_id += 1 except fluid.core.EOFException: diff --git a/models/recall/multiview-simnet/__init__.py b/models/recall/multiview-simnet/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/models/recall/multiview-simnet/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/recall/multiview-simnet/config.yaml b/models/recall/multiview-simnet/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0fcc1fc2f18ed2588d4f9eb235a1dd619e99f337 --- /dev/null +++ b/models/recall/multiview-simnet/config.yaml @@ -0,0 +1,59 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +evaluate: + workspace: "fleetrec.models.recall.multiview-simnet" + reader: + batch_size: 2 + class: "{workspace}/evaluate_reader.py" + test_data_path: "{workspace}/data/test" + +train: + trainer: + # for cluster training + strategy: "async" + + epochs: 2 + workspace: "fleetrec.models.recall.multiview-simnet" + + reader: + batch_size: 2 + class: "{workspace}/reader.py" + train_data_path: "{workspace}/data/train" + dataset_class: "DataLoader" + + model: + models: "{workspace}/model.py" + hyper_parameters: + use_DataLoader: True + query_encoder: "bow" + title_encoder: "bow" + query_encode_dim: 128 + title_encode_dim: 128 + query_slots: 1 + title_slots: 1 + sparse_feature_dim: 1000001 + embedding_dim: 128 + hidden_size: 128 + learning_rate: 0.0001 + optimizer: adam + + save: + increment: + dirname: "increment" + epoch_interval: 1 + save_last: True + inference: + dirname: "inference" + epoch_interval: 1 + save_last: True diff --git a/models/recall/multiview-simnet/data/test/test.txt b/models/recall/multiview-simnet/data/test/test.txt new file mode 100644 index 0000000000000000000000000000000000000000..cfb0675ff714459bc47d1e54bb1c707770238296 --- /dev/null +++ b/models/recall/multiview-simnet/data/test/test.txt @@ -0,0 +1,10 @@ +224289:0 126379:0 284519:0 549329:0 750666:0 393772:0 586898:0 736887:0 48785:0 906517:0 229162:1 483485:1 739835:1 29957:1 694497:1 997508:1 556876:1 717791:1 232176:1 430356:1 +366182:0 82062:0 708883:0 949128:0 798964:0 639103:0 409033:0 79301:0 405607:0 342616:0 61552:1 560547:1 3760:1 754734:1 98496:1 472427:1 979596:1 750283:1 492028:1 801383:1 +969571:0 405187:0 756217:0 563640:0 572168:0 881952:0 446260:0 692177:0 994140:0 485393:0 509081:1 297377:1 465399:1 934708:1 430949:1 135651:1 484531:1 385306:1 463957:1 996004:1 +436320:0 423131:0 963969:0 78345:0 879550:0 458203:0 684397:0 956202:0 989802:0 526101:0 852446:1 182545:1 625656:1 674856:1 422648:1 74100:1 48372:1 850830:1 336087:1 178251:1 +242683:0 118677:0 20731:0 970617:0 355890:0 739613:0 926695:0 963639:0 201043:0 611907:0 115309:1 310984:1 615584:1 638886:1 575934:1 889389:1 974807:1 570987:1 532482:1 911925:1 +954007:0 122623:0 168195:0 348901:0 217880:0 84759:0 925763:0 436382:0 573742:0 942921:0 553377:1 835046:1 137907:1 933870:1 766585:1 48483:1 543079:1 889467:1 521705:1 906676:1 +798690:0 617323:0 553266:0 232924:0 159461:0 404822:0 52992:0 364854:0 913876:0 547974:0 559472:1 748595:1 71793:1 357331:1 606888:1 477051:1 291481:1 89363:1 503881:1 423029:1 +228207:0 785250:0 661149:0 803304:0 478781:0 495202:0 804509:0 273065:0 26123:0 810840:0 801871:1 146772:1 421009:1 752344:1 946358:1 531668:1 5771:1 191294:1 627329:1 434664:1 +984628:0 762075:0 505288:0 48519:0 72492:0 26568:0 684085:0 613095:0 781547:0 895829:0 280541:1 903234:1 708065:1 386658:1 331060:1 3693:1 279760:1 459579:1 423552:1 962594:1 +674172:0 39271:0 646093:0 757969:0 553251:0 734960:0 967186:0 856940:0 617246:0 376452:0 113050:1 472707:1 975057:1 865095:1 155824:1 389921:1 205520:1 513667:1 163588:1 953463:1 diff --git a/models/recall/multiview-simnet/data/train/train.txt b/models/recall/multiview-simnet/data/train/train.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b26c6c6c2eab44b9f9b29b9fb475c5d018ca92a --- /dev/null +++ b/models/recall/multiview-simnet/data/train/train.txt @@ -0,0 +1,10 @@ +7688:0 589671:0 339543:0 681723:0 339204:0 743067:0 897959:0 897541:0 571340:0 858141:0 68161:1 533957:1 288065:1 755516:1 179906:1 324817:1 116293:1 942079:1 455277:1 787142:1 251765:2 846187:2 586960:2 781883:2 430436:2 240100:2 686201:2 632045:2 585097:2 61976:2 +187966:0 194147:0 640819:0 283848:0 514875:0 310781:0 760083:0 281096:0 837090:0 928087:0 958908:1 451359:1 456136:1 577231:1 373371:1 651928:1 877106:1 721988:1 342265:1 114942:1 668915:2 502190:2 139044:2 213045:2 36710:2 119509:2 450285:2 165440:2 199495:2 798870:2 +477955:0 598041:0 452166:0 924550:0 152308:0 316225:0 285239:0 7967:0 177143:0 132244:0 391070:1 169561:1 256279:1 563588:1 749753:1 237035:1 550804:1 736257:1 71551:1 61944:1 102132:2 484023:2 82995:2 732704:2 114816:2 413165:2 197504:2 686192:2 253734:2 248157:2 +325819:0 140241:0 365103:0 334185:0 357327:0 613836:0 928004:0 595589:0 506569:0 539067:0 638196:1 729129:1 730912:1 701797:1 571150:1 140054:1 680316:1 889784:1 302584:1 676284:1 671069:2 212989:2 318469:2 732930:2 924564:2 147041:2 572412:2 662673:2 418312:2 382855:2 +839803:0 888881:0 957998:0 906486:0 44377:0 247842:0 994783:0 813449:0 168271:0 493685:0 269703:1 156692:1 686681:1 273684:1 312387:1 462033:1 669631:1 635437:1 74337:1 217677:1 582194:2 992666:2 860610:2 660766:2 24524:2 169856:2 882211:2 291866:2 44494:2 984736:2 +327559:0 627497:0 876526:0 243959:0 532929:0 639919:0 443220:0 952110:0 844723:0 372053:0 196819:1 326005:1 62242:1 774928:1 382727:1 348680:1 946697:1 625998:1 276517:1 251595:1 342204:2 825871:2 407136:2 724114:2 611341:2 517978:2 248341:2 111254:2 836867:2 677297:2 +72451:0 749548:0 283413:0 419402:0 67446:0 341795:0 918120:0 892028:0 113151:0 832663:0 758121:1 500602:1 734935:1 577972:1 205421:1 726739:1 276563:1 611928:1 185486:1 603502:1 633117:2 929300:2 332435:2 216848:2 412769:2 708304:2 800045:2 315869:2 444476:2 332565:2 +675647:0 212558:0 654982:0 321053:0 111172:0 635432:0 298523:0 612182:0 203835:0 288250:0 990034:1 891786:1 188524:1 480757:1 436783:1 874434:1 530090:1 492441:1 32835:1 886415:1 688876:2 626030:2 612348:2 208265:2 355885:2 603938:2 349931:2 86683:2 361956:2 705130:2 +164500:0 332294:0 373155:0 320413:0 801561:0 152827:0 28282:0 435913:0 376758:0 367848:0 285596:1 282674:1 357323:1 257195:1 948061:1 996976:1 300918:1 734644:1 870559:1 924205:1 45095:2 61352:2 242258:2 153354:2 763576:2 133542:2 431079:2 193327:2 655823:2 770159:2 +821764:0 184731:0 888413:0 793536:0 30049:0 533675:0 791254:0 92255:0 74185:0 557758:0 795898:1 15689:1 983592:1 248891:1 64421:1 387642:1 315522:1 526054:1 404172:1 704838:1 537016:2 383828:2 438418:2 885895:2 894698:2 228867:2 343213:2 411377:2 149957:2 810795:2 diff --git a/models/recall/multiview-simnet/data_process.sh b/models/recall/multiview-simnet/data_process.sh new file mode 100644 index 0000000000000000000000000000000000000000..15c6c908477cd3ba6a72a65bad039bb10295bd9c --- /dev/null +++ b/models/recall/multiview-simnet/data_process.sh @@ -0,0 +1,10 @@ +#! /bin/bash + +set -e +echo "begin to prepare data" + +mkdir -p data/train +mkdir -p data/test + +python generate_synthetic_data.py + diff --git a/models/recall/multiview-simnet/evaluate_reader.py b/models/recall/multiview-simnet/evaluate_reader.py new file mode 100755 index 0000000000000000000000000000000000000000..63340ccd003589d6e4411f08ed8ffa554ee170fa --- /dev/null +++ b/models/recall/multiview-simnet/evaluate_reader.py @@ -0,0 +1,57 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import io +import copy +import random +from fleetrec.core.reader import Reader +from fleetrec.core.utils import envs + + +class EvaluateReader(Reader): + def init(self): + self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model") + self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model") + + self.all_slots = [] + for i in range(self.query_slots): + self.all_slots.append(str(i)) + + for i in range(self.title_slots): + self.all_slots.append(str(i + self.query_slots)) + + self._all_slots_dict = dict() + for index, slot in enumerate(self.all_slots): + self._all_slots_dict[slot] = [False, index] + + def generate_sample(self, line): + def data_iter(): + elements = line.rstrip().split() + padding = 0 + output = [(slot, []) for slot in self.all_slots] + for elem in elements: + feasign, slot = elem.split(':') + if not self._all_slots_dict.has_key(slot): + continue + self._all_slots_dict[slot][0] = True + index = self._all_slots_dict[slot][1] + output[index][1].append(int(feasign)) + for slot in self._all_slots_dict: + visit, index = self._all_slots_dict[slot] + if visit: + self._all_slots_dict[slot][0] = False + else: + output[index][1].append(padding) + yield output + return data_iter diff --git a/models/recall/multiview-simnet/generate_synthetic_data.py b/models/recall/multiview-simnet/generate_synthetic_data.py new file mode 100644 index 0000000000000000000000000000000000000000..5ebb3a355f3904dc0a5cccff8e9d0b48b89f18f4 --- /dev/null +++ b/models/recall/multiview-simnet/generate_synthetic_data.py @@ -0,0 +1,87 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random + +class Dataset: + def __init__(self): + pass + +class SyntheticDataset(Dataset): + def __init__(self, sparse_feature_dim, query_slot_num, title_slot_num, dataset_size=10000): + # ids are randomly generated + self.ids_per_slot = 10 + self.sparse_feature_dim = sparse_feature_dim + self.query_slot_num = query_slot_num + self.title_slot_num = title_slot_num + self.dataset_size = dataset_size + + def _reader_creator(self, is_train): + def generate_ids(num, space): + return [random.randint(0, space - 1) for i in range(num)] + + def reader(): + for i in range(self.dataset_size): + query_slots = [] + pos_title_slots = [] + neg_title_slots = [] + for i in range(self.query_slot_num): + qslot = generate_ids(self.ids_per_slot, + self.sparse_feature_dim) + qslot = [str(fea) + ':' + str(i) for fea in qslot] + query_slots += qslot + for i in range(self.title_slot_num): + pt_slot = generate_ids(self.ids_per_slot, + self.sparse_feature_dim) + pt_slot = [str(fea) + ':' + str(i + self.query_slot_num) for fea in pt_slot] + pos_title_slots += pt_slot + if is_train: + for i in range(self.title_slot_num): + nt_slot = generate_ids(self.ids_per_slot, + self.sparse_feature_dim) + nt_slot = [str(fea) + ':' + str(i + self.query_slot_num + self.title_slot_num) for fea in nt_slot] + neg_title_slots += nt_slot + yield query_slots + pos_title_slots + neg_title_slots + else: + yield query_slots + pos_title_slots + + return reader + + def train(self): + return self._reader_creator(True) + + def valid(self): + return self._reader_creator(True) + + def test(self): + return self._reader_creator(False) + +if __name__ == '__main__': + sparse_feature_dim = 1000001 + query_slots = 1 + title_slots = 1 + dataset_size = 10 + dataset = SyntheticDataset(sparse_feature_dim, query_slots, title_slots, dataset_size) + train_reader = dataset.train() + test_reader = dataset.test() + + with open("data/train/train.txt", 'w') as fout: + for data in train_reader(): + fout.write(' '.join(data)) + fout.write("\n") + + with open("data/test/test.txt", 'w') as fout: + for data in test_reader(): + fout.write(' '.join(data)) + fout.write("\n") diff --git a/models/recall/multiview-simnet/model.py b/models/recall/multiview-simnet/model.py new file mode 100644 index 0000000000000000000000000000000000000000..c33c10033c55a2d95ce14ac7755eceba8a3a7dd1 --- /dev/null +++ b/models/recall/multiview-simnet/model.py @@ -0,0 +1,301 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import math +import paddle.fluid as fluid +import paddle.fluid.layers as layers +import paddle.fluid.layers.tensor as tensor +import paddle.fluid.layers.control_flow as cf + +from fleetrec.core.utils import envs +from fleetrec.core.model import Model as ModelBase + +class BowEncoder(object): + """ bow-encoder """ + + def __init__(self): + self.param_name = "" + + def forward(self, emb): + return fluid.layers.sequence_pool(input=emb, pool_type='sum') + + +class CNNEncoder(object): + """ cnn-encoder""" + + def __init__(self, + param_name="cnn", + win_size=3, + ksize=128, + act='tanh', + pool_type='max'): + self.param_name = param_name + self.win_size = win_size + self.ksize = ksize + self.act = act + self.pool_type = pool_type + + def forward(self, emb): + return fluid.nets.sequence_conv_pool( + input=emb, + num_filters=self.ksize, + filter_size=self.win_size, + act=self.act, + pool_type=self.pool_type, + param_attr=self.param_name + ".param", + bias_attr=self.param_name + ".bias") + + +class GrnnEncoder(object): + """ grnn-encoder """ + + def __init__(self, param_name="grnn", hidden_size=128): + self.param_name = param_name + self.hidden_size = hidden_size + + def forward(self, emb): + fc0 = fluid.layers.fc(input=emb, + size=self.hidden_size * 3, + param_attr=self.param_name + "_fc.w", + bias_attr=False) + + gru_h = fluid.layers.dynamic_gru( + input=fc0, + size=self.hidden_size, + is_reverse=False, + param_attr=self.param_name + ".param", + bias_attr=self.param_name + ".bias") + return fluid.layers.sequence_pool(input=gru_h, pool_type='max') + + +class SimpleEncoderFactory(object): + def __init__(self): + pass + + ''' create an encoder through create function ''' + + def create(self, enc_type, enc_hid_size): + if enc_type == "bow": + bow_encode = BowEncoder() + return bow_encode + elif enc_type == "cnn": + cnn_encode = CNNEncoder(ksize=enc_hid_size) + return cnn_encode + elif enc_type == "gru": + rnn_encode = GrnnEncoder(hidden_size=enc_hid_size) + return rnn_encode + +class Model(ModelBase): + def __init__(self, config): + ModelBase.__init__(self, config) + self.init_config() + + def init_config(self): + self._fetch_interval = 1 + query_encoder = envs.get_global_env("hyper_parameters.query_encoder", None, self._namespace) + title_encoder = envs.get_global_env("hyper_parameters.title_encoder", None, self._namespace) + query_encode_dim = envs.get_global_env("hyper_parameters.query_encode_dim", None, self._namespace) + title_encode_dim = envs.get_global_env("hyper_parameters.title_encode_dim", None, self._namespace) + query_slots = envs.get_global_env("hyper_parameters.query_slots", None, self._namespace) + title_slots = envs.get_global_env("hyper_parameters.title_slots", None, self._namespace) + factory = SimpleEncoderFactory() + self.query_encoders = [ + factory.create(query_encoder, query_encode_dim) + for i in range(query_slots) + ] + self.title_encoders = [ + factory.create(title_encoder, title_encode_dim) + for i in range(title_slots) + ] + + self.emb_size = envs.get_global_env("hyper_parameters.sparse_feature_dim", None, self._namespace) + self.emb_dim = envs.get_global_env("hyper_parameters.embedding_dim", None, self._namespace) + self.emb_shape = [self.emb_size, self.emb_dim] + self.hidden_size = envs.get_global_env("hyper_parameters.hidden_size", None, self._namespace) + self.margin = 0.1 + + def input(self, is_train=True): + self.q_slots = [ + fluid.data( + name="%d" % i, shape=[None, 1], lod_level=1, dtype='int64') + for i in range(len(self.query_encoders)) + ] + self.pt_slots = [ + fluid.data( + name="%d" % (i + len(self.query_encoders)), shape=[None, 1], lod_level=1, dtype='int64') + for i in range(len(self.title_encoders)) + ] + + if is_train == False: + return self.q_slots + self.pt_slots + + self.nt_slots = [ + fluid.data( + name="%d" % (i + len(self.query_encoders) + len(self.title_encoders)), shape=[None, 1], lod_level=1, dtype='int64') + for i in range(len(self.title_encoders)) + ] + + return self.q_slots + self.pt_slots + self.nt_slots + + def train_input(self): + res = self.input() + self._data_var = res + + use_dataloader = envs.get_global_env("hyper_parameters.use_DataLoader", False, self._namespace) + + if self._platform != "LINUX" or use_dataloader: + self._data_loader = fluid.io.DataLoader.from_generator( + feed_list=self._data_var, capacity=256, use_double_buffer=False, iterable=False) + + def get_acc(self, x, y): + less = tensor.cast(cf.less_than(x, y), dtype='float32') + label_ones = fluid.layers.fill_constant_batch_size_like( + input=x, dtype='float32', shape=[-1, 1], value=1.0) + correct = fluid.layers.reduce_sum(less) + total = fluid.layers.reduce_sum(label_ones) + acc = fluid.layers.elementwise_div(correct, total) + return acc + + def net(self): + q_embs = [ + fluid.embedding( + input=query, size=self.emb_shape, param_attr="emb") + for query in self.q_slots + ] + pt_embs = [ + fluid.embedding( + input=title, size=self.emb_shape, param_attr="emb") + for title in self.pt_slots + ] + nt_embs = [ + fluid.embedding( + input=title, size=self.emb_shape, param_attr="emb") + for title in self.nt_slots + ] + + # encode each embedding field with encoder + q_encodes = [ + self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs) + ] + pt_encodes = [ + self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs) + ] + nt_encodes = [ + self.title_encoders[i].forward(emb) for i, emb in enumerate(nt_embs) + ] + + # concat multi view for query, pos_title, neg_title + q_concat = fluid.layers.concat(q_encodes) + pt_concat = fluid.layers.concat(pt_encodes) + nt_concat = fluid.layers.concat(nt_encodes) + + # projection of hidden layer + q_hid = fluid.layers.fc(q_concat, + size=self.hidden_size, + param_attr='q_fc.w', + bias_attr='q_fc.b') + pt_hid = fluid.layers.fc(pt_concat, + size=self.hidden_size, + param_attr='t_fc.w', + bias_attr='t_fc.b') + nt_hid = fluid.layers.fc(nt_concat, + size=self.hidden_size, + param_attr='t_fc.w', + bias_attr='t_fc.b') + + # cosine of hidden layers + cos_pos = fluid.layers.cos_sim(q_hid, pt_hid) + cos_neg = fluid.layers.cos_sim(q_hid, nt_hid) + + # pairwise hinge_loss + loss_part1 = fluid.layers.elementwise_sub( + tensor.fill_constant_batch_size_like( + input=cos_pos, + shape=[-1, 1], + value=self.margin, + dtype='float32'), + cos_pos) + + loss_part2 = fluid.layers.elementwise_add(loss_part1, cos_neg) + + loss_part3 = fluid.layers.elementwise_max( + tensor.fill_constant_batch_size_like( + input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'), + loss_part2) + + self.avg_cost = fluid.layers.mean(loss_part3) + self.acc = self.get_acc(cos_neg, cos_pos) + + def avg_loss(self): + self._cost = self.avg_cost + + def metrics(self): + self._metrics["loss"] = self.avg_cost + self._metrics["acc"] = self.acc + + def train_net(self): + self.train_input() + self.net() + self.avg_loss() + self.metrics() + + def optimizer(self): + learning_rate = envs.get_global_env("hyper_parameters.learning_rate", None, self._namespace) + optimizer = fluid.optimizer.Adam(learning_rate=learning_rate) + return optimizer + + def infer_input(self): + res = self.input(is_train=False) + self._infer_data_var = res + + self._infer_data_loader = fluid.io.DataLoader.from_generator( + feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + + def infer_net(self): + self.infer_input() + # lookup embedding for each slot + q_embs = [ + fluid.embedding( + input=query, size=self.emb_shape, param_attr="emb") + for query in self.q_slots + ] + pt_embs = [ + fluid.embedding( + input=title, size=self.emb_shape, param_attr="emb") + for title in self.pt_slots + ] + # encode each embedding field with encoder + q_encodes = [ + self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs) + ] + pt_encodes = [ + self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs) + ] + # concat multi view for query, pos_title, neg_title + q_concat = fluid.layers.concat(q_encodes) + pt_concat = fluid.layers.concat(pt_encodes) + # projection of hidden layer + q_hid = fluid.layers.fc(q_concat, + size=self.hidden_size, + param_attr='q_fc.w', + bias_attr='q_fc.b') + pt_hid = fluid.layers.fc(pt_concat, + size=self.hidden_size, + param_attr='t_fc.w', + bias_attr='t_fc.b') + + # cosine of hidden layers + cos = fluid.layers.cos_sim(q_hid, pt_hid) + self._infer_results['query_pt_sim'] = cos diff --git a/models/recall/multiview-simnet/reader.py b/models/recall/multiview-simnet/reader.py new file mode 100755 index 0000000000000000000000000000000000000000..34cabd415617bbbc5d4cfc942c5a48406e228d3d --- /dev/null +++ b/models/recall/multiview-simnet/reader.py @@ -0,0 +1,60 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import io +import copy +import random +from fleetrec.core.reader import Reader +from fleetrec.core.utils import envs + + +class TrainReader(Reader): + def init(self): + self.query_slots = envs.get_global_env("hyper_parameters.query_slots", None, "train.model") + self.title_slots = envs.get_global_env("hyper_parameters.title_slots", None, "train.model") + + self.all_slots = [] + for i in range(self.query_slots): + self.all_slots.append(str(i)) + + for i in range(self.title_slots): + self.all_slots.append(str(i + self.query_slots)) + + for i in range(self.title_slots): + self.all_slots.append(str(i + self.query_slots + self.title_slots)) + + self._all_slots_dict = dict() + for index, slot in enumerate(self.all_slots): + self._all_slots_dict[slot] = [False, index] + + def generate_sample(self, line): + def data_iter(): + elements = line.rstrip().split() + padding = 0 + output = [(slot, []) for slot in self.all_slots] + for elem in elements: + feasign, slot = elem.split(':') + if not self._all_slots_dict.has_key(slot): + continue + self._all_slots_dict[slot][0] = True + index = self._all_slots_dict[slot][1] + output[index][1].append(int(feasign)) + for slot in self._all_slots_dict: + visit, index = self._all_slots_dict[slot] + if visit: + self._all_slots_dict[slot][0] = False + else: + output[index][1].append(padding) + yield output + return data_iter