diff --git a/models/multitask/esmm/config.yaml b/models/multitask/esmm/config.yaml index 18b47f893089badf28841814d5ef367121b1a46e..f40b967c1c02175debd44bfdc15a6d48c4208de6 100644 --- a/models/multitask/esmm/config.yaml +++ b/models/multitask/esmm/config.yaml @@ -12,6 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +evaluate: + reader: + batch_size: 1 + class: "{workspace}/esmm_infer_reader.py" + test_data_path: "{workspace}/data/train" + train: trainer: # for cluster training diff --git a/models/multitask/esmm/esmm_infer_reader.py b/models/multitask/esmm/esmm_infer_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..6e94a1eed07bab82fa80ece0041f9b1e94bb531d --- /dev/null +++ b/models/multitask/esmm/esmm_infer_reader.py @@ -0,0 +1,63 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +from paddlerec.core.reader import Reader +from paddlerec.core.utils import envs +from collections import defaultdict +import numpy as np + + +class EvaluateReader(Reader): + def init(self): + all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128', '129', + '205', '206', '207', '210', '216', '508', '509', '702', '853', '301'] + self.all_field_id_dict = defaultdict(int) + for i,field_id in enumerate(all_field_id): + self.all_field_id_dict[field_id] = [False,i] + + def generate_sample(self, line): + """ + Read the data line by line and process it as a dictionary + """ + + def reader(): + """ + This function needs to be implemented by the user, based on data format + """ + features = line.strip().split(',') + ctr = int(features[1]) + cvr = int(features[2]) + + padding = 0 + output = [(field_id,[]) for field_id in self.all_field_id_dict] + + for elem in features[4:]: + field_id,feat_id = elem.strip().split(':') + if field_id not in self.all_field_id_dict: + continue + self.all_field_id_dict[field_id][0] = True + index = self.all_field_id_dict[field_id][1] + output[index][1].append(int(feat_id)) + + for field_id in self.all_field_id_dict: + visited,index = self.all_field_id_dict[field_id] + if visited: + self.all_field_id_dict[field_id][0] = False + else: + output[index][1].append(padding) + output.append(('ctr', [ctr])) + output.append(('cvr', [cvr])) + yield output + return reader diff --git a/models/multitask/esmm/model.py b/models/multitask/esmm/model.py index 6654f3379d80fc944046be4840a43c4b31a03a20..1641f72d1e4eab39cfe7ce1aa5055c25d139fb16 100644 --- a/models/multitask/esmm/model.py +++ b/models/multitask/esmm/model.py @@ -53,7 +53,7 @@ class Model(ModelBase): return inputs - def net(self, inputs): + def net(self, inputs, is_infer=False): vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace) embed_size = envs.get_global_env("hyper_parameters.embed_size", None, self._namespace) @@ -89,14 +89,21 @@ class Model(ModelBase): ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one, cvr_prop_one) ctcvr_prop = fluid.layers.concat(input=[1-ctcvr_prop_one,ctcvr_prop_one], axis = 1) + + auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(input=ctr_out, label=ctr_clk) + auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(input=ctcvr_prop, label=ctcvr_buy) + + if is_infer: + self._infer_results["AUC_ctr"] = auc_ctr + self._infer_results["AUC_ctcvr"] = auc_ctcvr + return + loss_ctr = fluid.layers.cross_entropy(input=ctr_out, label=ctr_clk) loss_ctcvr = fluid.layers.cross_entropy(input=ctcvr_prop, label=ctcvr_buy) cost = loss_ctr + loss_ctcvr avg_cost = fluid.layers.mean(cost) - auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(input=ctr_out, label=ctr_clk) - auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(input=ctcvr_prop, label=ctcvr_buy) self._cost = avg_cost self._metrics["AUC_ctr"] = auc_ctr @@ -111,4 +118,7 @@ class Model(ModelBase): def infer_net(self): - pass + self._infer_data_var = self.input_data() + self._infer_data_loader = fluid.io.DataLoader.from_generator( + feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False) + self.net(self._infer_data_var, is_infer=True)