From 321454931597cc8a18fa645f2028a2911edc4dff Mon Sep 17 00:00:00 2001
From: zhangwenhui03 <zhangwenhui03@baidu.com>
Date: Fri, 15 May 2020 15:59:20 +0800
Subject: [PATCH] add esmm infer

---
 models/multitask/esmm/config.yaml          |  6 +++
 models/multitask/esmm/esmm_infer_reader.py | 63 ++++++++++++++++++++++
 models/multitask/esmm/model.py             | 18 +++++--
 3 files changed, 83 insertions(+), 4 deletions(-)
 create mode 100644 models/multitask/esmm/esmm_infer_reader.py

diff --git a/models/multitask/esmm/config.yaml b/models/multitask/esmm/config.yaml
index 18b47f89..f40b967c 100644
--- a/models/multitask/esmm/config.yaml
+++ b/models/multitask/esmm/config.yaml
@@ -12,6 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+evaluate:
+  reader:
+    batch_size: 1
+    class: "{workspace}/esmm_infer_reader.py"
+    test_data_path: "{workspace}/data/train"
+
 train:
   trainer:
     # for cluster training
diff --git a/models/multitask/esmm/esmm_infer_reader.py b/models/multitask/esmm/esmm_infer_reader.py
new file mode 100644
index 00000000..6e94a1ee
--- /dev/null
+++ b/models/multitask/esmm/esmm_infer_reader.py
@@ -0,0 +1,63 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+from paddlerec.core.reader import Reader
+from paddlerec.core.utils import envs
+from collections import defaultdict
+import numpy as np
+
+
+class EvaluateReader(Reader):
+    def init(self):
+        all_field_id = ['101', '109_14', '110_14', '127_14', '150_14', '121', '122', '124', '125', '126', '127', '128', '129',
+                        '205', '206', '207', '210', '216', '508', '509', '702', '853', '301']
+        self.all_field_id_dict = defaultdict(int)
+        for i,field_id in enumerate(all_field_id):
+            self.all_field_id_dict[field_id] = [False,i]
+
+    def generate_sample(self, line):
+        """
+        Read the data line by line and process it as a dictionary
+        """
+
+        def reader():
+            """
+            This function needs to be implemented by the user, based on data format
+            """
+            features = line.strip().split(',')
+            ctr = int(features[1])
+            cvr = int(features[2])
+            
+            padding = 0
+            output = [(field_id,[]) for field_id in self.all_field_id_dict]
+
+            for elem in features[4:]:
+                field_id,feat_id = elem.strip().split(':')
+                if field_id not in self.all_field_id_dict:
+                    continue
+                self.all_field_id_dict[field_id][0] = True
+                index = self.all_field_id_dict[field_id][1]
+                output[index][1].append(int(feat_id)) 
+                
+            for field_id in self.all_field_id_dict:
+                visited,index = self.all_field_id_dict[field_id]
+                if visited:
+                    self.all_field_id_dict[field_id][0] = False
+                else:
+                    output[index][1].append(padding) 
+            output.append(('ctr', [ctr]))
+            output.append(('cvr', [cvr]))
+            yield output
+        return reader
diff --git a/models/multitask/esmm/model.py b/models/multitask/esmm/model.py
index 6654f337..1641f72d 100644
--- a/models/multitask/esmm/model.py
+++ b/models/multitask/esmm/model.py
@@ -53,7 +53,7 @@ class Model(ModelBase):
         
         return inputs
     
-    def net(self, inputs):
+    def net(self, inputs, is_infer=False):
         
         vocab_size = envs.get_global_env("hyper_parameters.vocab_size", None, self._namespace)
         embed_size = envs.get_global_env("hyper_parameters.embed_size", None, self._namespace)
@@ -89,14 +89,21 @@ class Model(ModelBase):
         
         ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one, cvr_prop_one)
         ctcvr_prop = fluid.layers.concat(input=[1-ctcvr_prop_one,ctcvr_prop_one], axis = 1)
+
+        auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(input=ctr_out, label=ctr_clk)
+        auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(input=ctcvr_prop, label=ctcvr_buy)
+
+        if is_infer:
+            self._infer_results["AUC_ctr"] = auc_ctr
+            self._infer_results["AUC_ctcvr"] = auc_ctcvr
+            return
+
     
         loss_ctr = fluid.layers.cross_entropy(input=ctr_out, label=ctr_clk)
         loss_ctcvr = fluid.layers.cross_entropy(input=ctcvr_prop, label=ctcvr_buy)
         cost = loss_ctr + loss_ctcvr
         avg_cost = fluid.layers.mean(cost)
 
-        auc_ctr, batch_auc_ctr, auc_states_ctr = fluid.layers.auc(input=ctr_out, label=ctr_clk)
-        auc_ctcvr, batch_auc_ctcvr, auc_states_ctcvr = fluid.layers.auc(input=ctcvr_prop, label=ctcvr_buy)
     
         self._cost = avg_cost
         self._metrics["AUC_ctr"] = auc_ctr
@@ -111,4 +118,7 @@ class Model(ModelBase):
 
 
     def infer_net(self):
-        pass
+        self._infer_data_var = self.input_data()
+        self._infer_data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=self._infer_data_var, capacity=64, use_double_buffer=False, iterable=False)
+        self.net(self._infer_data_var, is_infer=True)
-- 
GitLab