model.py 19.7 KB
Newer Older
C
chengmo 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
# -*- coding=utf-8 -*-
"""
#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
T
tangwei 已提交
17

C
chengmo 已提交
18 19
import paddle.fluid as fluid

20
from paddlerec.core.utils import envs
C
Chengmo 已提交
21
from paddlerec.core.model import ModelBase
C
chengmo 已提交
22 23 24 25 26


class Model(ModelBase):
    def __init__(self, config):
        ModelBase.__init__(self, config)
C
Chengmo 已提交
27 28

    def _init_hyper_parameters(self):
C
chengmo 已提交
29
        # tree meta hyper parameters
C
Chengmo 已提交
30 31
        self.max_layers = envs.get_global_env("hyper_parameters.max_layers", 4)
        self.node_nums = envs.get_global_env("hyper_parameters.node_nums", 26)
C
chengmo 已提交
32
        self.leaf_node_nums = envs.get_global_env(
C
Chengmo 已提交
33
            "hyper_parameters.leaf_node_nums", 13)
C
chengmo 已提交
34
        self.output_positive = envs.get_global_env(
C
Chengmo 已提交
35
            "hyper_parameters.output_positive", True)
C
chengmo 已提交
36
        self.layer_node_num_list = envs.get_global_env(
C
Chengmo 已提交
37 38 39 40
            "hyper_parameters.layer_node_num_list", [2, 4, 7, 12])
        self.child_nums = envs.get_global_env("hyper_parameters.child_nums", 2)
        self.tree_layer_path = envs.get_global_env(
            "hyper_parameters.tree.tree_layer_path", None)
C
chengmo 已提交
41 42 43

        # model training hyper parameter
        self.node_emb_size = envs.get_global_env(
C
Chengmo 已提交
44
            "hyper_parameters.node_emb_size", 64)
C
chengmo 已提交
45
        self.input_emb_size = envs.get_global_env(
C
Chengmo 已提交
46 47
            "hyper_parameters.input_emb_size", 768)
        self.act = envs.get_global_env("hyper_parameters.act", "tanh")
C
chengmo 已提交
48
        self.neg_sampling_list = envs.get_global_env(
C
Chengmo 已提交
49
            "hyper_parameters.neg_sampling_list", [1, 2, 3, 4])
C
chengmo 已提交
50 51

        # model infer hyper parameter
C
Chengmo 已提交
52 53 54 55 56 57 58 59 60 61 62 63 64 65
        self.topK = envs.get_global_env(
            "hyper_parameters.topK",
            1, )
        self.batch_size = envs.get_global_env(
            "dataset.dataset_infer.batch_size", 1)

    def net(self, input, is_infer=False):
        if not is_infer:
            return self.train_net(input)
        else:
            return self.infer_net(input)

    def train_net(self, input):
        self.tdm_net(input)
C
chengmo 已提交
66
        self.create_info()
C
chengmo 已提交
67 68 69
        self.avg_loss()
        self.metrics()

C
Chengmo 已提交
70
    def infer_net(self, input):
C
chengmo 已提交
71
        self.create_first_layer()
C
Chengmo 已提交
72 73 74 75 76 77 78
        self.tdm_infer_net(input)

    def input_data(self, is_infer=False, **kwargs):
        if not is_infer:
            return self.train_input()
        else:
            return self.infer_input()
C
chengmo 已提交
79 80 81 82 83 84 85

    """ -------- Train network detail ------- """

    def train_input(self):
        input_emb = fluid.data(
            name="input_emb",
            shape=[None, self.input_emb_size],
T
tangwei 已提交
86
            dtype="float32", )
C
chengmo 已提交
87 88 89 90

        item_label = fluid.data(
            name="item_label",
            shape=[None, 1],
T
tangwei 已提交
91
            dtype="int64", )
C
chengmo 已提交
92

C
Chengmo 已提交
93
        return [input_emb, item_label]
C
chengmo 已提交
94

C
Chengmo 已提交
95
    def tdm_net(self, input):
C
chengmo 已提交
96 97 98 99 100
        """
        tdm训练网络的主要流程部分
        """
        is_distributed = True if envs.get_trainer() == "CtrTrainer" else False

C
Chengmo 已提交
101 102
        input_emb = input[0]
        item_label = input[1]
C
chengmo 已提交
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118

        # 根据输入的item的正样本在给定的树上进行负采样
        # sample_nodes 是采样的node_id的结果,包含正负样本
        # sample_label 是采样的node_id对应的正负标签
        # sample_mask 是为了保持tensor维度一致,padding部分的标签,若为0,则是padding的虚拟node_id
        sample_nodes, sample_label, sample_mask = fluid.contrib.layers.tdm_sampler(
            x=item_label,
            neg_samples_num_list=self.neg_sampling_list,
            layer_node_num_list=self.layer_node_num_list,
            leaf_node_num=self.leaf_node_nums,
            tree_travel_attr=fluid.ParamAttr(name="TDM_Tree_Travel"),
            tree_layer_attr=fluid.ParamAttr(name="TDM_Tree_Layer"),
            output_positive=self.output_positive,
            output_list=True,
            seed=0,
            tree_dtype='int64',
T
tangwei 已提交
119
            dtype='int64')
C
chengmo 已提交
120 121 122 123 124 125 126

        # 查表得到每个节点的Embedding
        sample_nodes_emb = [
            fluid.embedding(
                input=sample_nodes[i],
                is_sparse=True,
                size=[self.node_nums, self.node_emb_size],
T
tangwei 已提交
127 128
                param_attr=fluid.ParamAttr(name="TDM_Tree_Emb"))
            for i in range(self.max_layers)
C
chengmo 已提交
129 130 131 132
        ]

        # 此处进行Reshape是为了之后层次化的分类器训练
        sample_nodes_emb = [
T
tangwei 已提交
133 134 135 136
            fluid.layers.reshape(sample_nodes_emb[i], [
                -1, self.neg_sampling_list[i] + self.output_positive,
                self.node_emb_size
            ]) for i in range(self.max_layers)
C
chengmo 已提交
137 138 139 140 141 142
        ]

        # 对输入的input_emb进行转换,使其维度与node_emb维度一致
        input_trans_emb = self.input_trans_layer(input_emb)

        # 分类器的主体网络,分别训练不同层次的分类器
T
tangwei 已提交
143 144
        layer_classifier_res = self.classifier_layer(input_trans_emb,
                                                     sample_nodes_emb)
C
chengmo 已提交
145 146 147

        # 最后的概率判别FC,将所有层次的node分类结果放到一起以相同的标准进行判别
        # 考虑到树极大可能不平衡,有些item不在最后一层,所以需要这样的机制保证每个item都有机会被召回
T
tangwei 已提交
148 149 150 151 152 153 154
        tdm_fc = fluid.layers.fc(
            input=layer_classifier_res,
            size=2,
            act=None,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(name="tdm.cls_fc.weight"),
            bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias"))
C
chengmo 已提交
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175

        # 将loss打平,放到一起计算整体网络的loss
        tdm_fc_re = fluid.layers.reshape(tdm_fc, [-1, 2])

        # 若想对各个层次的loss辅以不同的权重,则在此处无需concat
        # 支持各个层次分别计算loss,再乘相应的权重
        sample_label = fluid.layers.concat(sample_label, axis=1)
        labels_reshape = fluid.layers.reshape(sample_label, [-1, 1])
        labels_reshape.stop_gradient = True

        # 计算整体的loss并得到softmax的输出
        cost, softmax_prob = fluid.layers.softmax_with_cross_entropy(
            logits=tdm_fc_re, label=labels_reshape, return_softmax=True)

        # 通过mask过滤掉虚拟节点的loss
        sample_mask = fluid.layers.concat(sample_mask, axis=1)
        mask_reshape = fluid.layers.reshape(sample_mask, [-1, 1])
        mask_index = fluid.layers.where(mask_reshape != 0)
        mask_index.stop_gradient = True

        self.mask_cost = fluid.layers.gather_nd(cost, mask_index)
C
chengmo 已提交
176 177

        softmax_prob = fluid.layers.unsqueeze(input=softmax_prob, axes=[1])
C
chengmo 已提交
178 179 180 181 182
        self.mask_prob = fluid.layers.gather_nd(softmax_prob, mask_index)
        self.mask_label = fluid.layers.gather_nd(labels_reshape, mask_index)

        self._predict = self.mask_prob

C
chengmo 已提交
183 184 185 186 187 188 189 190 191 192 193 194 195
    def create_info(self):
        fluid.default_startup_program().global_block().create_var(
            name="TDM_Tree_Info",
            dtype=fluid.core.VarDesc.VarType.INT32,
            shape=[self.node_nums, 3 + self.child_nums],
            persistable=True,
            initializer=fluid.initializer.ConstantInitializer(0))
        fluid.default_main_program().global_block().create_var(
            name="TDM_Tree_Info",
            dtype=fluid.core.VarDesc.VarType.INT32,
            shape=[self.node_nums, 3 + self.child_nums],
            persistable=True)

C
chengmo 已提交
196 197 198 199 200 201 202
    def avg_loss(self):
        avg_cost = fluid.layers.reduce_mean(self.mask_cost)
        self._cost = avg_cost

    def metrics(self):
        auc, batch_auc, _ = fluid.layers.auc(input=self._predict,
                                             label=self.mask_label,
T
tangwei 已提交
203
                                             num_thresholds=2**12,
C
chengmo 已提交
204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
                                             slide_steps=20)
        self._metrics["AUC"] = auc
        self._metrics["BATCH_AUC"] = batch_auc
        self._metrics["BATCH_LOSS"] = self._cost

    def input_trans_layer(self, input_emb):
        """
        输入侧训练组网
        """
        # 将input映射到与node相同的维度
        input_fc_out = fluid.layers.fc(
            input=input_emb,
            size=self.node_emb_size,
            act=None,
            param_attr=fluid.ParamAttr(name="trans.input_fc.weight"),
T
tangwei 已提交
219
            bias_attr=fluid.ParamAttr(name="trans.input_fc.bias"), )
C
chengmo 已提交
220 221 222 223 224 225 226 227 228

        # 将input_emb映射到各个不同层次的向量表示空间
        input_layer_fc_out = [
            fluid.layers.fc(
                input=input_fc_out,
                size=self.node_emb_size,
                act=self.act,
                param_attr=fluid.ParamAttr(
                    name="trans.layer_fc.weight." + str(i)),
T
tangwei 已提交
229 230 231
                bias_attr=fluid.ParamAttr(
                    name="trans.layer_fc.bias." + str(i)), )
            for i in range(self.max_layers)
C
chengmo 已提交
232 233 234 235 236 237 238 239 240 241 242 243 244 245 246
        ]

        return input_layer_fc_out

    def _expand_layer(self, input_layer, node, layer_idx):
        # 扩展input的输入,使数量与node一致,
        # 也可以以其他broadcast的操作进行代替
        # 同时兼容了训练组网与预测组网
        input_layer_unsequeeze = fluid.layers.unsqueeze(
            input=input_layer, axes=[1])
        if not isinstance(node, list):
            input_layer_expand = fluid.layers.expand(
                input_layer_unsequeeze, expand_times=[1, node.shape[1], 1])
        else:
            input_layer_expand = fluid.layers.expand(
T
tangwei 已提交
247 248
                input_layer_unsequeeze,
                expand_times=[1, node[layer_idx].shape[1], 1])
C
chengmo 已提交
249 250 251 252 253
        return input_layer_expand

    def classifier_layer(self, input, node):
        # 扩展input,使维度与node匹配
        input_expand = [
T
tangwei 已提交
254 255
            self._expand_layer(input[i], node, i)
            for i in range(self.max_layers)
C
chengmo 已提交
256 257 258 259 260
        ]

        # 将input_emb与node_emb concat到一起过分类器FC
        input_node_concat = [
            fluid.layers.concat(
T
tangwei 已提交
261 262
                input=[input_expand[i], node[i]], axis=2)
            for i in range(self.max_layers)
C
chengmo 已提交
263 264 265 266 267 268 269 270
        ]
        hidden_states_fc = [
            fluid.layers.fc(
                input=input_node_concat[i],
                size=self.node_emb_size,
                num_flatten_dims=2,
                act=self.act,
                param_attr=fluid.ParamAttr(
T
for mat  
tangwei 已提交
271
                    name="cls.concat_fc.weight." + str(i)),
T
tangwei 已提交
272 273
                bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(i)))
            for i in range(self.max_layers)
C
chengmo 已提交
274 275 276 277 278 279 280 281 282 283 284 285 286 287
        ]

        # 如果将所有层次的node放到一起计算loss,则需要在此处concat
        # 将分类器结果以batch为准绳concat到一起,而不是layer
        # 维度形如[batch_size, total_node_num, node_emb_size]
        hidden_states_concat = fluid.layers.concat(hidden_states_fc, axis=1)
        return hidden_states_concat

    """ -------- Infer network detail ------- """

    def infer_input(self):
        input_emb = fluid.layers.data(
            name="input_emb",
            shape=[self.input_emb_size],
T
tangwei 已提交
288
            dtype="float32", )
C
chengmo 已提交
289

C
Chengmo 已提交
290
        return [input_emb]
C
chengmo 已提交
291 292 293 294

    def get_layer_list(self):
        """get layer list from layer_list.txt"""
        layer_list = []
C
chengmo 已提交
295
        with open(self.tree_layer_path, 'r') as fin:
C
chengmo 已提交
296 297 298 299 300 301 302
            for line in fin.readlines():
                l = []
                layer = (line.split('\n'))[0].split(',')
                for node in layer:
                    if node:
                        l.append(node)
                layer_list.append(l)
C
chengmo 已提交
303
        self.layer_list = layer_list
C
chengmo 已提交
304 305 306 307 308 309 310 311 312 313 314 315 316

    def create_first_layer(self):
        """decide which layer to start infer"""
        self.get_layer_list()
        first_layer_id = 0
        for idx, layer_node in enumerate(self.layer_node_num_list):
            if layer_node >= self.topK:
                first_layer_id = idx
                break
        first_layer_node = self.layer_list[first_layer_id]
        self.first_layer_idx = first_layer_id
        node_list = []
        mask_list = []
C
chengmo 已提交
317
        for id in first_layer_node:
T
tangwei 已提交
318 319 320 321 322 323
            node_list.append(
                fluid.layers.fill_constant(
                    [self.batch_size, 1], value=int(id), dtype='int64'))
            mask_list.append(
                fluid.layers.fill_constant(
                    [self.batch_size, 1], value=0, dtype='int64'))
C
chengmo 已提交
324 325 326
        self.first_layer_node = fluid.layers.concat(node_list, axis=1)
        self.first_layer_node_mask = fluid.layers.concat(mask_list, axis=1)

C
Chengmo 已提交
327
    def tdm_infer_net(self, input):
C
chengmo 已提交
328 329 330 331 332 333 334 335
        """
        infer的主要流程
        infer的基本逻辑是:从上层开始(具体层idx由树结构及TopK值决定)
        1、依次通过每一层分类器,得到当前层输入的指定节点的prob
        2、根据prob值大小,取topK的节点,取这些节点的孩子节点作为下一层的输入
        3、循环1、2步骤,遍历完所有层,得到每一层筛选结果的集合
        4、将筛选结果集合中的叶子节点,拿出来再做一次topK,得到最终的召回输出
        """
C
Chengmo 已提交
336
        input_emb = input[0]
C
chengmo 已提交
337 338 339 340 341
        node_score = []
        node_list = []

        current_layer_node = self.first_layer_node
        current_layer_node_mask = self.first_layer_node_mask
C
chengmo 已提交
342
        input_trans_emb = self.input_fc_infer(input_emb)
C
chengmo 已提交
343 344 345 346 347 348 349

        for layer_idx in range(self.first_layer_idx, self.max_layers):
            # 确定当前层的需要计算的节点数
            if layer_idx == self.first_layer_idx:
                current_layer_node_num = self.first_layer_node.shape[1]
            else:
                current_layer_node_num = current_layer_node.shape[1] * \
C
Chengmo 已提交
350
                    current_layer_node.shape[2]
C
chengmo 已提交
351 352 353 354 355 356 357

            current_layer_node = fluid.layers.reshape(
                current_layer_node, [-1, current_layer_node_num])
            current_layer_node_mask = fluid.layers.reshape(
                current_layer_node_mask, [-1, current_layer_node_num])
            node_emb = fluid.embedding(
                input=current_layer_node,
C
chengmo 已提交
358
                size=[self.node_nums, self.node_emb_size],
C
chengmo 已提交
359 360
                param_attr=fluid.ParamAttr(name="TDM_Tree_Emb"))

T
tangwei 已提交
361
            input_fc_out = self.layer_fc_infer(input_trans_emb, layer_idx)
C
chengmo 已提交
362 363

            # 过每一层的分类器
T
tangwei 已提交
364 365
            layer_classifier_res = self.classifier_layer_infer(
                input_fc_out, node_emb, layer_idx)
C
chengmo 已提交
366 367

            # 过最终的判别分类器
T
tangwei 已提交
368 369 370 371 372 373 374
            tdm_fc = fluid.layers.fc(
                input=layer_classifier_res,
                size=2,
                act=None,
                num_flatten_dims=2,
                param_attr=fluid.ParamAttr(name="tdm.cls_fc.weight"),
                bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias"))
C
chengmo 已提交
375 376 377 378

            prob = fluid.layers.softmax(tdm_fc)
            positive_prob = fluid.layers.slice(
                prob, axes=[2], starts=[1], ends=[2])
T
tangwei 已提交
379 380
            prob_re = fluid.layers.reshape(positive_prob,
                                           [-1, current_layer_node_num])
C
chengmo 已提交
381 382 383

            # 过滤掉padding产生的无效节点(node_id=0)
            node_zero_mask = fluid.layers.cast(current_layer_node, 'bool')
C
Chengmo 已提交
384
            node_zero_mask = fluid.layers.cast(node_zero_mask, 'float32')
C
chengmo 已提交
385 386 387 388 389 390 391 392 393 394
            prob_re = prob_re * node_zero_mask

            # 在当前层的分类结果中取topK,并将对应的score及node_id保存下来
            k = self.topK
            if current_layer_node_num < self.topK:
                k = current_layer_node_num
            _, topk_i = fluid.layers.topk(prob_re, k)

            # index_sample op根据下标索引tensor对应位置的值
            # 若paddle版本>2.0,调用方式为paddle.index_sample
T
tangwei 已提交
395 396
            top_node = fluid.contrib.layers.index_sample(current_layer_node,
                                                         topk_i)
C
chengmo 已提交
397
            prob_re_mask = prob_re * current_layer_node_mask  # 过滤掉非叶子节点
T
tangwei 已提交
398 399
            topk_value = fluid.contrib.layers.index_sample(prob_re_mask,
                                                           topk_i)
C
chengmo 已提交
400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423
            node_score.append(topk_value)
            node_list.append(top_node)

            # 取当前层topK结果的孩子节点,作为下一层的输入
            if layer_idx < self.max_layers - 1:
                # tdm_child op 根据输入返回其 child 及 child_mask
                # 若child是叶子节点,则child_mask=1,否则为0
                current_layer_node, current_layer_node_mask = \
                    fluid.contrib.layers.tdm_child(x=top_node,
                                                   node_nums=self.node_nums,
                                                   child_nums=self.child_nums,
                                                   param_attr=fluid.ParamAttr(
                                                       name="TDM_Tree_Info"),
                                                   dtype='int64')

        total_node_score = fluid.layers.concat(node_score, axis=1)
        total_node = fluid.layers.concat(node_list, axis=1)

        # 考虑到树可能是不平衡的,计算所有层的叶子节点的topK
        res_score, res_i = fluid.layers.topk(total_node_score, self.topK)
        res_layer_node = fluid.contrib.layers.index_sample(total_node, res_i)
        res_node = fluid.layers.reshape(res_layer_node, [-1, self.topK, 1])

        # 利用Tree_info信息,将node_id转换为item_id
T
tangwei 已提交
424 425
        tree_info = fluid.default_main_program().global_block().var(
            "TDM_Tree_Info")
C
chengmo 已提交
426 427 428 429 430
        res_node_emb = fluid.layers.gather_nd(tree_info, res_node)

        res_item = fluid.layers.slice(
            res_node_emb, axes=[2], starts=[0], ends=[1])
        self.res_item_re = fluid.layers.reshape(res_item, [-1, self.topK])
C
chengmo 已提交
431
        self._infer_results["item"] = self.res_item_re
C
chengmo 已提交
432 433 434 435 436 437 438 439 440 441 442

    def input_fc_infer(self, input_emb):
        """
        输入侧预测组网第一部分,将input转换为node同维度
        """
        # 组网与训练时保持一致
        input_fc_out = fluid.layers.fc(
            input=input_emb,
            size=self.node_emb_size,
            act=None,
            param_attr=fluid.ParamAttr(name="trans.input_fc.weight"),
T
tangwei 已提交
443
            bias_attr=fluid.ParamAttr(name="trans.input_fc.bias"), )
C
chengmo 已提交
444 445 446 447 448 449 450 451 452 453 454 455 456 457
        return input_fc_out

    def layer_fc_infer(self, input_fc_out, layer_idx):
        """
        输入侧预测组网第二部分,将input映射到不同层次的向量空间
        """
        # 组网与训练保持一致,通过layer_idx指定不同层的FC
        input_layer_fc_out = fluid.layers.fc(
            input=input_fc_out,
            size=self.node_emb_size,
            act=self.act,
            param_attr=fluid.ParamAttr(
                name="trans.layer_fc.weight." + str(layer_idx)),
            bias_attr=fluid.ParamAttr(
T
tangwei 已提交
458
                name="trans.layer_fc.bias." + str(layer_idx)), )
C
chengmo 已提交
459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477
        return input_layer_fc_out

    def classifier_layer_infer(self, input, node, layer_idx):
        # 为infer组网提供的简化版classifier,通过给定layer_idx调用不同层的分类器

        # 同样需要保持input与node的维度匹配
        input_expand = self._expand_layer(input, node, layer_idx)

        # 与训练网络相同的concat逻辑
        input_node_concat = fluid.layers.concat(
            input=[input_expand, node], axis=2)

        # 根据参数名param_attr调用不同的层的FC
        hidden_states_fc = fluid.layers.fc(
            input=input_node_concat,
            size=self.node_emb_size,
            num_flatten_dims=2,
            act=self.act,
            param_attr=fluid.ParamAttr(
T
for mat  
tangwei 已提交
478
                name="cls.concat_fc.weight." + str(layer_idx)),
T
tangwei 已提交
479 480
            bias_attr=fluid.ParamAttr(
                name="cls.concat_fc.bias." + str(layer_idx)))
C
chengmo 已提交
481
        return hidden_states_fc