diff --git a/dssm/infer.py b/dssm/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/dssm/network_conf.py b/dssm/network_conf.py new file mode 100644 index 0000000000000000000000000000000000000000..be3e84a9131904dea2224c02bf9df2f8d7e4e455 --- /dev/null +++ b/dssm/network_conf.py @@ -0,0 +1,217 @@ +from paddle import v2 as paddle +from paddle.v2.attr import ParamAttr +from utils import TaskType, logger + + +class DSSM(object): + def __init__(self, + dnn_dims=[], + vocab_sizes=[], + task_type=TaskType.CLASSFICATION, + share_semantic_generator=False, + class_num=None, + share_embed=False): + ''' + @dnn_dims: list of int + dimentions of each layer in semantic vector generator. + @vocab_sizes: 2-d tuple + size of both left and right items. + @task_type: str + type of task, should be 'rank', 'regression' or 'classification' + @share_semantic_generator: bool + whether to share the semantic vector generator for both left and right. + @share_embed: bool + whether to share the embeddings between left and right. + @class_num: int + number of categories. + ''' + assert len( + vocab_sizes + ) == 2, "vocab_sizes specify the sizes left and right inputs, and dim should be 2." + + self.dnn_dims = dnn_dims + self.vocab_sizes = vocab_sizes + self.share_semantic_generator = share_semantic_generator + self.share_embed = share_embed + self.task_type = task_type + self.class_num = class_num + + logger.info("vocabulary sizes: %s" % str(self.vocab_sizes)) + + def __call__(self): + if self.task_type == TaskType.CLASSFICATION: + return self._build_classification_model() + return self._build_rank_model() + + def create_embedding(self, input, prefix=''): + ''' + Create an embedding table whose name has a `prefix`. + ''' + emb = paddle.layer.embedding( + input=input, + size=self.dnn_dims[0], + param_attr=ParamAttr(name='%s_emb.w' % prefix)) + return emb + + def create_fc(self, emb, prefix=''): + ''' + A multi-layer fully connected neural networks. + + @emb: paddle.layer + output of the embedding layer + @prefix: str + prefix of layers' names, used to share parameters between more than one `fc` parts. + ''' + _input_layer = paddle.layer.pooling( + input=emb, pooling_type=paddle.pooling.Max()) + for id, dim in enumerate(self.dnn_dims[1:]): + name = "%s_fc_%d_%d" % (prefix, id, dim) + fc = paddle.layer.fc( + name=name, + input=_input_layer, + size=dim, + act=paddle.activation.Relu(), + param_attr=ParamAttr(name='%s.w' % name), + bias_attr=None, ) + _input_layer = fc + return _input_layer + + def create_cnn(self, emb, prefix=''): + ''' + A multi-layer CNN. + + @emb: paddle.layer + output of the embedding layer + @prefix: str + prefix of layers' names, used to share parameters between more than one `cnn` parts. + ''' + pass + + def _build_classification_model(self): + ''' + Build a classification model, and the cost is returned. + + A Classification has 3 inputs: + - source sentence + - target sentence + - classification label + + ''' + # prepare inputs. + assert self.class_num + + source = paddle.layer.data( + name='source_input', + type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0])) + target = paddle.layer.data( + name='target_input', + type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) + label = paddle.layer.data( + name='label_input', + type=paddle.data_type.integer_value(self.class_num)) + + prefixs = '_ _'.split( + ) if self.share_semantic_generator else 'left right'.split() + embed_prefixs = '_ _'.split( + ) if self.share_embed else 'left right'.split() + + word_vecs = [] + for id, input in enumerate([source, target]): + x = self.create_embedding(input, prefix=embed_prefixs[id]) + word_vecs.append(x) + + semantics = [] + for id, input in enumerate(word_vecs): + x = self.create_fc(input, prefix=prefixs[id]) + semantics.append(x) + + concated_vector = paddle.layer.concat(semantics) + prediction = paddle.layer.fc( + input=concated_vector, + size=self.class_num, + act=paddle.activation.Softmax()) + cost = paddle.layer.classification_cost(input=prediction, label=label) + return cost, prediction, label + + def _build_rank_model(self): + ''' + Build a pairwise rank model, and the cost is returned. + + A pairwise rank model has 3 inputs: + - source sentence + - left_target sentence + - right_target sentence + - label, 1 if left_target should be sorted in front of right_target, otherwise 0. + ''' + source = paddle.layer.data( + name='source_input', + type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0])) + left_target = paddle.layer.data( + name='left_target_input', + type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) + right_target = paddle.layer.data( + name='right_target_input', + type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) + label = paddle.layer.data( + name='label_input', type=paddle.data_type.integer_value(1)) + + prefixs = '_ _ _'.split( + ) if self.share_semantic_generator else 'source left right'.split() + embed_prefixs = '_ _'.split( + ) if self.share_embed else 'source target target'.split() + + word_vecs = [] + for id, input in enumerate([source, left_target, right_target]): + x = self.create_embedding(input, prefix=embed_prefixs[id]) + word_vecs.append(x) + + semantics = [] + for id, input in enumerate(word_vecs): + x = self.create_fc(input, prefix=prefixs[id]) + semantics.append(x) + + # cossim score of source and left_target + left_score = paddle.layer.cos_sim(semantics[0], semantics[1]) + # cossim score of source and right target + right_score = paddle.layer.cos_sim(semantics[0], semantics[2]) + + # rank cost + cost = paddle.layer.rank_cost(left_score, right_score, label=label) + # prediction = left_score - right_score + # but this operator is not supported currently. + # so AUC will not used. + return cost, None, None + + +class RankMetrics(object): + ''' + A custom metrics to calculate AUC. + + Paddle's rank model do not support auc evaluator directly, + to make it, infer all the outputs and use python to calculate + the metrics. + ''' + + def __init__(self, model_parameters, left_score_layer, right_score_layer, + label): + ''' + @model_parameters: dict + model's parameters + @left_score_layer: paddle.layer + left part's score + @right_score_laeyr: paddle.layer + right part's score + @label: paddle.data_layer + label input + ''' + self.inferer = paddle.inference.Inference( + output_layer=[left_score_layer, right_score_layer], + parameters=model_parameters) + + def test(self, input): + scores = [] + for id, rcd in enumerate(input()): + # output [left_score, right_score, label] + res = self.inferer(input=input) + scores.append(res) + print scores diff --git a/dssm/reader.py b/dssm/reader.py new file mode 100644 index 0000000000000000000000000000000000000000..cea53726f39ecaf292890461c5a6ed0216e53026 --- /dev/null +++ b/dssm/reader.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from utils import UNK, TaskType, load_dic, sent2ids, logger + + +class Dataset(object): + def __init__(self, + train_path, + test_path, + source_dic_path, + target_dic_path, + task_type=TaskType.RANK): + self.train_path = train_path + self.test_path = test_path + self.source_dic_path = source_dic_path + self.target_dic_path = target_dic_path + self.task_type = task_type + + self.source_dic = load_dic(self.source_dic_path) + self.target_dic = load_dic(self.target_dic_path) + + self.record_reader = self._read_classification_record \ + if self.task_type == TaskType.CLASSFICATION \ + else self._read_rank_record + + def train(self): + logger.info("[reader] load trainset from %s" % self.train_path) + with open(self.train_path) as f: + for line_id, line in enumerate(f): + yield self.record_reader(line) + + def test(self): + logger.info("[reader] load testset from %s" % self.test_path) + with open(self.test_path) as f: + for line_id, line in enumerate(f): + yield self.record_reader(line) + + def _read_classification_record(self, line): + ''' + data format: + [TAB] [TAB]