diff --git a/dssm/README.md b/dssm/README.md index d85b35ebd5b616423514243d9b32ec64a9fef198..b65c11df7d00f34b8378c92371858ca383827a1d 100644 --- a/dssm/README.md +++ b/dssm/README.md @@ -384,11 +384,13 @@ def _build_rank_model(self): ``` usage: train.py [-h] [-i TRAIN_DATA_PATH] [-t TEST_DATA_PATH] [-s SOURCE_DIC_PATH] [--target_dic_path TARGET_DIC_PATH] - [-b BATCH_SIZE] [-p NUM_PASSES] -y MODEL_TYPE --model_arch - MODEL_ARCH + [-b BATCH_SIZE] [-p NUM_PASSES] -y MODEL_TYPE -a MODEL_ARCH [--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET] [--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS] [--num_workers NUM_WORKERS] [--use_gpu USE_GPU] [-c CLASS_NUM] + [--model_output_prefix MODEL_OUTPUT_PREFIX] + [-g NUM_BATCHES_TO_LOG] [-e NUM_BATCHES_TO_TEST] + [-z NUM_BATCHES_TO_SAVE_MODEL] PaddlePaddle DSSM example @@ -408,9 +410,9 @@ optional arguments: -p NUM_PASSES, --num_passes NUM_PASSES number of passes to run(default:10) -y MODEL_TYPE, --model_type MODEL_TYPE - model type, 0 for classification, 1 for pairwise rank - (default: classification) - --model_arch MODEL_ARCH + model type, 0 for classification, 1 for pairwise rank, + 2 for regression (default: classification) + -a MODEL_ARCH, --model_arch MODEL_ARCH model architecture, 1 for CNN, 0 for FC, 2 for RNN --share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET whether to share network parameters between source and @@ -426,8 +428,73 @@ optional arguments: --use_gpu USE_GPU whether to use GPU devices (default: False) -c CLASS_NUM, --class_num CLASS_NUM number of categories for classification task. + --model_output_prefix MODEL_OUTPUT_PREFIX + prefix of the path for model to store, (default: ./) + -g NUM_BATCHES_TO_LOG, --num_batches_to_log NUM_BATCHES_TO_LOG + number of batches to output train log, (default: 100) + -e NUM_BATCHES_TO_TEST, --num_batches_to_test NUM_BATCHES_TO_TEST + number of batches to test, (default: 200) + -z NUM_BATCHES_TO_SAVE_MODEL, --num_batches_to_save_model NUM_BATCHES_TO_SAVE_MODEL + number of batches to output model, (default: 400) ``` +重要的参数描述如下 + +- `train_data_path` 训练数据路径 +- `test_data_path` 测试数据路局,可以不设置 +- `source_dic_path` 源字典字典路径 +- `target_dic_path` 目标字典路径 +- `model_type` 模型的损失函数的类型,分类0,排序1,回归2 +- `model_arch` 模型结构,FC 0, CNN 1, RNN 2 +- `dnn_dims` 模型各层的维度设置,默认为 `256,128,64,32`,即模型有4层,各层维度如上设置 + +## 用训练好的模型预测 +``` +usage: infer.py [-h] --model_path MODEL_PATH -i DATA_PATH -o + PREDICTION_OUTPUT_PATH -y MODEL_TYPE [-s SOURCE_DIC_PATH] + [--target_dic_path TARGET_DIC_PATH] -a MODEL_ARCH + [--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET] + [--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS] + [-c CLASS_NUM] + +PaddlePaddle DSSM infer + +optional arguments: + -h, --help show this help message and exit + --model_path MODEL_PATH + path of model parameters file + -i DATA_PATH, --data_path DATA_PATH + path of the dataset to infer + -o PREDICTION_OUTPUT_PATH, --prediction_output_path PREDICTION_OUTPUT_PATH + path to output the prediction + -y MODEL_TYPE, --model_type MODEL_TYPE + model type, 0 for classification, 1 for pairwise rank, + 2 for regression (default: classification) + -s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH + path of the source's word dic + --target_dic_path TARGET_DIC_PATH + path of the target's word dic, if not set, the + `source_dic_path` will be used + -a MODEL_ARCH, --model_arch MODEL_ARCH + model architecture, 1 for CNN, 0 for FC, 2 for RNN + --share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET + whether to share network parameters between source and + target + --share_embed SHARE_EMBED + whether to share word embedding between source and + target + --dnn_dims DNN_DIMS dimentions of dnn layers, default is '256,128,64,32', + which means create a 4-layer dnn, demention of each + layer is 256, 128, 64 and 32 + -c CLASS_NUM, --class_num CLASS_NUM + number of categories for classification task. +``` + +部分参数可以参考 `train.py`,重要参数解释如下 + +- `data_path` 需要预测的数据路径 +- `prediction_output_path` 预测的输出路径 + ## 参考文献 1. Huang P S, He X, Gao J, et al. Learning deep structured semantic models for web search using clickthrough data[C]//Proceedings of the 22nd ACM international conference on Conference on information & knowledge management. ACM, 2013: 2333-2338. diff --git a/dssm/infer.py b/dssm/infer.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..9192d7e19af10da9edba5353747e11ab6c786f0f 100644 --- a/dssm/infer.py +++ b/dssm/infer.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import argparse +import itertools + +import reader +import paddle.v2 as paddle +from network_conf import DSSM +from utils import logger, ModelType, ModelArch, load_dic + +parser = argparse.ArgumentParser(description="PaddlePaddle DSSM infer") +parser.add_argument( + '--model_path', + type=str, + required=True, + help="path of model parameters file") +parser.add_argument( + '-i', + '--data_path', + type=str, + required=True, + help="path of the dataset to infer") +parser.add_argument( + '-o', + '--prediction_output_path', + type=str, + required=True, + help="path to output the prediction") +parser.add_argument( + '-y', + '--model_type', + type=int, + required=True, + default=ModelType.CLASSIFICATION_MODE, + help="model type, %d for classification, %d for pairwise rank, %d for regression (default: classification)" + % (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE, + ModelType.REGRESSION_MODE)) +parser.add_argument( + '-s', + '--source_dic_path', + type=str, + required=False, + help="path of the source's word dic") +parser.add_argument( + '--target_dic_path', + type=str, + required=False, + help="path of the target's word dic, if not set, the `source_dic_path` will be used" +) +parser.add_argument( + '-a', + '--model_arch', + type=int, + required=True, + default=ModelArch.CNN_MODE, + help="model architecture, %d for CNN, %d for FC, %d for RNN" % + (ModelArch.CNN_MODE, ModelArch.FC_MODE, ModelArch.RNN_MODE)) +parser.add_argument( + '--share_network_between_source_target', + type=bool, + default=False, + help="whether to share network parameters between source and target") +parser.add_argument( + '--share_embed', + type=bool, + default=False, + help="whether to share word embedding between source and target") +parser.add_argument( + '--dnn_dims', + type=str, + default='256,128,64,32', + help="dimentions of dnn layers, default is '256,128,64,32', which means create a 4-layer dnn, demention of each layer is 256, 128, 64 and 32" +) +parser.add_argument( + '-c', + '--class_num', + type=int, + default=0, + help="number of categories for classification task.") + +args = parser.parse_args() +args.model_type = ModelType(args.model_type) +args.model_arch = ModelArch(args.model_arch) +if args.model_type.is_classification(): + assert args.class_num > 1, "--class_num should be set in classification task." + +layer_dims = map(int, args.dnn_dims.split(',')) +args.target_dic_path = args.source_dic_path if not args.target_dic_path else args.target_dic_path + +paddle.init(use_gpu=False, trainer_count=1) + + +class Inferer(object): + def __init__(self, param_path): + logger.info("create DSSM model") + + cost, prediction, label = DSSM( + dnn_dims=layer_dims, + vocab_sizes=[ + len(load_dic(path)) + for path in [args.source_dic_path, args.target_dic_path] + ], + model_type=args.model_type, + model_arch=args.model_arch, + share_semantic_generator=args.share_network_between_source_target, + class_num=args.class_num, + share_embed=args.share_embed)() + + # load parameter + logger.info("load model parameters from %s" % param_path) + self.parameters = paddle.parameters.Parameters.from_tar( + open(param_path, 'r')) + self.inferer = paddle.inference.Inference( + output_layer=prediction, parameters=self.parameters) + + def infer(self, data_path): + logger.info("infer data...") + dataset = reader.Dataset( + train_path=data_path, + test_path=None, + source_dic_path=args.source_dic_path, + target_dic_path=args.target_dic_path, + model_type=args.model_type, ) + infer_reader = paddle.batch(dataset.infer, batch_size=1000) + logger.warning('write predictions to %s' % args.prediction_output_path) + + output_f = open(args.prediction_output_path, 'w') + + for id, batch in enumerate(infer_reader()): + res = self.inferer.infer(input=batch) + predictions = [' '.join(map(str, x)) for x in res] + assert len(batch) == len( + predictions), "predict error, %d inputs, but %d predictions" % ( + len(batch), len(predictions)) + output_f.write('\n'.join(map(str, predictions)) + '\n') + + +if __name__ == '__main__': + inferer = Inferer(args.model_path) + inferer.infer(args.data_path) diff --git a/dssm/network_conf.py b/dssm/network_conf.py index 916079825010b253e33a8edb230980d8bd613841..f0f2e1a209debe6bfd3c826048c0afc0b5f9dd66 100644 --- a/dssm/network_conf.py +++ b/dssm/network_conf.py @@ -11,7 +11,8 @@ class DSSM(object): model_arch=ModelArch.create_cnn(), share_semantic_generator=False, class_num=None, - share_embed=False): + share_embed=False, + is_infer=False): ''' @dnn_dims: list of int dimentions of each layer in semantic vector generator. @@ -40,6 +41,7 @@ class DSSM(object): self.model_type = ModelType(model_type) self.model_arch = ModelArch(model_arch) self.class_num = class_num + self.is_infer = is_infer logger.warning("build DSSM model with config of %s, %s" % (self.model_type, self.model_arch)) logger.info("vocabulary sizes: %s" % str(self.vocab_sizes)) @@ -68,9 +70,6 @@ class DSSM(object): self.model_type_creater = _model_type[str(self.model_type)] def __call__(self): - # if self.model_type.is_classification(): - # return self._build_classification_model() - # return self._build_rank_model() return self.model_type_creater() def create_embedding(self, input, prefix=''): @@ -189,8 +188,9 @@ class DSSM(object): right_target = paddle.layer.data( name='right_target_input', type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) - label = paddle.layer.data( - name='label_input', type=paddle.data_type.integer_value(1)) + if not self.is_infer: + label = paddle.layer.data( + name='label_input', type=paddle.data_type.integer_value(1)) prefixs = '_ _ _'.split( ) if self.share_semantic_generator else 'source left right'.split() @@ -212,12 +212,14 @@ class DSSM(object): # cossim score of source and right target right_score = paddle.layer.cos_sim(semantics[0], semantics[2]) - # rank cost - cost = paddle.layer.rank_cost(left_score, right_score, label=label) - # prediction = left_score - right_score - # but this operator is not supported currently. - # so AUC will not used. - return cost, None, None + if not self.is_infer: + # rank cost + cost = paddle.layer.rank_cost(left_score, right_score, label=label) + # prediction = left_score - right_score + # but this operator is not supported currently. + # so AUC will not used. + return cost, None, label + return None, [left_score, right_score], label def _build_classification_or_regression_model(self, is_classification): ''' @@ -270,38 +272,7 @@ class DSSM(object): else: prediction = paddle.layer.cos_sim(*semantics) cost = paddle.layer.mse_cost(prediction, label) - return cost, prediction, label - - -class RankMetrics(object): - ''' - A custom metrics to calculate AUC. - Paddle's rank model do not support auc evaluator directly, - to make it, infer all the outputs and use python to calculate - the metrics. - ''' - - def __init__(self, model_parameters, left_score_layer, right_score_layer, - label): - ''' - @model_parameters: dict - model's parameters - @left_score_layer: paddle.layer - left part's score - @right_score_laeyr: paddle.layer - right part's score - @label: paddle.data_layer - label input - ''' - self.inferer = paddle.inference.Inference( - output_layer=[left_score_layer, right_score_layer], - parameters=model_parameters) - - def test(self, input): - scores = [] - for id, rcd in enumerate(input()): - # output [left_score, right_score, label] - res = self.inferer(input=input) - scores.append(res) - print scores + if not self.is_infer: + return cost, prediction, label + return None, prediction, label diff --git a/dssm/reader.py b/dssm/reader.py index 45cf7449eea631a0eceb9b4ff78c0b7f20cc9026..677072dae985980fab3da4dd09893721f84866fd 100644 --- a/dssm/reader.py +++ b/dssm/reader.py @@ -23,6 +23,7 @@ class Dataset(object): assert isinstance(model_type, ModelType) self.record_reader = _record_reader[model_type.mode] + self.is_infer = False def train(self): ''' @@ -37,11 +38,17 @@ class Dataset(object): ''' Load testset. ''' - logger.info("[reader] load testset from %s" % self.test_path) + # logger.info("[reader] load testset from %s" % self.test_path) with open(self.test_path) as f: for line_id, line in enumerate(f): yield self.record_reader(line) + def infer(self): + self.is_infer = True + with open(self.train_path) as f: + for line in f: + yield self.record_reader(line) + def _read_classification_record(self, line): ''' data format: @@ -56,8 +63,10 @@ class Dataset(object): " [TAB] [TAB]