提交 4012e62c 编写于 作者: S Superjom

init DSSM enhancement

上级 e88101bb
...@@ -384,11 +384,13 @@ def _build_rank_model(self): ...@@ -384,11 +384,13 @@ def _build_rank_model(self):
``` ```
usage: train.py [-h] [-i TRAIN_DATA_PATH] [-t TEST_DATA_PATH] usage: train.py [-h] [-i TRAIN_DATA_PATH] [-t TEST_DATA_PATH]
[-s SOURCE_DIC_PATH] [--target_dic_path TARGET_DIC_PATH] [-s SOURCE_DIC_PATH] [--target_dic_path TARGET_DIC_PATH]
[-b BATCH_SIZE] [-p NUM_PASSES] -y MODEL_TYPE --model_arch [-b BATCH_SIZE] [-p NUM_PASSES] -y MODEL_TYPE -a MODEL_ARCH
MODEL_ARCH
[--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET] [--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
[--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS] [--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
[--num_workers NUM_WORKERS] [--use_gpu USE_GPU] [-c CLASS_NUM] [--num_workers NUM_WORKERS] [--use_gpu USE_GPU] [-c CLASS_NUM]
[--model_output_prefix MODEL_OUTPUT_PREFIX]
[-g NUM_BATCHES_TO_LOG] [-e NUM_BATCHES_TO_TEST]
[-z NUM_BATCHES_TO_SAVE_MODEL]
PaddlePaddle DSSM example PaddlePaddle DSSM example
...@@ -408,9 +410,9 @@ optional arguments: ...@@ -408,9 +410,9 @@ optional arguments:
-p NUM_PASSES, --num_passes NUM_PASSES -p NUM_PASSES, --num_passes NUM_PASSES
number of passes to run(default:10) number of passes to run(default:10)
-y MODEL_TYPE, --model_type MODEL_TYPE -y MODEL_TYPE, --model_type MODEL_TYPE
model type, 0 for classification, 1 for pairwise rank model type, 0 for classification, 1 for pairwise rank,
(default: classification) 2 for regression (default: classification)
--model_arch MODEL_ARCH -a MODEL_ARCH, --model_arch MODEL_ARCH
model architecture, 1 for CNN, 0 for FC, 2 for RNN model architecture, 1 for CNN, 0 for FC, 2 for RNN
--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET --share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
whether to share network parameters between source and whether to share network parameters between source and
...@@ -426,6 +428,56 @@ optional arguments: ...@@ -426,6 +428,56 @@ optional arguments:
--use_gpu USE_GPU whether to use GPU devices (default: False) --use_gpu USE_GPU whether to use GPU devices (default: False)
-c CLASS_NUM, --class_num CLASS_NUM -c CLASS_NUM, --class_num CLASS_NUM
number of categories for classification task. number of categories for classification task.
--model_output_prefix MODEL_OUTPUT_PREFIX
prefix of the path for model to store, (default: ./)
-g NUM_BATCHES_TO_LOG, --num_batches_to_log NUM_BATCHES_TO_LOG
number of batches to output train log, (default: 100)
-e NUM_BATCHES_TO_TEST, --num_batches_to_test NUM_BATCHES_TO_TEST
number of batches to test, (default: 200)
-z NUM_BATCHES_TO_SAVE_MODEL, --num_batches_to_save_model NUM_BATCHES_TO_SAVE_MODEL
number of batches to output model, (default: 400)
```
## 用训练好的模型预测
```python
usage: infer.py [-h] --model_path MODEL_PATH -i DATA_PATH -o
PREDICTION_OUTPUT_PATH -y MODEL_TYPE [-s SOURCE_DIC_PATH]
[--target_dic_path TARGET_DIC_PATH] -a MODEL_ARCH
[--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
[--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
[-c CLASS_NUM]
PaddlePaddle DSSM infer
optional arguments:
-h, --help show this help message and exit
--model_path MODEL_PATH
path of model parameters file
-i DATA_PATH, --data_path DATA_PATH
path of the dataset to infer
-o PREDICTION_OUTPUT_PATH, --prediction_output_path PREDICTION_OUTPUT_PATH
path to output the prediction
-y MODEL_TYPE, --model_type MODEL_TYPE
model type, 0 for classification, 1 for pairwise rank,
2 for regression (default: classification)
-s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
path of the source's word dic
--target_dic_path TARGET_DIC_PATH
path of the target's word dic, if not set, the
`source_dic_path` will be used
-a MODEL_ARCH, --model_arch MODEL_ARCH
model architecture, 1 for CNN, 0 for FC, 2 for RNN
--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
whether to share network parameters between source and
target
--share_embed SHARE_EMBED
whether to share word embedding between source and
target
--dnn_dims DNN_DIMS dimentions of dnn layers, default is '256,128,64,32',
which means create a 4-layer dnn, demention of each
layer is 256, 128, 64 and 32
-c CLASS_NUM, --class_num CLASS_NUM
number of categories for classification task.
``` ```
## 参考文献 ## 参考文献
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import itertools
import reader
import paddle.v2 as paddle
from network_conf import DSSM
from utils import logger, ModelType, ModelArch, load_dic
parser = argparse.ArgumentParser(description="PaddlePaddle DSSM infer")
parser.add_argument(
'--model_path',
type=str,
required=True,
help="path of model parameters file")
parser.add_argument(
'-i',
'--data_path',
type=str,
required=True,
help="path of the dataset to infer")
parser.add_argument(
'-o',
'--prediction_output_path',
type=str,
required=True,
help="path to output the prediction")
parser.add_argument(
'-y',
'--model_type',
type=int,
required=True,
default=ModelType.CLASSIFICATION_MODE,
help="model type, %d for classification, %d for pairwise rank, %d for regression (default: classification)"
% (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE,
ModelType.REGRESSION_MODE))
parser.add_argument(
'-s',
'--source_dic_path',
type=str,
required=False,
help="path of the source's word dic")
parser.add_argument(
'--target_dic_path',
type=str,
required=False,
help="path of the target's word dic, if not set, the `source_dic_path` will be used"
)
parser.add_argument(
'-a',
'--model_arch',
type=int,
required=True,
default=ModelArch.CNN_MODE,
help="model architecture, %d for CNN, %d for FC, %d for RNN" %
(ModelArch.CNN_MODE, ModelArch.FC_MODE, ModelArch.RNN_MODE))
parser.add_argument(
'--share_network_between_source_target',
type=bool,
default=False,
help="whether to share network parameters between source and target")
parser.add_argument(
'--share_embed',
type=bool,
default=False,
help="whether to share word embedding between source and target")
parser.add_argument(
'--dnn_dims',
type=str,
default='256,128,64,32',
help="dimentions of dnn layers, default is '256,128,64,32', which means create a 4-layer dnn, demention of each layer is 256, 128, 64 and 32"
)
parser.add_argument(
'-c',
'--class_num',
type=int,
default=0,
help="number of categories for classification task.")
args = parser.parse_args()
args.model_type = ModelType(args.model_type)
args.model_arch = ModelArch(args.model_arch)
if args.model_type.is_classification():
assert args.class_num > 1, "--class_num should be set in classification task."
layer_dims = map(int, args.dnn_dims.split(','))
args.target_dic_path = args.source_dic_path if not args.target_dic_path else args.target_dic_path
paddle.init(use_gpu=False, trainer_count=1)
class Inferer(object):
def __init__(self, param_path):
logger.info("create DSSM model")
cost, prediction, label = DSSM(
dnn_dims=layer_dims,
vocab_sizes=[
len(load_dic(path))
for path in [args.source_dic_path, args.target_dic_path]
],
model_type=args.model_type,
model_arch=args.model_arch,
share_semantic_generator=args.share_network_between_source_target,
class_num=args.class_num,
share_embed=args.share_embed)()
# load parameter
logger.info("load model parameters from %s" % param_path)
self.parameters = paddle.parameters.Parameters.from_tar(
open(param_path, 'r'))
self.inferer = paddle.inference.Inference(
output_layer=prediction, parameters=self.parameters)
def infer(self, data_path):
logger.info("infer data...")
dataset = reader.Dataset(
train_path=data_path,
test_path=None,
source_dic_path=args.source_dic_path,
target_dic_path=args.target_dic_path,
model_type=args.model_type, )
infer_reader = paddle.batch(dataset.infer, batch_size=1000)
logger.warning('write predictions to %s' % args.prediction_output_path)
output_f = open(args.prediction_output_path, 'w')
for id, batch in enumerate(infer_reader()):
res = self.inferer.infer(input=batch)
predictions = [' '.join(map(str, x)) for x in res]
assert len(batch) == len(
predictions), "predict error, %d inputs, but %d predictions" % (
len(batch), len(predictions))
output_f.write('\n'.join(map(str, predictions)) + '\n')
if __name__ == '__main__':
inferer = Inferer(args.model_path)
inferer.infer(args.data_path)
...@@ -11,7 +11,8 @@ class DSSM(object): ...@@ -11,7 +11,8 @@ class DSSM(object):
model_arch=ModelArch.create_cnn(), model_arch=ModelArch.create_cnn(),
share_semantic_generator=False, share_semantic_generator=False,
class_num=None, class_num=None,
share_embed=False): share_embed=False,
is_infer=False):
''' '''
@dnn_dims: list of int @dnn_dims: list of int
dimentions of each layer in semantic vector generator. dimentions of each layer in semantic vector generator.
...@@ -40,6 +41,7 @@ class DSSM(object): ...@@ -40,6 +41,7 @@ class DSSM(object):
self.model_type = ModelType(model_type) self.model_type = ModelType(model_type)
self.model_arch = ModelArch(model_arch) self.model_arch = ModelArch(model_arch)
self.class_num = class_num self.class_num = class_num
self.is_infer = is_infer
logger.warning("build DSSM model with config of %s, %s" % logger.warning("build DSSM model with config of %s, %s" %
(self.model_type, self.model_arch)) (self.model_type, self.model_arch))
logger.info("vocabulary sizes: %s" % str(self.vocab_sizes)) logger.info("vocabulary sizes: %s" % str(self.vocab_sizes))
...@@ -68,9 +70,6 @@ class DSSM(object): ...@@ -68,9 +70,6 @@ class DSSM(object):
self.model_type_creater = _model_type[str(self.model_type)] self.model_type_creater = _model_type[str(self.model_type)]
def __call__(self): def __call__(self):
# if self.model_type.is_classification():
# return self._build_classification_model()
# return self._build_rank_model()
return self.model_type_creater() return self.model_type_creater()
def create_embedding(self, input, prefix=''): def create_embedding(self, input, prefix=''):
...@@ -189,6 +188,7 @@ class DSSM(object): ...@@ -189,6 +188,7 @@ class DSSM(object):
right_target = paddle.layer.data( right_target = paddle.layer.data(
name='right_target_input', name='right_target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
if not self.is_infer:
label = paddle.layer.data( label = paddle.layer.data(
name='label_input', type=paddle.data_type.integer_value(1)) name='label_input', type=paddle.data_type.integer_value(1))
...@@ -212,12 +212,14 @@ class DSSM(object): ...@@ -212,12 +212,14 @@ class DSSM(object):
# cossim score of source and right target # cossim score of source and right target
right_score = paddle.layer.cos_sim(semantics[0], semantics[2]) right_score = paddle.layer.cos_sim(semantics[0], semantics[2])
if not self.is_infer:
# rank cost # rank cost
cost = paddle.layer.rank_cost(left_score, right_score, label=label) cost = paddle.layer.rank_cost(left_score, right_score, label=label)
# prediction = left_score - right_score # prediction = left_score - right_score
# but this operator is not supported currently. # but this operator is not supported currently.
# so AUC will not used. # so AUC will not used.
return cost, None, None return cost, None, label
return None, [left_score, right_score], label
def _build_classification_or_regression_model(self, is_classification): def _build_classification_or_regression_model(self, is_classification):
''' '''
...@@ -270,38 +272,7 @@ class DSSM(object): ...@@ -270,38 +272,7 @@ class DSSM(object):
else: else:
prediction = paddle.layer.cos_sim(*semantics) prediction = paddle.layer.cos_sim(*semantics)
cost = paddle.layer.mse_cost(prediction, label) cost = paddle.layer.mse_cost(prediction, label)
return cost, prediction, label
class RankMetrics(object): if not self.is_infer:
''' return cost, prediction, label
A custom metrics to calculate AUC. return None, prediction, label
Paddle's rank model do not support auc evaluator directly,
to make it, infer all the outputs and use python to calculate
the metrics.
'''
def __init__(self, model_parameters, left_score_layer, right_score_layer,
label):
'''
@model_parameters: dict
model's parameters
@left_score_layer: paddle.layer
left part's score
@right_score_laeyr: paddle.layer
right part's score
@label: paddle.data_layer
label input
'''
self.inferer = paddle.inference.Inference(
output_layer=[left_score_layer, right_score_layer],
parameters=model_parameters)
def test(self, input):
scores = []
for id, rcd in enumerate(input()):
# output [left_score, right_score, label]
res = self.inferer(input=input)
scores.append(res)
print scores
...@@ -23,6 +23,7 @@ class Dataset(object): ...@@ -23,6 +23,7 @@ class Dataset(object):
assert isinstance(model_type, ModelType) assert isinstance(model_type, ModelType)
self.record_reader = _record_reader[model_type.mode] self.record_reader = _record_reader[model_type.mode]
self.is_infer = False
def train(self): def train(self):
''' '''
...@@ -37,11 +38,17 @@ class Dataset(object): ...@@ -37,11 +38,17 @@ class Dataset(object):
''' '''
Load testset. Load testset.
''' '''
logger.info("[reader] load testset from %s" % self.test_path) # logger.info("[reader] load testset from %s" % self.test_path)
with open(self.test_path) as f: with open(self.test_path) as f:
for line_id, line in enumerate(f): for line_id, line in enumerate(f):
yield self.record_reader(line) yield self.record_reader(line)
def infer(self):
self.is_infer = True
with open(self.train_path) as f:
for line in f:
yield self.record_reader(line)
def _read_classification_record(self, line): def _read_classification_record(self, line):
''' '''
data format: data format:
...@@ -56,8 +63,10 @@ class Dataset(object): ...@@ -56,8 +63,10 @@ class Dataset(object):
"<source words> [TAB] <target words> [TAB] <label>'" "<source words> [TAB] <target words> [TAB] <label>'"
source = sent2ids(fs[0], self.source_dic) source = sent2ids(fs[0], self.source_dic)
target = sent2ids(fs[1], self.target_dic) target = sent2ids(fs[1], self.target_dic)
if not self.is_infer:
label = int(fs[2]) label = int(fs[2])
return (source, target, label, ) return (source, target, label, )
return source, target
def _read_regression_record(self, line): def _read_regression_record(self, line):
''' '''
...@@ -73,8 +82,10 @@ class Dataset(object): ...@@ -73,8 +82,10 @@ class Dataset(object):
"<source words> [TAB] <target words> [TAB] <label>'" "<source words> [TAB] <target words> [TAB] <label>'"
source = sent2ids(fs[0], self.source_dic) source = sent2ids(fs[0], self.source_dic)
target = sent2ids(fs[1], self.target_dic) target = sent2ids(fs[1], self.target_dic)
if not self.is_infer:
label = float(fs[2]) label = float(fs[2])
return (source, target, [label], ) return (source, target, [label], )
return source, target
def _read_rank_record(self, line): def _read_rank_record(self, line):
''' '''
...@@ -89,9 +100,10 @@ class Dataset(object): ...@@ -89,9 +100,10 @@ class Dataset(object):
source = sent2ids(fs[0], self.source_dic) source = sent2ids(fs[0], self.source_dic)
left_target = sent2ids(fs[1], self.target_dic) left_target = sent2ids(fs[1], self.target_dic)
right_target = sent2ids(fs[2], self.target_dic) right_target = sent2ids(fs[2], self.target_dic)
if not self.is_infer:
label = int(fs[3]) label = int(fs[3])
return (source, left_target, right_target, label) return (source, left_target, right_target, label)
return source, left_target, right_target
if __name__ == '__main__': if __name__ == '__main__':
......
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import argparse import argparse
import gzip
import paddle.v2 as paddle import paddle.v2 as paddle
from network_conf import DSSM from network_conf import DSSM
import reader import reader
from utils import TaskType, load_dic, logger, ModelType, ModelArch from utils import TaskType, load_dic, logger, ModelType, ModelArch, display_args
parser = argparse.ArgumentParser(description="PaddlePaddle DSSM example") parser = argparse.ArgumentParser(description="PaddlePaddle DSSM example")
...@@ -56,6 +55,7 @@ parser.add_argument( ...@@ -56,6 +55,7 @@ parser.add_argument(
% (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE, % (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE,
ModelType.REGRESSION_MODE)) ModelType.REGRESSION_MODE))
parser.add_argument( parser.add_argument(
'-a',
'--model_arch', '--model_arch',
type=int, type=int,
required=True, required=True,
...@@ -91,6 +91,29 @@ parser.add_argument( ...@@ -91,6 +91,29 @@ parser.add_argument(
type=int, type=int,
default=0, default=0,
help="number of categories for classification task.") help="number of categories for classification task.")
parser.add_argument(
'--model_output_prefix',
type=str,
default="./",
help="prefix of the path for model to store, (default: ./)")
parser.add_argument(
'-g',
'--num_batches_to_log',
type=int,
default=100,
help="number of batches to output train log, (default: 100)")
parser.add_argument(
'-e',
'--num_batches_to_test',
type=int,
default=200,
help="number of batches to test, (default: 200)")
parser.add_argument(
'-z',
'--num_batches_to_save_model',
type=int,
default=400,
help="number of batches to output model, (default: 400)")
# arguments check. # arguments check.
args = parser.parse_args() args = parser.parse_args()
...@@ -100,10 +123,7 @@ if args.model_type.is_classification(): ...@@ -100,10 +123,7 @@ if args.model_type.is_classification():
assert args.class_num > 1, "--class_num should be set in classification task." assert args.class_num > 1, "--class_num should be set in classification task."
layer_dims = [int(i) for i in args.dnn_dims.split(',')] layer_dims = [int(i) for i in args.dnn_dims.split(',')]
target_dic_path = args.source_dic_path if not args.target_dic_path else args.target_dic_path args.target_dic_path = args.source_dic_path if not args.target_dic_path else args.target_dic_path
model_save_name_prefix = "dssm_pass_%s_%s" % (args.model_type,
args.model_arch, )
def train(train_data_path=None, def train(train_data_path=None,
...@@ -174,15 +194,10 @@ def train(train_data_path=None, ...@@ -174,15 +194,10 @@ def train(train_data_path=None,
trainer = paddle.trainer.SGD( trainer = paddle.trainer.SGD(
cost=cost, cost=cost,
extra_layers=None, extra_layers=paddle.evaluator.auc(input=prediction, label=label)
if not model_type.is_rank() else None,
parameters=parameters, parameters=parameters,
update_equation=adam_optimizer) update_equation=adam_optimizer)
# trainer = paddle.trainer.SGD(
# cost=cost,
# extra_layers=paddle.evaluator.auc(input=prediction, label=label)
# if prediction and model_type.is_classification() else None,
# parameters=parameters,
# update_equation=adam_optimizer)
feeding = {} feeding = {}
if model_type.is_classification() or model_type.is_regression(): if model_type.is_classification() or model_type.is_regression():
...@@ -200,20 +215,28 @@ def train(train_data_path=None, ...@@ -200,20 +215,28 @@ def train(train_data_path=None,
Define batch handler Define batch handler
''' '''
if isinstance(event, paddle.event.EndIteration): if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0: # output train log
logger.info("Pass %d, Batch %d, Cost %f, %s\n" % ( if event.batch_id % args.num_batches_to_log == 0:
logger.info("Pass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)) event.pass_id, event.batch_id, event.cost, event.metrics))
if isinstance(event, paddle.event.EndPass): # test model
if event.batch_id > 0 and event.batch_id % args.num_batches_to_test == 0:
if test_reader is not None: if test_reader is not None:
if model_type.is_classification(): if model_type.is_classification():
result = trainer.test(reader=test_reader, feeding=feeding) result = trainer.test(
logger.info("Test at Pass %d, %s \n" % (event.pass_id, reader=test_reader, feeding=feeding)
logger.info("Test at Pass %d, %s" % (event.pass_id,
result.metrics)) result.metrics))
else: else:
result = None result = None
with gzip.open("dssm_%s_pass_%05d.tar.gz" % # save model
(model_save_name_prefix, event.pass_id), "w") as f: if event.batch_id > 0 and event.batch_id % args.num_batches_to_save_model == 0:
model_desc = "{type}_{arch}".format(
type=str(args.model_type), arch=str(args.model_arch))
with open("%sdssm_%s_pass_%05d.tar" %
(args.model_output_prefix, model_desc,
event.pass_id), "w") as f:
parameters.to_tar(f) parameters.to_tar(f)
trainer.train( trainer.train(
...@@ -226,6 +249,7 @@ def train(train_data_path=None, ...@@ -226,6 +249,7 @@ def train(train_data_path=None,
if __name__ == '__main__': if __name__ == '__main__':
display_args(args)
train( train(
train_data_path=args.train_data_path, train_data_path=args.train_data_path,
test_data_path=args.test_data_path, test_data_path=args.test_data_path,
......
import logging
import paddle import paddle
UNK = 0 UNK = 0
...@@ -126,6 +127,12 @@ def load_dic(path): ...@@ -126,6 +127,12 @@ def load_dic(path):
return dic return dic
def display_args(args):
logger.info("arguments passed by command line:")
for k, v in sorted(v for v in vars(args).items()):
logger.info("{}:\t{}".format(k, v))
if __name__ == '__main__': if __name__ == '__main__':
t = TaskType(1) t = TaskType(1)
t = TaskType.create_train() t = TaskType.create_train()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册