提交 df91e730 编写于 作者: S Superjom

init dssm

上级 0a27ca9d
from paddle import v2 as paddle
from paddle.v2.attr import ParamAttr
from utils import TaskType, logger
class DSSM(object):
def __init__(self,
dnn_dims=[],
vocab_sizes=[],
task_type=TaskType.CLASSFICATION,
share_semantic_generator=False,
class_num=None,
share_embed=False):
'''
@dnn_dims: list of int
dimentions of each layer in semantic vector generator.
@vocab_sizes: 2-d tuple
size of both left and right items.
@task_type: str
type of task, should be 'rank', 'regression' or 'classification'
@share_semantic_generator: bool
whether to share the semantic vector generator for both left and right.
@share_embed: bool
whether to share the embeddings between left and right.
@class_num: int
number of categories.
'''
assert len(
vocab_sizes
) == 2, "vocab_sizes specify the sizes left and right inputs, and dim should be 2."
self.dnn_dims = dnn_dims
self.vocab_sizes = vocab_sizes
self.share_semantic_generator = share_semantic_generator
self.share_embed = share_embed
self.task_type = task_type
self.class_num = class_num
logger.info("vocabulary sizes: %s" % str(self.vocab_sizes))
def __call__(self):
if self.task_type == TaskType.CLASSFICATION:
return self._build_classification_model()
return self._build_rank_model()
def create_embedding(self, input, prefix=''):
'''
Create an embedding table whose name has a `prefix`.
'''
emb = paddle.layer.embedding(
input=input,
size=self.dnn_dims[0],
param_attr=ParamAttr(name='%s_emb.w' % prefix))
return emb
def create_fc(self, emb, prefix=''):
'''
A multi-layer fully connected neural networks.
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `fc` parts.
'''
_input_layer = paddle.layer.pooling(
input=emb, pooling_type=paddle.pooling.Max())
for id, dim in enumerate(self.dnn_dims[1:]):
name = "%s_fc_%d_%d" % (prefix, id, dim)
fc = paddle.layer.fc(
name=name,
input=_input_layer,
size=dim,
act=paddle.activation.Relu(),
param_attr=ParamAttr(name='%s.w' % name),
bias_attr=None, )
_input_layer = fc
return _input_layer
def create_cnn(self, emb, prefix=''):
'''
A multi-layer CNN.
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `cnn` parts.
'''
pass
def _build_classification_model(self):
'''
Build a classification model, and the cost is returned.
A Classification has 3 inputs:
- source sentence
- target sentence
- classification label
'''
# prepare inputs.
assert self.class_num
source = paddle.layer.data(
name='source_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
target = paddle.layer.data(
name='target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
label = paddle.layer.data(
name='label_input',
type=paddle.data_type.integer_value(self.class_num))
prefixs = '_ _'.split(
) if self.share_semantic_generator else 'left right'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'left right'.split()
word_vecs = []
for id, input in enumerate([source, target]):
x = self.create_embedding(input, prefix=embed_prefixs[id])
word_vecs.append(x)
semantics = []
for id, input in enumerate(word_vecs):
x = self.create_fc(input, prefix=prefixs[id])
semantics.append(x)
concated_vector = paddle.layer.concat(semantics)
prediction = paddle.layer.fc(
input=concated_vector,
size=self.class_num,
act=paddle.activation.Softmax())
cost = paddle.layer.classification_cost(input=prediction, label=label)
return cost, prediction, label
def _build_rank_model(self):
'''
Build a pairwise rank model, and the cost is returned.
A pairwise rank model has 3 inputs:
- source sentence
- left_target sentence
- right_target sentence
- label, 1 if left_target should be sorted in front of right_target, otherwise 0.
'''
source = paddle.layer.data(
name='source_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
left_target = paddle.layer.data(
name='left_target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
right_target = paddle.layer.data(
name='right_target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
label = paddle.layer.data(
name='label_input', type=paddle.data_type.integer_value(1))
prefixs = '_ _ _'.split(
) if self.share_semantic_generator else 'source left right'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'source target target'.split()
word_vecs = []
for id, input in enumerate([source, left_target, right_target]):
x = self.create_embedding(input, prefix=embed_prefixs[id])
word_vecs.append(x)
semantics = []
for id, input in enumerate(word_vecs):
x = self.create_fc(input, prefix=prefixs[id])
semantics.append(x)
# cossim score of source and left_target
left_score = paddle.layer.cos_sim(semantics[0], semantics[1])
# cossim score of source and right target
right_score = paddle.layer.cos_sim(semantics[0], semantics[2])
# rank cost
cost = paddle.layer.rank_cost(left_score, right_score, label=label)
# prediction = left_score - right_score
# but this operator is not supported currently.
# so AUC will not used.
return cost, None, None
class RankMetrics(object):
'''
A custom metrics to calculate AUC.
Paddle's rank model do not support auc evaluator directly,
to make it, infer all the outputs and use python to calculate
the metrics.
'''
def __init__(self, model_parameters, left_score_layer, right_score_layer,
label):
'''
@model_parameters: dict
model's parameters
@left_score_layer: paddle.layer
left part's score
@right_score_laeyr: paddle.layer
right part's score
@label: paddle.data_layer
label input
'''
self.inferer = paddle.inference.Inference(
output_layer=[left_score_layer, right_score_layer],
parameters=model_parameters)
def test(self, input):
scores = []
for id, rcd in enumerate(input()):
# output [left_score, right_score, label]
res = self.inferer(input=input)
scores.append(res)
print scores
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from utils import UNK, TaskType, load_dic, sent2ids, logger
class Dataset(object):
def __init__(self,
train_path,
test_path,
source_dic_path,
target_dic_path,
task_type=TaskType.RANK):
self.train_path = train_path
self.test_path = test_path
self.source_dic_path = source_dic_path
self.target_dic_path = target_dic_path
self.task_type = task_type
self.source_dic = load_dic(self.source_dic_path)
self.target_dic = load_dic(self.target_dic_path)
self.record_reader = self._read_classification_record \
if self.task_type == TaskType.CLASSFICATION \
else self._read_rank_record
def train(self):
logger.info("[reader] load trainset from %s" % self.train_path)
with open(self.train_path) as f:
for line_id, line in enumerate(f):
yield self.record_reader(line)
def test(self):
logger.info("[reader] load testset from %s" % self.test_path)
with open(self.test_path) as f:
for line_id, line in enumerate(f):
yield self.record_reader(line)
def _read_classification_record(self, line):
'''
data format:
<source words> [TAB] <target words> [TAB] <label>
@line: str
a string line which represent a record.
'''
fs = line.strip().split('\t')
assert len(fs) == 3, "wrong format for classification\n" + \
"the format shoud be " +\
"<source words> [TAB] <target words> [TAB] <label>'"
source = sent2ids(fs[0], self.source_dic)
target = sent2ids(fs[1], self.target_dic)
label = int(fs[2])
return (source, target, label, )
def _read_rank_record(self, line):
'''
data format:
<source words> [TAB] <left_target words> [TAB] <right_target words> [TAB] <label>
'''
fs = line.strip().split('\t')
assert len(fs) == 4, "wrong format for rank\n" + \
"the format should be " +\
"<source words> [TAB] <left_target words> [TAB] <right_target words> [TAB] <label>"
source = sent2ids(fs[0], self.source_dic)
left_target = sent2ids(fs[1], self.target_dic)
right_target = sent2ids(fs[2], self.target_dic)
label = int(fs[3])
return (source, left_target, right_target, label)
if __name__ == '__main__':
path = './data/classification/train.txt'
test_path = './data/classification/test.txt'
source_dic = './data/vocab.txt'
dataset = Dataset(path, test_path, source_dic, source_dic,
TaskType.CLASSFICATION)
for rcd in dataset.train():
print rcd
# for i in range(10):
# print i, dataset.train().next()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import gzip
import paddle.v2 as paddle
from network_conf import DSSM
import reader
from utils import TaskType, load_dic, logger
parser = argparse.ArgumentParser(description="PaddlePaddle DSSM example")
parser.add_argument(
'--train_data_path',
type=str,
required=False,
help="path of training dataset")
parser.add_argument(
'--test_data_path',
type=str,
required=False,
help="path of testing dataset")
parser.add_argument(
'--source_dic_path',
type=str,
required=False,
help="path of the source's word dic")
parser.add_argument(
'--target_dic_path',
type=str,
required=False,
help="path of the target's word dic, if not set, the `source_dic_path` will be used"
)
parser.add_argument(
'--batch_size',
type=int,
default=10,
help="size of mini-batch (default:10)")
parser.add_argument(
'--num_passes',
type=int,
default=10,
help="number of passes to run(default:10)")
parser.add_argument(
'--task_type',
type=int,
default=TaskType.CLASSFICATION,
help="task type, 0 for classification, 1 for pairwise rank")
parser.add_argument(
'--share_network_between_source_target',
type=bool,
default=False,
help="whether to share network parameters between source and target")
parser.add_argument(
'--share_embed',
type=bool,
default=False,
help="whether to share word embedding between source and target")
parser.add_argument(
'--dnn_dims',
type=str,
default='256,128,64,32',
help="dimentions of dnn layers, default is '256,128,64,32', which means create a 4-layer dnn, dementions of each layer is 256, 128, 64 and 32"
)
parser.add_argument(
'--num_workers', type=int, default=1, help="num worker threads, default 1")
args = parser.parse_args()
layer_dims = [int(i) for i in args.dnn_dims.split(',')]
target_dic_path = args.source_dic_path if not args.target_dic_path else args.target_dic_path
def train(train_data_path=None,
test_data_path=None,
source_dic_path=None,
target_dic_path=None,
task_type=TaskType.CLASSFICATION,
batch_size=10,
num_passes=10,
share_semantic_generator=False,
share_embed=False,
class_num=None,
num_workers=1):
'''
Train the DSSM.
'''
default_train_path = './data/rank/train.txt'
default_test_path = './data/rank/test.txt'
default_dic_path = './data/vocab.txt'
if task_type == TaskType.CLASSFICATION:
default_train_path = './data/classification/train.txt'
default_test_path = './data/classification/test.txt'
use_default_data = not train_data_path
if use_default_data:
train_data_path = default_train_path
test_data_path = default_test_path
source_dic_path = default_dic_path
target_dic_path = default_dic_path
dataset = reader.Dataset(
train_path=train_data_path,
test_path=test_data_path,
source_dic_path=source_dic_path,
target_dic_path=target_dic_path,
task_type=task_type, )
train_reader = paddle.batch(
paddle.reader.shuffle(dataset.train, buf_size=1000),
batch_size=batch_size)
test_reader = paddle.batch(
paddle.reader.shuffle(dataset.test, buf_size=1000),
batch_size=batch_size)
paddle.init(use_gpu=False, trainer_count=num_workers)
cost, prediction, label = DSSM(
dnn_dims=layer_dims,
vocab_sizes=[
len(load_dic(path)) for path in [source_dic_path, target_dic_path]
],
task_type=task_type,
share_semantic_generator=share_semantic_generator,
class_num=class_num,
share_embed=share_embed)()
parameters = paddle.parameters.create(cost)
adam_optimizer = paddle.optimizer.Adam(
learning_rate=1e-3,
regularization=paddle.optimizer.L2Regularization(rate=1e-3),
model_average=paddle.optimizer.ModelAverage(average_window=0.5))
trainer = paddle.trainer.SGD(
cost=cost,
extra_layers=paddle.evaluator.auc(input=prediction, label=label)
if prediction else None,
parameters=parameters,
update_equation=adam_optimizer)
feeding = {}
if task_type == TaskType.CLASSFICATION:
feeding = {'source_input': 0, 'target_input': 1, 'label_input': 2}
else:
feeding = {
'source_input': 0,
'left_target_input': 1,
'right_target_input': 2,
'label_input': 3
}
def _event_handler(event):
'''
Define batch handler
'''
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
logger.info("Pass %d, Batch %d, Cost %f, %s\n" % (
event.pass_id, event.batch_id, event.cost, event.metrics))
if isinstance(event, paddle.event.EndPass):
if test_reader is not None:
if task_type == TaskType.CLASSFICATION:
result = trainer.test(reader=test_reader, feeding=feeding)
logger.info("Test at Pass %d, %s \n" % (event.pass_id,
result.metrics))
else:
result = None
with gzip.open("dssm_pass_%05d.tar.gz" % event.pass_id, "w") as f:
parameters.to_tar(f)
trainer.train(
reader=train_reader,
event_handler=_event_handler,
feeding=feeding,
num_passes=num_passes)
logger.info("Training has finished.")
if __name__ == '__main__':
# train(class_num=2)
train(task_type=TaskType.RANK)
import logging
UNK = 0
logger = logging.getLogger("logger")
logger.setLevel(logging.INFO)
class TaskType:
'''
type of DSSM's task.
'''
# pairwise rank.
RANK = 0
# classification.
CLASSFICATION = 1
def sent2ids(sent, vocab):
'''
transform a sentence to a list of ids.
@sent: str
a sentence.
@vocab: dict
a word dic
'''
return [vocab.get(w, UNK) for w in sent.split()]
def load_dic(path):
'''
word dic format:
each line is a word
'''
dic = {}
with open(path) as f:
for id, line in enumerate(f):
w = line.strip()
dic[w] = id
return dic
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册